xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_queue.c (revision c3a66015)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22a3f829aeSBill Moore  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26283b8460SGeorge.Wilson /*
27283b8460SGeorge.Wilson  * Copyright (c) 2012 by Delphix. All rights reserved.
28283b8460SGeorge.Wilson  */
29283b8460SGeorge.Wilson 
30fa9e4066Sahrens #include <sys/zfs_context.h>
31fa9e4066Sahrens #include <sys/vdev_impl.h>
32*c3a66015SMatthew Ahrens #include <sys/spa_impl.h>
33fa9e4066Sahrens #include <sys/zio.h>
34fa9e4066Sahrens #include <sys/avl.h>
35fa9e4066Sahrens 
36614409b5Sahrens /*
37614409b5Sahrens  * These tunables are for performance analysis.
38614409b5Sahrens  */
39614409b5Sahrens /*
40614409b5Sahrens  * zfs_vdev_max_pending is the maximum number of i/os concurrently
41614409b5Sahrens  * pending to each device.  zfs_vdev_min_pending is the initial number
42614409b5Sahrens  * of i/os pending to each device (before it starts ramping up to
43614409b5Sahrens  * max_pending).
44614409b5Sahrens  */
45c33e334fSMatthew Ahrens int zfs_vdev_max_pending = 10;
46614409b5Sahrens int zfs_vdev_min_pending = 4;
47614409b5Sahrens 
48d3d50737SRafael Vanoni /* deadline = pri + ddi_get_lbolt64() >> time_shift) */
49614409b5Sahrens int zfs_vdev_time_shift = 6;
50614409b5Sahrens 
51614409b5Sahrens /* exponential I/O issue ramp-up rate */
52614409b5Sahrens int zfs_vdev_ramp_rate = 2;
53614409b5Sahrens 
54614409b5Sahrens /*
55f94275ceSAdam Leventhal  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
56f94275ceSAdam Leventhal  * For read I/Os, we also aggregate across small adjacency gaps; for writes
57f94275ceSAdam Leventhal  * we include spans of optional I/Os to aid aggregation at the disk even when
58f94275ceSAdam Leventhal  * they aren't able to help us aggregate at this level.
59614409b5Sahrens  */
60614409b5Sahrens int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
616f708f7cSJeff Bonwick int zfs_vdev_read_gap_limit = 32 << 10;
62f94275ceSAdam Leventhal int zfs_vdev_write_gap_limit = 4 << 10;
63614409b5Sahrens 
64fa9e4066Sahrens /*
65fa9e4066Sahrens  * Virtual device vector for disk I/O scheduling.
66fa9e4066Sahrens  */
67fa9e4066Sahrens int
68fa9e4066Sahrens vdev_queue_deadline_compare(const void *x1, const void *x2)
69fa9e4066Sahrens {
70fa9e4066Sahrens 	const zio_t *z1 = x1;
71fa9e4066Sahrens 	const zio_t *z2 = x2;
72fa9e4066Sahrens 
73fa9e4066Sahrens 	if (z1->io_deadline < z2->io_deadline)
74fa9e4066Sahrens 		return (-1);
75fa9e4066Sahrens 	if (z1->io_deadline > z2->io_deadline)
76fa9e4066Sahrens 		return (1);
77fa9e4066Sahrens 
78fa9e4066Sahrens 	if (z1->io_offset < z2->io_offset)
79fa9e4066Sahrens 		return (-1);
80fa9e4066Sahrens 	if (z1->io_offset > z2->io_offset)
81fa9e4066Sahrens 		return (1);
82fa9e4066Sahrens 
83fa9e4066Sahrens 	if (z1 < z2)
84fa9e4066Sahrens 		return (-1);
85fa9e4066Sahrens 	if (z1 > z2)
86fa9e4066Sahrens 		return (1);
87fa9e4066Sahrens 
88fa9e4066Sahrens 	return (0);
89fa9e4066Sahrens }
90fa9e4066Sahrens 
91fa9e4066Sahrens int
92fa9e4066Sahrens vdev_queue_offset_compare(const void *x1, const void *x2)
93fa9e4066Sahrens {
94fa9e4066Sahrens 	const zio_t *z1 = x1;
95fa9e4066Sahrens 	const zio_t *z2 = x2;
96fa9e4066Sahrens 
97fa9e4066Sahrens 	if (z1->io_offset < z2->io_offset)
98fa9e4066Sahrens 		return (-1);
99fa9e4066Sahrens 	if (z1->io_offset > z2->io_offset)
100fa9e4066Sahrens 		return (1);
101fa9e4066Sahrens 
102fa9e4066Sahrens 	if (z1 < z2)
103fa9e4066Sahrens 		return (-1);
104fa9e4066Sahrens 	if (z1 > z2)
105fa9e4066Sahrens 		return (1);
106fa9e4066Sahrens 
107fa9e4066Sahrens 	return (0);
108fa9e4066Sahrens }
109fa9e4066Sahrens 
110fa9e4066Sahrens void
111fa9e4066Sahrens vdev_queue_init(vdev_t *vd)
112fa9e4066Sahrens {
113fa9e4066Sahrens 	vdev_queue_t *vq = &vd->vdev_queue;
114fa9e4066Sahrens 
115fa9e4066Sahrens 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
116fa9e4066Sahrens 
117fa9e4066Sahrens 	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
118fa9e4066Sahrens 	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
119fa9e4066Sahrens 
120fa9e4066Sahrens 	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
121fa9e4066Sahrens 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
122fa9e4066Sahrens 
123fa9e4066Sahrens 	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
124fa9e4066Sahrens 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
125fa9e4066Sahrens 
126fa9e4066Sahrens 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
127fa9e4066Sahrens 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
128fa9e4066Sahrens }
129fa9e4066Sahrens 
130fa9e4066Sahrens void
131fa9e4066Sahrens vdev_queue_fini(vdev_t *vd)
132fa9e4066Sahrens {
133fa9e4066Sahrens 	vdev_queue_t *vq = &vd->vdev_queue;
134fa9e4066Sahrens 
135fa9e4066Sahrens 	avl_destroy(&vq->vq_deadline_tree);
136fa9e4066Sahrens 	avl_destroy(&vq->vq_read_tree);
137fa9e4066Sahrens 	avl_destroy(&vq->vq_write_tree);
138fa9e4066Sahrens 	avl_destroy(&vq->vq_pending_tree);
139fa9e4066Sahrens 
140fa9e4066Sahrens 	mutex_destroy(&vq->vq_lock);
141fa9e4066Sahrens }
142fa9e4066Sahrens 
143ea8dc4b6Seschrock static void
144ea8dc4b6Seschrock vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
145ea8dc4b6Seschrock {
146*c3a66015SMatthew Ahrens 	spa_t *spa = zio->io_spa;
147ea8dc4b6Seschrock 	avl_add(&vq->vq_deadline_tree, zio);
148ea8dc4b6Seschrock 	avl_add(zio->io_vdev_tree, zio);
149*c3a66015SMatthew Ahrens 
150*c3a66015SMatthew Ahrens 	if (spa->spa_iokstat != NULL) {
151*c3a66015SMatthew Ahrens 		mutex_enter(&spa->spa_iokstat_lock);
152*c3a66015SMatthew Ahrens 		kstat_waitq_enter(spa->spa_iokstat->ks_data);
153*c3a66015SMatthew Ahrens 		mutex_exit(&spa->spa_iokstat_lock);
154*c3a66015SMatthew Ahrens 	}
155ea8dc4b6Seschrock }
156ea8dc4b6Seschrock 
157ea8dc4b6Seschrock static void
158ea8dc4b6Seschrock vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
159ea8dc4b6Seschrock {
160*c3a66015SMatthew Ahrens 	spa_t *spa = zio->io_spa;
161ea8dc4b6Seschrock 	avl_remove(&vq->vq_deadline_tree, zio);
162ea8dc4b6Seschrock 	avl_remove(zio->io_vdev_tree, zio);
163*c3a66015SMatthew Ahrens 
164*c3a66015SMatthew Ahrens 	if (spa->spa_iokstat != NULL) {
165*c3a66015SMatthew Ahrens 		mutex_enter(&spa->spa_iokstat_lock);
166*c3a66015SMatthew Ahrens 		kstat_waitq_exit(spa->spa_iokstat->ks_data);
167*c3a66015SMatthew Ahrens 		mutex_exit(&spa->spa_iokstat_lock);
168*c3a66015SMatthew Ahrens 	}
169*c3a66015SMatthew Ahrens }
170*c3a66015SMatthew Ahrens 
171*c3a66015SMatthew Ahrens static void
172*c3a66015SMatthew Ahrens vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
173*c3a66015SMatthew Ahrens {
174*c3a66015SMatthew Ahrens 	spa_t *spa = zio->io_spa;
175*c3a66015SMatthew Ahrens 	avl_add(&vq->vq_pending_tree, zio);
176*c3a66015SMatthew Ahrens 	if (spa->spa_iokstat != NULL) {
177*c3a66015SMatthew Ahrens 		mutex_enter(&spa->spa_iokstat_lock);
178*c3a66015SMatthew Ahrens 		kstat_runq_enter(spa->spa_iokstat->ks_data);
179*c3a66015SMatthew Ahrens 		mutex_exit(&spa->spa_iokstat_lock);
180*c3a66015SMatthew Ahrens 	}
181*c3a66015SMatthew Ahrens }
182*c3a66015SMatthew Ahrens 
183*c3a66015SMatthew Ahrens static void
184*c3a66015SMatthew Ahrens vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
185*c3a66015SMatthew Ahrens {
186*c3a66015SMatthew Ahrens 	spa_t *spa = zio->io_spa;
187*c3a66015SMatthew Ahrens 	avl_remove(&vq->vq_pending_tree, zio);
188*c3a66015SMatthew Ahrens 	if (spa->spa_iokstat != NULL) {
189*c3a66015SMatthew Ahrens 		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
190*c3a66015SMatthew Ahrens 
191*c3a66015SMatthew Ahrens 		mutex_enter(&spa->spa_iokstat_lock);
192*c3a66015SMatthew Ahrens 		kstat_runq_exit(spa->spa_iokstat->ks_data);
193*c3a66015SMatthew Ahrens 		if (zio->io_type == ZIO_TYPE_READ) {
194*c3a66015SMatthew Ahrens 			ksio->reads++;
195*c3a66015SMatthew Ahrens 			ksio->nread += zio->io_size;
196*c3a66015SMatthew Ahrens 		} else if (zio->io_type == ZIO_TYPE_WRITE) {
197*c3a66015SMatthew Ahrens 			ksio->writes++;
198*c3a66015SMatthew Ahrens 			ksio->nwritten += zio->io_size;
199*c3a66015SMatthew Ahrens 		}
200*c3a66015SMatthew Ahrens 		mutex_exit(&spa->spa_iokstat_lock);
201*c3a66015SMatthew Ahrens 	}
202ea8dc4b6Seschrock }
203ea8dc4b6Seschrock 
204fa9e4066Sahrens static void
205fa9e4066Sahrens vdev_queue_agg_io_done(zio_t *aio)
206fa9e4066Sahrens {
207a3f829aeSBill Moore 	zio_t *pio;
208fa9e4066Sahrens 
209a3f829aeSBill Moore 	while ((pio = zio_walk_parents(aio)) != NULL)
210fa9e4066Sahrens 		if (aio->io_type == ZIO_TYPE_READ)
211a3f829aeSBill Moore 			bcopy((char *)aio->io_data + (pio->io_offset -
212a3f829aeSBill Moore 			    aio->io_offset), pio->io_data, pio->io_size);
213fa9e4066Sahrens 
214fa9e4066Sahrens 	zio_buf_free(aio->io_data, aio->io_size);
215fa9e4066Sahrens }
216fa9e4066Sahrens 
2176f708f7cSJeff Bonwick /*
2186f708f7cSJeff Bonwick  * Compute the range spanned by two i/os, which is the endpoint of the last
2196f708f7cSJeff Bonwick  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
2206f708f7cSJeff Bonwick  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
2216f708f7cSJeff Bonwick  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
2226f708f7cSJeff Bonwick  */
2236f708f7cSJeff Bonwick #define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
2246f708f7cSJeff Bonwick #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
225fa9e4066Sahrens 
226fa9e4066Sahrens static zio_t *
227e05725b1Sbonwick vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
228fa9e4066Sahrens {
229f94275ceSAdam Leventhal 	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
230a3f829aeSBill Moore 	avl_tree_t *t;
2318ad4d6ddSJeff Bonwick 	int flags;
2326f708f7cSJeff Bonwick 	uint64_t maxspan = zfs_vdev_aggregation_limit;
2336f708f7cSJeff Bonwick 	uint64_t maxgap;
234f94275ceSAdam Leventhal 	int stretch;
235fa9e4066Sahrens 
236f94275ceSAdam Leventhal again:
237fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&vq->vq_lock));
238fa9e4066Sahrens 
239fa9e4066Sahrens 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
240fa9e4066Sahrens 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
241fa9e4066Sahrens 		return (NULL);
242fa9e4066Sahrens 
243fa9e4066Sahrens 	fio = lio = avl_first(&vq->vq_deadline_tree);
244fa9e4066Sahrens 
245a3f829aeSBill Moore 	t = fio->io_vdev_tree;
2468ad4d6ddSJeff Bonwick 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
2476f708f7cSJeff Bonwick 	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
2488ad4d6ddSJeff Bonwick 
2498ad4d6ddSJeff Bonwick 	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
2508ad4d6ddSJeff Bonwick 		/*
251f94275ceSAdam Leventhal 		 * We can aggregate I/Os that are sufficiently adjacent and of
252f94275ceSAdam Leventhal 		 * the same flavor, as expressed by the AGG_INHERIT flags.
253f94275ceSAdam Leventhal 		 * The latter requirement is necessary so that certain
254f94275ceSAdam Leventhal 		 * attributes of the I/O, such as whether it's a normal I/O
255f94275ceSAdam Leventhal 		 * or a scrub/resilver, can be preserved in the aggregate.
256f94275ceSAdam Leventhal 		 * We can include optional I/Os, but don't allow them
257f94275ceSAdam Leventhal 		 * to begin a range as they add no benefit in that situation.
258f94275ceSAdam Leventhal 		 */
259f94275ceSAdam Leventhal 
260f94275ceSAdam Leventhal 		/*
261f94275ceSAdam Leventhal 		 * We keep track of the last non-optional I/O.
262f94275ceSAdam Leventhal 		 */
263f94275ceSAdam Leventhal 		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
264f94275ceSAdam Leventhal 
265f94275ceSAdam Leventhal 		/*
266f94275ceSAdam Leventhal 		 * Walk backwards through sufficiently contiguous I/Os
267f94275ceSAdam Leventhal 		 * recording the last non-option I/O.
2688ad4d6ddSJeff Bonwick 		 */
269a3f829aeSBill Moore 		while ((dio = AVL_PREV(t, fio)) != NULL &&
2708ad4d6ddSJeff Bonwick 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
271f94275ceSAdam Leventhal 		    IO_SPAN(dio, lio) <= maxspan &&
272f94275ceSAdam Leventhal 		    IO_GAP(dio, fio) <= maxgap) {
2738ad4d6ddSJeff Bonwick 			fio = dio;
274f94275ceSAdam Leventhal 			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
275f94275ceSAdam Leventhal 				mio = fio;
276f94275ceSAdam Leventhal 		}
277f94275ceSAdam Leventhal 
278f94275ceSAdam Leventhal 		/*
279f94275ceSAdam Leventhal 		 * Skip any initial optional I/Os.
280f94275ceSAdam Leventhal 		 */
281f94275ceSAdam Leventhal 		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
282f94275ceSAdam Leventhal 			fio = AVL_NEXT(t, fio);
283f94275ceSAdam Leventhal 			ASSERT(fio != NULL);
284f94275ceSAdam Leventhal 		}
2856f708f7cSJeff Bonwick 
286f94275ceSAdam Leventhal 		/*
287f94275ceSAdam Leventhal 		 * Walk forward through sufficiently contiguous I/Os.
288f94275ceSAdam Leventhal 		 */
289a3f829aeSBill Moore 		while ((dio = AVL_NEXT(t, lio)) != NULL &&
2908ad4d6ddSJeff Bonwick 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
291f94275ceSAdam Leventhal 		    IO_SPAN(fio, dio) <= maxspan &&
292f94275ceSAdam Leventhal 		    IO_GAP(lio, dio) <= maxgap) {
2938ad4d6ddSJeff Bonwick 			lio = dio;
294f94275ceSAdam Leventhal 			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
295f94275ceSAdam Leventhal 				mio = lio;
296f94275ceSAdam Leventhal 		}
297f94275ceSAdam Leventhal 
298f94275ceSAdam Leventhal 		/*
299f94275ceSAdam Leventhal 		 * Now that we've established the range of the I/O aggregation
300f94275ceSAdam Leventhal 		 * we must decide what to do with trailing optional I/Os.
301f94275ceSAdam Leventhal 		 * For reads, there's nothing to do. While we are unable to
302f94275ceSAdam Leventhal 		 * aggregate further, it's possible that a trailing optional
303f94275ceSAdam Leventhal 		 * I/O would allow the underlying device to aggregate with
304f94275ceSAdam Leventhal 		 * subsequent I/Os. We must therefore determine if the next
305f94275ceSAdam Leventhal 		 * non-optional I/O is close enough to make aggregation
306f94275ceSAdam Leventhal 		 * worthwhile.
307f94275ceSAdam Leventhal 		 */
308f94275ceSAdam Leventhal 		stretch = B_FALSE;
309f94275ceSAdam Leventhal 		if (t != &vq->vq_read_tree && mio != NULL) {
310f94275ceSAdam Leventhal 			nio = lio;
311f94275ceSAdam Leventhal 			while ((dio = AVL_NEXT(t, nio)) != NULL &&
312f94275ceSAdam Leventhal 			    IO_GAP(nio, dio) == 0 &&
313f94275ceSAdam Leventhal 			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
314f94275ceSAdam Leventhal 				nio = dio;
315f94275ceSAdam Leventhal 				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
316f94275ceSAdam Leventhal 					stretch = B_TRUE;
317f94275ceSAdam Leventhal 					break;
318f94275ceSAdam Leventhal 				}
319f94275ceSAdam Leventhal 			}
320f94275ceSAdam Leventhal 		}
321f94275ceSAdam Leventhal 
322f94275ceSAdam Leventhal 		if (stretch) {
323f94275ceSAdam Leventhal 			/* This may be a no-op. */
324f94275ceSAdam Leventhal 			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
325f94275ceSAdam Leventhal 			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
326f94275ceSAdam Leventhal 		} else {
327f94275ceSAdam Leventhal 			while (lio != mio && lio != fio) {
328f94275ceSAdam Leventhal 				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
329f94275ceSAdam Leventhal 				lio = AVL_PREV(t, lio);
330f94275ceSAdam Leventhal 				ASSERT(lio != NULL);
331f94275ceSAdam Leventhal 			}
332f94275ceSAdam Leventhal 		}
333fa9e4066Sahrens 	}
334fa9e4066Sahrens 
335fa9e4066Sahrens 	if (fio != lio) {
3366f708f7cSJeff Bonwick 		uint64_t size = IO_SPAN(fio, lio);
337614409b5Sahrens 		ASSERT(size <= zfs_vdev_aggregation_limit);
338fa9e4066Sahrens 
339e14bb325SJeff Bonwick 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
34080eb36f2SGeorge Wilson 		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
3418ad4d6ddSJeff Bonwick 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
342fa9e4066Sahrens 		    vdev_queue_agg_io_done, NULL);
343283b8460SGeorge.Wilson 		aio->io_timestamp = fio->io_timestamp;
344fa9e4066Sahrens 
3456f708f7cSJeff Bonwick 		nio = fio;
3466f708f7cSJeff Bonwick 		do {
3476f708f7cSJeff Bonwick 			dio = nio;
3486f708f7cSJeff Bonwick 			nio = AVL_NEXT(t, dio);
349fa9e4066Sahrens 			ASSERT(dio->io_type == aio->io_type);
350a3f829aeSBill Moore 			ASSERT(dio->io_vdev_tree == t);
351a3f829aeSBill Moore 
352f94275ceSAdam Leventhal 			if (dio->io_flags & ZIO_FLAG_NODATA) {
353f94275ceSAdam Leventhal 				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
354f94275ceSAdam Leventhal 				bzero((char *)aio->io_data + (dio->io_offset -
355f94275ceSAdam Leventhal 				    aio->io_offset), dio->io_size);
356f94275ceSAdam Leventhal 			} else if (dio->io_type == ZIO_TYPE_WRITE) {
357a3f829aeSBill Moore 				bcopy(dio->io_data, (char *)aio->io_data +
358a3f829aeSBill Moore 				    (dio->io_offset - aio->io_offset),
359a3f829aeSBill Moore 				    dio->io_size);
360f94275ceSAdam Leventhal 			}
361a3f829aeSBill Moore 
362a3f829aeSBill Moore 			zio_add_child(dio, aio);
363ea8dc4b6Seschrock 			vdev_queue_io_remove(vq, dio);
364fa9e4066Sahrens 			zio_vdev_io_bypass(dio);
365a3f829aeSBill Moore 			zio_execute(dio);
3666f708f7cSJeff Bonwick 		} while (dio != lio);
367fa9e4066Sahrens 
368*c3a66015SMatthew Ahrens 		vdev_queue_pending_add(vq, aio);
369fa9e4066Sahrens 
370fa9e4066Sahrens 		return (aio);
371fa9e4066Sahrens 	}
372fa9e4066Sahrens 
373a3f829aeSBill Moore 	ASSERT(fio->io_vdev_tree == t);
374ea8dc4b6Seschrock 	vdev_queue_io_remove(vq, fio);
375fa9e4066Sahrens 
376f94275ceSAdam Leventhal 	/*
377f94275ceSAdam Leventhal 	 * If the I/O is or was optional and therefore has no data, we need to
378f94275ceSAdam Leventhal 	 * simply discard it. We need to drop the vdev queue's lock to avoid a
379f94275ceSAdam Leventhal 	 * deadlock that we could encounter since this I/O will complete
380f94275ceSAdam Leventhal 	 * immediately.
381f94275ceSAdam Leventhal 	 */
382f94275ceSAdam Leventhal 	if (fio->io_flags & ZIO_FLAG_NODATA) {
383f94275ceSAdam Leventhal 		mutex_exit(&vq->vq_lock);
384f94275ceSAdam Leventhal 		zio_vdev_io_bypass(fio);
385f94275ceSAdam Leventhal 		zio_execute(fio);
386f94275ceSAdam Leventhal 		mutex_enter(&vq->vq_lock);
387f94275ceSAdam Leventhal 		goto again;
388f94275ceSAdam Leventhal 	}
389f94275ceSAdam Leventhal 
390*c3a66015SMatthew Ahrens 	vdev_queue_pending_add(vq, fio);
391fa9e4066Sahrens 
392fa9e4066Sahrens 	return (fio);
393fa9e4066Sahrens }
394fa9e4066Sahrens 
395fa9e4066Sahrens zio_t *
396fa9e4066Sahrens vdev_queue_io(zio_t *zio)
397fa9e4066Sahrens {
398fa9e4066Sahrens 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
399fa9e4066Sahrens 	zio_t *nio;
400fa9e4066Sahrens 
401fa9e4066Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
402fa9e4066Sahrens 
403fa9e4066Sahrens 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
404fa9e4066Sahrens 		return (zio);
405fa9e4066Sahrens 
406fa9e4066Sahrens 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
407fa9e4066Sahrens 
408fa9e4066Sahrens 	if (zio->io_type == ZIO_TYPE_READ)
409fa9e4066Sahrens 		zio->io_vdev_tree = &vq->vq_read_tree;
410fa9e4066Sahrens 	else
411fa9e4066Sahrens 		zio->io_vdev_tree = &vq->vq_write_tree;
412fa9e4066Sahrens 
413fa9e4066Sahrens 	mutex_enter(&vq->vq_lock);
414fa9e4066Sahrens 
415283b8460SGeorge.Wilson 	zio->io_timestamp = ddi_get_lbolt64();
416283b8460SGeorge.Wilson 	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
417d3d50737SRafael Vanoni 	    zio->io_priority;
418fa9e4066Sahrens 
419ea8dc4b6Seschrock 	vdev_queue_io_add(vq, zio);
420fa9e4066Sahrens 
421e05725b1Sbonwick 	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
422fa9e4066Sahrens 
423fa9e4066Sahrens 	mutex_exit(&vq->vq_lock);
424fa9e4066Sahrens 
425e05725b1Sbonwick 	if (nio == NULL)
426e05725b1Sbonwick 		return (NULL);
427e05725b1Sbonwick 
428e05725b1Sbonwick 	if (nio->io_done == vdev_queue_agg_io_done) {
429e05725b1Sbonwick 		zio_nowait(nio);
430e05725b1Sbonwick 		return (NULL);
431e05725b1Sbonwick 	}
432fa9e4066Sahrens 
433e05725b1Sbonwick 	return (nio);
434fa9e4066Sahrens }
435fa9e4066Sahrens 
436fa9e4066Sahrens void
437fa9e4066Sahrens vdev_queue_io_done(zio_t *zio)
438fa9e4066Sahrens {
439fa9e4066Sahrens 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
440fa9e4066Sahrens 
441283b8460SGeorge.Wilson 	if (zio_injection_enabled)
442283b8460SGeorge.Wilson 		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
443283b8460SGeorge.Wilson 
444fa9e4066Sahrens 	mutex_enter(&vq->vq_lock);
445fa9e4066Sahrens 
446*c3a66015SMatthew Ahrens 	vdev_queue_pending_remove(vq, zio);
447fa9e4066Sahrens 
448283b8460SGeorge.Wilson 	vq->vq_io_complete_ts = ddi_get_lbolt64();
449283b8460SGeorge.Wilson 	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
450283b8460SGeorge.Wilson 
451e14bb325SJeff Bonwick 	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
452e14bb325SJeff Bonwick 		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
453fa9e4066Sahrens 		if (nio == NULL)
454fa9e4066Sahrens 			break;
455fa9e4066Sahrens 		mutex_exit(&vq->vq_lock);
456e05725b1Sbonwick 		if (nio->io_done == vdev_queue_agg_io_done) {
457e05725b1Sbonwick 			zio_nowait(nio);
458e05725b1Sbonwick 		} else {
459fa9e4066Sahrens 			zio_vdev_io_reissue(nio);
460e05725b1Sbonwick 			zio_execute(nio);
461e05725b1Sbonwick 		}
462fa9e4066Sahrens 		mutex_enter(&vq->vq_lock);
463fa9e4066Sahrens 	}
464fa9e4066Sahrens 
465fa9e4066Sahrens 	mutex_exit(&vq->vq_lock);
466fa9e4066Sahrens }
467