xref: /illumos-gate/usr/src/uts/common/fs/zfs/zio.c (revision 0a4e9518)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22d58459f4Sek  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens #include <sys/zfs_context.h>
29ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
30fa9e4066Sahrens #include <sys/spa.h>
31fa9e4066Sahrens #include <sys/txg.h>
32fa9e4066Sahrens #include <sys/spa_impl.h>
33fa9e4066Sahrens #include <sys/vdev_impl.h>
34fa9e4066Sahrens #include <sys/zio_impl.h>
35fa9e4066Sahrens #include <sys/zio_compress.h>
36fa9e4066Sahrens #include <sys/zio_checksum.h>
37fa9e4066Sahrens 
38fa9e4066Sahrens /*
39fa9e4066Sahrens  * ==========================================================================
40fa9e4066Sahrens  * I/O priority table
41fa9e4066Sahrens  * ==========================================================================
42fa9e4066Sahrens  */
43fa9e4066Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
44fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_NOW		*/
45fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
46fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
47fa9e4066Sahrens 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
48fa9e4066Sahrens 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
49fa9e4066Sahrens 	4,	/* ZIO_PRIORITY_FREE		*/
50fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
51fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
52fa9e4066Sahrens 	10,	/* ZIO_PRIORITY_RESILVER	*/
53fa9e4066Sahrens 	20,	/* ZIO_PRIORITY_SCRUB		*/
54fa9e4066Sahrens };
55fa9e4066Sahrens 
56fa9e4066Sahrens /*
57fa9e4066Sahrens  * ==========================================================================
58fa9e4066Sahrens  * I/O type descriptions
59fa9e4066Sahrens  * ==========================================================================
60fa9e4066Sahrens  */
61fa9e4066Sahrens char *zio_type_name[ZIO_TYPES] = {
62fa9e4066Sahrens 	"null", "read", "write", "free", "claim", "ioctl" };
63fa9e4066Sahrens 
64fa9e4066Sahrens /* At or above this size, force gang blocking - for testing */
65fa9e4066Sahrens uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
66fa9e4066Sahrens 
67d63d470bSgw /* Force an allocation failure when non-zero */
68d63d470bSgw uint16_t zio_zil_fail_shift = 0;
69*0a4e9518Sgw uint16_t zio_io_fail_shift = 0;
70*0a4e9518Sgw 
71*0a4e9518Sgw /* Enable/disable the write-retry logic */
72*0a4e9518Sgw int zio_write_retry = 1;
73*0a4e9518Sgw 
74*0a4e9518Sgw /* Taskq to handle reissuing of I/Os */
75*0a4e9518Sgw taskq_t *zio_taskq;
76*0a4e9518Sgw int zio_resume_threads = 4;
77d63d470bSgw 
78fa9e4066Sahrens typedef struct zio_sync_pass {
79fa9e4066Sahrens 	int	zp_defer_free;		/* defer frees after this pass */
80fa9e4066Sahrens 	int	zp_dontcompress;	/* don't compress after this pass */
81fa9e4066Sahrens 	int	zp_rewrite;		/* rewrite new bps after this pass */
82fa9e4066Sahrens } zio_sync_pass_t;
83fa9e4066Sahrens 
84fa9e4066Sahrens zio_sync_pass_t zio_sync_pass = {
85fa9e4066Sahrens 	1,	/* zp_defer_free */
86fa9e4066Sahrens 	4,	/* zp_dontcompress */
87fa9e4066Sahrens 	1,	/* zp_rewrite */
88fa9e4066Sahrens };
89fa9e4066Sahrens 
90*0a4e9518Sgw static boolean_t zio_io_should_fail(uint16_t);
91*0a4e9518Sgw 
92fa9e4066Sahrens /*
93fa9e4066Sahrens  * ==========================================================================
94fa9e4066Sahrens  * I/O kmem caches
95fa9e4066Sahrens  * ==========================================================================
96fa9e4066Sahrens  */
97ccae0b50Seschrock kmem_cache_t *zio_cache;
98fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
99ad23a2dbSjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
100ad23a2dbSjohansen 
101ad23a2dbSjohansen #ifdef _KERNEL
102ad23a2dbSjohansen extern vmem_t *zio_alloc_arena;
103ad23a2dbSjohansen #endif
104fa9e4066Sahrens 
105*0a4e9518Sgw /*
106*0a4e9518Sgw  * Determine if we are allowed to issue the IO based on the
107*0a4e9518Sgw  * pool state. If we must wait then block until we are told
108*0a4e9518Sgw  * that we may continue.
109*0a4e9518Sgw  */
110*0a4e9518Sgw #define	ZIO_ENTER(spa) {						\
111*0a4e9518Sgw 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
112*0a4e9518Sgw 		mutex_enter(&spa->spa_zio_lock);			\
113*0a4e9518Sgw 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
114*0a4e9518Sgw 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
115*0a4e9518Sgw 		mutex_exit(&spa->spa_zio_lock);				\
116*0a4e9518Sgw 	}								\
117*0a4e9518Sgw }
118*0a4e9518Sgw 
119*0a4e9518Sgw /*
120*0a4e9518Sgw  * An allocation zio is one that either currently has the DVA allocate
121*0a4e9518Sgw  * stage set or will have it later in it's lifetime.
122*0a4e9518Sgw  */
123*0a4e9518Sgw #define	IO_IS_ALLOCATING(zio) \
124*0a4e9518Sgw 	((zio)->io_orig_pipeline == ZIO_WRITE_PIPELINE ||		\
125*0a4e9518Sgw 	(zio)->io_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
126*0a4e9518Sgw 
127*0a4e9518Sgw /*
128*0a4e9518Sgw  * The only way to tell is by looking for the gang pipeline stage
129*0a4e9518Sgw  */
130*0a4e9518Sgw #define	IO_IS_REWRITE(zio)						\
131*0a4e9518Sgw 	((zio)->io_pipeline & (1U << ZIO_STAGE_GANG_PIPELINE))
132*0a4e9518Sgw 
133fa9e4066Sahrens void
134fa9e4066Sahrens zio_init(void)
135fa9e4066Sahrens {
136fa9e4066Sahrens 	size_t c;
137ad23a2dbSjohansen 	vmem_t *data_alloc_arena = NULL;
138ad23a2dbSjohansen 
139ad23a2dbSjohansen #ifdef _KERNEL
140ad23a2dbSjohansen 	data_alloc_arena = zio_alloc_arena;
141ad23a2dbSjohansen #endif
142fa9e4066Sahrens 
143ccae0b50Seschrock 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
144ccae0b50Seschrock 	    NULL, NULL, NULL, NULL, NULL, 0);
145ccae0b50Seschrock 
146fa9e4066Sahrens 	/*
147fa9e4066Sahrens 	 * For small buffers, we want a cache for each multiple of
148fa9e4066Sahrens 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
149fa9e4066Sahrens 	 * for each quarter-power of 2.  For large buffers, we want
150fa9e4066Sahrens 	 * a cache for each multiple of PAGESIZE.
151fa9e4066Sahrens 	 */
152fa9e4066Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
153fa9e4066Sahrens 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
154fa9e4066Sahrens 		size_t p2 = size;
155fa9e4066Sahrens 		size_t align = 0;
156fa9e4066Sahrens 
157fa9e4066Sahrens 		while (p2 & (p2 - 1))
158fa9e4066Sahrens 			p2 &= p2 - 1;
159fa9e4066Sahrens 
160fa9e4066Sahrens 		if (size <= 4 * SPA_MINBLOCKSIZE) {
161fa9e4066Sahrens 			align = SPA_MINBLOCKSIZE;
162fa9e4066Sahrens 		} else if (P2PHASE(size, PAGESIZE) == 0) {
163fa9e4066Sahrens 			align = PAGESIZE;
164fa9e4066Sahrens 		} else if (P2PHASE(size, p2 >> 2) == 0) {
165fa9e4066Sahrens 			align = p2 >> 2;
166fa9e4066Sahrens 		}
167fa9e4066Sahrens 
168fa9e4066Sahrens 		if (align != 0) {
169ad23a2dbSjohansen 			char name[36];
1705ad82045Snd 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
171fa9e4066Sahrens 			zio_buf_cache[c] = kmem_cache_create(name, size,
172a0965f35Sbonwick 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
173ad23a2dbSjohansen 
174ad23a2dbSjohansen 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
175ad23a2dbSjohansen 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
176ad23a2dbSjohansen 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
177ad23a2dbSjohansen 			    KMC_NODEBUG);
178ad23a2dbSjohansen 
179fa9e4066Sahrens 			dprintf("creating cache for size %5lx align %5lx\n",
180fa9e4066Sahrens 			    size, align);
181fa9e4066Sahrens 		}
182fa9e4066Sahrens 	}
183fa9e4066Sahrens 
184fa9e4066Sahrens 	while (--c != 0) {
185fa9e4066Sahrens 		ASSERT(zio_buf_cache[c] != NULL);
186fa9e4066Sahrens 		if (zio_buf_cache[c - 1] == NULL)
187fa9e4066Sahrens 			zio_buf_cache[c - 1] = zio_buf_cache[c];
188ad23a2dbSjohansen 
189ad23a2dbSjohansen 		ASSERT(zio_data_buf_cache[c] != NULL);
190ad23a2dbSjohansen 		if (zio_data_buf_cache[c - 1] == NULL)
191ad23a2dbSjohansen 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
192fa9e4066Sahrens 	}
193ea8dc4b6Seschrock 
194*0a4e9518Sgw 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
195*0a4e9518Sgw 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
196*0a4e9518Sgw 
197ea8dc4b6Seschrock 	zio_inject_init();
198fa9e4066Sahrens }
199fa9e4066Sahrens 
200fa9e4066Sahrens void
201fa9e4066Sahrens zio_fini(void)
202fa9e4066Sahrens {
203fa9e4066Sahrens 	size_t c;
204fa9e4066Sahrens 	kmem_cache_t *last_cache = NULL;
205ad23a2dbSjohansen 	kmem_cache_t *last_data_cache = NULL;
206fa9e4066Sahrens 
207fa9e4066Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
208fa9e4066Sahrens 		if (zio_buf_cache[c] != last_cache) {
209fa9e4066Sahrens 			last_cache = zio_buf_cache[c];
210fa9e4066Sahrens 			kmem_cache_destroy(zio_buf_cache[c]);
211fa9e4066Sahrens 		}
212fa9e4066Sahrens 		zio_buf_cache[c] = NULL;
213ad23a2dbSjohansen 
214ad23a2dbSjohansen 		if (zio_data_buf_cache[c] != last_data_cache) {
215ad23a2dbSjohansen 			last_data_cache = zio_data_buf_cache[c];
216ad23a2dbSjohansen 			kmem_cache_destroy(zio_data_buf_cache[c]);
217ad23a2dbSjohansen 		}
218ad23a2dbSjohansen 		zio_data_buf_cache[c] = NULL;
219fa9e4066Sahrens 	}
220ea8dc4b6Seschrock 
221*0a4e9518Sgw 	taskq_destroy(zio_taskq);
222*0a4e9518Sgw 
223ccae0b50Seschrock 	kmem_cache_destroy(zio_cache);
224ccae0b50Seschrock 
225ea8dc4b6Seschrock 	zio_inject_fini();
226fa9e4066Sahrens }
227fa9e4066Sahrens 
228fa9e4066Sahrens /*
229fa9e4066Sahrens  * ==========================================================================
230fa9e4066Sahrens  * Allocate and free I/O buffers
231fa9e4066Sahrens  * ==========================================================================
232fa9e4066Sahrens  */
233ad23a2dbSjohansen 
234ad23a2dbSjohansen /*
235ad23a2dbSjohansen  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
236ad23a2dbSjohansen  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
237ad23a2dbSjohansen  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
238ad23a2dbSjohansen  * excess / transient data in-core during a crashdump.
239ad23a2dbSjohansen  */
240fa9e4066Sahrens void *
241fa9e4066Sahrens zio_buf_alloc(size_t size)
242fa9e4066Sahrens {
243fa9e4066Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
244fa9e4066Sahrens 
245fa9e4066Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
246fa9e4066Sahrens 
247fa9e4066Sahrens 	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
248fa9e4066Sahrens }
249fa9e4066Sahrens 
250ad23a2dbSjohansen /*
251ad23a2dbSjohansen  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
252ad23a2dbSjohansen  * crashdump if the kernel panics.  This exists so that we will limit the amount
253ad23a2dbSjohansen  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
254ad23a2dbSjohansen  * of kernel heap dumped to disk when the kernel panics)
255ad23a2dbSjohansen  */
256ad23a2dbSjohansen void *
257ad23a2dbSjohansen zio_data_buf_alloc(size_t size)
258ad23a2dbSjohansen {
259ad23a2dbSjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
260ad23a2dbSjohansen 
261ad23a2dbSjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
262ad23a2dbSjohansen 
263ad23a2dbSjohansen 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
264ad23a2dbSjohansen }
265ad23a2dbSjohansen 
266fa9e4066Sahrens void
267fa9e4066Sahrens zio_buf_free(void *buf, size_t size)
268fa9e4066Sahrens {
269fa9e4066Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
270fa9e4066Sahrens 
271fa9e4066Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
272fa9e4066Sahrens 
273fa9e4066Sahrens 	kmem_cache_free(zio_buf_cache[c], buf);
274fa9e4066Sahrens }
275fa9e4066Sahrens 
276ad23a2dbSjohansen void
277ad23a2dbSjohansen zio_data_buf_free(void *buf, size_t size)
278ad23a2dbSjohansen {
279ad23a2dbSjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
280ad23a2dbSjohansen 
281ad23a2dbSjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
282ad23a2dbSjohansen 
283ad23a2dbSjohansen 	kmem_cache_free(zio_data_buf_cache[c], buf);
284ad23a2dbSjohansen }
285b3995adbSahrens 
286fa9e4066Sahrens /*
287fa9e4066Sahrens  * ==========================================================================
288fa9e4066Sahrens  * Push and pop I/O transform buffers
289fa9e4066Sahrens  * ==========================================================================
290fa9e4066Sahrens  */
291fa9e4066Sahrens static void
292fa9e4066Sahrens zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
293fa9e4066Sahrens {
294fa9e4066Sahrens 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
295fa9e4066Sahrens 
296fa9e4066Sahrens 	zt->zt_data = data;
297fa9e4066Sahrens 	zt->zt_size = size;
298fa9e4066Sahrens 	zt->zt_bufsize = bufsize;
299fa9e4066Sahrens 
300fa9e4066Sahrens 	zt->zt_next = zio->io_transform_stack;
301fa9e4066Sahrens 	zio->io_transform_stack = zt;
302fa9e4066Sahrens 
303fa9e4066Sahrens 	zio->io_data = data;
304fa9e4066Sahrens 	zio->io_size = size;
305fa9e4066Sahrens }
306fa9e4066Sahrens 
307fa9e4066Sahrens static void
308fa9e4066Sahrens zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
309fa9e4066Sahrens {
310fa9e4066Sahrens 	zio_transform_t *zt = zio->io_transform_stack;
311fa9e4066Sahrens 
312fa9e4066Sahrens 	*data = zt->zt_data;
313fa9e4066Sahrens 	*size = zt->zt_size;
314fa9e4066Sahrens 	*bufsize = zt->zt_bufsize;
315fa9e4066Sahrens 
316fa9e4066Sahrens 	zio->io_transform_stack = zt->zt_next;
317fa9e4066Sahrens 	kmem_free(zt, sizeof (zio_transform_t));
318fa9e4066Sahrens 
319fa9e4066Sahrens 	if ((zt = zio->io_transform_stack) != NULL) {
320fa9e4066Sahrens 		zio->io_data = zt->zt_data;
321fa9e4066Sahrens 		zio->io_size = zt->zt_size;
322fa9e4066Sahrens 	}
323fa9e4066Sahrens }
324fa9e4066Sahrens 
325fa9e4066Sahrens static void
326fa9e4066Sahrens zio_clear_transform_stack(zio_t *zio)
327fa9e4066Sahrens {
328fa9e4066Sahrens 	void *data;
329fa9e4066Sahrens 	uint64_t size, bufsize;
330fa9e4066Sahrens 
331fa9e4066Sahrens 	ASSERT(zio->io_transform_stack != NULL);
332fa9e4066Sahrens 
333fa9e4066Sahrens 	zio_pop_transform(zio, &data, &size, &bufsize);
334fa9e4066Sahrens 	while (zio->io_transform_stack != NULL) {
335fa9e4066Sahrens 		zio_buf_free(data, bufsize);
336fa9e4066Sahrens 		zio_pop_transform(zio, &data, &size, &bufsize);
337fa9e4066Sahrens 	}
338fa9e4066Sahrens }
339fa9e4066Sahrens 
340fa9e4066Sahrens /*
341fa9e4066Sahrens  * ==========================================================================
342fa9e4066Sahrens  * Create the various types of I/O (read, write, free)
343fa9e4066Sahrens  * ==========================================================================
344fa9e4066Sahrens  */
345fa9e4066Sahrens static zio_t *
346fa9e4066Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
347fa9e4066Sahrens     void *data, uint64_t size, zio_done_func_t *done, void *private,
348fa9e4066Sahrens     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
349fa9e4066Sahrens {
350fa9e4066Sahrens 	zio_t *zio;
351fa9e4066Sahrens 
352fa9e4066Sahrens 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
353fa9e4066Sahrens 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
354fa9e4066Sahrens 
355ccae0b50Seschrock 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
356ccae0b50Seschrock 	bzero(zio, sizeof (zio_t));
357fa9e4066Sahrens 	zio->io_parent = pio;
358fa9e4066Sahrens 	zio->io_spa = spa;
359fa9e4066Sahrens 	zio->io_txg = txg;
360fdb2e906Sek 	zio->io_flags = flags;
361fa9e4066Sahrens 	if (bp != NULL) {
362fa9e4066Sahrens 		zio->io_bp = bp;
363fa9e4066Sahrens 		zio->io_bp_copy = *bp;
364fa9e4066Sahrens 		zio->io_bp_orig = *bp;
365fdb2e906Sek 		if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
366fdb2e906Sek 		    BP_GET_LEVEL(bp) != 0)
367fdb2e906Sek 			zio->io_flags |= ZIO_FLAG_METADATA;
368fa9e4066Sahrens 	}
369fa9e4066Sahrens 	zio->io_done = done;
370fa9e4066Sahrens 	zio->io_private = private;
371fa9e4066Sahrens 	zio->io_type = type;
372fa9e4066Sahrens 	zio->io_priority = priority;
373fa9e4066Sahrens 	zio->io_stage = stage;
374fa9e4066Sahrens 	zio->io_pipeline = pipeline;
375fa9e4066Sahrens 	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
376fa9e4066Sahrens 	zio->io_timestamp = lbolt64;
377fdb2e906Sek 	if (pio != NULL)
378fdb2e906Sek 		zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
3795ad82045Snd 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
380c25056deSgw 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
381fa9e4066Sahrens 	zio_push_transform(zio, data, size, size);
382fa9e4066Sahrens 
383b3995adbSahrens 	/*
384b3995adbSahrens 	 * Note on config lock:
385b3995adbSahrens 	 *
386b3995adbSahrens 	 * If CONFIG_HELD is set, then the caller already has the config
387b3995adbSahrens 	 * lock, so we don't need it for this io.
388b3995adbSahrens 	 *
389b3995adbSahrens 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
390b3995adbSahrens 	 * config lock on behalf of this io, so it should be released
391b3995adbSahrens 	 * in zio_done.
392b3995adbSahrens 	 *
393b3995adbSahrens 	 * Unless CONFIG_HELD is set, we will grab the config lock for
394b3995adbSahrens 	 * any top-level (parent-less) io, *except* NULL top-level ios.
395b3995adbSahrens 	 * The NULL top-level ios rarely have any children, so we delay
396b3995adbSahrens 	 * grabbing the lock until the first child is added (but it is
397b3995adbSahrens 	 * still grabbed on behalf of the top-level i/o, so additional
398b3995adbSahrens 	 * children don't need to also grab it).  This greatly reduces
399b3995adbSahrens 	 * contention on the config lock.
400b3995adbSahrens 	 */
401fa9e4066Sahrens 	if (pio == NULL) {
402b3995adbSahrens 		if (type != ZIO_TYPE_NULL &&
403b3995adbSahrens 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
404ea8dc4b6Seschrock 			spa_config_enter(zio->io_spa, RW_READER, zio);
405b3995adbSahrens 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
406b3995adbSahrens 		}
407fa9e4066Sahrens 		zio->io_root = zio;
408fa9e4066Sahrens 	} else {
409fa9e4066Sahrens 		zio->io_root = pio->io_root;
410ea8dc4b6Seschrock 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
411ea8dc4b6Seschrock 			zio->io_logical = pio->io_logical;
412fa9e4066Sahrens 		mutex_enter(&pio->io_lock);
413b3995adbSahrens 		if (pio->io_parent == NULL &&
414b3995adbSahrens 		    pio->io_type == ZIO_TYPE_NULL &&
415b3995adbSahrens 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
416b3995adbSahrens 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
417b3995adbSahrens 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
418b3995adbSahrens 			spa_config_enter(zio->io_spa, RW_READER, pio);
419b3995adbSahrens 		}
420fa9e4066Sahrens 		if (stage < ZIO_STAGE_READY)
421fa9e4066Sahrens 			pio->io_children_notready++;
422fa9e4066Sahrens 		pio->io_children_notdone++;
423fa9e4066Sahrens 		zio->io_sibling_next = pio->io_child;
424fa9e4066Sahrens 		zio->io_sibling_prev = NULL;
425fa9e4066Sahrens 		if (pio->io_child != NULL)
426fa9e4066Sahrens 			pio->io_child->io_sibling_prev = zio;
427fa9e4066Sahrens 		pio->io_child = zio;
42844cd46caSbillm 		zio->io_ndvas = pio->io_ndvas;
429fa9e4066Sahrens 		mutex_exit(&pio->io_lock);
430fa9e4066Sahrens 	}
431fa9e4066Sahrens 
432*0a4e9518Sgw 	/*
433*0a4e9518Sgw 	 * Save off the original state incase we need to retry later.
434*0a4e9518Sgw 	 */
435*0a4e9518Sgw 	zio->io_orig_stage = zio->io_stage;
436*0a4e9518Sgw 	zio->io_orig_pipeline = zio->io_pipeline;
437*0a4e9518Sgw 	zio->io_orig_flags = zio->io_flags;
438*0a4e9518Sgw 
439fa9e4066Sahrens 	return (zio);
440fa9e4066Sahrens }
441fa9e4066Sahrens 
442*0a4e9518Sgw static void
443*0a4e9518Sgw zio_reset(zio_t *zio)
444*0a4e9518Sgw {
445*0a4e9518Sgw 	zio_clear_transform_stack(zio);
446*0a4e9518Sgw 
447*0a4e9518Sgw 	zio->io_flags = zio->io_orig_flags;
448*0a4e9518Sgw 	zio->io_stage = zio->io_orig_stage;
449*0a4e9518Sgw 	zio->io_pipeline = zio->io_orig_pipeline;
450*0a4e9518Sgw 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
451*0a4e9518Sgw }
452*0a4e9518Sgw 
453fa9e4066Sahrens zio_t *
454fa9e4066Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
455fa9e4066Sahrens 	int flags)
456fa9e4066Sahrens {
457fa9e4066Sahrens 	zio_t *zio;
458fa9e4066Sahrens 
459fa9e4066Sahrens 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
460fa9e4066Sahrens 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
461fa9e4066Sahrens 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
462fa9e4066Sahrens 
463fa9e4066Sahrens 	return (zio);
464fa9e4066Sahrens }
465fa9e4066Sahrens 
466fa9e4066Sahrens zio_t *
467fa9e4066Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
468fa9e4066Sahrens {
469fa9e4066Sahrens 	return (zio_null(NULL, spa, done, private, flags));
470fa9e4066Sahrens }
471fa9e4066Sahrens 
472fa9e4066Sahrens zio_t *
473fa9e4066Sahrens zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
474fa9e4066Sahrens     uint64_t size, zio_done_func_t *done, void *private,
475ea8dc4b6Seschrock     int priority, int flags, zbookmark_t *zb)
476fa9e4066Sahrens {
477fa9e4066Sahrens 	zio_t *zio;
478fa9e4066Sahrens 
479fa9e4066Sahrens 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
480fa9e4066Sahrens 
481*0a4e9518Sgw 	/*
482*0a4e9518Sgw 	 * If the user has specified that we allow I/Os to continue
483*0a4e9518Sgw 	 * then attempt to satisfy the read.
484*0a4e9518Sgw 	 */
485*0a4e9518Sgw 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
486*0a4e9518Sgw 		ZIO_ENTER(spa);
487*0a4e9518Sgw 
488fa9e4066Sahrens 	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
489faafa6e3Sahrens 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
490faafa6e3Sahrens 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
491ea8dc4b6Seschrock 	zio->io_bookmark = *zb;
492ea8dc4b6Seschrock 
493ea8dc4b6Seschrock 	zio->io_logical = zio;
494fa9e4066Sahrens 
495fa9e4066Sahrens 	/*
496fa9e4066Sahrens 	 * Work off our copy of the bp so the caller can free it.
497fa9e4066Sahrens 	 */
498fa9e4066Sahrens 	zio->io_bp = &zio->io_bp_copy;
499fa9e4066Sahrens 
500fa9e4066Sahrens 	return (zio);
501fa9e4066Sahrens }
502fa9e4066Sahrens 
503fa9e4066Sahrens zio_t *
50444cd46caSbillm zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
505fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
506c717a561Smaybee     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
507c717a561Smaybee     int flags, zbookmark_t *zb)
508fa9e4066Sahrens {
509fa9e4066Sahrens 	zio_t *zio;
510fa9e4066Sahrens 
511fa9e4066Sahrens 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
512fa9e4066Sahrens 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
513fa9e4066Sahrens 
514fa9e4066Sahrens 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
515fa9e4066Sahrens 	    compress < ZIO_COMPRESS_FUNCTIONS);
516fa9e4066Sahrens 
517*0a4e9518Sgw 	ZIO_ENTER(spa);
518*0a4e9518Sgw 
519fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
520faafa6e3Sahrens 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
521fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
522fa9e4066Sahrens 
523c717a561Smaybee 	zio->io_ready = ready;
524c717a561Smaybee 
525ea8dc4b6Seschrock 	zio->io_bookmark = *zb;
526ea8dc4b6Seschrock 
527ea8dc4b6Seschrock 	zio->io_logical = zio;
528ea8dc4b6Seschrock 
529fa9e4066Sahrens 	zio->io_checksum = checksum;
530fa9e4066Sahrens 	zio->io_compress = compress;
53144cd46caSbillm 	zio->io_ndvas = ncopies;
532fa9e4066Sahrens 
533fa9e4066Sahrens 	if (compress != ZIO_COMPRESS_OFF)
534fa9e4066Sahrens 		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
535fa9e4066Sahrens 
536fa9e4066Sahrens 	if (bp->blk_birth != txg) {
537fa9e4066Sahrens 		/* XXX the bp usually (always?) gets re-zeroed later */
538fa9e4066Sahrens 		BP_ZERO(bp);
539fa9e4066Sahrens 		BP_SET_LSIZE(bp, size);
540fa9e4066Sahrens 		BP_SET_PSIZE(bp, size);
54144cd46caSbillm 	} else {
54244cd46caSbillm 		/* Make sure someone doesn't change their mind on overwrites */
54344cd46caSbillm 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
54444cd46caSbillm 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
545fa9e4066Sahrens 	}
546fa9e4066Sahrens 
547fa9e4066Sahrens 	return (zio);
548fa9e4066Sahrens }
549fa9e4066Sahrens 
550fa9e4066Sahrens zio_t *
551fa9e4066Sahrens zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
552fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
553ea8dc4b6Seschrock     zio_done_func_t *done, void *private, int priority, int flags,
554ea8dc4b6Seschrock     zbookmark_t *zb)
555fa9e4066Sahrens {
556fa9e4066Sahrens 	zio_t *zio;
557fa9e4066Sahrens 
558fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
559faafa6e3Sahrens 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
560fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
561fa9e4066Sahrens 
562ea8dc4b6Seschrock 	zio->io_bookmark = *zb;
563fa9e4066Sahrens 	zio->io_checksum = checksum;
564fa9e4066Sahrens 	zio->io_compress = ZIO_COMPRESS_OFF;
565fa9e4066Sahrens 
56644cd46caSbillm 	if (pio != NULL)
56744cd46caSbillm 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
56844cd46caSbillm 
569fa9e4066Sahrens 	return (zio);
570fa9e4066Sahrens }
571fa9e4066Sahrens 
572*0a4e9518Sgw static void
573*0a4e9518Sgw zio_write_allocate_ready(zio_t *zio)
574*0a4e9518Sgw {
575*0a4e9518Sgw 	/* Free up the previous block */
576*0a4e9518Sgw 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
577*0a4e9518Sgw 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
578*0a4e9518Sgw 		    &zio->io_bp_orig, NULL, NULL));
579*0a4e9518Sgw 	}
580*0a4e9518Sgw }
581*0a4e9518Sgw 
582fa9e4066Sahrens static zio_t *
583fa9e4066Sahrens zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
584fa9e4066Sahrens     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
585fa9e4066Sahrens     zio_done_func_t *done, void *private, int priority, int flags)
586fa9e4066Sahrens {
587fa9e4066Sahrens 	zio_t *zio;
588fa9e4066Sahrens 
589fa9e4066Sahrens 	BP_ZERO(bp);
590fa9e4066Sahrens 	BP_SET_LSIZE(bp, size);
591fa9e4066Sahrens 	BP_SET_PSIZE(bp, size);
592fa9e4066Sahrens 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
593fa9e4066Sahrens 
594fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
595fa9e4066Sahrens 	    ZIO_TYPE_WRITE, priority, flags,
596fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
597fa9e4066Sahrens 
598fa9e4066Sahrens 	zio->io_checksum = checksum;
599fa9e4066Sahrens 	zio->io_compress = ZIO_COMPRESS_OFF;
600*0a4e9518Sgw 	zio->io_ready = zio_write_allocate_ready;
601fa9e4066Sahrens 
602fa9e4066Sahrens 	return (zio);
603fa9e4066Sahrens }
604fa9e4066Sahrens 
605fa9e4066Sahrens zio_t *
606fa9e4066Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
607fa9e4066Sahrens     zio_done_func_t *done, void *private)
608fa9e4066Sahrens {
609fa9e4066Sahrens 	zio_t *zio;
610fa9e4066Sahrens 
611fa9e4066Sahrens 	ASSERT(!BP_IS_HOLE(bp));
612fa9e4066Sahrens 
613fa9e4066Sahrens 	if (txg == spa->spa_syncing_txg &&
614fa9e4066Sahrens 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
615fa9e4066Sahrens 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
616fa9e4066Sahrens 		return (zio_null(pio, spa, NULL, NULL, 0));
617fa9e4066Sahrens 	}
618fa9e4066Sahrens 
619fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
620faafa6e3Sahrens 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
621fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
622fa9e4066Sahrens 
623fa9e4066Sahrens 	zio->io_bp = &zio->io_bp_copy;
624fa9e4066Sahrens 
625fa9e4066Sahrens 	return (zio);
626fa9e4066Sahrens }
627fa9e4066Sahrens 
628fa9e4066Sahrens zio_t *
629fa9e4066Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
630fa9e4066Sahrens     zio_done_func_t *done, void *private)
631fa9e4066Sahrens {
632fa9e4066Sahrens 	zio_t *zio;
633fa9e4066Sahrens 
634fa9e4066Sahrens 	/*
635fa9e4066Sahrens 	 * A claim is an allocation of a specific block.  Claims are needed
636fa9e4066Sahrens 	 * to support immediate writes in the intent log.  The issue is that
637fa9e4066Sahrens 	 * immediate writes contain committed data, but in a txg that was
638fa9e4066Sahrens 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
639fa9e4066Sahrens 	 * the intent log claims all blocks that contain immediate write data
640fa9e4066Sahrens 	 * so that the SPA knows they're in use.
641fa9e4066Sahrens 	 *
642fa9e4066Sahrens 	 * All claims *must* be resolved in the first txg -- before the SPA
643fa9e4066Sahrens 	 * starts allocating blocks -- so that nothing is allocated twice.
644fa9e4066Sahrens 	 */
645fa9e4066Sahrens 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
646fa9e4066Sahrens 	ASSERT3U(spa_first_txg(spa), <=, txg);
647fa9e4066Sahrens 
648fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
649fa9e4066Sahrens 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
650fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
651fa9e4066Sahrens 
652fa9e4066Sahrens 	zio->io_bp = &zio->io_bp_copy;
653fa9e4066Sahrens 
654fa9e4066Sahrens 	return (zio);
655fa9e4066Sahrens }
656fa9e4066Sahrens 
657fa9e4066Sahrens zio_t *
658fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
659fa9e4066Sahrens     zio_done_func_t *done, void *private, int priority, int flags)
660fa9e4066Sahrens {
661fa9e4066Sahrens 	zio_t *zio;
662fa9e4066Sahrens 	int c;
663fa9e4066Sahrens 
664fa9e4066Sahrens 	if (vd->vdev_children == 0) {
665fa9e4066Sahrens 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
666fa9e4066Sahrens 		    ZIO_TYPE_IOCTL, priority, flags,
667fa9e4066Sahrens 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
668fa9e4066Sahrens 
669fa9e4066Sahrens 		zio->io_vd = vd;
670fa9e4066Sahrens 		zio->io_cmd = cmd;
671fa9e4066Sahrens 	} else {
672fa9e4066Sahrens 		zio = zio_null(pio, spa, NULL, NULL, flags);
673fa9e4066Sahrens 
674fa9e4066Sahrens 		for (c = 0; c < vd->vdev_children; c++)
675fa9e4066Sahrens 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
676fa9e4066Sahrens 			    done, private, priority, flags));
677fa9e4066Sahrens 	}
678fa9e4066Sahrens 
679fa9e4066Sahrens 	return (zio);
680fa9e4066Sahrens }
681fa9e4066Sahrens 
682fa9e4066Sahrens static void
683fa9e4066Sahrens zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
684fa9e4066Sahrens     int checksum)
685fa9e4066Sahrens {
686fa9e4066Sahrens 	ASSERT(vd->vdev_children == 0);
687fa9e4066Sahrens 
688fa9e4066Sahrens 	ASSERT(size <= SPA_MAXBLOCKSIZE);
689fa9e4066Sahrens 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
690fa9e4066Sahrens 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
691fa9e4066Sahrens 
692fa9e4066Sahrens 	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
693fa9e4066Sahrens 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
694fa9e4066Sahrens 	ASSERT3U(offset + size, <=, vd->vdev_psize);
695fa9e4066Sahrens 
696fa9e4066Sahrens 	BP_ZERO(bp);
697fa9e4066Sahrens 
698fa9e4066Sahrens 	BP_SET_LSIZE(bp, size);
699fa9e4066Sahrens 	BP_SET_PSIZE(bp, size);
700fa9e4066Sahrens 
701fa9e4066Sahrens 	BP_SET_CHECKSUM(bp, checksum);
702fa9e4066Sahrens 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
703fa9e4066Sahrens 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
704fa9e4066Sahrens 
705fa9e4066Sahrens 	if (checksum != ZIO_CHECKSUM_OFF)
706fa9e4066Sahrens 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
707fa9e4066Sahrens }
708fa9e4066Sahrens 
709fa9e4066Sahrens zio_t *
710fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
711fa9e4066Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
712fa9e4066Sahrens     int priority, int flags)
713fa9e4066Sahrens {
714fa9e4066Sahrens 	zio_t *zio;
715fa9e4066Sahrens 	blkptr_t blk;
716fa9e4066Sahrens 
717*0a4e9518Sgw 	ZIO_ENTER(vd->vdev_spa);
718*0a4e9518Sgw 
719fa9e4066Sahrens 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
720fa9e4066Sahrens 
721fa9e4066Sahrens 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
722fa9e4066Sahrens 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
723fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
724fa9e4066Sahrens 
725fa9e4066Sahrens 	zio->io_vd = vd;
726fa9e4066Sahrens 	zio->io_offset = offset;
727fa9e4066Sahrens 
728fa9e4066Sahrens 	/*
729fa9e4066Sahrens 	 * Work off our copy of the bp so the caller can free it.
730fa9e4066Sahrens 	 */
731fa9e4066Sahrens 	zio->io_bp = &zio->io_bp_copy;
732fa9e4066Sahrens 
733fa9e4066Sahrens 	return (zio);
734fa9e4066Sahrens }
735fa9e4066Sahrens 
736fa9e4066Sahrens zio_t *
737fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
738fa9e4066Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
739fa9e4066Sahrens     int priority, int flags)
740fa9e4066Sahrens {
741fa9e4066Sahrens 	zio_block_tail_t *zbt;
742fa9e4066Sahrens 	void *wbuf;
743fa9e4066Sahrens 	zio_t *zio;
744fa9e4066Sahrens 	blkptr_t blk;
745fa9e4066Sahrens 
746*0a4e9518Sgw 	ZIO_ENTER(vd->vdev_spa);
747*0a4e9518Sgw 
748fa9e4066Sahrens 	zio_phys_bp_init(vd, &blk, offset, size, checksum);
749fa9e4066Sahrens 
750fa9e4066Sahrens 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
751fa9e4066Sahrens 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
752fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
753fa9e4066Sahrens 
754fa9e4066Sahrens 	zio->io_vd = vd;
755fa9e4066Sahrens 	zio->io_offset = offset;
756fa9e4066Sahrens 
757fa9e4066Sahrens 	zio->io_bp = &zio->io_bp_copy;
758fa9e4066Sahrens 	zio->io_checksum = checksum;
759fa9e4066Sahrens 
760fa9e4066Sahrens 	if (zio_checksum_table[checksum].ci_zbt) {
761fa9e4066Sahrens 		/*
762fa9e4066Sahrens 		 * zbt checksums are necessarily destructive -- they modify
763fa9e4066Sahrens 		 * one word of the write buffer to hold the verifier/checksum.
764fa9e4066Sahrens 		 * Therefore, we must make a local copy in case the data is
765fa9e4066Sahrens 		 * being written to multiple places.
766fa9e4066Sahrens 		 */
767fa9e4066Sahrens 		wbuf = zio_buf_alloc(size);
768fa9e4066Sahrens 		bcopy(data, wbuf, size);
769fa9e4066Sahrens 		zio_push_transform(zio, wbuf, size, size);
770fa9e4066Sahrens 
771fa9e4066Sahrens 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
772fa9e4066Sahrens 		zbt->zbt_cksum = blk.blk_cksum;
773fa9e4066Sahrens 	}
774fa9e4066Sahrens 
775fa9e4066Sahrens 	return (zio);
776fa9e4066Sahrens }
777fa9e4066Sahrens 
778fa9e4066Sahrens /*
779fa9e4066Sahrens  * Create a child I/O to do some work for us.  It has no associated bp.
780fa9e4066Sahrens  */
781fa9e4066Sahrens zio_t *
782fa9e4066Sahrens zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
783fa9e4066Sahrens 	void *data, uint64_t size, int type, int priority, int flags,
784fa9e4066Sahrens 	zio_done_func_t *done, void *private)
785fa9e4066Sahrens {
786fa9e4066Sahrens 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
787fa9e4066Sahrens 	zio_t *cio;
788fa9e4066Sahrens 
789fa9e4066Sahrens 	if (type == ZIO_TYPE_READ && bp != NULL) {
790fa9e4066Sahrens 		/*
791fa9e4066Sahrens 		 * If we have the bp, then the child should perform the
792fa9e4066Sahrens 		 * checksum and the parent need not.  This pushes error
793fa9e4066Sahrens 		 * detection as close to the leaves as possible and
794fa9e4066Sahrens 		 * eliminates redundant checksums in the interior nodes.
795fa9e4066Sahrens 		 */
796fa9e4066Sahrens 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
797fa9e4066Sahrens 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
798fa9e4066Sahrens 	}
799fa9e4066Sahrens 
800fa9e4066Sahrens 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
801fa9e4066Sahrens 	    done, private, type, priority,
802fa9e4066Sahrens 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
80344cd46caSbillm 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
804fa9e4066Sahrens 
805fa9e4066Sahrens 	cio->io_vd = vd;
806fa9e4066Sahrens 	cio->io_offset = offset;
807fa9e4066Sahrens 
808fa9e4066Sahrens 	return (cio);
809fa9e4066Sahrens }
810fa9e4066Sahrens 
811fa9e4066Sahrens /*
812fa9e4066Sahrens  * ==========================================================================
813fa9e4066Sahrens  * Initiate I/O, either sync or async
814fa9e4066Sahrens  * ==========================================================================
815fa9e4066Sahrens  */
816fa9e4066Sahrens int
817fa9e4066Sahrens zio_wait(zio_t *zio)
818fa9e4066Sahrens {
819fa9e4066Sahrens 	int error;
820fa9e4066Sahrens 
821fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
822fa9e4066Sahrens 
823fa9e4066Sahrens 	zio->io_waiter = curthread;
824fa9e4066Sahrens 
825fa9e4066Sahrens 	zio_next_stage_async(zio);
826fa9e4066Sahrens 
827fa9e4066Sahrens 	mutex_enter(&zio->io_lock);
828fa9e4066Sahrens 	while (zio->io_stalled != ZIO_STAGE_DONE)
829fa9e4066Sahrens 		cv_wait(&zio->io_cv, &zio->io_lock);
830fa9e4066Sahrens 	mutex_exit(&zio->io_lock);
831fa9e4066Sahrens 
832fa9e4066Sahrens 	error = zio->io_error;
8335ad82045Snd 	mutex_destroy(&zio->io_lock);
834c25056deSgw 	cv_destroy(&zio->io_cv);
835ccae0b50Seschrock 	kmem_cache_free(zio_cache, zio);
836fa9e4066Sahrens 
837fa9e4066Sahrens 	return (error);
838fa9e4066Sahrens }
839fa9e4066Sahrens 
840fa9e4066Sahrens void
841fa9e4066Sahrens zio_nowait(zio_t *zio)
842fa9e4066Sahrens {
843fa9e4066Sahrens 	zio_next_stage_async(zio);
844fa9e4066Sahrens }
845fa9e4066Sahrens 
846fa9e4066Sahrens /*
847fa9e4066Sahrens  * ==========================================================================
848fa9e4066Sahrens  * I/O pipeline interlocks: parent/child dependency scoreboarding
849fa9e4066Sahrens  * ==========================================================================
850fa9e4066Sahrens  */
851fa9e4066Sahrens static void
852fa9e4066Sahrens zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
853fa9e4066Sahrens {
854fa9e4066Sahrens 	mutex_enter(&zio->io_lock);
855fa9e4066Sahrens 	if (*countp == 0) {
856fa9e4066Sahrens 		ASSERT(zio->io_stalled == 0);
857fa9e4066Sahrens 		mutex_exit(&zio->io_lock);
858fa9e4066Sahrens 		zio_next_stage(zio);
859fa9e4066Sahrens 	} else {
860fa9e4066Sahrens 		zio->io_stalled = stage;
861fa9e4066Sahrens 		mutex_exit(&zio->io_lock);
862fa9e4066Sahrens 	}
863fa9e4066Sahrens }
864fa9e4066Sahrens 
865fa9e4066Sahrens static void
866fa9e4066Sahrens zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
867fa9e4066Sahrens {
868fa9e4066Sahrens 	zio_t *pio = zio->io_parent;
869fa9e4066Sahrens 
870fa9e4066Sahrens 	mutex_enter(&pio->io_lock);
871fa9e4066Sahrens 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
872fa9e4066Sahrens 		pio->io_error = zio->io_error;
873*0a4e9518Sgw 	ASSERT3U(*countp, >, 0);
874fa9e4066Sahrens 	if (--*countp == 0 && pio->io_stalled == stage) {
875fa9e4066Sahrens 		pio->io_stalled = 0;
876fa9e4066Sahrens 		mutex_exit(&pio->io_lock);
877fa9e4066Sahrens 		zio_next_stage_async(pio);
878fa9e4066Sahrens 	} else {
879fa9e4066Sahrens 		mutex_exit(&pio->io_lock);
880fa9e4066Sahrens 	}
881fa9e4066Sahrens }
882fa9e4066Sahrens 
883fa9e4066Sahrens static void
884fa9e4066Sahrens zio_wait_children_ready(zio_t *zio)
885fa9e4066Sahrens {
886fa9e4066Sahrens 	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
887fa9e4066Sahrens 	    &zio->io_children_notready);
888fa9e4066Sahrens }
889fa9e4066Sahrens 
890fa9e4066Sahrens void
891fa9e4066Sahrens zio_wait_children_done(zio_t *zio)
892fa9e4066Sahrens {
893fa9e4066Sahrens 	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
894fa9e4066Sahrens 	    &zio->io_children_notdone);
895fa9e4066Sahrens }
896fa9e4066Sahrens 
897*0a4e9518Sgw static void
898*0a4e9518Sgw zio_read_init(zio_t *zio)
899*0a4e9518Sgw {
900*0a4e9518Sgw 	if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
901*0a4e9518Sgw 		uint64_t csize = BP_GET_PSIZE(zio->io_bp);
902*0a4e9518Sgw 		void *cbuf = zio_buf_alloc(csize);
903*0a4e9518Sgw 
904*0a4e9518Sgw 		zio_push_transform(zio, cbuf, csize, csize);
905*0a4e9518Sgw 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
906*0a4e9518Sgw 	}
907*0a4e9518Sgw 
908*0a4e9518Sgw 	if (BP_IS_GANG(zio->io_bp)) {
909*0a4e9518Sgw 		uint64_t gsize = SPA_GANGBLOCKSIZE;
910*0a4e9518Sgw 		void *gbuf = zio_buf_alloc(gsize);
911*0a4e9518Sgw 
912*0a4e9518Sgw 		zio_push_transform(zio, gbuf, gsize, gsize);
913*0a4e9518Sgw 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
914*0a4e9518Sgw 	}
915*0a4e9518Sgw 	zio_next_stage(zio);
916*0a4e9518Sgw }
917*0a4e9518Sgw 
918fa9e4066Sahrens static void
919fa9e4066Sahrens zio_ready(zio_t *zio)
920fa9e4066Sahrens {
921fa9e4066Sahrens 	zio_t *pio = zio->io_parent;
922fa9e4066Sahrens 
923c717a561Smaybee 	if (zio->io_ready)
924c717a561Smaybee 		zio->io_ready(zio);
925c717a561Smaybee 
926fa9e4066Sahrens 	if (pio != NULL)
927fa9e4066Sahrens 		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
928fa9e4066Sahrens 		    &pio->io_children_notready);
929fa9e4066Sahrens 
930fa9e4066Sahrens 	if (zio->io_bp)
931fa9e4066Sahrens 		zio->io_bp_copy = *zio->io_bp;
932fa9e4066Sahrens 
933fa9e4066Sahrens 	zio_next_stage(zio);
934fa9e4066Sahrens }
935fa9e4066Sahrens 
936fa9e4066Sahrens static void
937*0a4e9518Sgw zio_vdev_retry_io(zio_t *zio)
938fa9e4066Sahrens {
939fa9e4066Sahrens 	zio_t *pio = zio->io_parent;
940*0a4e9518Sgw 
941*0a4e9518Sgw 	/*
942*0a4e9518Sgw 	 * Preserve the failed bp so that the io_ready() callback can
943*0a4e9518Sgw 	 * update the accounting accordingly. The callback will also be
944*0a4e9518Sgw 	 * responsible for freeing the previously allocated block, if one
945*0a4e9518Sgw 	 * exists.
946*0a4e9518Sgw 	 */
947*0a4e9518Sgw 	zio->io_bp_orig = *zio->io_bp;
948*0a4e9518Sgw 
949*0a4e9518Sgw 	/*
950*0a4e9518Sgw 	 * We must zero out the old DVA and blk_birth before reallocating
951*0a4e9518Sgw 	 * the bp. We don't want to do this if this is a rewrite however.
952*0a4e9518Sgw 	 */
953*0a4e9518Sgw 	if (!IO_IS_REWRITE(zio)) {
954*0a4e9518Sgw 		BP_ZERO_DVAS(zio->io_bp);
955*0a4e9518Sgw 	}
956*0a4e9518Sgw 
957*0a4e9518Sgw 	zio_reset(zio);
958*0a4e9518Sgw 
959*0a4e9518Sgw 	if (pio) {
960*0a4e9518Sgw 		/*
961*0a4e9518Sgw 		 * Let the parent know that we will
962*0a4e9518Sgw 		 * re-alloc the write (=> new bp info).
963*0a4e9518Sgw 		 */
964*0a4e9518Sgw 		mutex_enter(&pio->io_lock);
965*0a4e9518Sgw 		pio->io_children_notready++;
966*0a4e9518Sgw 
967*0a4e9518Sgw 		/*
968*0a4e9518Sgw 		 * If the parent I/O is still in the open stage, then
969*0a4e9518Sgw 		 * don't bother telling it to retry since it hasn't
970*0a4e9518Sgw 		 * progressed far enough for it to care.
971*0a4e9518Sgw 		 */
972*0a4e9518Sgw 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
973*0a4e9518Sgw 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
974*0a4e9518Sgw 
975*0a4e9518Sgw 		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
976*0a4e9518Sgw 		mutex_exit(&pio->io_lock);
977*0a4e9518Sgw 	}
978*0a4e9518Sgw 
979*0a4e9518Sgw 	/*
980*0a4e9518Sgw 	 * We are getting ready to process the retry request so clear
981*0a4e9518Sgw 	 * the flag and the zio's current error status.
982*0a4e9518Sgw 	 */
983*0a4e9518Sgw 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
984*0a4e9518Sgw 	zio->io_error = 0;
985*0a4e9518Sgw 	zio_next_stage_async(zio);
986*0a4e9518Sgw }
987*0a4e9518Sgw 
988*0a4e9518Sgw int
989*0a4e9518Sgw zio_vdev_resume_io(spa_t *spa)
990*0a4e9518Sgw {
991*0a4e9518Sgw 	zio_t *zio;
992*0a4e9518Sgw 
993*0a4e9518Sgw 	mutex_enter(&spa->spa_zio_lock);
994*0a4e9518Sgw 
995*0a4e9518Sgw 	/*
996*0a4e9518Sgw 	 * Probe all of vdevs that have experienced an I/O error.
997*0a4e9518Sgw 	 * If we are still unable to verify the integrity of the vdev
998*0a4e9518Sgw 	 * then we prevent the resume from proceeeding.
999*0a4e9518Sgw 	 */
1000*0a4e9518Sgw 	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
1001*0a4e9518Sgw 	    zio = list_next(&spa->spa_zio_list, zio)) {
1002*0a4e9518Sgw 		int error = 0;
1003*0a4e9518Sgw 
1004*0a4e9518Sgw 		/* We only care about I/Os that must succeed */
1005*0a4e9518Sgw 		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
1006*0a4e9518Sgw 			continue;
1007*0a4e9518Sgw 		error = vdev_probe(zio->io_vd);
1008*0a4e9518Sgw 		if (error) {
1009*0a4e9518Sgw 			mutex_exit(&spa->spa_zio_lock);
1010*0a4e9518Sgw 			return (error);
1011*0a4e9518Sgw 		}
1012*0a4e9518Sgw 	}
1013*0a4e9518Sgw 
1014*0a4e9518Sgw 	/*
1015*0a4e9518Sgw 	 * Clear the vdev stats so that I/O can flow.
1016*0a4e9518Sgw 	 */
1017*0a4e9518Sgw 	vdev_clear(spa, NULL, B_FALSE);
1018*0a4e9518Sgw 
1019*0a4e9518Sgw 	spa->spa_state = POOL_STATE_ACTIVE;
1020*0a4e9518Sgw 	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
1021*0a4e9518Sgw 		list_remove(&spa->spa_zio_list, zio);
1022*0a4e9518Sgw 		zio->io_error = 0;
1023*0a4e9518Sgw 
1024*0a4e9518Sgw 		/*
1025*0a4e9518Sgw 		 * If we are resuming an allocating I/O then we force it
1026*0a4e9518Sgw 		 * to retry and let it resume operation where it left off.
1027*0a4e9518Sgw 		 * Otherwise, go back to the ready stage and pick up from
1028*0a4e9518Sgw 		 * there.
1029*0a4e9518Sgw 		 */
1030*0a4e9518Sgw 		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
1031*0a4e9518Sgw 			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1032*0a4e9518Sgw 			zio->io_stage--;
1033*0a4e9518Sgw 		} else {
1034*0a4e9518Sgw 			zio->io_stage = ZIO_STAGE_READY;
1035*0a4e9518Sgw 		}
1036*0a4e9518Sgw 
1037*0a4e9518Sgw 		(void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
1038*0a4e9518Sgw 		    zio, TQ_SLEEP);
1039*0a4e9518Sgw 	}
1040*0a4e9518Sgw 	mutex_exit(&spa->spa_zio_lock);
1041*0a4e9518Sgw 
1042*0a4e9518Sgw 	/*
1043*0a4e9518Sgw 	 * Wait for the taskqs to finish and recheck the pool state since
1044*0a4e9518Sgw 	 * it's possible that a resumed I/O has failed again.
1045*0a4e9518Sgw 	 */
1046*0a4e9518Sgw 	taskq_wait(zio_taskq);
1047*0a4e9518Sgw 	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
1048*0a4e9518Sgw 		return (EIO);
1049*0a4e9518Sgw 
1050*0a4e9518Sgw 	mutex_enter(&spa->spa_zio_lock);
1051*0a4e9518Sgw 	cv_broadcast(&spa->spa_zio_cv);
1052*0a4e9518Sgw 	mutex_exit(&spa->spa_zio_lock);
1053*0a4e9518Sgw 
1054*0a4e9518Sgw 	return (0);
1055*0a4e9518Sgw }
1056*0a4e9518Sgw 
1057*0a4e9518Sgw static void
1058*0a4e9518Sgw zio_vdev_suspend_io(zio_t *zio)
1059*0a4e9518Sgw {
1060*0a4e9518Sgw 	spa_t *spa = zio->io_spa;
1061*0a4e9518Sgw 
1062*0a4e9518Sgw 	/*
1063*0a4e9518Sgw 	 * We've experienced an unrecoverable failure so
1064*0a4e9518Sgw 	 * set the pool state accordingly and queue all
1065*0a4e9518Sgw 	 * failed IOs.
1066*0a4e9518Sgw 	 */
1067*0a4e9518Sgw 	spa->spa_state = POOL_STATE_IO_FAILURE;
1068*0a4e9518Sgw 
1069*0a4e9518Sgw 	mutex_enter(&spa->spa_zio_lock);
1070*0a4e9518Sgw 	list_insert_tail(&spa->spa_zio_list, zio);
1071*0a4e9518Sgw 
1072*0a4e9518Sgw #ifndef _KERNEL
1073*0a4e9518Sgw 	/* Used to notify ztest that the pool has suspended */
1074*0a4e9518Sgw 	cv_broadcast(&spa->spa_zio_cv);
1075*0a4e9518Sgw #endif
1076*0a4e9518Sgw 	mutex_exit(&spa->spa_zio_lock);
1077*0a4e9518Sgw }
1078*0a4e9518Sgw 
1079*0a4e9518Sgw static void
1080*0a4e9518Sgw zio_assess(zio_t *zio)
1081*0a4e9518Sgw {
1082fa9e4066Sahrens 	spa_t *spa = zio->io_spa;
1083fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1084fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1085fa9e4066Sahrens 
1086fa9e4066Sahrens 	ASSERT(zio->io_children_notready == 0);
1087fa9e4066Sahrens 	ASSERT(zio->io_children_notdone == 0);
1088fa9e4066Sahrens 
1089fa9e4066Sahrens 	if (bp != NULL) {
1090fa9e4066Sahrens 		ASSERT(bp->blk_pad[0] == 0);
1091fa9e4066Sahrens 		ASSERT(bp->blk_pad[1] == 0);
1092fa9e4066Sahrens 		ASSERT(bp->blk_pad[2] == 0);
1093fa9e4066Sahrens 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
1094fa9e4066Sahrens 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
109544cd46caSbillm 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
1096fa9e4066Sahrens 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
109744cd46caSbillm 			if (zio->io_ndvas != 0)
109844cd46caSbillm 				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
109944cd46caSbillm 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
110044cd46caSbillm 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
110144cd46caSbillm 		}
1102fa9e4066Sahrens 	}
1103fa9e4066Sahrens 
1104*0a4e9518Sgw 	/*
1105*0a4e9518Sgw 	 * Some child I/O has indicated that a retry is necessary, so
1106*0a4e9518Sgw 	 * we set an error on the I/O and let the logic below do the
1107*0a4e9518Sgw 	 * rest.
1108*0a4e9518Sgw 	 */
1109*0a4e9518Sgw 	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
1110*0a4e9518Sgw 		zio->io_error = ERESTART;
1111*0a4e9518Sgw 
1112fa9e4066Sahrens 	if (vd != NULL)
1113fa9e4066Sahrens 		vdev_stat_update(zio);
1114fa9e4066Sahrens 
1115fa9e4066Sahrens 	if (zio->io_error) {
1116ea8dc4b6Seschrock 		/*
1117ea8dc4b6Seschrock 		 * If this I/O is attached to a particular vdev,
1118ea8dc4b6Seschrock 		 * generate an error message describing the I/O failure
1119ea8dc4b6Seschrock 		 * at the block level.  We ignore these errors if the
1120ea8dc4b6Seschrock 		 * device is currently unavailable.
1121ea8dc4b6Seschrock 		 */
1122ecc2d604Sbonwick 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
1123*0a4e9518Sgw 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
1124ea8dc4b6Seschrock 
1125ea8dc4b6Seschrock 		if ((zio->io_error == EIO ||
1126ea8dc4b6Seschrock 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
1127ea8dc4b6Seschrock 		    zio->io_logical == zio) {
1128ea8dc4b6Seschrock 			/*
1129ea8dc4b6Seschrock 			 * For root I/O requests, tell the SPA to log the error
1130ea8dc4b6Seschrock 			 * appropriately.  Also, generate a logical data
1131ea8dc4b6Seschrock 			 * ereport.
1132ea8dc4b6Seschrock 			 */
1133*0a4e9518Sgw 			spa_log_error(spa, zio);
1134ea8dc4b6Seschrock 
1135*0a4e9518Sgw 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
1136*0a4e9518Sgw 			    0, 0);
1137ea8dc4b6Seschrock 		}
1138fa9e4066Sahrens 
1139ea8dc4b6Seschrock 		/*
1140*0a4e9518Sgw 		 * If we are an allocating I/O then we retry on another
1141*0a4e9518Sgw 		 * vdev unless the pool is out of space.  We handle this
1142*0a4e9518Sgw 		 * condition based on the spa's failmode property.
1143*0a4e9518Sgw 		 */
1144*0a4e9518Sgw 		if (zio_write_retry && zio->io_error != ENOSPC &&
1145*0a4e9518Sgw 		    IO_IS_ALLOCATING(zio) &&
1146*0a4e9518Sgw 		    zio->io_flags & ZIO_FLAG_WRITE_RETRY) {
1147*0a4e9518Sgw 			zio_vdev_retry_io(zio);
1148*0a4e9518Sgw 			return;
1149*0a4e9518Sgw 		}
1150*0a4e9518Sgw 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1151*0a4e9518Sgw 
1152*0a4e9518Sgw 		/*
1153*0a4e9518Sgw 		 * For I/O requests that cannot fail, we carry out
1154*0a4e9518Sgw 		 * the requested behavior based on the failmode pool
1155*0a4e9518Sgw 		 * property.
1156*0a4e9518Sgw 		 *
1157*0a4e9518Sgw 		 * XXX - Need to differentiate between an ENOSPC as
1158*0a4e9518Sgw 		 * a result of vdev failures vs. a full pool.
1159ea8dc4b6Seschrock 		 */
1160ea8dc4b6Seschrock 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
1161d58459f4Sek 			char *blkbuf;
1162d58459f4Sek 
1163*0a4e9518Sgw #ifdef ZFS_DEBUG
1164d58459f4Sek 			blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
1165d58459f4Sek 			if (blkbuf) {
1166d58459f4Sek 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
1167d58459f4Sek 				    bp ? bp : &zio->io_bp_copy);
1168d58459f4Sek 			}
1169*0a4e9518Sgw 			cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p "
1170*0a4e9518Sgw 			    "%s): error %d", zio->io_error == ECKSUM ?
1171ea8dc4b6Seschrock 			    "bad checksum" : "I/O failure",
1172ea8dc4b6Seschrock 			    zio_type_name[zio->io_type],
1173ea8dc4b6Seschrock 			    vdev_description(vd),
1174ea8dc4b6Seschrock 			    (u_longlong_t)zio->io_offset,
1175*0a4e9518Sgw 			    (void *)zio, blkbuf ? blkbuf : "", zio->io_error);
1176*0a4e9518Sgw #endif
1177*0a4e9518Sgw 
1178*0a4e9518Sgw 			if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
1179*0a4e9518Sgw 				fm_panic("Pool '%s' has encountered an "
1180*0a4e9518Sgw 				    "uncorrectable I/O failure and the "
1181*0a4e9518Sgw 				    "failure mode property for this pool "
1182*0a4e9518Sgw 				    "is set to panic.", spa_name(spa));
1183*0a4e9518Sgw 			} else {
1184*0a4e9518Sgw 				cmn_err(CE_WARN, "Pool '%s' has encountered "
1185*0a4e9518Sgw 				    "an uncorrectable I/O error. Manual "
1186*0a4e9518Sgw 				    "intervention is required.",
1187*0a4e9518Sgw 				    spa_name(spa));
1188*0a4e9518Sgw 				zio_vdev_suspend_io(zio);
1189*0a4e9518Sgw 			}
1190*0a4e9518Sgw 			return;
1191ea8dc4b6Seschrock 		}
1192fa9e4066Sahrens 	}
1193*0a4e9518Sgw 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
1194*0a4e9518Sgw 	ASSERT(zio->io_children_notready == 0);
1195*0a4e9518Sgw 	zio_next_stage(zio);
1196*0a4e9518Sgw }
1197*0a4e9518Sgw 
1198*0a4e9518Sgw static void
1199*0a4e9518Sgw zio_done(zio_t *zio)
1200*0a4e9518Sgw {
1201*0a4e9518Sgw 	zio_t *pio = zio->io_parent;
1202*0a4e9518Sgw 	spa_t *spa = zio->io_spa;
1203*0a4e9518Sgw 
1204*0a4e9518Sgw 	ASSERT(zio->io_children_notready == 0);
1205*0a4e9518Sgw 	ASSERT(zio->io_children_notdone == 0);
1206*0a4e9518Sgw 
1207fa9e4066Sahrens 	zio_clear_transform_stack(zio);
1208fa9e4066Sahrens 
1209fa9e4066Sahrens 	if (zio->io_done)
1210fa9e4066Sahrens 		zio->io_done(zio);
1211fa9e4066Sahrens 
1212fa9e4066Sahrens 	ASSERT(zio->io_delegate_list == NULL);
1213fa9e4066Sahrens 	ASSERT(zio->io_delegate_next == NULL);
1214fa9e4066Sahrens 
1215fa9e4066Sahrens 	if (pio != NULL) {
1216fa9e4066Sahrens 		zio_t *next, *prev;
1217fa9e4066Sahrens 
1218fa9e4066Sahrens 		mutex_enter(&pio->io_lock);
1219fa9e4066Sahrens 		next = zio->io_sibling_next;
1220fa9e4066Sahrens 		prev = zio->io_sibling_prev;
1221fa9e4066Sahrens 		if (next != NULL)
1222fa9e4066Sahrens 			next->io_sibling_prev = prev;
1223fa9e4066Sahrens 		if (prev != NULL)
1224fa9e4066Sahrens 			prev->io_sibling_next = next;
1225fa9e4066Sahrens 		if (pio->io_child == zio)
1226fa9e4066Sahrens 			pio->io_child = next;
1227fa9e4066Sahrens 		mutex_exit(&pio->io_lock);
1228fa9e4066Sahrens 
1229fa9e4066Sahrens 		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
1230fa9e4066Sahrens 		    &pio->io_children_notdone);
1231fa9e4066Sahrens 	}
1232fa9e4066Sahrens 
1233b3995adbSahrens 	/*
1234ccae0b50Seschrock 	 * Note: this I/O is now done, and will shortly be freed, so there is no
1235ccae0b50Seschrock 	 * need to clear this (or any other) flag.
1236b3995adbSahrens 	 */
1237b3995adbSahrens 	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
1238ea8dc4b6Seschrock 		spa_config_exit(spa, zio);
1239fa9e4066Sahrens 
1240fa9e4066Sahrens 	if (zio->io_waiter != NULL) {
1241fa9e4066Sahrens 		mutex_enter(&zio->io_lock);
1242fa9e4066Sahrens 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1243fa9e4066Sahrens 		zio->io_stalled = zio->io_stage;
1244fa9e4066Sahrens 		cv_broadcast(&zio->io_cv);
1245fa9e4066Sahrens 		mutex_exit(&zio->io_lock);
1246fa9e4066Sahrens 	} else {
1247c25056deSgw 		mutex_destroy(&zio->io_lock);
1248c25056deSgw 		cv_destroy(&zio->io_cv);
1249ccae0b50Seschrock 		kmem_cache_free(zio_cache, zio);
1250fa9e4066Sahrens 	}
1251fa9e4066Sahrens }
1252fa9e4066Sahrens 
1253fa9e4066Sahrens /*
1254fa9e4066Sahrens  * ==========================================================================
1255fa9e4066Sahrens  * Compression support
1256fa9e4066Sahrens  * ==========================================================================
1257fa9e4066Sahrens  */
1258fa9e4066Sahrens static void
1259fa9e4066Sahrens zio_write_compress(zio_t *zio)
1260fa9e4066Sahrens {
1261fa9e4066Sahrens 	int compress = zio->io_compress;
1262fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1263fa9e4066Sahrens 	void *cbuf;
1264fa9e4066Sahrens 	uint64_t lsize = zio->io_size;
1265fa9e4066Sahrens 	uint64_t csize = lsize;
1266fa9e4066Sahrens 	uint64_t cbufsize = 0;
1267fa9e4066Sahrens 	int pass;
1268fa9e4066Sahrens 
1269fa9e4066Sahrens 	if (bp->blk_birth == zio->io_txg) {
1270fa9e4066Sahrens 		/*
1271fa9e4066Sahrens 		 * We're rewriting an existing block, which means we're
1272fa9e4066Sahrens 		 * working on behalf of spa_sync().  For spa_sync() to
1273fa9e4066Sahrens 		 * converge, it must eventually be the case that we don't
1274fa9e4066Sahrens 		 * have to allocate new blocks.  But compression changes
1275fa9e4066Sahrens 		 * the blocksize, which forces a reallocate, and makes
1276fa9e4066Sahrens 		 * convergence take longer.  Therefore, after the first
1277fa9e4066Sahrens 		 * few passes, stop compressing to ensure convergence.
1278fa9e4066Sahrens 		 */
1279fa9e4066Sahrens 		pass = spa_sync_pass(zio->io_spa);
1280fa9e4066Sahrens 		if (pass > zio_sync_pass.zp_dontcompress)
1281fa9e4066Sahrens 			compress = ZIO_COMPRESS_OFF;
1282fa9e4066Sahrens 	} else {
1283fa9e4066Sahrens 		ASSERT(BP_IS_HOLE(bp));
1284fa9e4066Sahrens 		pass = 1;
1285fa9e4066Sahrens 	}
1286fa9e4066Sahrens 
1287fa9e4066Sahrens 	if (compress != ZIO_COMPRESS_OFF)
1288fa9e4066Sahrens 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
1289fa9e4066Sahrens 		    &cbuf, &csize, &cbufsize))
1290fa9e4066Sahrens 			compress = ZIO_COMPRESS_OFF;
1291fa9e4066Sahrens 
1292fa9e4066Sahrens 	if (compress != ZIO_COMPRESS_OFF && csize != 0)
1293fa9e4066Sahrens 		zio_push_transform(zio, cbuf, csize, cbufsize);
1294fa9e4066Sahrens 
1295fa9e4066Sahrens 	/*
1296fa9e4066Sahrens 	 * The final pass of spa_sync() must be all rewrites, but the first
1297fa9e4066Sahrens 	 * few passes offer a trade-off: allocating blocks defers convergence,
1298fa9e4066Sahrens 	 * but newly allocated blocks are sequential, so they can be written
1299fa9e4066Sahrens 	 * to disk faster.  Therefore, we allow the first few passes of
1300fa9e4066Sahrens 	 * spa_sync() to reallocate new blocks, but force rewrites after that.
1301fa9e4066Sahrens 	 * There should only be a handful of blocks after pass 1 in any case.
1302fa9e4066Sahrens 	 */
1303fa9e4066Sahrens 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
1304fa9e4066Sahrens 	    pass > zio_sync_pass.zp_rewrite) {
1305fa9e4066Sahrens 		ASSERT(csize != 0);
1306a2eea2e1Sahrens 		BP_SET_LSIZE(bp, lsize);
1307a2eea2e1Sahrens 		BP_SET_COMPRESS(bp, compress);
1308fa9e4066Sahrens 		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
1309fa9e4066Sahrens 	} else {
131087bd5c1eSahrens 		if (bp->blk_birth == zio->io_txg)
131187bd5c1eSahrens 			BP_ZERO(bp);
1312fa9e4066Sahrens 		if (csize == 0) {
1313fa9e4066Sahrens 			BP_ZERO(bp);
1314fa9e4066Sahrens 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
1315fa9e4066Sahrens 		} else {
131644cd46caSbillm 			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1317fa9e4066Sahrens 			BP_SET_LSIZE(bp, lsize);
1318fa9e4066Sahrens 			BP_SET_PSIZE(bp, csize);
1319fa9e4066Sahrens 			BP_SET_COMPRESS(bp, compress);
1320fa9e4066Sahrens 			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
1321fa9e4066Sahrens 		}
1322fa9e4066Sahrens 	}
1323fa9e4066Sahrens 
1324fa9e4066Sahrens 	zio_next_stage(zio);
1325fa9e4066Sahrens }
1326fa9e4066Sahrens 
1327fa9e4066Sahrens static void
1328fa9e4066Sahrens zio_read_decompress(zio_t *zio)
1329fa9e4066Sahrens {
1330fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1331fa9e4066Sahrens 	void *data;
1332fa9e4066Sahrens 	uint64_t size;
1333fa9e4066Sahrens 	uint64_t bufsize;
1334fa9e4066Sahrens 	int compress = BP_GET_COMPRESS(bp);
1335fa9e4066Sahrens 
1336fa9e4066Sahrens 	ASSERT(compress != ZIO_COMPRESS_OFF);
1337fa9e4066Sahrens 
1338fa9e4066Sahrens 	zio_pop_transform(zio, &data, &size, &bufsize);
1339fa9e4066Sahrens 
1340fa9e4066Sahrens 	if (zio_decompress_data(compress, data, size,
1341fa9e4066Sahrens 	    zio->io_data, zio->io_size))
1342fa9e4066Sahrens 		zio->io_error = EIO;
1343fa9e4066Sahrens 
1344fa9e4066Sahrens 	zio_buf_free(data, bufsize);
1345fa9e4066Sahrens 
1346fa9e4066Sahrens 	zio_next_stage(zio);
1347fa9e4066Sahrens }
1348fa9e4066Sahrens 
1349fa9e4066Sahrens /*
1350fa9e4066Sahrens  * ==========================================================================
1351fa9e4066Sahrens  * Gang block support
1352fa9e4066Sahrens  * ==========================================================================
1353fa9e4066Sahrens  */
1354fa9e4066Sahrens static void
1355fa9e4066Sahrens zio_gang_pipeline(zio_t *zio)
1356fa9e4066Sahrens {
1357fa9e4066Sahrens 	/*
1358fa9e4066Sahrens 	 * By default, the pipeline assumes that we're dealing with a gang
1359fa9e4066Sahrens 	 * block.  If we're not, strip out any gang-specific stages.
1360fa9e4066Sahrens 	 */
136144cd46caSbillm 	if (!BP_IS_GANG(zio->io_bp))
1362fa9e4066Sahrens 		zio->io_pipeline &= ~ZIO_GANG_STAGES;
1363fa9e4066Sahrens 
1364fa9e4066Sahrens 	zio_next_stage(zio);
1365fa9e4066Sahrens }
1366fa9e4066Sahrens 
1367fa9e4066Sahrens static void
1368fa9e4066Sahrens zio_gang_byteswap(zio_t *zio)
1369fa9e4066Sahrens {
1370fa9e4066Sahrens 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1371fa9e4066Sahrens 
1372fa9e4066Sahrens 	if (BP_SHOULD_BYTESWAP(zio->io_bp))
1373fa9e4066Sahrens 		byteswap_uint64_array(zio->io_data, zio->io_size);
1374fa9e4066Sahrens }
1375fa9e4066Sahrens 
1376fa9e4066Sahrens static void
1377fa9e4066Sahrens zio_get_gang_header(zio_t *zio)
1378fa9e4066Sahrens {
1379fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1380fa9e4066Sahrens 	uint64_t gsize = SPA_GANGBLOCKSIZE;
1381fa9e4066Sahrens 	void *gbuf = zio_buf_alloc(gsize);
1382fa9e4066Sahrens 
138344cd46caSbillm 	ASSERT(BP_IS_GANG(bp));
1384fa9e4066Sahrens 
1385fa9e4066Sahrens 	zio_push_transform(zio, gbuf, gsize, gsize);
1386fa9e4066Sahrens 
1387fa9e4066Sahrens 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
1388fa9e4066Sahrens 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
1389fa9e4066Sahrens 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1390*0a4e9518Sgw 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
1391fa9e4066Sahrens 
1392fa9e4066Sahrens 	zio_wait_children_done(zio);
1393fa9e4066Sahrens }
1394fa9e4066Sahrens 
1395fa9e4066Sahrens static void
1396fa9e4066Sahrens zio_read_gang_members(zio_t *zio)
1397fa9e4066Sahrens {
1398fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
1399fa9e4066Sahrens 	uint64_t gsize, gbufsize, loff, lsize;
1400fa9e4066Sahrens 	int i;
1401fa9e4066Sahrens 
140244cd46caSbillm 	ASSERT(BP_IS_GANG(zio->io_bp));
1403fa9e4066Sahrens 
1404fa9e4066Sahrens 	zio_gang_byteswap(zio);
1405fa9e4066Sahrens 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1406fa9e4066Sahrens 
1407fa9e4066Sahrens 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1408fa9e4066Sahrens 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1409fa9e4066Sahrens 		lsize = BP_GET_PSIZE(gbp);
1410fa9e4066Sahrens 
1411fa9e4066Sahrens 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1412fa9e4066Sahrens 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1413fa9e4066Sahrens 		ASSERT3U(loff + lsize, <=, zio->io_size);
1414fa9e4066Sahrens 		ASSERT(i < SPA_GBH_NBLKPTRS);
1415fa9e4066Sahrens 		ASSERT(!BP_IS_HOLE(gbp));
1416fa9e4066Sahrens 
1417fa9e4066Sahrens 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
1418fa9e4066Sahrens 		    (char *)zio->io_data + loff, lsize, NULL, NULL,
1419ea8dc4b6Seschrock 		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
1420ea8dc4b6Seschrock 		    &zio->io_bookmark));
1421fa9e4066Sahrens 	}
1422fa9e4066Sahrens 
1423fa9e4066Sahrens 	zio_buf_free(gbh, gbufsize);
1424fa9e4066Sahrens 	zio_wait_children_done(zio);
1425fa9e4066Sahrens }
1426fa9e4066Sahrens 
1427fa9e4066Sahrens static void
1428fa9e4066Sahrens zio_rewrite_gang_members(zio_t *zio)
1429fa9e4066Sahrens {
1430fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
1431fa9e4066Sahrens 	uint64_t gsize, gbufsize, loff, lsize;
1432fa9e4066Sahrens 	int i;
1433fa9e4066Sahrens 
143444cd46caSbillm 	ASSERT(BP_IS_GANG(zio->io_bp));
1435fa9e4066Sahrens 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1436fa9e4066Sahrens 
1437fa9e4066Sahrens 	zio_gang_byteswap(zio);
1438fa9e4066Sahrens 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1439fa9e4066Sahrens 
1440fa9e4066Sahrens 	ASSERT(gsize == gbufsize);
1441fa9e4066Sahrens 
1442fa9e4066Sahrens 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
1443fa9e4066Sahrens 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1444fa9e4066Sahrens 		lsize = BP_GET_PSIZE(gbp);
1445fa9e4066Sahrens 
1446fa9e4066Sahrens 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
1447fa9e4066Sahrens 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
1448fa9e4066Sahrens 		ASSERT3U(loff + lsize, <=, zio->io_size);
1449fa9e4066Sahrens 		ASSERT(i < SPA_GBH_NBLKPTRS);
1450fa9e4066Sahrens 		ASSERT(!BP_IS_HOLE(gbp));
1451fa9e4066Sahrens 
1452fa9e4066Sahrens 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
1453fa9e4066Sahrens 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
1454ea8dc4b6Seschrock 		    NULL, NULL, zio->io_priority, zio->io_flags,
1455ea8dc4b6Seschrock 		    &zio->io_bookmark));
1456fa9e4066Sahrens 	}
1457fa9e4066Sahrens 
1458fa9e4066Sahrens 	zio_push_transform(zio, gbh, gsize, gbufsize);
1459fa9e4066Sahrens 	zio_wait_children_ready(zio);
1460fa9e4066Sahrens }
1461fa9e4066Sahrens 
1462fa9e4066Sahrens static void
1463fa9e4066Sahrens zio_free_gang_members(zio_t *zio)
1464fa9e4066Sahrens {
1465fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
1466fa9e4066Sahrens 	uint64_t gsize, gbufsize;
1467fa9e4066Sahrens 	int i;
1468fa9e4066Sahrens 
146944cd46caSbillm 	ASSERT(BP_IS_GANG(zio->io_bp));
1470fa9e4066Sahrens 
1471fa9e4066Sahrens 	zio_gang_byteswap(zio);
1472fa9e4066Sahrens 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1473fa9e4066Sahrens 
1474fa9e4066Sahrens 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1475fa9e4066Sahrens 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1476fa9e4066Sahrens 
1477fa9e4066Sahrens 		if (BP_IS_HOLE(gbp))
1478fa9e4066Sahrens 			continue;
1479fa9e4066Sahrens 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
1480fa9e4066Sahrens 		    gbp, NULL, NULL));
1481fa9e4066Sahrens 	}
1482fa9e4066Sahrens 
1483fa9e4066Sahrens 	zio_buf_free(gbh, gbufsize);
1484fa9e4066Sahrens 	zio_next_stage(zio);
1485fa9e4066Sahrens }
1486fa9e4066Sahrens 
1487fa9e4066Sahrens static void
1488fa9e4066Sahrens zio_claim_gang_members(zio_t *zio)
1489fa9e4066Sahrens {
1490fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
1491fa9e4066Sahrens 	uint64_t gsize, gbufsize;
1492fa9e4066Sahrens 	int i;
1493fa9e4066Sahrens 
149444cd46caSbillm 	ASSERT(BP_IS_GANG(zio->io_bp));
1495fa9e4066Sahrens 
1496fa9e4066Sahrens 	zio_gang_byteswap(zio);
1497fa9e4066Sahrens 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
1498fa9e4066Sahrens 
1499fa9e4066Sahrens 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1500fa9e4066Sahrens 		blkptr_t *gbp = &gbh->zg_blkptr[i];
1501fa9e4066Sahrens 		if (BP_IS_HOLE(gbp))
1502fa9e4066Sahrens 			continue;
1503fa9e4066Sahrens 		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
1504fa9e4066Sahrens 		    gbp, NULL, NULL));
1505fa9e4066Sahrens 	}
1506fa9e4066Sahrens 
1507fa9e4066Sahrens 	zio_buf_free(gbh, gbufsize);
1508fa9e4066Sahrens 	zio_next_stage(zio);
1509fa9e4066Sahrens }
1510fa9e4066Sahrens 
1511fa9e4066Sahrens static void
1512fa9e4066Sahrens zio_write_allocate_gang_member_done(zio_t *zio)
1513fa9e4066Sahrens {
1514fa9e4066Sahrens 	zio_t *pio = zio->io_parent;
151544cd46caSbillm 	dva_t *cdva = zio->io_bp->blk_dva;
151644cd46caSbillm 	dva_t *pdva = pio->io_bp->blk_dva;
1517fa9e4066Sahrens 	uint64_t asize;
151844cd46caSbillm 	int d;
1519fa9e4066Sahrens 
152044cd46caSbillm 	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
152144cd46caSbillm 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
152244cd46caSbillm 	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
152344cd46caSbillm 	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
1524fa9e4066Sahrens 
1525fa9e4066Sahrens 	mutex_enter(&pio->io_lock);
152644cd46caSbillm 	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
152744cd46caSbillm 		ASSERT(DVA_GET_GANG(&pdva[d]));
152844cd46caSbillm 		asize = DVA_GET_ASIZE(&pdva[d]);
152944cd46caSbillm 		asize += DVA_GET_ASIZE(&cdva[d]);
153044cd46caSbillm 		DVA_SET_ASIZE(&pdva[d], asize);
153144cd46caSbillm 	}
1532fa9e4066Sahrens 	mutex_exit(&pio->io_lock);
1533fa9e4066Sahrens }
1534fa9e4066Sahrens 
1535*0a4e9518Sgw static int
15368654d025Sperrin zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
1537fa9e4066Sahrens {
1538fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
153944cd46caSbillm 	dva_t *dva = bp->blk_dva;
154044cd46caSbillm 	spa_t *spa = zio->io_spa;
1541fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
154244cd46caSbillm 	uint64_t txg = zio->io_txg;
1543fa9e4066Sahrens 	uint64_t resid = zio->io_size;
1544fa9e4066Sahrens 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
1545fa9e4066Sahrens 	uint64_t gsize, loff, lsize;
1546fa9e4066Sahrens 	uint32_t gbps_left;
154744cd46caSbillm 	int ndvas = zio->io_ndvas;
154844cd46caSbillm 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1549fa9e4066Sahrens 	int error;
155044cd46caSbillm 	int i, d;
1551fa9e4066Sahrens 
1552fa9e4066Sahrens 	gsize = SPA_GANGBLOCKSIZE;
1553fa9e4066Sahrens 	gbps_left = SPA_GBH_NBLKPTRS;
1554fa9e4066Sahrens 
15558654d025Sperrin 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
15568654d025Sperrin 	    B_FALSE);
1557*0a4e9518Sgw 	if (error)
1558*0a4e9518Sgw 		return (error);
1559fa9e4066Sahrens 
156044cd46caSbillm 	for (d = 0; d < gbh_ndvas; d++)
156144cd46caSbillm 		DVA_SET_GANG(&dva[d], 1);
1562fa9e4066Sahrens 
156344cd46caSbillm 	bp->blk_birth = txg;
1564fa9e4066Sahrens 
1565fa9e4066Sahrens 	gbh = zio_buf_alloc(gsize);
1566fa9e4066Sahrens 	bzero(gbh, gsize);
1567fa9e4066Sahrens 
156844cd46caSbillm 	/* We need to test multi-level gang blocks */
156944cd46caSbillm 	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
157044cd46caSbillm 		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
157144cd46caSbillm 
1572fa9e4066Sahrens 	for (loff = 0, i = 0; loff != zio->io_size;
1573fa9e4066Sahrens 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
1574fa9e4066Sahrens 		blkptr_t *gbp = &gbh->zg_blkptr[i];
157544cd46caSbillm 		dva = gbp->blk_dva;
1576fa9e4066Sahrens 
1577fa9e4066Sahrens 		ASSERT(gbps_left != 0);
1578fa9e4066Sahrens 		maxalloc = MIN(maxalloc, resid);
1579fa9e4066Sahrens 
1580fa9e4066Sahrens 		while (resid <= maxalloc * gbps_left) {
15818654d025Sperrin 			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
158267bd71c6Sperrin 			    txg, bp, B_FALSE);
1583fa9e4066Sahrens 			if (error == 0)
1584fa9e4066Sahrens 				break;
1585fa9e4066Sahrens 			ASSERT3U(error, ==, ENOSPC);
1586*0a4e9518Sgw 			/* XXX - free up previous allocations? */
1587fa9e4066Sahrens 			if (maxalloc == SPA_MINBLOCKSIZE)
1588*0a4e9518Sgw 				return (error);
1589fa9e4066Sahrens 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
1590fa9e4066Sahrens 		}
1591fa9e4066Sahrens 
1592fa9e4066Sahrens 		if (resid <= maxalloc * gbps_left) {
1593fa9e4066Sahrens 			lsize = maxalloc;
1594fa9e4066Sahrens 			BP_SET_LSIZE(gbp, lsize);
1595fa9e4066Sahrens 			BP_SET_PSIZE(gbp, lsize);
1596fa9e4066Sahrens 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
159744cd46caSbillm 			gbp->blk_birth = txg;
159844cd46caSbillm 			zio_nowait(zio_rewrite(zio, spa,
159944cd46caSbillm 			    zio->io_checksum, txg, gbp,
1600fa9e4066Sahrens 			    (char *)zio->io_data + loff, lsize,
1601fa9e4066Sahrens 			    zio_write_allocate_gang_member_done, NULL,
1602ea8dc4b6Seschrock 			    zio->io_priority, zio->io_flags,
1603ea8dc4b6Seschrock 			    &zio->io_bookmark));
1604fa9e4066Sahrens 		} else {
1605fa9e4066Sahrens 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
1606fa9e4066Sahrens 			ASSERT(lsize != SPA_MINBLOCKSIZE);
160744cd46caSbillm 			zio_nowait(zio_write_allocate(zio, spa,
160844cd46caSbillm 			    zio->io_checksum, txg, gbp,
1609fa9e4066Sahrens 			    (char *)zio->io_data + loff, lsize,
1610fa9e4066Sahrens 			    zio_write_allocate_gang_member_done, NULL,
1611fa9e4066Sahrens 			    zio->io_priority, zio->io_flags));
1612fa9e4066Sahrens 		}
1613fa9e4066Sahrens 	}
1614fa9e4066Sahrens 
1615fa9e4066Sahrens 	ASSERT(resid == 0 && loff == zio->io_size);
1616fa9e4066Sahrens 
1617fa9e4066Sahrens 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
1618fa9e4066Sahrens 
1619fa9e4066Sahrens 	zio_push_transform(zio, gbh, gsize, gsize);
162044cd46caSbillm 	/*
162144cd46caSbillm 	 * As much as we'd like this to be zio_wait_children_ready(),
162244cd46caSbillm 	 * updating our ASIZE doesn't happen until the io_done callback,
162344cd46caSbillm 	 * so we have to wait for that to finish in order for our BP
162444cd46caSbillm 	 * to be stable.
162544cd46caSbillm 	 */
1626fa9e4066Sahrens 	zio_wait_children_done(zio);
1627*0a4e9518Sgw 	return (0);
1628fa9e4066Sahrens }
1629fa9e4066Sahrens 
1630fa9e4066Sahrens /*
1631fa9e4066Sahrens  * ==========================================================================
1632fa9e4066Sahrens  * Allocate and free blocks
1633fa9e4066Sahrens  * ==========================================================================
1634fa9e4066Sahrens  */
1635fa9e4066Sahrens static void
1636fa9e4066Sahrens zio_dva_allocate(zio_t *zio)
1637fa9e4066Sahrens {
16388654d025Sperrin 	spa_t *spa = zio->io_spa;
16398654d025Sperrin 	metaslab_class_t *mc = spa->spa_normal_class;
1640fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1641fa9e4066Sahrens 	int error;
1642fa9e4066Sahrens 
1643fa9e4066Sahrens 	ASSERT(BP_IS_HOLE(bp));
164444cd46caSbillm 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
164544cd46caSbillm 	ASSERT3U(zio->io_ndvas, >, 0);
16468654d025Sperrin 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
1647fa9e4066Sahrens 
1648fa9e4066Sahrens 	/* For testing, make some blocks above a certain size be gang blocks */
1649fa9e4066Sahrens 	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
1650*0a4e9518Sgw 		error = zio_write_allocate_gang_members(zio, mc);
1651*0a4e9518Sgw 		if (error)
1652*0a4e9518Sgw 			zio->io_error = error;
1653fa9e4066Sahrens 		return;
1654fa9e4066Sahrens 	}
1655fa9e4066Sahrens 
1656*0a4e9518Sgw 	/*
1657*0a4e9518Sgw 	 * For testing purposes, we force I/Os to retry. We don't allow
1658*0a4e9518Sgw 	 * retries beyond the first pass since those I/Os are non-allocating
1659*0a4e9518Sgw 	 * writes. We do this after the gang block testing block so that
1660*0a4e9518Sgw 	 * they don't inherit the retry flag.
1661*0a4e9518Sgw 	 */
1662*0a4e9518Sgw 	if (zio_io_fail_shift &&
1663*0a4e9518Sgw 	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
1664*0a4e9518Sgw 	    zio_io_should_fail(zio_io_fail_shift))
1665*0a4e9518Sgw 		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
1666*0a4e9518Sgw 
1667fa9e4066Sahrens 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1668fa9e4066Sahrens 
16698654d025Sperrin 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
167067bd71c6Sperrin 	    zio->io_txg, NULL, B_FALSE);
1671fa9e4066Sahrens 
1672fa9e4066Sahrens 	if (error == 0) {
1673fa9e4066Sahrens 		bp->blk_birth = zio->io_txg;
1674*0a4e9518Sgw 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
1675*0a4e9518Sgw 		error = zio_write_allocate_gang_members(zio, mc);
1676*0a4e9518Sgw 		if (error == 0)
1677*0a4e9518Sgw 			return;
1678*0a4e9518Sgw 		zio->io_error = error;
1679fa9e4066Sahrens 	} else {
1680fa9e4066Sahrens 		zio->io_error = error;
1681fa9e4066Sahrens 	}
1682fa9e4066Sahrens 	zio_next_stage(zio);
1683fa9e4066Sahrens }
1684fa9e4066Sahrens 
1685fa9e4066Sahrens static void
1686fa9e4066Sahrens zio_dva_free(zio_t *zio)
1687fa9e4066Sahrens {
1688fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1689fa9e4066Sahrens 
1690d80c45e0Sbonwick 	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
1691fa9e4066Sahrens 
1692fa9e4066Sahrens 	BP_ZERO(bp);
1693fa9e4066Sahrens 
1694fa9e4066Sahrens 	zio_next_stage(zio);
1695fa9e4066Sahrens }
1696fa9e4066Sahrens 
1697fa9e4066Sahrens static void
1698fa9e4066Sahrens zio_dva_claim(zio_t *zio)
1699fa9e4066Sahrens {
1700d80c45e0Sbonwick 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1701fa9e4066Sahrens 
1702fa9e4066Sahrens 	zio_next_stage(zio);
1703fa9e4066Sahrens }
1704fa9e4066Sahrens 
1705fa9e4066Sahrens /*
1706fa9e4066Sahrens  * ==========================================================================
1707fa9e4066Sahrens  * Read and write to physical devices
1708fa9e4066Sahrens  * ==========================================================================
1709fa9e4066Sahrens  */
1710fa9e4066Sahrens 
1711fa9e4066Sahrens static void
171244cd46caSbillm zio_vdev_io_start(zio_t *zio)
1713fa9e4066Sahrens {
1714fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
171544cd46caSbillm 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
171644cd46caSbillm 	blkptr_t *bp = zio->io_bp;
171744cd46caSbillm 	uint64_t align;
1718*0a4e9518Sgw 	spa_t *spa = zio->io_spa;
1719*0a4e9518Sgw 
1720*0a4e9518Sgw 	/*
1721*0a4e9518Sgw 	 * If the pool is already in a failure state then just suspend
1722*0a4e9518Sgw 	 * this IO until the problem is resolved. We will reissue them
1723*0a4e9518Sgw 	 * at that time.
1724*0a4e9518Sgw 	 */
1725*0a4e9518Sgw 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
1726*0a4e9518Sgw 	    zio->io_type == ZIO_TYPE_WRITE) {
1727*0a4e9518Sgw 		zio_vdev_suspend_io(zio);
1728*0a4e9518Sgw 		return;
1729*0a4e9518Sgw 	}
173044cd46caSbillm 
173144cd46caSbillm 	if (vd == NULL) {
173244cd46caSbillm 		/* The mirror_ops handle multiple DVAs in a single BP */
173344cd46caSbillm 		vdev_mirror_ops.vdev_op_io_start(zio);
173444cd46caSbillm 		return;
173544cd46caSbillm 	}
173644cd46caSbillm 
173744cd46caSbillm 	align = 1ULL << tvd->vdev_ashift;
1738fa9e4066Sahrens 
1739ecc2d604Sbonwick 	if (zio->io_retries == 0 && vd == tvd)
1740fa9e4066Sahrens 		zio->io_flags |= ZIO_FLAG_FAILFAST;
1741fa9e4066Sahrens 
174244cd46caSbillm 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
174344cd46caSbillm 	    vd->vdev_children == 0) {
1744fa9e4066Sahrens 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
1745fa9e4066Sahrens 		zio->io_offset += VDEV_LABEL_START_SIZE;
1746fa9e4066Sahrens 	}
1747fa9e4066Sahrens 
1748ecc2d604Sbonwick 	if (P2PHASE(zio->io_size, align) != 0) {
1749ecc2d604Sbonwick 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
1750ecc2d604Sbonwick 		char *abuf = zio_buf_alloc(asize);
1751ecc2d604Sbonwick 		ASSERT(vd == tvd);
1752ecc2d604Sbonwick 		if (zio->io_type == ZIO_TYPE_WRITE) {
1753ecc2d604Sbonwick 			bcopy(zio->io_data, abuf, zio->io_size);
1754ecc2d604Sbonwick 			bzero(abuf + zio->io_size, asize - zio->io_size);
1755ecc2d604Sbonwick 		}
1756ecc2d604Sbonwick 		zio_push_transform(zio, abuf, asize, asize);
1757ecc2d604Sbonwick 		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
1758ecc2d604Sbonwick 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
1759ecc2d604Sbonwick 	}
1760ecc2d604Sbonwick 
1761ecc2d604Sbonwick 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
1762ecc2d604Sbonwick 	ASSERT(P2PHASE(zio->io_size, align) == 0);
1763ecc2d604Sbonwick 	ASSERT(bp == NULL ||
1764ecc2d604Sbonwick 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
1765fa9e4066Sahrens 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1766fa9e4066Sahrens 
1767fa9e4066Sahrens 	vdev_io_start(zio);
1768fa9e4066Sahrens 
1769fa9e4066Sahrens 	/* zio_next_stage_async() gets called from io completion interrupt */
1770fa9e4066Sahrens }
1771fa9e4066Sahrens 
1772fa9e4066Sahrens static void
1773fa9e4066Sahrens zio_vdev_io_done(zio_t *zio)
1774fa9e4066Sahrens {
177544cd46caSbillm 	if (zio->io_vd == NULL)
177644cd46caSbillm 		/* The mirror_ops handle multiple DVAs in a single BP */
177744cd46caSbillm 		vdev_mirror_ops.vdev_op_io_done(zio);
177844cd46caSbillm 	else
177944cd46caSbillm 		vdev_io_done(zio);
1780fa9e4066Sahrens }
1781fa9e4066Sahrens 
1782fa9e4066Sahrens /* XXPOLICY */
1783ea8dc4b6Seschrock boolean_t
1784fa9e4066Sahrens zio_should_retry(zio_t *zio)
1785fa9e4066Sahrens {
1786fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1787fa9e4066Sahrens 
1788fa9e4066Sahrens 	if (zio->io_error == 0)
1789fa9e4066Sahrens 		return (B_FALSE);
1790fa9e4066Sahrens 	if (zio->io_delegate_list != NULL)
1791fa9e4066Sahrens 		return (B_FALSE);
179244cd46caSbillm 	if (vd && vd != vd->vdev_top)
1793fa9e4066Sahrens 		return (B_FALSE);
1794fa9e4066Sahrens 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
1795fa9e4066Sahrens 		return (B_FALSE);
1796ea8dc4b6Seschrock 	if (zio->io_retries > 0)
1797fa9e4066Sahrens 		return (B_FALSE);
1798fa9e4066Sahrens 
1799fa9e4066Sahrens 	return (B_TRUE);
1800fa9e4066Sahrens }
1801fa9e4066Sahrens 
1802fa9e4066Sahrens static void
1803fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio)
1804fa9e4066Sahrens {
1805fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
180644cd46caSbillm 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
1807fa9e4066Sahrens 
1808fa9e4066Sahrens 	ASSERT(zio->io_vsd == NULL);
1809fa9e4066Sahrens 
1810ecc2d604Sbonwick 	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
1811ecc2d604Sbonwick 		void *abuf;
1812ecc2d604Sbonwick 		uint64_t asize;
1813ecc2d604Sbonwick 		ASSERT(vd == tvd);
1814ecc2d604Sbonwick 		zio_pop_transform(zio, &abuf, &asize, &asize);
1815ecc2d604Sbonwick 		if (zio->io_type == ZIO_TYPE_READ)
1816ecc2d604Sbonwick 			bcopy(abuf, zio->io_data, zio->io_size);
1817ecc2d604Sbonwick 		zio_buf_free(abuf, asize);
1818ecc2d604Sbonwick 		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
1819ecc2d604Sbonwick 	}
1820ecc2d604Sbonwick 
1821ea8dc4b6Seschrock 	if (zio_injection_enabled && !zio->io_error)
1822ea8dc4b6Seschrock 		zio->io_error = zio_handle_fault_injection(zio, EIO);
1823ea8dc4b6Seschrock 
1824fa9e4066Sahrens 	/*
1825fa9e4066Sahrens 	 * If the I/O failed, determine whether we should attempt to retry it.
1826fa9e4066Sahrens 	 */
1827fa9e4066Sahrens 	/* XXPOLICY */
1828fa9e4066Sahrens 	if (zio_should_retry(zio)) {
1829fa9e4066Sahrens 		ASSERT(tvd == vd);
1830fa9e4066Sahrens 
1831fa9e4066Sahrens 		zio->io_retries++;
1832fa9e4066Sahrens 		zio->io_error = 0;
1833b3995adbSahrens 		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT |
1834b3995adbSahrens 		    ZIO_FLAG_CONFIG_GRABBED;
1835fa9e4066Sahrens 		/* XXPOLICY */
1836fa9e4066Sahrens 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
1837fa9e4066Sahrens 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
183844cd46caSbillm 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1839fa9e4066Sahrens 
1840fa9e4066Sahrens 		dprintf("retry #%d for %s to %s offset %llx\n",
1841fa9e4066Sahrens 		    zio->io_retries, zio_type_name[zio->io_type],
1842fa9e4066Sahrens 		    vdev_description(vd), zio->io_offset);
1843fa9e4066Sahrens 
1844ea8dc4b6Seschrock 		zio_next_stage_async(zio);
1845ea8dc4b6Seschrock 		return;
1846ea8dc4b6Seschrock 	}
1847fa9e4066Sahrens 
1848fa9e4066Sahrens 	zio_next_stage(zio);
1849fa9e4066Sahrens }
1850fa9e4066Sahrens 
1851fa9e4066Sahrens void
1852fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio)
1853fa9e4066Sahrens {
1854fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1855fa9e4066Sahrens 	ASSERT(zio->io_error == 0);
1856fa9e4066Sahrens 
1857fa9e4066Sahrens 	zio->io_stage--;
1858fa9e4066Sahrens }
1859fa9e4066Sahrens 
1860fa9e4066Sahrens void
1861fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio)
1862fa9e4066Sahrens {
1863fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1864fa9e4066Sahrens 
1865fa9e4066Sahrens 	zio->io_stage--;
1866fa9e4066Sahrens }
1867fa9e4066Sahrens 
1868fa9e4066Sahrens void
1869fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio)
1870fa9e4066Sahrens {
1871fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1872fa9e4066Sahrens 	ASSERT(zio->io_error == 0);
1873fa9e4066Sahrens 
1874fa9e4066Sahrens 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1875fa9e4066Sahrens 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1876fa9e4066Sahrens }
1877fa9e4066Sahrens 
1878fa9e4066Sahrens /*
1879fa9e4066Sahrens  * ==========================================================================
1880fa9e4066Sahrens  * Generate and verify checksums
1881fa9e4066Sahrens  * ==========================================================================
1882fa9e4066Sahrens  */
1883fa9e4066Sahrens static void
1884fa9e4066Sahrens zio_checksum_generate(zio_t *zio)
1885fa9e4066Sahrens {
1886fa9e4066Sahrens 	int checksum = zio->io_checksum;
1887fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
1888fa9e4066Sahrens 
1889fa9e4066Sahrens 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1890fa9e4066Sahrens 
1891fa9e4066Sahrens 	BP_SET_CHECKSUM(bp, checksum);
1892fa9e4066Sahrens 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1893fa9e4066Sahrens 
1894fa9e4066Sahrens 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
1895fa9e4066Sahrens 
1896fa9e4066Sahrens 	zio_next_stage(zio);
1897fa9e4066Sahrens }
1898fa9e4066Sahrens 
1899fa9e4066Sahrens static void
1900fa9e4066Sahrens zio_gang_checksum_generate(zio_t *zio)
1901fa9e4066Sahrens {
1902fa9e4066Sahrens 	zio_cksum_t zc;
1903fa9e4066Sahrens 	zio_gbh_phys_t *gbh = zio->io_data;
1904fa9e4066Sahrens 
190544cd46caSbillm 	ASSERT(BP_IS_GANG(zio->io_bp));
1906fa9e4066Sahrens 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
1907fa9e4066Sahrens 
1908fa9e4066Sahrens 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
1909fa9e4066Sahrens 
1910fa9e4066Sahrens 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
1911fa9e4066Sahrens 
1912fa9e4066Sahrens 	zio_next_stage(zio);
1913fa9e4066Sahrens }
1914fa9e4066Sahrens 
1915fa9e4066Sahrens static void
1916fa9e4066Sahrens zio_checksum_verify(zio_t *zio)
1917fa9e4066Sahrens {
1918fa9e4066Sahrens 	if (zio->io_bp != NULL) {
1919fa9e4066Sahrens 		zio->io_error = zio_checksum_error(zio);
1920ea8dc4b6Seschrock 		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1921ea8dc4b6Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1922ea8dc4b6Seschrock 			    zio->io_spa, zio->io_vd, zio, 0, 0);
1923fa9e4066Sahrens 	}
1924fa9e4066Sahrens 
1925fa9e4066Sahrens 	zio_next_stage(zio);
1926fa9e4066Sahrens }
1927fa9e4066Sahrens 
1928fa9e4066Sahrens /*
1929fa9e4066Sahrens  * Called by RAID-Z to ensure we don't compute the checksum twice.
1930fa9e4066Sahrens  */
1931fa9e4066Sahrens void
1932fa9e4066Sahrens zio_checksum_verified(zio_t *zio)
1933fa9e4066Sahrens {
1934fa9e4066Sahrens 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
1935fa9e4066Sahrens }
1936fa9e4066Sahrens 
1937fa9e4066Sahrens /*
1938fa9e4066Sahrens  * Set the external verifier for a gang block based on stuff in the bp
1939fa9e4066Sahrens  */
1940fa9e4066Sahrens void
1941fa9e4066Sahrens zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
1942fa9e4066Sahrens {
194344cd46caSbillm 	blkptr_t *bp = zio->io_bp;
194444cd46caSbillm 
194544cd46caSbillm 	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
194644cd46caSbillm 	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
194744cd46caSbillm 	zcp->zc_word[2] = bp->blk_birth;
1948fa9e4066Sahrens 	zcp->zc_word[3] = 0;
1949fa9e4066Sahrens }
1950fa9e4066Sahrens 
1951fa9e4066Sahrens /*
1952fa9e4066Sahrens  * ==========================================================================
1953fa9e4066Sahrens  * Define the pipeline
1954fa9e4066Sahrens  * ==========================================================================
1955fa9e4066Sahrens  */
1956fa9e4066Sahrens typedef void zio_pipe_stage_t(zio_t *zio);
1957fa9e4066Sahrens 
1958fa9e4066Sahrens static void
1959fa9e4066Sahrens zio_badop(zio_t *zio)
1960fa9e4066Sahrens {
1961fa9e4066Sahrens 	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
1962fa9e4066Sahrens }
1963fa9e4066Sahrens 
1964fa9e4066Sahrens zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
1965fa9e4066Sahrens 	zio_badop,
1966fa9e4066Sahrens 	zio_wait_children_ready,
1967fa9e4066Sahrens 	zio_write_compress,
1968fa9e4066Sahrens 	zio_checksum_generate,
1969fa9e4066Sahrens 	zio_gang_pipeline,
1970fa9e4066Sahrens 	zio_get_gang_header,
1971fa9e4066Sahrens 	zio_rewrite_gang_members,
1972fa9e4066Sahrens 	zio_free_gang_members,
1973fa9e4066Sahrens 	zio_claim_gang_members,
1974fa9e4066Sahrens 	zio_dva_allocate,
1975fa9e4066Sahrens 	zio_dva_free,
1976fa9e4066Sahrens 	zio_dva_claim,
1977fa9e4066Sahrens 	zio_gang_checksum_generate,
1978fa9e4066Sahrens 	zio_ready,
1979*0a4e9518Sgw 	zio_read_init,
1980fa9e4066Sahrens 	zio_vdev_io_start,
1981fa9e4066Sahrens 	zio_vdev_io_done,
1982fa9e4066Sahrens 	zio_vdev_io_assess,
1983fa9e4066Sahrens 	zio_wait_children_done,
1984fa9e4066Sahrens 	zio_checksum_verify,
1985fa9e4066Sahrens 	zio_read_gang_members,
1986fa9e4066Sahrens 	zio_read_decompress,
1987*0a4e9518Sgw 	zio_assess,
1988fa9e4066Sahrens 	zio_done,
1989fa9e4066Sahrens 	zio_badop
1990fa9e4066Sahrens };
1991fa9e4066Sahrens 
1992fa9e4066Sahrens /*
1993fa9e4066Sahrens  * Move an I/O to the next stage of the pipeline and execute that stage.
1994fa9e4066Sahrens  * There's no locking on io_stage because there's no legitimate way for
1995fa9e4066Sahrens  * multiple threads to be attempting to process the same I/O.
1996fa9e4066Sahrens  */
1997fa9e4066Sahrens void
1998fa9e4066Sahrens zio_next_stage(zio_t *zio)
1999fa9e4066Sahrens {
2000fa9e4066Sahrens 	uint32_t pipeline = zio->io_pipeline;
2001fa9e4066Sahrens 
2002fa9e4066Sahrens 	ASSERT(!MUTEX_HELD(&zio->io_lock));
2003fa9e4066Sahrens 
2004fa9e4066Sahrens 	if (zio->io_error) {
2005fa9e4066Sahrens 		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
2006fa9e4066Sahrens 		    zio, vdev_description(zio->io_vd),
2007fa9e4066Sahrens 		    zio->io_offset, zio->io_stage, zio->io_error);
2008fa9e4066Sahrens 		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
2009fa9e4066Sahrens 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
2010fa9e4066Sahrens 	}
2011fa9e4066Sahrens 
2012fa9e4066Sahrens 	while (((1U << ++zio->io_stage) & pipeline) == 0)
2013fa9e4066Sahrens 		continue;
2014fa9e4066Sahrens 
2015fa9e4066Sahrens 	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
2016fa9e4066Sahrens 	ASSERT(zio->io_stalled == 0);
2017fa9e4066Sahrens 
20189bc11082Sek 	/*
20199bc11082Sek 	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
20209bc11082Sek 	 */
20219bc11082Sek 	if (((1U << zio->io_stage) & zio->io_async_stages) &&
20229bc11082Sek 	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
20239bc11082Sek 	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
20249bc11082Sek 		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
20259bc11082Sek 		(void) taskq_dispatch(tq,
20269bc11082Sek 		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
20279bc11082Sek 	} else {
20289bc11082Sek 		zio_pipeline[zio->io_stage](zio);
20299bc11082Sek 	}
2030fa9e4066Sahrens }
2031fa9e4066Sahrens 
2032fa9e4066Sahrens void
2033fa9e4066Sahrens zio_next_stage_async(zio_t *zio)
2034fa9e4066Sahrens {
2035fa9e4066Sahrens 	taskq_t *tq;
2036fa9e4066Sahrens 	uint32_t pipeline = zio->io_pipeline;
2037fa9e4066Sahrens 
2038fa9e4066Sahrens 	ASSERT(!MUTEX_HELD(&zio->io_lock));
2039fa9e4066Sahrens 
2040fa9e4066Sahrens 	if (zio->io_error) {
2041fa9e4066Sahrens 		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
2042fa9e4066Sahrens 		    zio, vdev_description(zio->io_vd),
2043fa9e4066Sahrens 		    zio->io_offset, zio->io_stage, zio->io_error);
2044fa9e4066Sahrens 		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
2045fa9e4066Sahrens 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
2046fa9e4066Sahrens 	}
2047fa9e4066Sahrens 
2048fa9e4066Sahrens 	while (((1U << ++zio->io_stage) & pipeline) == 0)
2049fa9e4066Sahrens 		continue;
2050fa9e4066Sahrens 
2051fa9e4066Sahrens 	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
2052fa9e4066Sahrens 	ASSERT(zio->io_stalled == 0);
2053fa9e4066Sahrens 
2054fa9e4066Sahrens 	/*
2055fa9e4066Sahrens 	 * For performance, we'll probably want two sets of task queues:
2056fa9e4066Sahrens 	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
2057fa9e4066Sahrens 	 * part is for read performance: since we have to make a pass over
2058fa9e4066Sahrens 	 * the data to checksum it anyway, we want to do this on the same CPU
2059fa9e4066Sahrens 	 * that issued the read, because (assuming CPU scheduling affinity)
2060fa9e4066Sahrens 	 * that thread is probably still there.  Getting this optimization
2061fa9e4066Sahrens 	 * right avoids performance-hostile cache-to-cache transfers.
2062fa9e4066Sahrens 	 *
2063fa9e4066Sahrens 	 * Note that having two sets of task queues is also necessary for
2064fa9e4066Sahrens 	 * correctness: if all of the issue threads get bogged down waiting
2065fa9e4066Sahrens 	 * for dependent reads (e.g. metaslab freelist) to complete, then
2066fa9e4066Sahrens 	 * there won't be any threads available to service I/O completion
2067fa9e4066Sahrens 	 * interrupts.
2068fa9e4066Sahrens 	 */
2069fa9e4066Sahrens 	if ((1U << zio->io_stage) & zio->io_async_stages) {
2070fa9e4066Sahrens 		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
2071fa9e4066Sahrens 			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
2072fa9e4066Sahrens 		else
2073fa9e4066Sahrens 			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
2074fa9e4066Sahrens 		(void) taskq_dispatch(tq,
2075fa9e4066Sahrens 		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
2076fa9e4066Sahrens 	} else {
2077fa9e4066Sahrens 		zio_pipeline[zio->io_stage](zio);
2078fa9e4066Sahrens 	}
2079fa9e4066Sahrens }
2080fa9e4066Sahrens 
2081*0a4e9518Sgw void
2082*0a4e9518Sgw zio_resubmit_stage_async(void *arg)
2083*0a4e9518Sgw {
2084*0a4e9518Sgw 	zio_t *zio = (zio_t *)(uintptr_t)arg;
2085*0a4e9518Sgw 
2086*0a4e9518Sgw 	zio_next_stage_async(zio);
2087*0a4e9518Sgw }
2088*0a4e9518Sgw 
2089d63d470bSgw static boolean_t
2090*0a4e9518Sgw zio_io_should_fail(uint16_t range)
2091d63d470bSgw {
2092d63d470bSgw 	static uint16_t	allocs = 0;
2093d63d470bSgw 
2094*0a4e9518Sgw 	return (P2PHASE(allocs++, 1U<<range) == 0);
2095d63d470bSgw }
2096d63d470bSgw 
2097fa9e4066Sahrens /*
2098fa9e4066Sahrens  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2099fa9e4066Sahrens  */
2100fa9e4066Sahrens int
210167bd71c6Sperrin zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
210267bd71c6Sperrin     uint64_t txg)
2103fa9e4066Sahrens {
2104fa9e4066Sahrens 	int error;
2105fa9e4066Sahrens 
2106ea8dc4b6Seschrock 	spa_config_enter(spa, RW_READER, FTAG);
2107fa9e4066Sahrens 
2108*0a4e9518Sgw 	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
2109d63d470bSgw 		spa_config_exit(spa, FTAG);
2110d63d470bSgw 		return (ENOSPC);
2111d63d470bSgw 	}
2112d63d470bSgw 
211367bd71c6Sperrin 	/*
21148654d025Sperrin 	 * We were passed the previous log block's DVA in bp->blk_dva[0].
21158654d025Sperrin 	 * We use that as a hint for which vdev to allocate from next.
211667bd71c6Sperrin 	 */
21178654d025Sperrin 	error = metaslab_alloc(spa, spa->spa_log_class, size,
21188654d025Sperrin 	    new_bp, 1, txg, old_bp, B_TRUE);
21198654d025Sperrin 
21208654d025Sperrin 	if (error)
21218654d025Sperrin 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
21228654d025Sperrin 		    new_bp, 1, txg, old_bp, B_TRUE);
2123fa9e4066Sahrens 
2124fa9e4066Sahrens 	if (error == 0) {
212567bd71c6Sperrin 		BP_SET_LSIZE(new_bp, size);
212667bd71c6Sperrin 		BP_SET_PSIZE(new_bp, size);
212767bd71c6Sperrin 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
212867bd71c6Sperrin 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
212967bd71c6Sperrin 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
213067bd71c6Sperrin 		BP_SET_LEVEL(new_bp, 0);
213167bd71c6Sperrin 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
213267bd71c6Sperrin 		new_bp->blk_birth = txg;
2133fa9e4066Sahrens 	}
2134fa9e4066Sahrens 
2135ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2136fa9e4066Sahrens 
2137fa9e4066Sahrens 	return (error);
2138fa9e4066Sahrens }
2139fa9e4066Sahrens 
2140fa9e4066Sahrens /*
2141fa9e4066Sahrens  * Free an intent log block.  We know it can't be a gang block, so there's
2142fa9e4066Sahrens  * nothing to do except metaslab_free() it.
2143fa9e4066Sahrens  */
2144fa9e4066Sahrens void
2145fa9e4066Sahrens zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
2146fa9e4066Sahrens {
214744cd46caSbillm 	ASSERT(!BP_IS_GANG(bp));
2148fa9e4066Sahrens 
2149ea8dc4b6Seschrock 	spa_config_enter(spa, RW_READER, FTAG);
2150fa9e4066Sahrens 
2151d80c45e0Sbonwick 	metaslab_free(spa, bp, txg, B_FALSE);
2152fa9e4066Sahrens 
2153ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2154fa9e4066Sahrens }
215546341222Sperrin 
215646341222Sperrin /*
215746341222Sperrin  * start an async flush of the write cache for this vdev
215846341222Sperrin  */
215946341222Sperrin void
216046341222Sperrin zio_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
216146341222Sperrin {
216246341222Sperrin 	vdev_t *vd;
216346341222Sperrin 
216446341222Sperrin 	/*
216546341222Sperrin 	 * Lock out configuration changes.
216646341222Sperrin 	 */
216746341222Sperrin 	spa_config_enter(spa, RW_READER, FTAG);
216846341222Sperrin 
216946341222Sperrin 	if (*zio == NULL)
217046341222Sperrin 		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
217146341222Sperrin 
217246341222Sperrin 	vd = vdev_lookup_top(spa, vdev);
217346341222Sperrin 	ASSERT(vd);
217446341222Sperrin 
217546341222Sperrin 	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
217646341222Sperrin 	    NULL, NULL, ZIO_PRIORITY_NOW,
217746341222Sperrin 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
217846341222Sperrin 
217946341222Sperrin 	spa_config_exit(spa, FTAG);
218046341222Sperrin }
2181