xref: /illumos-gate/usr/src/uts/common/fs/zfs/zio.c (revision b24ab6762772a3f6a89393947930c7fa61306783)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22a3f829aeSBill Moore  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #include <sys/zfs_context.h>
27ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
28fa9e4066Sahrens #include <sys/spa.h>
29fa9e4066Sahrens #include <sys/txg.h>
30fa9e4066Sahrens #include <sys/spa_impl.h>
31fa9e4066Sahrens #include <sys/vdev_impl.h>
32fa9e4066Sahrens #include <sys/zio_impl.h>
33fa9e4066Sahrens #include <sys/zio_compress.h>
34fa9e4066Sahrens #include <sys/zio_checksum.h>
35*b24ab676SJeff Bonwick #include <sys/dmu_objset.h>
36*b24ab676SJeff Bonwick #include <sys/arc.h>
37*b24ab676SJeff Bonwick #include <sys/ddt.h>
38fa9e4066Sahrens 
39fa9e4066Sahrens /*
40fa9e4066Sahrens  * ==========================================================================
41fa9e4066Sahrens  * I/O priority table
42fa9e4066Sahrens  * ==========================================================================
43fa9e4066Sahrens  */
44fa9e4066Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
45fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_NOW		*/
46fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
47fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
48fa9e4066Sahrens 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
49fa9e4066Sahrens 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
50fa9e4066Sahrens 	4,	/* ZIO_PRIORITY_FREE		*/
51fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
52fa9e4066Sahrens 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
53fa9e4066Sahrens 	10,	/* ZIO_PRIORITY_RESILVER	*/
54fa9e4066Sahrens 	20,	/* ZIO_PRIORITY_SCRUB		*/
55fa9e4066Sahrens };
56fa9e4066Sahrens 
57fa9e4066Sahrens /*
58fa9e4066Sahrens  * ==========================================================================
59fa9e4066Sahrens  * I/O type descriptions
60fa9e4066Sahrens  * ==========================================================================
61fa9e4066Sahrens  */
62fa9e4066Sahrens char *zio_type_name[ZIO_TYPES] = {
63fa9e4066Sahrens 	"null", "read", "write", "free", "claim", "ioctl" };
64fa9e4066Sahrens 
65fa9e4066Sahrens /*
66fa9e4066Sahrens  * ==========================================================================
67fa9e4066Sahrens  * I/O kmem caches
68fa9e4066Sahrens  * ==========================================================================
69fa9e4066Sahrens  */
70ccae0b50Seschrock kmem_cache_t *zio_cache;
71a3f829aeSBill Moore kmem_cache_t *zio_link_cache;
72fa9e4066Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
73ad23a2dbSjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
74ad23a2dbSjohansen 
75ad23a2dbSjohansen #ifdef _KERNEL
76ad23a2dbSjohansen extern vmem_t *zio_alloc_arena;
77ad23a2dbSjohansen #endif
78fa9e4066Sahrens 
790a4e9518Sgw /*
80e14bb325SJeff Bonwick  * An allocating zio is one that either currently has the DVA allocate
81e14bb325SJeff Bonwick  * stage set or will have it later in its lifetime.
820a4e9518Sgw  */
83*b24ab676SJeff Bonwick #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
84*b24ab676SJeff Bonwick 
85*b24ab676SJeff Bonwick #ifdef ZFS_DEBUG
86*b24ab676SJeff Bonwick int zio_buf_debug_limit = 16384;
87*b24ab676SJeff Bonwick #else
88*b24ab676SJeff Bonwick int zio_buf_debug_limit = 0;
89*b24ab676SJeff Bonwick #endif
900a4e9518Sgw 
91fa9e4066Sahrens void
92fa9e4066Sahrens zio_init(void)
93fa9e4066Sahrens {
94fa9e4066Sahrens 	size_t c;
95ad23a2dbSjohansen 	vmem_t *data_alloc_arena = NULL;
96ad23a2dbSjohansen 
97ad23a2dbSjohansen #ifdef _KERNEL
98ad23a2dbSjohansen 	data_alloc_arena = zio_alloc_arena;
99ad23a2dbSjohansen #endif
100a3f829aeSBill Moore 	zio_cache = kmem_cache_create("zio_cache",
101a3f829aeSBill Moore 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
102a3f829aeSBill Moore 	zio_link_cache = kmem_cache_create("zio_link_cache",
103a3f829aeSBill Moore 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
104ccae0b50Seschrock 
105fa9e4066Sahrens 	/*
106fa9e4066Sahrens 	 * For small buffers, we want a cache for each multiple of
107fa9e4066Sahrens 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
108fa9e4066Sahrens 	 * for each quarter-power of 2.  For large buffers, we want
109fa9e4066Sahrens 	 * a cache for each multiple of PAGESIZE.
110fa9e4066Sahrens 	 */
111fa9e4066Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
112fa9e4066Sahrens 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
113fa9e4066Sahrens 		size_t p2 = size;
114fa9e4066Sahrens 		size_t align = 0;
115fa9e4066Sahrens 
116fa9e4066Sahrens 		while (p2 & (p2 - 1))
117fa9e4066Sahrens 			p2 &= p2 - 1;
118fa9e4066Sahrens 
119fa9e4066Sahrens 		if (size <= 4 * SPA_MINBLOCKSIZE) {
120fa9e4066Sahrens 			align = SPA_MINBLOCKSIZE;
121fa9e4066Sahrens 		} else if (P2PHASE(size, PAGESIZE) == 0) {
122fa9e4066Sahrens 			align = PAGESIZE;
123fa9e4066Sahrens 		} else if (P2PHASE(size, p2 >> 2) == 0) {
124fa9e4066Sahrens 			align = p2 >> 2;
125fa9e4066Sahrens 		}
126fa9e4066Sahrens 
127fa9e4066Sahrens 		if (align != 0) {
128ad23a2dbSjohansen 			char name[36];
1295ad82045Snd 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
130fa9e4066Sahrens 			zio_buf_cache[c] = kmem_cache_create(name, size,
131*b24ab676SJeff Bonwick 			    align, NULL, NULL, NULL, NULL, NULL,
132*b24ab676SJeff Bonwick 			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
133ad23a2dbSjohansen 
134ad23a2dbSjohansen 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
135ad23a2dbSjohansen 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
136ad23a2dbSjohansen 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
137*b24ab676SJeff Bonwick 			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
138fa9e4066Sahrens 		}
139fa9e4066Sahrens 	}
140fa9e4066Sahrens 
141fa9e4066Sahrens 	while (--c != 0) {
142fa9e4066Sahrens 		ASSERT(zio_buf_cache[c] != NULL);
143fa9e4066Sahrens 		if (zio_buf_cache[c - 1] == NULL)
144fa9e4066Sahrens 			zio_buf_cache[c - 1] = zio_buf_cache[c];
145ad23a2dbSjohansen 
146ad23a2dbSjohansen 		ASSERT(zio_data_buf_cache[c] != NULL);
147ad23a2dbSjohansen 		if (zio_data_buf_cache[c - 1] == NULL)
148ad23a2dbSjohansen 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
149fa9e4066Sahrens 	}
150ea8dc4b6Seschrock 
151ea8dc4b6Seschrock 	zio_inject_init();
152fa9e4066Sahrens }
153fa9e4066Sahrens 
154fa9e4066Sahrens void
155fa9e4066Sahrens zio_fini(void)
156fa9e4066Sahrens {
157fa9e4066Sahrens 	size_t c;
158fa9e4066Sahrens 	kmem_cache_t *last_cache = NULL;
159ad23a2dbSjohansen 	kmem_cache_t *last_data_cache = NULL;
160fa9e4066Sahrens 
161fa9e4066Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
162fa9e4066Sahrens 		if (zio_buf_cache[c] != last_cache) {
163fa9e4066Sahrens 			last_cache = zio_buf_cache[c];
164fa9e4066Sahrens 			kmem_cache_destroy(zio_buf_cache[c]);
165fa9e4066Sahrens 		}
166fa9e4066Sahrens 		zio_buf_cache[c] = NULL;
167ad23a2dbSjohansen 
168ad23a2dbSjohansen 		if (zio_data_buf_cache[c] != last_data_cache) {
169ad23a2dbSjohansen 			last_data_cache = zio_data_buf_cache[c];
170ad23a2dbSjohansen 			kmem_cache_destroy(zio_data_buf_cache[c]);
171ad23a2dbSjohansen 		}
172ad23a2dbSjohansen 		zio_data_buf_cache[c] = NULL;
173fa9e4066Sahrens 	}
174ea8dc4b6Seschrock 
175a3f829aeSBill Moore 	kmem_cache_destroy(zio_link_cache);
176ccae0b50Seschrock 	kmem_cache_destroy(zio_cache);
177ccae0b50Seschrock 
178ea8dc4b6Seschrock 	zio_inject_fini();
179fa9e4066Sahrens }
180fa9e4066Sahrens 
181fa9e4066Sahrens /*
182fa9e4066Sahrens  * ==========================================================================
183fa9e4066Sahrens  * Allocate and free I/O buffers
184fa9e4066Sahrens  * ==========================================================================
185fa9e4066Sahrens  */
186ad23a2dbSjohansen 
187ad23a2dbSjohansen /*
188ad23a2dbSjohansen  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
189ad23a2dbSjohansen  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
190ad23a2dbSjohansen  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
191ad23a2dbSjohansen  * excess / transient data in-core during a crashdump.
192ad23a2dbSjohansen  */
193fa9e4066Sahrens void *
194fa9e4066Sahrens zio_buf_alloc(size_t size)
195fa9e4066Sahrens {
196fa9e4066Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
197fa9e4066Sahrens 
198fa9e4066Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
199fa9e4066Sahrens 
2001ab7f2deSmaybee 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
201fa9e4066Sahrens }
202fa9e4066Sahrens 
203ad23a2dbSjohansen /*
204ad23a2dbSjohansen  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
205ad23a2dbSjohansen  * crashdump if the kernel panics.  This exists so that we will limit the amount
206ad23a2dbSjohansen  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
207ad23a2dbSjohansen  * of kernel heap dumped to disk when the kernel panics)
208ad23a2dbSjohansen  */
209ad23a2dbSjohansen void *
210ad23a2dbSjohansen zio_data_buf_alloc(size_t size)
211ad23a2dbSjohansen {
212ad23a2dbSjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
213ad23a2dbSjohansen 
214ad23a2dbSjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
215ad23a2dbSjohansen 
2161ab7f2deSmaybee 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
217ad23a2dbSjohansen }
218ad23a2dbSjohansen 
219fa9e4066Sahrens void
220fa9e4066Sahrens zio_buf_free(void *buf, size_t size)
221fa9e4066Sahrens {
222fa9e4066Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
223fa9e4066Sahrens 
224fa9e4066Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
225fa9e4066Sahrens 
226fa9e4066Sahrens 	kmem_cache_free(zio_buf_cache[c], buf);
227fa9e4066Sahrens }
228fa9e4066Sahrens 
229ad23a2dbSjohansen void
230ad23a2dbSjohansen zio_data_buf_free(void *buf, size_t size)
231ad23a2dbSjohansen {
232ad23a2dbSjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
233ad23a2dbSjohansen 
234ad23a2dbSjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
235ad23a2dbSjohansen 
236ad23a2dbSjohansen 	kmem_cache_free(zio_data_buf_cache[c], buf);
237ad23a2dbSjohansen }
238b3995adbSahrens 
239fa9e4066Sahrens /*
240fa9e4066Sahrens  * ==========================================================================
241fa9e4066Sahrens  * Push and pop I/O transform buffers
242fa9e4066Sahrens  * ==========================================================================
243fa9e4066Sahrens  */
244fa9e4066Sahrens static void
245e14bb325SJeff Bonwick zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
246e14bb325SJeff Bonwick 	zio_transform_func_t *transform)
247fa9e4066Sahrens {
248fa9e4066Sahrens 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
249fa9e4066Sahrens 
250e14bb325SJeff Bonwick 	zt->zt_orig_data = zio->io_data;
251e14bb325SJeff Bonwick 	zt->zt_orig_size = zio->io_size;
252fa9e4066Sahrens 	zt->zt_bufsize = bufsize;
253e14bb325SJeff Bonwick 	zt->zt_transform = transform;
254fa9e4066Sahrens 
255fa9e4066Sahrens 	zt->zt_next = zio->io_transform_stack;
256fa9e4066Sahrens 	zio->io_transform_stack = zt;
257fa9e4066Sahrens 
258fa9e4066Sahrens 	zio->io_data = data;
259fa9e4066Sahrens 	zio->io_size = size;
260fa9e4066Sahrens }
261fa9e4066Sahrens 
262fa9e4066Sahrens static void
263e14bb325SJeff Bonwick zio_pop_transforms(zio_t *zio)
264fa9e4066Sahrens {
265e14bb325SJeff Bonwick 	zio_transform_t *zt;
266e14bb325SJeff Bonwick 
267e14bb325SJeff Bonwick 	while ((zt = zio->io_transform_stack) != NULL) {
268e14bb325SJeff Bonwick 		if (zt->zt_transform != NULL)
269e14bb325SJeff Bonwick 			zt->zt_transform(zio,
270e14bb325SJeff Bonwick 			    zt->zt_orig_data, zt->zt_orig_size);
271fa9e4066Sahrens 
272*b24ab676SJeff Bonwick 		if (zt->zt_bufsize != 0)
273*b24ab676SJeff Bonwick 			zio_buf_free(zio->io_data, zt->zt_bufsize);
274fa9e4066Sahrens 
275e14bb325SJeff Bonwick 		zio->io_data = zt->zt_orig_data;
276e14bb325SJeff Bonwick 		zio->io_size = zt->zt_orig_size;
277e14bb325SJeff Bonwick 		zio->io_transform_stack = zt->zt_next;
278fa9e4066Sahrens 
279e14bb325SJeff Bonwick 		kmem_free(zt, sizeof (zio_transform_t));
280fa9e4066Sahrens 	}
281fa9e4066Sahrens }
282fa9e4066Sahrens 
283e14bb325SJeff Bonwick /*
284e14bb325SJeff Bonwick  * ==========================================================================
285e14bb325SJeff Bonwick  * I/O transform callbacks for subblocks and decompression
286e14bb325SJeff Bonwick  * ==========================================================================
287e14bb325SJeff Bonwick  */
288e14bb325SJeff Bonwick static void
289e14bb325SJeff Bonwick zio_subblock(zio_t *zio, void *data, uint64_t size)
290e14bb325SJeff Bonwick {
291e14bb325SJeff Bonwick 	ASSERT(zio->io_size > size);
292e14bb325SJeff Bonwick 
293e14bb325SJeff Bonwick 	if (zio->io_type == ZIO_TYPE_READ)
294e14bb325SJeff Bonwick 		bcopy(zio->io_data, data, size);
295e14bb325SJeff Bonwick }
296e14bb325SJeff Bonwick 
297e14bb325SJeff Bonwick static void
298e14bb325SJeff Bonwick zio_decompress(zio_t *zio, void *data, uint64_t size)
299e14bb325SJeff Bonwick {
300e14bb325SJeff Bonwick 	if (zio->io_error == 0 &&
301e14bb325SJeff Bonwick 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
302*b24ab676SJeff Bonwick 	    zio->io_data, data, zio->io_size, size) != 0)
303e14bb325SJeff Bonwick 		zio->io_error = EIO;
304e14bb325SJeff Bonwick }
305e14bb325SJeff Bonwick 
306e14bb325SJeff Bonwick /*
307e14bb325SJeff Bonwick  * ==========================================================================
308e14bb325SJeff Bonwick  * I/O parent/child relationships and pipeline interlocks
309e14bb325SJeff Bonwick  * ==========================================================================
310e14bb325SJeff Bonwick  */
311a3f829aeSBill Moore /*
312a3f829aeSBill Moore  * NOTE - Callers to zio_walk_parents() and zio_walk_children must
313a3f829aeSBill Moore  *        continue calling these functions until they return NULL.
314a3f829aeSBill Moore  *        Otherwise, the next caller will pick up the list walk in
315a3f829aeSBill Moore  *        some indeterminate state.  (Otherwise every caller would
316a3f829aeSBill Moore  *        have to pass in a cookie to keep the state represented by
317a3f829aeSBill Moore  *        io_walk_link, which gets annoying.)
318a3f829aeSBill Moore  */
319a3f829aeSBill Moore zio_t *
320a3f829aeSBill Moore zio_walk_parents(zio_t *cio)
321a3f829aeSBill Moore {
322a3f829aeSBill Moore 	zio_link_t *zl = cio->io_walk_link;
323a3f829aeSBill Moore 	list_t *pl = &cio->io_parent_list;
324e14bb325SJeff Bonwick 
325a3f829aeSBill Moore 	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
326a3f829aeSBill Moore 	cio->io_walk_link = zl;
327a3f829aeSBill Moore 
328a3f829aeSBill Moore 	if (zl == NULL)
329a3f829aeSBill Moore 		return (NULL);
330a3f829aeSBill Moore 
331a3f829aeSBill Moore 	ASSERT(zl->zl_child == cio);
332a3f829aeSBill Moore 	return (zl->zl_parent);
333a3f829aeSBill Moore }
334a3f829aeSBill Moore 
335a3f829aeSBill Moore zio_t *
336a3f829aeSBill Moore zio_walk_children(zio_t *pio)
337a3f829aeSBill Moore {
338a3f829aeSBill Moore 	zio_link_t *zl = pio->io_walk_link;
339a3f829aeSBill Moore 	list_t *cl = &pio->io_child_list;
340a3f829aeSBill Moore 
341a3f829aeSBill Moore 	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
342a3f829aeSBill Moore 	pio->io_walk_link = zl;
343a3f829aeSBill Moore 
344a3f829aeSBill Moore 	if (zl == NULL)
345a3f829aeSBill Moore 		return (NULL);
346a3f829aeSBill Moore 
347a3f829aeSBill Moore 	ASSERT(zl->zl_parent == pio);
348a3f829aeSBill Moore 	return (zl->zl_child);
349a3f829aeSBill Moore }
350a3f829aeSBill Moore 
351a3f829aeSBill Moore zio_t *
352a3f829aeSBill Moore zio_unique_parent(zio_t *cio)
353a3f829aeSBill Moore {
354a3f829aeSBill Moore 	zio_t *pio = zio_walk_parents(cio);
355a3f829aeSBill Moore 
356a3f829aeSBill Moore 	VERIFY(zio_walk_parents(cio) == NULL);
357a3f829aeSBill Moore 	return (pio);
358a3f829aeSBill Moore }
359a3f829aeSBill Moore 
360a3f829aeSBill Moore void
361a3f829aeSBill Moore zio_add_child(zio_t *pio, zio_t *cio)
362e14bb325SJeff Bonwick {
363a3f829aeSBill Moore 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
364a3f829aeSBill Moore 
365a3f829aeSBill Moore 	/*
366a3f829aeSBill Moore 	 * Logical I/Os can have logical, gang, or vdev children.
367a3f829aeSBill Moore 	 * Gang I/Os can have gang or vdev children.
368a3f829aeSBill Moore 	 * Vdev I/Os can only have vdev children.
369a3f829aeSBill Moore 	 * The following ASSERT captures all of these constraints.
370a3f829aeSBill Moore 	 */
371a3f829aeSBill Moore 	ASSERT(cio->io_child_type <= pio->io_child_type);
372a3f829aeSBill Moore 
373a3f829aeSBill Moore 	zl->zl_parent = pio;
374a3f829aeSBill Moore 	zl->zl_child = cio;
375a3f829aeSBill Moore 
376a3f829aeSBill Moore 	mutex_enter(&cio->io_lock);
377e14bb325SJeff Bonwick 	mutex_enter(&pio->io_lock);
378a3f829aeSBill Moore 
379a3f829aeSBill Moore 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
380a3f829aeSBill Moore 
381a3f829aeSBill Moore 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
382a3f829aeSBill Moore 		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
383a3f829aeSBill Moore 
384a3f829aeSBill Moore 	list_insert_head(&pio->io_child_list, zl);
385a3f829aeSBill Moore 	list_insert_head(&cio->io_parent_list, zl);
386a3f829aeSBill Moore 
387*b24ab676SJeff Bonwick 	pio->io_child_count++;
388*b24ab676SJeff Bonwick 	cio->io_parent_count++;
389*b24ab676SJeff Bonwick 
390e14bb325SJeff Bonwick 	mutex_exit(&pio->io_lock);
391a3f829aeSBill Moore 	mutex_exit(&cio->io_lock);
392e14bb325SJeff Bonwick }
393e14bb325SJeff Bonwick 
394fa9e4066Sahrens static void
395a3f829aeSBill Moore zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
396e14bb325SJeff Bonwick {
397a3f829aeSBill Moore 	ASSERT(zl->zl_parent == pio);
398a3f829aeSBill Moore 	ASSERT(zl->zl_child == cio);
399e14bb325SJeff Bonwick 
400a3f829aeSBill Moore 	mutex_enter(&cio->io_lock);
401e14bb325SJeff Bonwick 	mutex_enter(&pio->io_lock);
402a3f829aeSBill Moore 
403a3f829aeSBill Moore 	list_remove(&pio->io_child_list, zl);
404a3f829aeSBill Moore 	list_remove(&cio->io_parent_list, zl);
405a3f829aeSBill Moore 
406*b24ab676SJeff Bonwick 	pio->io_child_count--;
407*b24ab676SJeff Bonwick 	cio->io_parent_count--;
408*b24ab676SJeff Bonwick 
409e14bb325SJeff Bonwick 	mutex_exit(&pio->io_lock);
410a3f829aeSBill Moore 	mutex_exit(&cio->io_lock);
411a3f829aeSBill Moore 
412a3f829aeSBill Moore 	kmem_cache_free(zio_link_cache, zl);
413e14bb325SJeff Bonwick }
414e14bb325SJeff Bonwick 
415e14bb325SJeff Bonwick static boolean_t
416e14bb325SJeff Bonwick zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
417fa9e4066Sahrens {
418e14bb325SJeff Bonwick 	uint64_t *countp = &zio->io_children[child][wait];
419e14bb325SJeff Bonwick 	boolean_t waiting = B_FALSE;
420e14bb325SJeff Bonwick 
421e14bb325SJeff Bonwick 	mutex_enter(&zio->io_lock);
422e14bb325SJeff Bonwick 	ASSERT(zio->io_stall == NULL);
423e14bb325SJeff Bonwick 	if (*countp != 0) {
424*b24ab676SJeff Bonwick 		zio->io_stage >>= 1;
425e14bb325SJeff Bonwick 		zio->io_stall = countp;
426e14bb325SJeff Bonwick 		waiting = B_TRUE;
427e14bb325SJeff Bonwick 	}
428e14bb325SJeff Bonwick 	mutex_exit(&zio->io_lock);
429e14bb325SJeff Bonwick 
430e14bb325SJeff Bonwick 	return (waiting);
431e14bb325SJeff Bonwick }
432fa9e4066Sahrens 
433e14bb325SJeff Bonwick static void
434e14bb325SJeff Bonwick zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
435e14bb325SJeff Bonwick {
436e14bb325SJeff Bonwick 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
437e14bb325SJeff Bonwick 	int *errorp = &pio->io_child_error[zio->io_child_type];
438fa9e4066Sahrens 
439e14bb325SJeff Bonwick 	mutex_enter(&pio->io_lock);
440e14bb325SJeff Bonwick 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
441e14bb325SJeff Bonwick 		*errorp = zio_worst_error(*errorp, zio->io_error);
442e14bb325SJeff Bonwick 	pio->io_reexecute |= zio->io_reexecute;
443e14bb325SJeff Bonwick 	ASSERT3U(*countp, >, 0);
444e14bb325SJeff Bonwick 	if (--*countp == 0 && pio->io_stall == countp) {
445e14bb325SJeff Bonwick 		pio->io_stall = NULL;
446e14bb325SJeff Bonwick 		mutex_exit(&pio->io_lock);
447e14bb325SJeff Bonwick 		zio_execute(pio);
448e14bb325SJeff Bonwick 	} else {
449e14bb325SJeff Bonwick 		mutex_exit(&pio->io_lock);
450fa9e4066Sahrens 	}
451fa9e4066Sahrens }
452fa9e4066Sahrens 
453e14bb325SJeff Bonwick static void
454e14bb325SJeff Bonwick zio_inherit_child_errors(zio_t *zio, enum zio_child c)
455e14bb325SJeff Bonwick {
456e14bb325SJeff Bonwick 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
457e14bb325SJeff Bonwick 		zio->io_error = zio->io_child_error[c];
458e14bb325SJeff Bonwick }
459e14bb325SJeff Bonwick 
460fa9e4066Sahrens /*
461fa9e4066Sahrens  * ==========================================================================
462e14bb325SJeff Bonwick  * Create the various types of I/O (read, write, free, etc)
463fa9e4066Sahrens  * ==========================================================================
464fa9e4066Sahrens  */
465fa9e4066Sahrens static zio_t *
466*b24ab676SJeff Bonwick zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
467fa9e4066Sahrens     void *data, uint64_t size, zio_done_func_t *done, void *private,
468*b24ab676SJeff Bonwick     zio_type_t type, int priority, enum zio_flag flags,
469*b24ab676SJeff Bonwick     vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
470*b24ab676SJeff Bonwick     enum zio_stage stage, enum zio_stage pipeline)
471fa9e4066Sahrens {
472fa9e4066Sahrens 	zio_t *zio;
473fa9e4066Sahrens 
474fa9e4066Sahrens 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
475fa9e4066Sahrens 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
476e14bb325SJeff Bonwick 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
477fa9e4066Sahrens 
478e14bb325SJeff Bonwick 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
479e14bb325SJeff Bonwick 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
480e14bb325SJeff Bonwick 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
481088f3894Sahrens 
482ccae0b50Seschrock 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
483ccae0b50Seschrock 	bzero(zio, sizeof (zio_t));
484e14bb325SJeff Bonwick 
485e14bb325SJeff Bonwick 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
486e14bb325SJeff Bonwick 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
487e14bb325SJeff Bonwick 
488a3f829aeSBill Moore 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
489a3f829aeSBill Moore 	    offsetof(zio_link_t, zl_parent_node));
490a3f829aeSBill Moore 	list_create(&zio->io_child_list, sizeof (zio_link_t),
491a3f829aeSBill Moore 	    offsetof(zio_link_t, zl_child_node));
492a3f829aeSBill Moore 
493e14bb325SJeff Bonwick 	if (vd != NULL)
494e14bb325SJeff Bonwick 		zio->io_child_type = ZIO_CHILD_VDEV;
495e14bb325SJeff Bonwick 	else if (flags & ZIO_FLAG_GANG_CHILD)
496e14bb325SJeff Bonwick 		zio->io_child_type = ZIO_CHILD_GANG;
497*b24ab676SJeff Bonwick 	else if (flags & ZIO_FLAG_DDT_CHILD)
498*b24ab676SJeff Bonwick 		zio->io_child_type = ZIO_CHILD_DDT;
499e14bb325SJeff Bonwick 	else
500e14bb325SJeff Bonwick 		zio->io_child_type = ZIO_CHILD_LOGICAL;
501e14bb325SJeff Bonwick 
502fa9e4066Sahrens 	if (bp != NULL) {
503*b24ab676SJeff Bonwick 		zio->io_bp = (blkptr_t *)bp;
504fa9e4066Sahrens 		zio->io_bp_copy = *bp;
505fa9e4066Sahrens 		zio->io_bp_orig = *bp;
506*b24ab676SJeff Bonwick 		if (type != ZIO_TYPE_WRITE ||
507*b24ab676SJeff Bonwick 		    zio->io_child_type == ZIO_CHILD_DDT)
508e14bb325SJeff Bonwick 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
509f5383399SBill Moore 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
510e14bb325SJeff Bonwick 			zio->io_logical = zio;
511f5383399SBill Moore 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
512f5383399SBill Moore 			pipeline |= ZIO_GANG_STAGES;
513fa9e4066Sahrens 	}
514e14bb325SJeff Bonwick 
515e14bb325SJeff Bonwick 	zio->io_spa = spa;
516e14bb325SJeff Bonwick 	zio->io_txg = txg;
517fa9e4066Sahrens 	zio->io_done = done;
518fa9e4066Sahrens 	zio->io_private = private;
519fa9e4066Sahrens 	zio->io_type = type;
520fa9e4066Sahrens 	zio->io_priority = priority;
521e14bb325SJeff Bonwick 	zio->io_vd = vd;
522e14bb325SJeff Bonwick 	zio->io_offset = offset;
523*b24ab676SJeff Bonwick 	zio->io_orig_data = zio->io_data = data;
524*b24ab676SJeff Bonwick 	zio->io_orig_size = zio->io_size = size;
525e14bb325SJeff Bonwick 	zio->io_orig_flags = zio->io_flags = flags;
526e14bb325SJeff Bonwick 	zio->io_orig_stage = zio->io_stage = stage;
527e14bb325SJeff Bonwick 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
528fa9e4066Sahrens 
529a3f829aeSBill Moore 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
530a3f829aeSBill Moore 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
531a3f829aeSBill Moore 
532e14bb325SJeff Bonwick 	if (zb != NULL)
533e14bb325SJeff Bonwick 		zio->io_bookmark = *zb;
534e14bb325SJeff Bonwick 
535e14bb325SJeff Bonwick 	if (pio != NULL) {
536e14bb325SJeff Bonwick 		if (zio->io_logical == NULL)
537ea8dc4b6Seschrock 			zio->io_logical = pio->io_logical;
538f5383399SBill Moore 		if (zio->io_child_type == ZIO_CHILD_GANG)
539f5383399SBill Moore 			zio->io_gang_leader = pio->io_gang_leader;
540e14bb325SJeff Bonwick 		zio_add_child(pio, zio);
541fa9e4066Sahrens 	}
542fa9e4066Sahrens 
543fa9e4066Sahrens 	return (zio);
544fa9e4066Sahrens }
545fa9e4066Sahrens 
5460a4e9518Sgw static void
547e14bb325SJeff Bonwick zio_destroy(zio_t *zio)
5480a4e9518Sgw {
549a3f829aeSBill Moore 	list_destroy(&zio->io_parent_list);
550a3f829aeSBill Moore 	list_destroy(&zio->io_child_list);
551e14bb325SJeff Bonwick 	mutex_destroy(&zio->io_lock);
552e14bb325SJeff Bonwick 	cv_destroy(&zio->io_cv);
553e14bb325SJeff Bonwick 	kmem_cache_free(zio_cache, zio);
5540a4e9518Sgw }
5550a4e9518Sgw 
556fa9e4066Sahrens zio_t *
557a3f829aeSBill Moore zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
558*b24ab676SJeff Bonwick     void *private, enum zio_flag flags)
559fa9e4066Sahrens {
560fa9e4066Sahrens 	zio_t *zio;
561fa9e4066Sahrens 
562fa9e4066Sahrens 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
563a3f829aeSBill Moore 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
564e14bb325SJeff Bonwick 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
565fa9e4066Sahrens 
566fa9e4066Sahrens 	return (zio);
567fa9e4066Sahrens }
568fa9e4066Sahrens 
569fa9e4066Sahrens zio_t *
570*b24ab676SJeff Bonwick zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
571fa9e4066Sahrens {
572a3f829aeSBill Moore 	return (zio_null(NULL, spa, NULL, done, private, flags));
573fa9e4066Sahrens }
574fa9e4066Sahrens 
575fa9e4066Sahrens zio_t *
576e14bb325SJeff Bonwick zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
577e14bb325SJeff Bonwick     void *data, uint64_t size, zio_done_func_t *done, void *private,
578*b24ab676SJeff Bonwick     int priority, enum zio_flag flags, const zbookmark_t *zb)
579fa9e4066Sahrens {
580fa9e4066Sahrens 	zio_t *zio;
581fa9e4066Sahrens 
582*b24ab676SJeff Bonwick 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
583088f3894Sahrens 	    data, size, done, private,
584e14bb325SJeff Bonwick 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
585*b24ab676SJeff Bonwick 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
586*b24ab676SJeff Bonwick 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
587fa9e4066Sahrens 
588fa9e4066Sahrens 	return (zio);
589fa9e4066Sahrens }
590fa9e4066Sahrens 
591fa9e4066Sahrens zio_t *
592e14bb325SJeff Bonwick zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
593*b24ab676SJeff Bonwick     void *data, uint64_t size, const zio_prop_t *zp,
594e14bb325SJeff Bonwick     zio_done_func_t *ready, zio_done_func_t *done, void *private,
595*b24ab676SJeff Bonwick     int priority, enum zio_flag flags, const zbookmark_t *zb)
596fa9e4066Sahrens {
597fa9e4066Sahrens 	zio_t *zio;
598fa9e4066Sahrens 
599e14bb325SJeff Bonwick 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
600e14bb325SJeff Bonwick 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
601e14bb325SJeff Bonwick 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
602e14bb325SJeff Bonwick 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
603e14bb325SJeff Bonwick 	    zp->zp_type < DMU_OT_NUMTYPES &&
604e14bb325SJeff Bonwick 	    zp->zp_level < 32 &&
605*b24ab676SJeff Bonwick 	    zp->zp_copies > 0 &&
606*b24ab676SJeff Bonwick 	    zp->zp_copies <= spa_max_replication(spa) &&
607*b24ab676SJeff Bonwick 	    zp->zp_dedup <= 1 &&
608*b24ab676SJeff Bonwick 	    zp->zp_dedup_verify <= 1);
6090a4e9518Sgw 
610fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
611e14bb325SJeff Bonwick 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
612*b24ab676SJeff Bonwick 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
613*b24ab676SJeff Bonwick 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
614fa9e4066Sahrens 
615c717a561Smaybee 	zio->io_ready = ready;
616e14bb325SJeff Bonwick 	zio->io_prop = *zp;
617fa9e4066Sahrens 
618fa9e4066Sahrens 	return (zio);
619fa9e4066Sahrens }
620fa9e4066Sahrens 
621fa9e4066Sahrens zio_t *
622e14bb325SJeff Bonwick zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
623e14bb325SJeff Bonwick     uint64_t size, zio_done_func_t *done, void *private, int priority,
624*b24ab676SJeff Bonwick     enum zio_flag flags, zbookmark_t *zb)
625fa9e4066Sahrens {
626fa9e4066Sahrens 	zio_t *zio;
627fa9e4066Sahrens 
628fa9e4066Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
629e14bb325SJeff Bonwick 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
630e14bb325SJeff Bonwick 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
631fa9e4066Sahrens 
632fa9e4066Sahrens 	return (zio);
633fa9e4066Sahrens }
634fa9e4066Sahrens 
635*b24ab676SJeff Bonwick void
636*b24ab676SJeff Bonwick zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
637*b24ab676SJeff Bonwick {
638*b24ab676SJeff Bonwick 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
639*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
640*b24ab676SJeff Bonwick 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
641*b24ab676SJeff Bonwick 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
642*b24ab676SJeff Bonwick 
643*b24ab676SJeff Bonwick 	zio->io_prop.zp_copies = copies;
644*b24ab676SJeff Bonwick 	zio->io_bp_override = bp;
645*b24ab676SJeff Bonwick }
646*b24ab676SJeff Bonwick 
647*b24ab676SJeff Bonwick void
648*b24ab676SJeff Bonwick zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
649*b24ab676SJeff Bonwick {
650*b24ab676SJeff Bonwick 	bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
651*b24ab676SJeff Bonwick }
652*b24ab676SJeff Bonwick 
653fa9e4066Sahrens zio_t *
654*b24ab676SJeff Bonwick zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
655*b24ab676SJeff Bonwick     enum zio_flag flags)
656fa9e4066Sahrens {
657fa9e4066Sahrens 	zio_t *zio;
658fa9e4066Sahrens 
659fa9e4066Sahrens 	ASSERT(!BP_IS_HOLE(bp));
660*b24ab676SJeff Bonwick 	ASSERT(spa_syncing_txg(spa) == txg);
661*b24ab676SJeff Bonwick 	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
662fa9e4066Sahrens 
663e14bb325SJeff Bonwick 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
664*b24ab676SJeff Bonwick 	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
665e14bb325SJeff Bonwick 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
666fa9e4066Sahrens 
667fa9e4066Sahrens 	return (zio);
668fa9e4066Sahrens }
669fa9e4066Sahrens 
670fa9e4066Sahrens zio_t *
671*b24ab676SJeff Bonwick zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
672*b24ab676SJeff Bonwick     zio_done_func_t *done, void *private, enum zio_flag flags)
673fa9e4066Sahrens {
674fa9e4066Sahrens 	zio_t *zio;
675fa9e4066Sahrens 
676fa9e4066Sahrens 	/*
677fa9e4066Sahrens 	 * A claim is an allocation of a specific block.  Claims are needed
678fa9e4066Sahrens 	 * to support immediate writes in the intent log.  The issue is that
679fa9e4066Sahrens 	 * immediate writes contain committed data, but in a txg that was
680fa9e4066Sahrens 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
681fa9e4066Sahrens 	 * the intent log claims all blocks that contain immediate write data
682fa9e4066Sahrens 	 * so that the SPA knows they're in use.
683fa9e4066Sahrens 	 *
684fa9e4066Sahrens 	 * All claims *must* be resolved in the first txg -- before the SPA
685fa9e4066Sahrens 	 * starts allocating blocks -- so that nothing is allocated twice.
686*b24ab676SJeff Bonwick 	 * If txg == 0 we just verify that the block is claimable.
687fa9e4066Sahrens 	 */
688fa9e4066Sahrens 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
689*b24ab676SJeff Bonwick 	ASSERT(txg == spa_first_txg(spa) || txg == 0);
690*b24ab676SJeff Bonwick 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
691fa9e4066Sahrens 
692e14bb325SJeff Bonwick 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
693e14bb325SJeff Bonwick 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
694e14bb325SJeff Bonwick 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
695fa9e4066Sahrens 
696fa9e4066Sahrens 	return (zio);
697fa9e4066Sahrens }
698fa9e4066Sahrens 
699fa9e4066Sahrens zio_t *
700fa9e4066Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
701*b24ab676SJeff Bonwick     zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
702fa9e4066Sahrens {
703fa9e4066Sahrens 	zio_t *zio;
704fa9e4066Sahrens 	int c;
705fa9e4066Sahrens 
706fa9e4066Sahrens 	if (vd->vdev_children == 0) {
707fa9e4066Sahrens 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
708e14bb325SJeff Bonwick 		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
709fa9e4066Sahrens 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
710fa9e4066Sahrens 
711fa9e4066Sahrens 		zio->io_cmd = cmd;
712fa9e4066Sahrens 	} else {
713a3f829aeSBill Moore 		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
714fa9e4066Sahrens 
715fa9e4066Sahrens 		for (c = 0; c < vd->vdev_children; c++)
716fa9e4066Sahrens 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
717fa9e4066Sahrens 			    done, private, priority, flags));
718fa9e4066Sahrens 	}
719fa9e4066Sahrens 
720fa9e4066Sahrens 	return (zio);
721fa9e4066Sahrens }
722fa9e4066Sahrens 
723fa9e4066Sahrens zio_t *
724fa9e4066Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
725fa9e4066Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
726*b24ab676SJeff Bonwick     int priority, enum zio_flag flags, boolean_t labels)
727fa9e4066Sahrens {
728fa9e4066Sahrens 	zio_t *zio;
7290a4e9518Sgw 
730e14bb325SJeff Bonwick 	ASSERT(vd->vdev_children == 0);
731e14bb325SJeff Bonwick 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
732e14bb325SJeff Bonwick 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
733e14bb325SJeff Bonwick 	ASSERT3U(offset + size, <=, vd->vdev_psize);
734fa9e4066Sahrens 
735e14bb325SJeff Bonwick 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
736e14bb325SJeff Bonwick 	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
737fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
738fa9e4066Sahrens 
739e14bb325SJeff Bonwick 	zio->io_prop.zp_checksum = checksum;
740fa9e4066Sahrens 
741fa9e4066Sahrens 	return (zio);
742fa9e4066Sahrens }
743fa9e4066Sahrens 
744fa9e4066Sahrens zio_t *
745fa9e4066Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
746fa9e4066Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
747*b24ab676SJeff Bonwick     int priority, enum zio_flag flags, boolean_t labels)
748fa9e4066Sahrens {
749fa9e4066Sahrens 	zio_t *zio;
7500a4e9518Sgw 
751e14bb325SJeff Bonwick 	ASSERT(vd->vdev_children == 0);
752e14bb325SJeff Bonwick 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
753e14bb325SJeff Bonwick 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
754e14bb325SJeff Bonwick 	ASSERT3U(offset + size, <=, vd->vdev_psize);
755fa9e4066Sahrens 
756e14bb325SJeff Bonwick 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
757e14bb325SJeff Bonwick 	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
758fa9e4066Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
759fa9e4066Sahrens 
760e14bb325SJeff Bonwick 	zio->io_prop.zp_checksum = checksum;
761fa9e4066Sahrens 
762fa9e4066Sahrens 	if (zio_checksum_table[checksum].ci_zbt) {
763fa9e4066Sahrens 		/*
764fa9e4066Sahrens 		 * zbt checksums are necessarily destructive -- they modify
765e14bb325SJeff Bonwick 		 * the end of the write buffer to hold the verifier/checksum.
766fa9e4066Sahrens 		 * Therefore, we must make a local copy in case the data is
767e14bb325SJeff Bonwick 		 * being written to multiple places in parallel.
768fa9e4066Sahrens 		 */
769e14bb325SJeff Bonwick 		void *wbuf = zio_buf_alloc(size);
770fa9e4066Sahrens 		bcopy(data, wbuf, size);
771e14bb325SJeff Bonwick 		zio_push_transform(zio, wbuf, size, size, NULL);
772fa9e4066Sahrens 	}
773fa9e4066Sahrens 
774fa9e4066Sahrens 	return (zio);
775fa9e4066Sahrens }
776fa9e4066Sahrens 
777fa9e4066Sahrens /*
778e14bb325SJeff Bonwick  * Create a child I/O to do some work for us.
779fa9e4066Sahrens  */
780fa9e4066Sahrens zio_t *
781e14bb325SJeff Bonwick zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
782*b24ab676SJeff Bonwick 	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
783fa9e4066Sahrens 	zio_done_func_t *done, void *private)
784fa9e4066Sahrens {
785*b24ab676SJeff Bonwick 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
786e14bb325SJeff Bonwick 	zio_t *zio;
787e14bb325SJeff Bonwick 
788e14bb325SJeff Bonwick 	ASSERT(vd->vdev_parent ==
789e14bb325SJeff Bonwick 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
790fa9e4066Sahrens 
791fa9e4066Sahrens 	if (type == ZIO_TYPE_READ && bp != NULL) {
792fa9e4066Sahrens 		/*
793fa9e4066Sahrens 		 * If we have the bp, then the child should perform the
794fa9e4066Sahrens 		 * checksum and the parent need not.  This pushes error
795fa9e4066Sahrens 		 * detection as close to the leaves as possible and
796fa9e4066Sahrens 		 * eliminates redundant checksums in the interior nodes.
797fa9e4066Sahrens 		 */
798*b24ab676SJeff Bonwick 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
799*b24ab676SJeff Bonwick 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
800fa9e4066Sahrens 	}
801fa9e4066Sahrens 
802e14bb325SJeff Bonwick 	if (vd->vdev_children == 0)
803e14bb325SJeff Bonwick 		offset += VDEV_LABEL_START_SIZE;
804e14bb325SJeff Bonwick 
805*b24ab676SJeff Bonwick 	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
806*b24ab676SJeff Bonwick 
807*b24ab676SJeff Bonwick 	/*
808*b24ab676SJeff Bonwick 	 * If we've decided to do a repair, the write is not speculative --
809*b24ab676SJeff Bonwick 	 * even if the original read was.
810*b24ab676SJeff Bonwick 	 */
811*b24ab676SJeff Bonwick 	if (flags & ZIO_FLAG_IO_REPAIR)
812*b24ab676SJeff Bonwick 		flags &= ~ZIO_FLAG_SPECULATIVE;
813*b24ab676SJeff Bonwick 
814e14bb325SJeff Bonwick 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
815*b24ab676SJeff Bonwick 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
816*b24ab676SJeff Bonwick 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
817fa9e4066Sahrens 
818e14bb325SJeff Bonwick 	return (zio);
81932b87932Sek }
82032b87932Sek 
821e14bb325SJeff Bonwick zio_t *
822e14bb325SJeff Bonwick zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
823*b24ab676SJeff Bonwick 	int type, int priority, enum zio_flag flags,
824*b24ab676SJeff Bonwick 	zio_done_func_t *done, void *private)
825fa9e4066Sahrens {
826e14bb325SJeff Bonwick 	zio_t *zio;
827fa9e4066Sahrens 
828e14bb325SJeff Bonwick 	ASSERT(vd->vdev_ops->vdev_op_leaf);
829fa9e4066Sahrens 
830e14bb325SJeff Bonwick 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
831e14bb325SJeff Bonwick 	    data, size, done, private, type, priority,
832e14bb325SJeff Bonwick 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
833e14bb325SJeff Bonwick 	    vd, offset, NULL,
834*b24ab676SJeff Bonwick 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
835fa9e4066Sahrens 
836e14bb325SJeff Bonwick 	return (zio);
837e05725b1Sbonwick }
838e05725b1Sbonwick 
839e05725b1Sbonwick void
840e14bb325SJeff Bonwick zio_flush(zio_t *zio, vdev_t *vd)
841e05725b1Sbonwick {
842e14bb325SJeff Bonwick 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
843e14bb325SJeff Bonwick 	    NULL, NULL, ZIO_PRIORITY_NOW,
844e14bb325SJeff Bonwick 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
845fa9e4066Sahrens }
846fa9e4066Sahrens 
847fa9e4066Sahrens /*
848fa9e4066Sahrens  * ==========================================================================
849e14bb325SJeff Bonwick  * Prepare to read and write logical blocks
850fa9e4066Sahrens  * ==========================================================================
851fa9e4066Sahrens  */
852e14bb325SJeff Bonwick 
853e05725b1Sbonwick static int
854e14bb325SJeff Bonwick zio_read_bp_init(zio_t *zio)
855fa9e4066Sahrens {
856e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
857e05725b1Sbonwick 
85803361682SJeff Bonwick 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
859f5383399SBill Moore 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
860f5383399SBill Moore 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
861*b24ab676SJeff Bonwick 		uint64_t psize = BP_GET_PSIZE(bp);
862*b24ab676SJeff Bonwick 		void *cbuf = zio_buf_alloc(psize);
863e05725b1Sbonwick 
864*b24ab676SJeff Bonwick 		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
865e14bb325SJeff Bonwick 	}
866fa9e4066Sahrens 
867e14bb325SJeff Bonwick 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
868e14bb325SJeff Bonwick 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
869fa9e4066Sahrens 
870*b24ab676SJeff Bonwick 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
871*b24ab676SJeff Bonwick 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
872*b24ab676SJeff Bonwick 
873e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
874fa9e4066Sahrens }
875fa9e4066Sahrens 
876e05725b1Sbonwick static int
877e14bb325SJeff Bonwick zio_write_bp_init(zio_t *zio)
8780a4e9518Sgw {
879*b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
880e14bb325SJeff Bonwick 	zio_prop_t *zp = &zio->io_prop;
881*b24ab676SJeff Bonwick 	enum zio_compress compress = zp->zp_compress;
882e05725b1Sbonwick 	blkptr_t *bp = zio->io_bp;
883e14bb325SJeff Bonwick 	uint64_t lsize = zio->io_size;
884*b24ab676SJeff Bonwick 	uint64_t psize = lsize;
885e14bb325SJeff Bonwick 	int pass = 1;
886e05725b1Sbonwick 
887e14bb325SJeff Bonwick 	/*
888e14bb325SJeff Bonwick 	 * If our children haven't all reached the ready stage,
889e14bb325SJeff Bonwick 	 * wait for them and then repeat this pipeline stage.
890e14bb325SJeff Bonwick 	 */
891e14bb325SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
892e14bb325SJeff Bonwick 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
893e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
8940a4e9518Sgw 
895e14bb325SJeff Bonwick 	if (!IO_IS_ALLOCATING(zio))
896e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_CONTINUE);
8970a4e9518Sgw 
898*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
899*b24ab676SJeff Bonwick 
900*b24ab676SJeff Bonwick 	if (zio->io_bp_override) {
901*b24ab676SJeff Bonwick 		ASSERT(bp->blk_birth != zio->io_txg);
902*b24ab676SJeff Bonwick 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
903*b24ab676SJeff Bonwick 
904*b24ab676SJeff Bonwick 		*bp = *zio->io_bp_override;
905*b24ab676SJeff Bonwick 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
906*b24ab676SJeff Bonwick 
907*b24ab676SJeff Bonwick 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
908*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
909*b24ab676SJeff Bonwick 
910*b24ab676SJeff Bonwick 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
911*b24ab676SJeff Bonwick 		    zp->zp_dedup_verify);
912*b24ab676SJeff Bonwick 
913*b24ab676SJeff Bonwick 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
914*b24ab676SJeff Bonwick 			BP_SET_DEDUP(bp, 1);
915*b24ab676SJeff Bonwick 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
916*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
917*b24ab676SJeff Bonwick 		}
918*b24ab676SJeff Bonwick 		zio->io_bp_override = NULL;
919*b24ab676SJeff Bonwick 		BP_ZERO(bp);
920*b24ab676SJeff Bonwick 	}
9210a4e9518Sgw 
922e14bb325SJeff Bonwick 	if (bp->blk_birth == zio->io_txg) {
923e14bb325SJeff Bonwick 		/*
924e14bb325SJeff Bonwick 		 * We're rewriting an existing block, which means we're
925e14bb325SJeff Bonwick 		 * working on behalf of spa_sync().  For spa_sync() to
926e14bb325SJeff Bonwick 		 * converge, it must eventually be the case that we don't
927e14bb325SJeff Bonwick 		 * have to allocate new blocks.  But compression changes
928e14bb325SJeff Bonwick 		 * the blocksize, which forces a reallocate, and makes
929e14bb325SJeff Bonwick 		 * convergence take longer.  Therefore, after the first
930e14bb325SJeff Bonwick 		 * few passes, stop compressing to ensure convergence.
931e14bb325SJeff Bonwick 		 */
932*b24ab676SJeff Bonwick 		pass = spa_sync_pass(spa);
933*b24ab676SJeff Bonwick 
934*b24ab676SJeff Bonwick 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
935*b24ab676SJeff Bonwick 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
936*b24ab676SJeff Bonwick 		ASSERT(!BP_GET_DEDUP(bp));
937e05725b1Sbonwick 
938e14bb325SJeff Bonwick 		if (pass > SYNC_PASS_DONT_COMPRESS)
939e14bb325SJeff Bonwick 			compress = ZIO_COMPRESS_OFF;
940e05725b1Sbonwick 
941e14bb325SJeff Bonwick 		/* Make sure someone doesn't change their mind on overwrites */
942*b24ab676SJeff Bonwick 		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
943*b24ab676SJeff Bonwick 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
944e14bb325SJeff Bonwick 	}
945fa9e4066Sahrens 
946e14bb325SJeff Bonwick 	if (compress != ZIO_COMPRESS_OFF) {
947*b24ab676SJeff Bonwick 		void *cbuf = zio_buf_alloc(lsize);
948*b24ab676SJeff Bonwick 		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
949*b24ab676SJeff Bonwick 		if (psize == 0 || psize == lsize) {
950e14bb325SJeff Bonwick 			compress = ZIO_COMPRESS_OFF;
951*b24ab676SJeff Bonwick 			zio_buf_free(cbuf, lsize);
952*b24ab676SJeff Bonwick 		} else {
953*b24ab676SJeff Bonwick 			ASSERT(psize < lsize);
954*b24ab676SJeff Bonwick 			zio_push_transform(zio, cbuf, psize, lsize, NULL);
955e14bb325SJeff Bonwick 		}
956e14bb325SJeff Bonwick 	}
957c717a561Smaybee 
958e14bb325SJeff Bonwick 	/*
959e14bb325SJeff Bonwick 	 * The final pass of spa_sync() must be all rewrites, but the first
960e14bb325SJeff Bonwick 	 * few passes offer a trade-off: allocating blocks defers convergence,
961e14bb325SJeff Bonwick 	 * but newly allocated blocks are sequential, so they can be written
962e14bb325SJeff Bonwick 	 * to disk faster.  Therefore, we allow the first few passes of
963e14bb325SJeff Bonwick 	 * spa_sync() to allocate new blocks, but force rewrites after that.
964e14bb325SJeff Bonwick 	 * There should only be a handful of blocks after pass 1 in any case.
965e14bb325SJeff Bonwick 	 */
966*b24ab676SJeff Bonwick 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
967e14bb325SJeff Bonwick 	    pass > SYNC_PASS_REWRITE) {
968*b24ab676SJeff Bonwick 		ASSERT(psize != 0);
969*b24ab676SJeff Bonwick 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
970e14bb325SJeff Bonwick 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
971e14bb325SJeff Bonwick 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
972e14bb325SJeff Bonwick 	} else {
973e14bb325SJeff Bonwick 		BP_ZERO(bp);
974e14bb325SJeff Bonwick 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
975e14bb325SJeff Bonwick 	}
976fa9e4066Sahrens 
977*b24ab676SJeff Bonwick 	if (psize == 0) {
978e14bb325SJeff Bonwick 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
979e14bb325SJeff Bonwick 	} else {
980e14bb325SJeff Bonwick 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
981e14bb325SJeff Bonwick 		BP_SET_LSIZE(bp, lsize);
982*b24ab676SJeff Bonwick 		BP_SET_PSIZE(bp, psize);
983e14bb325SJeff Bonwick 		BP_SET_COMPRESS(bp, compress);
984e14bb325SJeff Bonwick 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
985e14bb325SJeff Bonwick 		BP_SET_TYPE(bp, zp->zp_type);
986e14bb325SJeff Bonwick 		BP_SET_LEVEL(bp, zp->zp_level);
987*b24ab676SJeff Bonwick 		BP_SET_DEDUP(bp, zp->zp_dedup);
988e14bb325SJeff Bonwick 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
989*b24ab676SJeff Bonwick 		if (zp->zp_dedup) {
990*b24ab676SJeff Bonwick 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
991*b24ab676SJeff Bonwick 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
992*b24ab676SJeff Bonwick 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
993*b24ab676SJeff Bonwick 		}
994*b24ab676SJeff Bonwick 	}
995*b24ab676SJeff Bonwick 
996*b24ab676SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
997*b24ab676SJeff Bonwick }
998*b24ab676SJeff Bonwick 
999*b24ab676SJeff Bonwick static int
1000*b24ab676SJeff Bonwick zio_free_bp_init(zio_t *zio)
1001*b24ab676SJeff Bonwick {
1002*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1003*b24ab676SJeff Bonwick 
1004*b24ab676SJeff Bonwick 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1005*b24ab676SJeff Bonwick 		if (BP_GET_DEDUP(bp))
1006*b24ab676SJeff Bonwick 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1007*b24ab676SJeff Bonwick 		else
1008*b24ab676SJeff Bonwick 			arc_free(zio->io_spa, bp);
1009e14bb325SJeff Bonwick 	}
1010fa9e4066Sahrens 
1011468c413aSTim Haley 	if (zio_injection_enabled &&
1012468c413aSTim Haley 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
1013468c413aSTim Haley 		zio_handle_ignored_writes(zio);
1014468c413aSTim Haley 
1015e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1016fa9e4066Sahrens }
1017fa9e4066Sahrens 
1018e14bb325SJeff Bonwick /*
1019e14bb325SJeff Bonwick  * ==========================================================================
1020e14bb325SJeff Bonwick  * Execute the I/O pipeline
1021e14bb325SJeff Bonwick  * ==========================================================================
1022e14bb325SJeff Bonwick  */
1023e14bb325SJeff Bonwick 
1024e14bb325SJeff Bonwick static void
1025e14bb325SJeff Bonwick zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
1026fa9e4066Sahrens {
1027e14bb325SJeff Bonwick 	zio_type_t t = zio->io_type;
10280a4e9518Sgw 
10290a4e9518Sgw 	/*
1030bbe36defSGeorge Wilson 	 * If we're a config writer or a probe, the normal issue and
1031bbe36defSGeorge Wilson 	 * interrupt threads may all be blocked waiting for the config lock.
1032bbe36defSGeorge Wilson 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
10330a4e9518Sgw 	 */
1034bbe36defSGeorge Wilson 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1035e14bb325SJeff Bonwick 		t = ZIO_TYPE_NULL;
10360a4e9518Sgw 
10370a4e9518Sgw 	/*
1038e14bb325SJeff Bonwick 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
10390a4e9518Sgw 	 */
1040e14bb325SJeff Bonwick 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1041e14bb325SJeff Bonwick 		t = ZIO_TYPE_NULL;
10420a4e9518Sgw 
1043e14bb325SJeff Bonwick 	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
1044e14bb325SJeff Bonwick 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
1045e14bb325SJeff Bonwick }
10460a4e9518Sgw 
1047e14bb325SJeff Bonwick static boolean_t
1048e14bb325SJeff Bonwick zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
1049e14bb325SJeff Bonwick {
1050e14bb325SJeff Bonwick 	kthread_t *executor = zio->io_executor;
1051e14bb325SJeff Bonwick 	spa_t *spa = zio->io_spa;
10520a4e9518Sgw 
1053e14bb325SJeff Bonwick 	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
1054e14bb325SJeff Bonwick 		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
1055e14bb325SJeff Bonwick 			return (B_TRUE);
10560a4e9518Sgw 
1057e14bb325SJeff Bonwick 	return (B_FALSE);
1058e14bb325SJeff Bonwick }
1059e05725b1Sbonwick 
1060e14bb325SJeff Bonwick static int
1061e14bb325SJeff Bonwick zio_issue_async(zio_t *zio)
1062e14bb325SJeff Bonwick {
1063e14bb325SJeff Bonwick 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1064e14bb325SJeff Bonwick 
1065e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_STOP);
10660a4e9518Sgw }
10670a4e9518Sgw 
1068e14bb325SJeff Bonwick void
1069e14bb325SJeff Bonwick zio_interrupt(zio_t *zio)
10700a4e9518Sgw {
1071e14bb325SJeff Bonwick 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
1072e14bb325SJeff Bonwick }
10730a4e9518Sgw 
1074e14bb325SJeff Bonwick /*
1075e14bb325SJeff Bonwick  * Execute the I/O pipeline until one of the following occurs:
1076e14bb325SJeff Bonwick  * (1) the I/O completes; (2) the pipeline stalls waiting for
1077e14bb325SJeff Bonwick  * dependent child I/Os; (3) the I/O issues, so we're waiting
1078e14bb325SJeff Bonwick  * for an I/O completion interrupt; (4) the I/O is delegated by
1079e14bb325SJeff Bonwick  * vdev-level caching or aggregation; (5) the I/O is deferred
1080e14bb325SJeff Bonwick  * due to vdev-level queueing; (6) the I/O is handed off to
1081e14bb325SJeff Bonwick  * another thread.  In all cases, the pipeline stops whenever
1082e14bb325SJeff Bonwick  * there's no CPU work; it never burns a thread in cv_wait().
1083e14bb325SJeff Bonwick  *
1084e14bb325SJeff Bonwick  * There's no locking on io_stage because there's no legitimate way
1085e14bb325SJeff Bonwick  * for multiple threads to be attempting to process the same I/O.
1086e14bb325SJeff Bonwick  */
1087*b24ab676SJeff Bonwick static zio_pipe_stage_t *zio_pipeline[];
10880a4e9518Sgw 
1089e14bb325SJeff Bonwick void
1090e14bb325SJeff Bonwick zio_execute(zio_t *zio)
1091e14bb325SJeff Bonwick {
1092e14bb325SJeff Bonwick 	zio->io_executor = curthread;
10930a4e9518Sgw 
1094e14bb325SJeff Bonwick 	while (zio->io_stage < ZIO_STAGE_DONE) {
1095*b24ab676SJeff Bonwick 		enum zio_stage pipeline = zio->io_pipeline;
1096*b24ab676SJeff Bonwick 		enum zio_stage stage = zio->io_stage;
1097e14bb325SJeff Bonwick 		int rv;
10980a4e9518Sgw 
1099e14bb325SJeff Bonwick 		ASSERT(!MUTEX_HELD(&zio->io_lock));
1100*b24ab676SJeff Bonwick 		ASSERT(ISP2(stage));
1101*b24ab676SJeff Bonwick 		ASSERT(zio->io_stall == NULL);
11020a4e9518Sgw 
1103*b24ab676SJeff Bonwick 		do {
1104*b24ab676SJeff Bonwick 			stage <<= 1;
1105*b24ab676SJeff Bonwick 		} while ((stage & pipeline) == 0);
1106e14bb325SJeff Bonwick 
1107e14bb325SJeff Bonwick 		ASSERT(stage <= ZIO_STAGE_DONE);
11080a4e9518Sgw 
11090a4e9518Sgw 		/*
1110e14bb325SJeff Bonwick 		 * If we are in interrupt context and this pipeline stage
1111e14bb325SJeff Bonwick 		 * will grab a config lock that is held across I/O,
1112*b24ab676SJeff Bonwick 		 * or may wait for an I/O that needs an interrupt thread
1113*b24ab676SJeff Bonwick 		 * to complete, issue async to avoid deadlock.
11140a4e9518Sgw 		 */
1115*b24ab676SJeff Bonwick 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1116e14bb325SJeff Bonwick 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1117e14bb325SJeff Bonwick 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1118e14bb325SJeff Bonwick 			return;
11190a4e9518Sgw 		}
11200a4e9518Sgw 
1121e14bb325SJeff Bonwick 		zio->io_stage = stage;
1122*b24ab676SJeff Bonwick 		rv = zio_pipeline[highbit(stage) - 1](zio);
11230a4e9518Sgw 
1124e14bb325SJeff Bonwick 		if (rv == ZIO_PIPELINE_STOP)
1125e14bb325SJeff Bonwick 			return;
11260a4e9518Sgw 
1127e14bb325SJeff Bonwick 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1128e14bb325SJeff Bonwick 	}
11290a4e9518Sgw }
11300a4e9518Sgw 
1131e14bb325SJeff Bonwick /*
1132e14bb325SJeff Bonwick  * ==========================================================================
1133e14bb325SJeff Bonwick  * Initiate I/O, either sync or async
1134e14bb325SJeff Bonwick  * ==========================================================================
1135e14bb325SJeff Bonwick  */
1136e14bb325SJeff Bonwick int
1137e14bb325SJeff Bonwick zio_wait(zio_t *zio)
11380a4e9518Sgw {
1139e14bb325SJeff Bonwick 	int error;
11400a4e9518Sgw 
1141e14bb325SJeff Bonwick 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1142e14bb325SJeff Bonwick 	ASSERT(zio->io_executor == NULL);
11430a4e9518Sgw 
1144e14bb325SJeff Bonwick 	zio->io_waiter = curthread;
1145e05725b1Sbonwick 
1146e14bb325SJeff Bonwick 	zio_execute(zio);
11470a4e9518Sgw 
1148e14bb325SJeff Bonwick 	mutex_enter(&zio->io_lock);
1149e14bb325SJeff Bonwick 	while (zio->io_executor != NULL)
1150e14bb325SJeff Bonwick 		cv_wait(&zio->io_cv, &zio->io_lock);
1151e14bb325SJeff Bonwick 	mutex_exit(&zio->io_lock);
115232b87932Sek 
1153e14bb325SJeff Bonwick 	error = zio->io_error;
1154e14bb325SJeff Bonwick 	zio_destroy(zio);
115532b87932Sek 
1156e14bb325SJeff Bonwick 	return (error);
115732b87932Sek }
115832b87932Sek 
1159e14bb325SJeff Bonwick void
1160e14bb325SJeff Bonwick zio_nowait(zio_t *zio)
11610a4e9518Sgw {
1162e14bb325SJeff Bonwick 	ASSERT(zio->io_executor == NULL);
1163fa9e4066Sahrens 
1164a3f829aeSBill Moore 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1165a3f829aeSBill Moore 	    zio_unique_parent(zio) == NULL) {
1166ea8dc4b6Seschrock 		/*
1167e14bb325SJeff Bonwick 		 * This is a logical async I/O with no parent to wait for it.
116854d692b7SGeorge Wilson 		 * We add it to the spa_async_root_zio "Godfather" I/O which
116954d692b7SGeorge Wilson 		 * will ensure they complete prior to unloading the pool.
1170ea8dc4b6Seschrock 		 */
1171e14bb325SJeff Bonwick 		spa_t *spa = zio->io_spa;
117254d692b7SGeorge Wilson 
117354d692b7SGeorge Wilson 		zio_add_child(spa->spa_async_zio_root, zio);
1174e14bb325SJeff Bonwick 	}
1175ea8dc4b6Seschrock 
1176e14bb325SJeff Bonwick 	zio_execute(zio);
1177e14bb325SJeff Bonwick }
1178ea8dc4b6Seschrock 
1179e14bb325SJeff Bonwick /*
1180e14bb325SJeff Bonwick  * ==========================================================================
1181e14bb325SJeff Bonwick  * Reexecute or suspend/resume failed I/O
1182e14bb325SJeff Bonwick  * ==========================================================================
1183e14bb325SJeff Bonwick  */
1184fa9e4066Sahrens 
1185e14bb325SJeff Bonwick static void
1186e14bb325SJeff Bonwick zio_reexecute(zio_t *pio)
1187e14bb325SJeff Bonwick {
1188a3f829aeSBill Moore 	zio_t *cio, *cio_next;
1189a3f829aeSBill Moore 
1190a3f829aeSBill Moore 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1191a3f829aeSBill Moore 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1192f5383399SBill Moore 	ASSERT(pio->io_gang_leader == NULL);
1193f5383399SBill Moore 	ASSERT(pio->io_gang_tree == NULL);
1194e05725b1Sbonwick 
1195e14bb325SJeff Bonwick 	pio->io_flags = pio->io_orig_flags;
1196e14bb325SJeff Bonwick 	pio->io_stage = pio->io_orig_stage;
1197e14bb325SJeff Bonwick 	pio->io_pipeline = pio->io_orig_pipeline;
1198e14bb325SJeff Bonwick 	pio->io_reexecute = 0;
1199e14bb325SJeff Bonwick 	pio->io_error = 0;
1200a3f829aeSBill Moore 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1201a3f829aeSBill Moore 		pio->io_state[w] = 0;
1202e14bb325SJeff Bonwick 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1203e14bb325SJeff Bonwick 		pio->io_child_error[c] = 0;
12040a4e9518Sgw 
1205*b24ab676SJeff Bonwick 	if (IO_IS_ALLOCATING(pio))
1206*b24ab676SJeff Bonwick 		BP_ZERO(pio->io_bp);
1207d58459f4Sek 
1208e14bb325SJeff Bonwick 	/*
1209e14bb325SJeff Bonwick 	 * As we reexecute pio's children, new children could be created.
1210a3f829aeSBill Moore 	 * New children go to the head of pio's io_child_list, however,
1211e14bb325SJeff Bonwick 	 * so we will (correctly) not reexecute them.  The key is that
1212a3f829aeSBill Moore 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1213a3f829aeSBill Moore 	 * cannot be affected by any side effects of reexecuting 'cio'.
1214e14bb325SJeff Bonwick 	 */
1215a3f829aeSBill Moore 	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1216a3f829aeSBill Moore 		cio_next = zio_walk_children(pio);
1217e14bb325SJeff Bonwick 		mutex_enter(&pio->io_lock);
1218a3f829aeSBill Moore 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1219a3f829aeSBill Moore 			pio->io_children[cio->io_child_type][w]++;
1220e14bb325SJeff Bonwick 		mutex_exit(&pio->io_lock);
1221a3f829aeSBill Moore 		zio_reexecute(cio);
1222fa9e4066Sahrens 	}
1223e05725b1Sbonwick 
1224e14bb325SJeff Bonwick 	/*
1225e14bb325SJeff Bonwick 	 * Now that all children have been reexecuted, execute the parent.
122654d692b7SGeorge Wilson 	 * We don't reexecute "The Godfather" I/O here as it's the
122754d692b7SGeorge Wilson 	 * responsibility of the caller to wait on him.
1228e14bb325SJeff Bonwick 	 */
122954d692b7SGeorge Wilson 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
123054d692b7SGeorge Wilson 		zio_execute(pio);
12310a4e9518Sgw }
12320a4e9518Sgw 
1233e14bb325SJeff Bonwick void
1234e14bb325SJeff Bonwick zio_suspend(spa_t *spa, zio_t *zio)
12350a4e9518Sgw {
1236e14bb325SJeff Bonwick 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1237e14bb325SJeff Bonwick 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1238e14bb325SJeff Bonwick 		    "failure and the failure mode property for this pool "
1239e14bb325SJeff Bonwick 		    "is set to panic.", spa_name(spa));
12400a4e9518Sgw 
1241e14bb325SJeff Bonwick 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
12420a4e9518Sgw 
1243e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_suspend_lock);
1244fa9e4066Sahrens 
1245e14bb325SJeff Bonwick 	if (spa->spa_suspend_zio_root == NULL)
124654d692b7SGeorge Wilson 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
124754d692b7SGeorge Wilson 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
124854d692b7SGeorge Wilson 		    ZIO_FLAG_GODFATHER);
1249fa9e4066Sahrens 
1250e14bb325SJeff Bonwick 	spa->spa_suspended = B_TRUE;
1251fa9e4066Sahrens 
1252e14bb325SJeff Bonwick 	if (zio != NULL) {
125354d692b7SGeorge Wilson 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1254e14bb325SJeff Bonwick 		ASSERT(zio != spa->spa_suspend_zio_root);
1255e14bb325SJeff Bonwick 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1256a3f829aeSBill Moore 		ASSERT(zio_unique_parent(zio) == NULL);
1257e14bb325SJeff Bonwick 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1258e14bb325SJeff Bonwick 		zio_add_child(spa->spa_suspend_zio_root, zio);
1259e14bb325SJeff Bonwick 	}
1260fa9e4066Sahrens 
1261e14bb325SJeff Bonwick 	mutex_exit(&spa->spa_suspend_lock);
1262e14bb325SJeff Bonwick }
1263fa9e4066Sahrens 
126454d692b7SGeorge Wilson int
1265e14bb325SJeff Bonwick zio_resume(spa_t *spa)
1266e14bb325SJeff Bonwick {
126754d692b7SGeorge Wilson 	zio_t *pio;
1268fa9e4066Sahrens 
1269b3995adbSahrens 	/*
1270e14bb325SJeff Bonwick 	 * Reexecute all previously suspended i/o.
1271b3995adbSahrens 	 */
1272e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_suspend_lock);
1273e14bb325SJeff Bonwick 	spa->spa_suspended = B_FALSE;
1274e14bb325SJeff Bonwick 	cv_broadcast(&spa->spa_suspend_cv);
1275e14bb325SJeff Bonwick 	pio = spa->spa_suspend_zio_root;
1276e14bb325SJeff Bonwick 	spa->spa_suspend_zio_root = NULL;
1277e14bb325SJeff Bonwick 	mutex_exit(&spa->spa_suspend_lock);
1278e14bb325SJeff Bonwick 
1279e14bb325SJeff Bonwick 	if (pio == NULL)
128054d692b7SGeorge Wilson 		return (0);
1281e14bb325SJeff Bonwick 
128254d692b7SGeorge Wilson 	zio_reexecute(pio);
128354d692b7SGeorge Wilson 	return (zio_wait(pio));
1284e14bb325SJeff Bonwick }
1285e14bb325SJeff Bonwick 
1286e14bb325SJeff Bonwick void
1287e14bb325SJeff Bonwick zio_resume_wait(spa_t *spa)
1288e14bb325SJeff Bonwick {
1289e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_suspend_lock);
1290e14bb325SJeff Bonwick 	while (spa_suspended(spa))
1291e14bb325SJeff Bonwick 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1292e14bb325SJeff Bonwick 	mutex_exit(&spa->spa_suspend_lock);
1293fa9e4066Sahrens }
1294fa9e4066Sahrens 
1295fa9e4066Sahrens /*
1296fa9e4066Sahrens  * ==========================================================================
1297e14bb325SJeff Bonwick  * Gang blocks.
1298e14bb325SJeff Bonwick  *
1299e14bb325SJeff Bonwick  * A gang block is a collection of small blocks that looks to the DMU
1300e14bb325SJeff Bonwick  * like one large block.  When zio_dva_allocate() cannot find a block
1301e14bb325SJeff Bonwick  * of the requested size, due to either severe fragmentation or the pool
1302e14bb325SJeff Bonwick  * being nearly full, it calls zio_write_gang_block() to construct the
1303e14bb325SJeff Bonwick  * block from smaller fragments.
1304e14bb325SJeff Bonwick  *
1305e14bb325SJeff Bonwick  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1306e14bb325SJeff Bonwick  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1307e14bb325SJeff Bonwick  * an indirect block: it's an array of block pointers.  It consumes
1308e14bb325SJeff Bonwick  * only one sector and hence is allocatable regardless of fragmentation.
1309e14bb325SJeff Bonwick  * The gang header's bps point to its gang members, which hold the data.
1310e14bb325SJeff Bonwick  *
1311e14bb325SJeff Bonwick  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1312e14bb325SJeff Bonwick  * as the verifier to ensure uniqueness of the SHA256 checksum.
1313e14bb325SJeff Bonwick  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1314e14bb325SJeff Bonwick  * not the gang header.  This ensures that data block signatures (needed for
1315e14bb325SJeff Bonwick  * deduplication) are independent of how the block is physically stored.
1316e14bb325SJeff Bonwick  *
1317e14bb325SJeff Bonwick  * Gang blocks can be nested: a gang member may itself be a gang block.
1318e14bb325SJeff Bonwick  * Thus every gang block is a tree in which root and all interior nodes are
1319e14bb325SJeff Bonwick  * gang headers, and the leaves are normal blocks that contain user data.
1320e14bb325SJeff Bonwick  * The root of the gang tree is called the gang leader.
1321e14bb325SJeff Bonwick  *
1322e14bb325SJeff Bonwick  * To perform any operation (read, rewrite, free, claim) on a gang block,
1323e14bb325SJeff Bonwick  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1324e14bb325SJeff Bonwick  * in the io_gang_tree field of the original logical i/o by recursively
1325e14bb325SJeff Bonwick  * reading the gang leader and all gang headers below it.  This yields
1326e14bb325SJeff Bonwick  * an in-core tree containing the contents of every gang header and the
1327e14bb325SJeff Bonwick  * bps for every constituent of the gang block.
1328e14bb325SJeff Bonwick  *
1329e14bb325SJeff Bonwick  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1330e14bb325SJeff Bonwick  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1331e14bb325SJeff Bonwick  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1332e14bb325SJeff Bonwick  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1333e14bb325SJeff Bonwick  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1334e14bb325SJeff Bonwick  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1335e14bb325SJeff Bonwick  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1336e14bb325SJeff Bonwick  * of the gang header plus zio_checksum_compute() of the data to update the
1337e14bb325SJeff Bonwick  * gang header's blk_cksum as described above.
1338e14bb325SJeff Bonwick  *
1339e14bb325SJeff Bonwick  * The two-phase assemble/issue model solves the problem of partial failure --
1340e14bb325SJeff Bonwick  * what if you'd freed part of a gang block but then couldn't read the
1341e14bb325SJeff Bonwick  * gang header for another part?  Assembling the entire gang tree first
1342e14bb325SJeff Bonwick  * ensures that all the necessary gang header I/O has succeeded before
1343e14bb325SJeff Bonwick  * starting the actual work of free, claim, or write.  Once the gang tree
1344e14bb325SJeff Bonwick  * is assembled, free and claim are in-memory operations that cannot fail.
1345e14bb325SJeff Bonwick  *
1346e14bb325SJeff Bonwick  * In the event that a gang write fails, zio_dva_unallocate() walks the
1347e14bb325SJeff Bonwick  * gang tree to immediately free (i.e. insert back into the space map)
1348e14bb325SJeff Bonwick  * everything we've allocated.  This ensures that we don't get ENOSPC
1349e14bb325SJeff Bonwick  * errors during repeated suspend/resume cycles due to a flaky device.
1350e14bb325SJeff Bonwick  *
1351e14bb325SJeff Bonwick  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1352e14bb325SJeff Bonwick  * the gang tree, we won't modify the block, so we can safely defer the free
1353e14bb325SJeff Bonwick  * (knowing that the block is still intact).  If we *can* assemble the gang
1354e14bb325SJeff Bonwick  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1355e14bb325SJeff Bonwick  * each constituent bp and we can allocate a new block on the next sync pass.
1356e14bb325SJeff Bonwick  *
1357e14bb325SJeff Bonwick  * In all cases, the gang tree allows complete recovery from partial failure.
1358fa9e4066Sahrens  * ==========================================================================
1359fa9e4066Sahrens  */
1360e14bb325SJeff Bonwick 
1361e14bb325SJeff Bonwick static zio_t *
1362e14bb325SJeff Bonwick zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1363fa9e4066Sahrens {
1364e14bb325SJeff Bonwick 	if (gn != NULL)
1365e14bb325SJeff Bonwick 		return (pio);
1366fa9e4066Sahrens 
1367e14bb325SJeff Bonwick 	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1368e14bb325SJeff Bonwick 	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1369e14bb325SJeff Bonwick 	    &pio->io_bookmark));
1370e14bb325SJeff Bonwick }
1371e14bb325SJeff Bonwick 
1372e14bb325SJeff Bonwick zio_t *
1373e14bb325SJeff Bonwick zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1374e14bb325SJeff Bonwick {
1375e14bb325SJeff Bonwick 	zio_t *zio;
1376e14bb325SJeff Bonwick 
1377e14bb325SJeff Bonwick 	if (gn != NULL) {
1378e14bb325SJeff Bonwick 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1379e14bb325SJeff Bonwick 		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1380e14bb325SJeff Bonwick 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1381fa9e4066Sahrens 		/*
1382e14bb325SJeff Bonwick 		 * As we rewrite each gang header, the pipeline will compute
1383e14bb325SJeff Bonwick 		 * a new gang block header checksum for it; but no one will
1384e14bb325SJeff Bonwick 		 * compute a new data checksum, so we do that here.  The one
1385e14bb325SJeff Bonwick 		 * exception is the gang leader: the pipeline already computed
1386e14bb325SJeff Bonwick 		 * its data checksum because that stage precedes gang assembly.
1387e14bb325SJeff Bonwick 		 * (Presently, nothing actually uses interior data checksums;
1388e14bb325SJeff Bonwick 		 * this is just good hygiene.)
1389fa9e4066Sahrens 		 */
1390f5383399SBill Moore 		if (gn != pio->io_gang_leader->io_gang_tree) {
1391e14bb325SJeff Bonwick 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1392e14bb325SJeff Bonwick 			    data, BP_GET_PSIZE(bp));
1393e14bb325SJeff Bonwick 		}
1394*b24ab676SJeff Bonwick 		/*
1395*b24ab676SJeff Bonwick 		 * If we are here to damage data for testing purposes,
1396*b24ab676SJeff Bonwick 		 * leave the GBH alone so that we can detect the damage.
1397*b24ab676SJeff Bonwick 		 */
1398*b24ab676SJeff Bonwick 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1399*b24ab676SJeff Bonwick 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1400fa9e4066Sahrens 	} else {
1401e14bb325SJeff Bonwick 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1402e14bb325SJeff Bonwick 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1403e14bb325SJeff Bonwick 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1404fa9e4066Sahrens 	}
1405fa9e4066Sahrens 
1406e14bb325SJeff Bonwick 	return (zio);
1407e14bb325SJeff Bonwick }
1408fa9e4066Sahrens 
1409e14bb325SJeff Bonwick /* ARGSUSED */
1410e14bb325SJeff Bonwick zio_t *
1411e14bb325SJeff Bonwick zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1412e14bb325SJeff Bonwick {
1413*b24ab676SJeff Bonwick 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1414*b24ab676SJeff Bonwick 	    ZIO_GANG_CHILD_FLAGS(pio)));
1415fa9e4066Sahrens }
1416fa9e4066Sahrens 
1417e14bb325SJeff Bonwick /* ARGSUSED */
1418e14bb325SJeff Bonwick zio_t *
1419e14bb325SJeff Bonwick zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1420fa9e4066Sahrens {
1421e14bb325SJeff Bonwick 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1422e14bb325SJeff Bonwick 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1423e14bb325SJeff Bonwick }
1424fa9e4066Sahrens 
1425e14bb325SJeff Bonwick static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1426e14bb325SJeff Bonwick 	NULL,
1427e14bb325SJeff Bonwick 	zio_read_gang,
1428e14bb325SJeff Bonwick 	zio_rewrite_gang,
1429e14bb325SJeff Bonwick 	zio_free_gang,
1430e14bb325SJeff Bonwick 	zio_claim_gang,
1431e14bb325SJeff Bonwick 	NULL
1432e14bb325SJeff Bonwick };
1433fa9e4066Sahrens 
1434e14bb325SJeff Bonwick static void zio_gang_tree_assemble_done(zio_t *zio);
1435fa9e4066Sahrens 
1436e14bb325SJeff Bonwick static zio_gang_node_t *
1437e14bb325SJeff Bonwick zio_gang_node_alloc(zio_gang_node_t **gnpp)
1438e14bb325SJeff Bonwick {
1439e14bb325SJeff Bonwick 	zio_gang_node_t *gn;
1440fa9e4066Sahrens 
1441e14bb325SJeff Bonwick 	ASSERT(*gnpp == NULL);
1442fa9e4066Sahrens 
1443e14bb325SJeff Bonwick 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1444e14bb325SJeff Bonwick 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1445e14bb325SJeff Bonwick 	*gnpp = gn;
1446e14bb325SJeff Bonwick 
1447e14bb325SJeff Bonwick 	return (gn);
1448fa9e4066Sahrens }
1449fa9e4066Sahrens 
1450fa9e4066Sahrens static void
1451e14bb325SJeff Bonwick zio_gang_node_free(zio_gang_node_t **gnpp)
1452fa9e4066Sahrens {
1453e14bb325SJeff Bonwick 	zio_gang_node_t *gn = *gnpp;
1454fa9e4066Sahrens 
1455e14bb325SJeff Bonwick 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1456e14bb325SJeff Bonwick 		ASSERT(gn->gn_child[g] == NULL);
1457e14bb325SJeff Bonwick 
1458e14bb325SJeff Bonwick 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1459e14bb325SJeff Bonwick 	kmem_free(gn, sizeof (*gn));
1460e14bb325SJeff Bonwick 	*gnpp = NULL;
1461fa9e4066Sahrens }
1462fa9e4066Sahrens 
1463e14bb325SJeff Bonwick static void
1464e14bb325SJeff Bonwick zio_gang_tree_free(zio_gang_node_t **gnpp)
1465fa9e4066Sahrens {
1466e14bb325SJeff Bonwick 	zio_gang_node_t *gn = *gnpp;
1467fa9e4066Sahrens 
1468e14bb325SJeff Bonwick 	if (gn == NULL)
1469e14bb325SJeff Bonwick 		return;
1470fa9e4066Sahrens 
1471e14bb325SJeff Bonwick 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1472e14bb325SJeff Bonwick 		zio_gang_tree_free(&gn->gn_child[g]);
1473fa9e4066Sahrens 
1474e14bb325SJeff Bonwick 	zio_gang_node_free(gnpp);
1475fa9e4066Sahrens }
1476fa9e4066Sahrens 
1477e14bb325SJeff Bonwick static void
1478f5383399SBill Moore zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1479fa9e4066Sahrens {
1480e14bb325SJeff Bonwick 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1481e14bb325SJeff Bonwick 
1482f5383399SBill Moore 	ASSERT(gio->io_gang_leader == gio);
1483e14bb325SJeff Bonwick 	ASSERT(BP_IS_GANG(bp));
1484fa9e4066Sahrens 
1485f5383399SBill Moore 	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1486e14bb325SJeff Bonwick 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1487f5383399SBill Moore 	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1488e14bb325SJeff Bonwick }
1489fa9e4066Sahrens 
1490e14bb325SJeff Bonwick static void
1491e14bb325SJeff Bonwick zio_gang_tree_assemble_done(zio_t *zio)
1492e14bb325SJeff Bonwick {
1493f5383399SBill Moore 	zio_t *gio = zio->io_gang_leader;
1494e14bb325SJeff Bonwick 	zio_gang_node_t *gn = zio->io_private;
1495e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1496fa9e4066Sahrens 
1497f5383399SBill Moore 	ASSERT(gio == zio_unique_parent(zio));
1498*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_count == 0);
1499fa9e4066Sahrens 
1500e14bb325SJeff Bonwick 	if (zio->io_error)
1501e14bb325SJeff Bonwick 		return;
1502fa9e4066Sahrens 
1503e14bb325SJeff Bonwick 	if (BP_SHOULD_BYTESWAP(bp))
1504e14bb325SJeff Bonwick 		byteswap_uint64_array(zio->io_data, zio->io_size);
1505fa9e4066Sahrens 
1506e14bb325SJeff Bonwick 	ASSERT(zio->io_data == gn->gn_gbh);
1507e14bb325SJeff Bonwick 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1508e14bb325SJeff Bonwick 	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1509e05725b1Sbonwick 
1510e14bb325SJeff Bonwick 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1511e14bb325SJeff Bonwick 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1512e14bb325SJeff Bonwick 		if (!BP_IS_GANG(gbp))
1513e14bb325SJeff Bonwick 			continue;
1514f5383399SBill Moore 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1515e14bb325SJeff Bonwick 	}
1516fa9e4066Sahrens }
1517fa9e4066Sahrens 
1518e14bb325SJeff Bonwick static void
1519e14bb325SJeff Bonwick zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1520fa9e4066Sahrens {
1521f5383399SBill Moore 	zio_t *gio = pio->io_gang_leader;
1522e14bb325SJeff Bonwick 	zio_t *zio;
1523fa9e4066Sahrens 
1524e14bb325SJeff Bonwick 	ASSERT(BP_IS_GANG(bp) == !!gn);
1525f5383399SBill Moore 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1526f5383399SBill Moore 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1527fa9e4066Sahrens 
1528e14bb325SJeff Bonwick 	/*
1529e14bb325SJeff Bonwick 	 * If you're a gang header, your data is in gn->gn_gbh.
1530e14bb325SJeff Bonwick 	 * If you're a gang member, your data is in 'data' and gn == NULL.
1531e14bb325SJeff Bonwick 	 */
1532f5383399SBill Moore 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1533fa9e4066Sahrens 
1534e14bb325SJeff Bonwick 	if (gn != NULL) {
1535e14bb325SJeff Bonwick 		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1536fa9e4066Sahrens 
1537e14bb325SJeff Bonwick 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1538e14bb325SJeff Bonwick 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1539e14bb325SJeff Bonwick 			if (BP_IS_HOLE(gbp))
1540e14bb325SJeff Bonwick 				continue;
1541e14bb325SJeff Bonwick 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1542e14bb325SJeff Bonwick 			data = (char *)data + BP_GET_PSIZE(gbp);
1543e14bb325SJeff Bonwick 		}
1544fa9e4066Sahrens 	}
1545fa9e4066Sahrens 
1546f5383399SBill Moore 	if (gn == gio->io_gang_tree)
1547f5383399SBill Moore 		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1548e05725b1Sbonwick 
1549e14bb325SJeff Bonwick 	if (zio != pio)
1550e14bb325SJeff Bonwick 		zio_nowait(zio);
1551fa9e4066Sahrens }
1552fa9e4066Sahrens 
1553e05725b1Sbonwick static int
1554e14bb325SJeff Bonwick zio_gang_assemble(zio_t *zio)
1555fa9e4066Sahrens {
1556e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1557fa9e4066Sahrens 
1558f5383399SBill Moore 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1559f5383399SBill Moore 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1560f5383399SBill Moore 
1561f5383399SBill Moore 	zio->io_gang_leader = zio;
1562fa9e4066Sahrens 
1563e14bb325SJeff Bonwick 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1564e05725b1Sbonwick 
1565e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1566fa9e4066Sahrens }
1567fa9e4066Sahrens 
1568e05725b1Sbonwick static int
1569e14bb325SJeff Bonwick zio_gang_issue(zio_t *zio)
1570fa9e4066Sahrens {
1571e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1572fa9e4066Sahrens 
1573e14bb325SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1574e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
1575fa9e4066Sahrens 
1576f5383399SBill Moore 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1577f5383399SBill Moore 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1578fa9e4066Sahrens 
1579e14bb325SJeff Bonwick 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1580f5383399SBill Moore 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1581e14bb325SJeff Bonwick 	else
1582f5383399SBill Moore 		zio_gang_tree_free(&zio->io_gang_tree);
1583fa9e4066Sahrens 
1584e14bb325SJeff Bonwick 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1585e05725b1Sbonwick 
1586e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1587fa9e4066Sahrens }
1588fa9e4066Sahrens 
1589fa9e4066Sahrens static void
1590e14bb325SJeff Bonwick zio_write_gang_member_ready(zio_t *zio)
1591fa9e4066Sahrens {
1592a3f829aeSBill Moore 	zio_t *pio = zio_unique_parent(zio);
1593f5383399SBill Moore 	zio_t *gio = zio->io_gang_leader;
159444cd46caSbillm 	dva_t *cdva = zio->io_bp->blk_dva;
159544cd46caSbillm 	dva_t *pdva = pio->io_bp->blk_dva;
1596fa9e4066Sahrens 	uint64_t asize;
1597fa9e4066Sahrens 
1598e14bb325SJeff Bonwick 	if (BP_IS_HOLE(zio->io_bp))
1599e14bb325SJeff Bonwick 		return;
1600e14bb325SJeff Bonwick 
1601e14bb325SJeff Bonwick 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1602e14bb325SJeff Bonwick 
1603e14bb325SJeff Bonwick 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1604*b24ab676SJeff Bonwick 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1605*b24ab676SJeff Bonwick 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1606*b24ab676SJeff Bonwick 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
160744cd46caSbillm 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1608fa9e4066Sahrens 
1609fa9e4066Sahrens 	mutex_enter(&pio->io_lock);
1610e14bb325SJeff Bonwick 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
161144cd46caSbillm 		ASSERT(DVA_GET_GANG(&pdva[d]));
161244cd46caSbillm 		asize = DVA_GET_ASIZE(&pdva[d]);
161344cd46caSbillm 		asize += DVA_GET_ASIZE(&cdva[d]);
161444cd46caSbillm 		DVA_SET_ASIZE(&pdva[d], asize);
161544cd46caSbillm 	}
1616fa9e4066Sahrens 	mutex_exit(&pio->io_lock);
1617fa9e4066Sahrens }
1618fa9e4066Sahrens 
16190a4e9518Sgw static int
1620e14bb325SJeff Bonwick zio_write_gang_block(zio_t *pio)
1621fa9e4066Sahrens {
1622e14bb325SJeff Bonwick 	spa_t *spa = pio->io_spa;
1623e14bb325SJeff Bonwick 	blkptr_t *bp = pio->io_bp;
1624f5383399SBill Moore 	zio_t *gio = pio->io_gang_leader;
1625e14bb325SJeff Bonwick 	zio_t *zio;
1626e14bb325SJeff Bonwick 	zio_gang_node_t *gn, **gnpp;
1627fa9e4066Sahrens 	zio_gbh_phys_t *gbh;
1628e14bb325SJeff Bonwick 	uint64_t txg = pio->io_txg;
1629e14bb325SJeff Bonwick 	uint64_t resid = pio->io_size;
1630e14bb325SJeff Bonwick 	uint64_t lsize;
1631*b24ab676SJeff Bonwick 	int copies = gio->io_prop.zp_copies;
1632*b24ab676SJeff Bonwick 	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1633e14bb325SJeff Bonwick 	zio_prop_t zp;
1634fa9e4066Sahrens 	int error;
1635fa9e4066Sahrens 
1636*b24ab676SJeff Bonwick 	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1637*b24ab676SJeff Bonwick 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1638e14bb325SJeff Bonwick 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1639e05725b1Sbonwick 	if (error) {
1640e14bb325SJeff Bonwick 		pio->io_error = error;
1641e05725b1Sbonwick 		return (ZIO_PIPELINE_CONTINUE);
1642e05725b1Sbonwick 	}
1643fa9e4066Sahrens 
1644f5383399SBill Moore 	if (pio == gio) {
1645f5383399SBill Moore 		gnpp = &gio->io_gang_tree;
1646e14bb325SJeff Bonwick 	} else {
1647e14bb325SJeff Bonwick 		gnpp = pio->io_private;
1648e14bb325SJeff Bonwick 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1649fa9e4066Sahrens 	}
1650fa9e4066Sahrens 
1651e14bb325SJeff Bonwick 	gn = zio_gang_node_alloc(gnpp);
1652e14bb325SJeff Bonwick 	gbh = gn->gn_gbh;
1653e14bb325SJeff Bonwick 	bzero(gbh, SPA_GANGBLOCKSIZE);
1654fa9e4066Sahrens 
1655e14bb325SJeff Bonwick 	/*
1656e14bb325SJeff Bonwick 	 * Create the gang header.
1657e14bb325SJeff Bonwick 	 */
1658e14bb325SJeff Bonwick 	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1659e14bb325SJeff Bonwick 	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1660fa9e4066Sahrens 
1661e14bb325SJeff Bonwick 	/*
1662e14bb325SJeff Bonwick 	 * Create and nowait the gang children.
1663e14bb325SJeff Bonwick 	 */
1664e14bb325SJeff Bonwick 	for (int g = 0; resid != 0; resid -= lsize, g++) {
1665e14bb325SJeff Bonwick 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1666e14bb325SJeff Bonwick 		    SPA_MINBLOCKSIZE);
1667e14bb325SJeff Bonwick 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1668e14bb325SJeff Bonwick 
1669f5383399SBill Moore 		zp.zp_checksum = gio->io_prop.zp_checksum;
1670e14bb325SJeff Bonwick 		zp.zp_compress = ZIO_COMPRESS_OFF;
1671e14bb325SJeff Bonwick 		zp.zp_type = DMU_OT_NONE;
1672e14bb325SJeff Bonwick 		zp.zp_level = 0;
1673*b24ab676SJeff Bonwick 		zp.zp_copies = gio->io_prop.zp_copies;
1674*b24ab676SJeff Bonwick 		zp.zp_dedup = 0;
1675*b24ab676SJeff Bonwick 		zp.zp_dedup_verify = 0;
1676e14bb325SJeff Bonwick 
1677e14bb325SJeff Bonwick 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1678e14bb325SJeff Bonwick 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1679e14bb325SJeff Bonwick 		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1680e14bb325SJeff Bonwick 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1681e14bb325SJeff Bonwick 		    &pio->io_bookmark));
1682e14bb325SJeff Bonwick 	}
1683e05725b1Sbonwick 
168444cd46caSbillm 	/*
1685e14bb325SJeff Bonwick 	 * Set pio's pipeline to just wait for zio to finish.
168644cd46caSbillm 	 */
1687e14bb325SJeff Bonwick 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1688e14bb325SJeff Bonwick 
1689e14bb325SJeff Bonwick 	zio_nowait(zio);
1690e14bb325SJeff Bonwick 
1691e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
1692fa9e4066Sahrens }
1693fa9e4066Sahrens 
1694fa9e4066Sahrens /*
1695fa9e4066Sahrens  * ==========================================================================
1696*b24ab676SJeff Bonwick  * Dedup
1697fa9e4066Sahrens  * ==========================================================================
1698fa9e4066Sahrens  */
1699*b24ab676SJeff Bonwick static void
1700*b24ab676SJeff Bonwick zio_ddt_child_read_done(zio_t *zio)
1701*b24ab676SJeff Bonwick {
1702*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1703*b24ab676SJeff Bonwick 	ddt_entry_t *dde = zio->io_private;
1704*b24ab676SJeff Bonwick 	ddt_phys_t *ddp;
1705*b24ab676SJeff Bonwick 	zio_t *pio = zio_unique_parent(zio);
1706*b24ab676SJeff Bonwick 
1707*b24ab676SJeff Bonwick 	mutex_enter(&pio->io_lock);
1708*b24ab676SJeff Bonwick 	ddp = ddt_phys_select(dde, bp);
1709*b24ab676SJeff Bonwick 	if (zio->io_error == 0)
1710*b24ab676SJeff Bonwick 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
1711*b24ab676SJeff Bonwick 	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
1712*b24ab676SJeff Bonwick 		dde->dde_repair_data = zio->io_data;
1713*b24ab676SJeff Bonwick 	else
1714*b24ab676SJeff Bonwick 		zio_buf_free(zio->io_data, zio->io_size);
1715*b24ab676SJeff Bonwick 	mutex_exit(&pio->io_lock);
1716*b24ab676SJeff Bonwick }
1717*b24ab676SJeff Bonwick 
1718*b24ab676SJeff Bonwick static int
1719*b24ab676SJeff Bonwick zio_ddt_read_start(zio_t *zio)
1720*b24ab676SJeff Bonwick {
1721*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1722*b24ab676SJeff Bonwick 
1723*b24ab676SJeff Bonwick 	ASSERT(BP_GET_DEDUP(bp));
1724*b24ab676SJeff Bonwick 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1725*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1726*b24ab676SJeff Bonwick 
1727*b24ab676SJeff Bonwick 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
1728*b24ab676SJeff Bonwick 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
1729*b24ab676SJeff Bonwick 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
1730*b24ab676SJeff Bonwick 		ddt_phys_t *ddp = dde->dde_phys;
1731*b24ab676SJeff Bonwick 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
1732*b24ab676SJeff Bonwick 		blkptr_t blk;
1733*b24ab676SJeff Bonwick 
1734*b24ab676SJeff Bonwick 		ASSERT(zio->io_vsd == NULL);
1735*b24ab676SJeff Bonwick 		zio->io_vsd = dde;
1736*b24ab676SJeff Bonwick 
1737*b24ab676SJeff Bonwick 		if (ddp_self == NULL)
1738*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
1739*b24ab676SJeff Bonwick 
1740*b24ab676SJeff Bonwick 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1741*b24ab676SJeff Bonwick 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
1742*b24ab676SJeff Bonwick 				continue;
1743*b24ab676SJeff Bonwick 			ddt_bp_create(ddt, &dde->dde_key, ddp, &blk);
1744*b24ab676SJeff Bonwick 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
1745*b24ab676SJeff Bonwick 			    zio_buf_alloc(zio->io_size), zio->io_size,
1746*b24ab676SJeff Bonwick 			    zio_ddt_child_read_done, dde, zio->io_priority,
1747*b24ab676SJeff Bonwick 			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
1748*b24ab676SJeff Bonwick 			    &zio->io_bookmark));
1749*b24ab676SJeff Bonwick 		}
1750*b24ab676SJeff Bonwick 		return (ZIO_PIPELINE_CONTINUE);
1751*b24ab676SJeff Bonwick 	}
1752*b24ab676SJeff Bonwick 
1753*b24ab676SJeff Bonwick 	zio_nowait(zio_read(zio, zio->io_spa, bp,
1754*b24ab676SJeff Bonwick 	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
1755*b24ab676SJeff Bonwick 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
1756*b24ab676SJeff Bonwick 
1757*b24ab676SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
1758*b24ab676SJeff Bonwick }
1759e14bb325SJeff Bonwick 
1760*b24ab676SJeff Bonwick static int
1761*b24ab676SJeff Bonwick zio_ddt_read_done(zio_t *zio)
1762*b24ab676SJeff Bonwick {
1763*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1764*b24ab676SJeff Bonwick 
1765*b24ab676SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
1766*b24ab676SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
1767*b24ab676SJeff Bonwick 
1768*b24ab676SJeff Bonwick 	ASSERT(BP_GET_DEDUP(bp));
1769*b24ab676SJeff Bonwick 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1770*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1771*b24ab676SJeff Bonwick 
1772*b24ab676SJeff Bonwick 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
1773*b24ab676SJeff Bonwick 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
1774*b24ab676SJeff Bonwick 		ddt_entry_t *dde = zio->io_vsd;
1775*b24ab676SJeff Bonwick 		if (ddt == NULL) {
1776*b24ab676SJeff Bonwick 			ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE);
1777*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
1778*b24ab676SJeff Bonwick 		}
1779*b24ab676SJeff Bonwick 		if (dde == NULL) {
1780*b24ab676SJeff Bonwick 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
1781*b24ab676SJeff Bonwick 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1782*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_STOP);
1783*b24ab676SJeff Bonwick 		}
1784*b24ab676SJeff Bonwick 		if (dde->dde_repair_data != NULL) {
1785*b24ab676SJeff Bonwick 			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
1786*b24ab676SJeff Bonwick 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
1787*b24ab676SJeff Bonwick 		}
1788*b24ab676SJeff Bonwick 		ddt_repair_done(ddt, dde);
1789*b24ab676SJeff Bonwick 		zio->io_vsd = NULL;
1790*b24ab676SJeff Bonwick 	}
1791*b24ab676SJeff Bonwick 
1792*b24ab676SJeff Bonwick 	ASSERT(zio->io_vsd == NULL);
1793*b24ab676SJeff Bonwick 
1794*b24ab676SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
1795*b24ab676SJeff Bonwick }
1796*b24ab676SJeff Bonwick 
1797*b24ab676SJeff Bonwick static boolean_t
1798*b24ab676SJeff Bonwick zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
1799*b24ab676SJeff Bonwick {
1800*b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
1801*b24ab676SJeff Bonwick 
1802*b24ab676SJeff Bonwick 	/*
1803*b24ab676SJeff Bonwick 	 * Note: we compare the original data, not the transformed data,
1804*b24ab676SJeff Bonwick 	 * because when zio->io_bp is an override bp, we will not have
1805*b24ab676SJeff Bonwick 	 * pushed the I/O transforms.  That's an important optimization
1806*b24ab676SJeff Bonwick 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
1807*b24ab676SJeff Bonwick 	 */
1808*b24ab676SJeff Bonwick 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1809*b24ab676SJeff Bonwick 		zio_t *lio = dde->dde_lead_zio[p];
1810*b24ab676SJeff Bonwick 
1811*b24ab676SJeff Bonwick 		if (lio != NULL) {
1812*b24ab676SJeff Bonwick 			return (lio->io_orig_size != zio->io_orig_size ||
1813*b24ab676SJeff Bonwick 			    bcmp(zio->io_orig_data, lio->io_orig_data,
1814*b24ab676SJeff Bonwick 			    zio->io_orig_size) != 0);
1815*b24ab676SJeff Bonwick 		}
1816*b24ab676SJeff Bonwick 	}
1817*b24ab676SJeff Bonwick 
1818*b24ab676SJeff Bonwick 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1819*b24ab676SJeff Bonwick 		ddt_phys_t *ddp = &dde->dde_phys[p];
1820*b24ab676SJeff Bonwick 
1821*b24ab676SJeff Bonwick 		if (ddp->ddp_phys_birth != 0) {
1822*b24ab676SJeff Bonwick 			arc_buf_t *abuf = NULL;
1823*b24ab676SJeff Bonwick 			uint32_t aflags = ARC_WAIT;
1824*b24ab676SJeff Bonwick 			blkptr_t blk = *zio->io_bp;
1825*b24ab676SJeff Bonwick 			int error;
1826*b24ab676SJeff Bonwick 
1827*b24ab676SJeff Bonwick 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
1828*b24ab676SJeff Bonwick 
1829*b24ab676SJeff Bonwick 			ddt_exit(ddt);
1830*b24ab676SJeff Bonwick 
1831*b24ab676SJeff Bonwick 			error = arc_read_nolock(NULL, spa, &blk,
1832*b24ab676SJeff Bonwick 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
1833*b24ab676SJeff Bonwick 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1834*b24ab676SJeff Bonwick 			    &aflags, &zio->io_bookmark);
1835*b24ab676SJeff Bonwick 
1836*b24ab676SJeff Bonwick 			if (error == 0) {
1837*b24ab676SJeff Bonwick 				if (arc_buf_size(abuf) != zio->io_orig_size ||
1838*b24ab676SJeff Bonwick 				    bcmp(abuf->b_data, zio->io_orig_data,
1839*b24ab676SJeff Bonwick 				    zio->io_orig_size) != 0)
1840*b24ab676SJeff Bonwick 					error = EEXIST;
1841*b24ab676SJeff Bonwick 				VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
1842*b24ab676SJeff Bonwick 			}
1843*b24ab676SJeff Bonwick 
1844*b24ab676SJeff Bonwick 			ddt_enter(ddt);
1845*b24ab676SJeff Bonwick 			return (error != 0);
1846*b24ab676SJeff Bonwick 		}
1847*b24ab676SJeff Bonwick 	}
1848*b24ab676SJeff Bonwick 
1849*b24ab676SJeff Bonwick 	return (B_FALSE);
1850*b24ab676SJeff Bonwick }
1851*b24ab676SJeff Bonwick 
1852*b24ab676SJeff Bonwick static void
1853*b24ab676SJeff Bonwick zio_ddt_child_write_ready(zio_t *zio)
1854*b24ab676SJeff Bonwick {
1855*b24ab676SJeff Bonwick 	int p = zio->io_prop.zp_copies;
1856*b24ab676SJeff Bonwick 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1857*b24ab676SJeff Bonwick 	ddt_entry_t *dde = zio->io_private;
1858*b24ab676SJeff Bonwick 	ddt_phys_t *ddp = &dde->dde_phys[p];
1859*b24ab676SJeff Bonwick 	zio_t *pio;
1860*b24ab676SJeff Bonwick 
1861*b24ab676SJeff Bonwick 	if (zio->io_error)
1862*b24ab676SJeff Bonwick 		return;
1863*b24ab676SJeff Bonwick 
1864*b24ab676SJeff Bonwick 	ddt_enter(ddt);
1865*b24ab676SJeff Bonwick 
1866*b24ab676SJeff Bonwick 	ASSERT(dde->dde_lead_zio[p] == zio);
1867*b24ab676SJeff Bonwick 
1868*b24ab676SJeff Bonwick 	ddt_phys_fill(ddp, zio->io_bp);
1869*b24ab676SJeff Bonwick 
1870*b24ab676SJeff Bonwick 	while ((pio = zio_walk_parents(zio)) != NULL)
1871*b24ab676SJeff Bonwick 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
1872*b24ab676SJeff Bonwick 
1873*b24ab676SJeff Bonwick 	ddt_exit(ddt);
1874*b24ab676SJeff Bonwick }
1875*b24ab676SJeff Bonwick 
1876*b24ab676SJeff Bonwick static void
1877*b24ab676SJeff Bonwick zio_ddt_child_write_done(zio_t *zio)
1878*b24ab676SJeff Bonwick {
1879*b24ab676SJeff Bonwick 	int p = zio->io_prop.zp_copies;
1880*b24ab676SJeff Bonwick 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1881*b24ab676SJeff Bonwick 	ddt_entry_t *dde = zio->io_private;
1882*b24ab676SJeff Bonwick 	ddt_phys_t *ddp = &dde->dde_phys[p];
1883*b24ab676SJeff Bonwick 
1884*b24ab676SJeff Bonwick 	ddt_enter(ddt);
1885*b24ab676SJeff Bonwick 
1886*b24ab676SJeff Bonwick 	ASSERT(ddp->ddp_refcnt == 0);
1887*b24ab676SJeff Bonwick 	ASSERT(dde->dde_lead_zio[p] == zio);
1888*b24ab676SJeff Bonwick 	dde->dde_lead_zio[p] = NULL;
1889*b24ab676SJeff Bonwick 
1890*b24ab676SJeff Bonwick 	if (zio->io_error == 0) {
1891*b24ab676SJeff Bonwick 		while (zio_walk_parents(zio) != NULL)
1892*b24ab676SJeff Bonwick 			ddt_phys_addref(ddp);
1893*b24ab676SJeff Bonwick 	} else {
1894*b24ab676SJeff Bonwick 		ddt_phys_clear(ddp);
1895*b24ab676SJeff Bonwick 	}
1896*b24ab676SJeff Bonwick 
1897*b24ab676SJeff Bonwick 	ddt_exit(ddt);
1898*b24ab676SJeff Bonwick }
1899*b24ab676SJeff Bonwick 
1900*b24ab676SJeff Bonwick static void
1901*b24ab676SJeff Bonwick zio_ddt_ditto_write_done(zio_t *zio)
1902*b24ab676SJeff Bonwick {
1903*b24ab676SJeff Bonwick 	int p = DDT_PHYS_DITTO;
1904*b24ab676SJeff Bonwick 	zio_prop_t *zp = &zio->io_prop;
1905*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1906*b24ab676SJeff Bonwick 	ddt_t *ddt = ddt_select(zio->io_spa, bp);
1907*b24ab676SJeff Bonwick 	ddt_entry_t *dde = zio->io_private;
1908*b24ab676SJeff Bonwick 	ddt_phys_t *ddp = &dde->dde_phys[p];
1909*b24ab676SJeff Bonwick 	ddt_key_t *ddk = &dde->dde_key;
1910*b24ab676SJeff Bonwick 
1911*b24ab676SJeff Bonwick 	ddt_enter(ddt);
1912*b24ab676SJeff Bonwick 
1913*b24ab676SJeff Bonwick 	ASSERT(ddp->ddp_refcnt == 0);
1914*b24ab676SJeff Bonwick 	ASSERT(dde->dde_lead_zio[p] == zio);
1915*b24ab676SJeff Bonwick 	dde->dde_lead_zio[p] = NULL;
1916*b24ab676SJeff Bonwick 
1917*b24ab676SJeff Bonwick 	if (zio->io_error == 0) {
1918*b24ab676SJeff Bonwick 		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
1919*b24ab676SJeff Bonwick 		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
1920*b24ab676SJeff Bonwick 		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
1921*b24ab676SJeff Bonwick 		if (ddp->ddp_phys_birth != 0)
1922*b24ab676SJeff Bonwick 			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
1923*b24ab676SJeff Bonwick 		ddt_phys_fill(ddp, bp);
1924*b24ab676SJeff Bonwick 	}
1925*b24ab676SJeff Bonwick 
1926*b24ab676SJeff Bonwick 	ddt_exit(ddt);
1927*b24ab676SJeff Bonwick }
1928*b24ab676SJeff Bonwick 
1929*b24ab676SJeff Bonwick static int
1930*b24ab676SJeff Bonwick zio_ddt_write(zio_t *zio)
1931*b24ab676SJeff Bonwick {
1932*b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
1933*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
1934*b24ab676SJeff Bonwick 	uint64_t txg = zio->io_txg;
1935*b24ab676SJeff Bonwick 	zio_prop_t *zp = &zio->io_prop;
1936*b24ab676SJeff Bonwick 	int p = zp->zp_copies;
1937*b24ab676SJeff Bonwick 	int ditto_copies;
1938*b24ab676SJeff Bonwick 	zio_t *cio = NULL;
1939*b24ab676SJeff Bonwick 	zio_t *dio = NULL;
1940*b24ab676SJeff Bonwick 	ddt_t *ddt = ddt_select(spa, bp);
1941*b24ab676SJeff Bonwick 	ddt_entry_t *dde;
1942*b24ab676SJeff Bonwick 	ddt_phys_t *ddp;
1943*b24ab676SJeff Bonwick 
1944*b24ab676SJeff Bonwick 	ASSERT(BP_GET_DEDUP(bp));
1945*b24ab676SJeff Bonwick 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
1946*b24ab676SJeff Bonwick 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
1947*b24ab676SJeff Bonwick 
1948*b24ab676SJeff Bonwick 	ddt_enter(ddt);
1949*b24ab676SJeff Bonwick 	dde = ddt_lookup(ddt, bp, B_TRUE);
1950*b24ab676SJeff Bonwick 	ddp = &dde->dde_phys[p];
1951*b24ab676SJeff Bonwick 
1952*b24ab676SJeff Bonwick 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
1953*b24ab676SJeff Bonwick 		/*
1954*b24ab676SJeff Bonwick 		 * If we're using a weak checksum, upgrade to a strong checksum
1955*b24ab676SJeff Bonwick 		 * and try again.  If we're already using a strong checksum,
1956*b24ab676SJeff Bonwick 		 * we can't resolve it, so just convert to an ordinary write.
1957*b24ab676SJeff Bonwick 		 * (And automatically e-mail a paper to Nature?)
1958*b24ab676SJeff Bonwick 		 */
1959*b24ab676SJeff Bonwick 		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
1960*b24ab676SJeff Bonwick 			zp->zp_checksum = spa_dedup_checksum(spa);
1961*b24ab676SJeff Bonwick 			zio_pop_transforms(zio);
1962*b24ab676SJeff Bonwick 			zio->io_stage = ZIO_STAGE_OPEN;
1963*b24ab676SJeff Bonwick 			BP_ZERO(bp);
1964*b24ab676SJeff Bonwick 		} else {
1965*b24ab676SJeff Bonwick 			zp->zp_dedup = 0;
1966*b24ab676SJeff Bonwick 		}
1967*b24ab676SJeff Bonwick 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
1968*b24ab676SJeff Bonwick 		ddt_exit(ddt);
1969*b24ab676SJeff Bonwick 		return (ZIO_PIPELINE_CONTINUE);
1970*b24ab676SJeff Bonwick 	}
1971*b24ab676SJeff Bonwick 
1972*b24ab676SJeff Bonwick 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
1973*b24ab676SJeff Bonwick 	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
1974*b24ab676SJeff Bonwick 
1975*b24ab676SJeff Bonwick 	if (ditto_copies > ddt_ditto_copies_present(dde) &&
1976*b24ab676SJeff Bonwick 	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
1977*b24ab676SJeff Bonwick 		zio_prop_t czp = *zp;
1978*b24ab676SJeff Bonwick 
1979*b24ab676SJeff Bonwick 		czp.zp_copies = ditto_copies;
1980*b24ab676SJeff Bonwick 
1981*b24ab676SJeff Bonwick 		/*
1982*b24ab676SJeff Bonwick 		 * If we arrived here with an override bp, we won't have run
1983*b24ab676SJeff Bonwick 		 * the transform stack, so we won't have the data we need to
1984*b24ab676SJeff Bonwick 		 * generate a child i/o.  So, toss the override bp and restart.
1985*b24ab676SJeff Bonwick 		 * This is safe, because using the override bp is just an
1986*b24ab676SJeff Bonwick 		 * optimization; and it's rare, so the cost doesn't matter.
1987*b24ab676SJeff Bonwick 		 */
1988*b24ab676SJeff Bonwick 		if (zio->io_bp_override) {
1989*b24ab676SJeff Bonwick 			zio_pop_transforms(zio);
1990*b24ab676SJeff Bonwick 			zio->io_stage = ZIO_STAGE_OPEN;
1991*b24ab676SJeff Bonwick 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
1992*b24ab676SJeff Bonwick 			zio->io_bp_override = NULL;
1993*b24ab676SJeff Bonwick 			BP_ZERO(bp);
1994*b24ab676SJeff Bonwick 			ddt_exit(ddt);
1995*b24ab676SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
1996*b24ab676SJeff Bonwick 		}
1997*b24ab676SJeff Bonwick 
1998*b24ab676SJeff Bonwick 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
1999*b24ab676SJeff Bonwick 		    zio->io_orig_size, &czp, NULL,
2000*b24ab676SJeff Bonwick 		    zio_ddt_ditto_write_done, dde, zio->io_priority,
2001*b24ab676SJeff Bonwick 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2002*b24ab676SJeff Bonwick 
2003*b24ab676SJeff Bonwick 		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2004*b24ab676SJeff Bonwick 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2005*b24ab676SJeff Bonwick 	}
2006*b24ab676SJeff Bonwick 
2007*b24ab676SJeff Bonwick 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2008*b24ab676SJeff Bonwick 		if (ddp->ddp_phys_birth != 0)
2009*b24ab676SJeff Bonwick 			ddt_bp_fill(ddp, bp, txg);
2010*b24ab676SJeff Bonwick 		if (dde->dde_lead_zio[p] != NULL)
2011*b24ab676SJeff Bonwick 			zio_add_child(zio, dde->dde_lead_zio[p]);
2012*b24ab676SJeff Bonwick 		else
2013*b24ab676SJeff Bonwick 			ddt_phys_addref(ddp);
2014*b24ab676SJeff Bonwick 	} else if (zio->io_bp_override) {
2015*b24ab676SJeff Bonwick 		ASSERT(bp->blk_birth == txg);
2016*b24ab676SJeff Bonwick 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2017*b24ab676SJeff Bonwick 		ddt_phys_fill(ddp, bp);
2018*b24ab676SJeff Bonwick 		ddt_phys_addref(ddp);
2019*b24ab676SJeff Bonwick 	} else {
2020*b24ab676SJeff Bonwick 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2021*b24ab676SJeff Bonwick 		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
2022*b24ab676SJeff Bonwick 		    zio_ddt_child_write_done, dde, zio->io_priority,
2023*b24ab676SJeff Bonwick 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2024*b24ab676SJeff Bonwick 
2025*b24ab676SJeff Bonwick 		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2026*b24ab676SJeff Bonwick 		dde->dde_lead_zio[p] = cio;
2027*b24ab676SJeff Bonwick 	}
2028*b24ab676SJeff Bonwick 
2029*b24ab676SJeff Bonwick 	ddt_exit(ddt);
2030*b24ab676SJeff Bonwick 
2031*b24ab676SJeff Bonwick 	if (cio)
2032*b24ab676SJeff Bonwick 		zio_nowait(cio);
2033*b24ab676SJeff Bonwick 	if (dio)
2034*b24ab676SJeff Bonwick 		zio_nowait(dio);
2035*b24ab676SJeff Bonwick 
2036*b24ab676SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
2037*b24ab676SJeff Bonwick }
2038*b24ab676SJeff Bonwick 
2039*b24ab676SJeff Bonwick static int
2040*b24ab676SJeff Bonwick zio_ddt_free(zio_t *zio)
2041*b24ab676SJeff Bonwick {
2042*b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
2043*b24ab676SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
2044*b24ab676SJeff Bonwick 	ddt_t *ddt = ddt_select(spa, bp);
2045*b24ab676SJeff Bonwick 	ddt_entry_t *dde;
2046*b24ab676SJeff Bonwick 	ddt_phys_t *ddp;
2047*b24ab676SJeff Bonwick 
2048*b24ab676SJeff Bonwick 	ASSERT(BP_GET_DEDUP(bp));
2049*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2050*b24ab676SJeff Bonwick 
2051*b24ab676SJeff Bonwick 	ddt_enter(ddt);
2052*b24ab676SJeff Bonwick 	dde = ddt_lookup(ddt, bp, B_TRUE);
2053*b24ab676SJeff Bonwick 	ddp = ddt_phys_select(dde, bp);
2054*b24ab676SJeff Bonwick 	ddt_phys_decref(ddp);
2055*b24ab676SJeff Bonwick 	ddt_exit(ddt);
2056*b24ab676SJeff Bonwick 
2057*b24ab676SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
2058*b24ab676SJeff Bonwick }
2059*b24ab676SJeff Bonwick 
2060*b24ab676SJeff Bonwick /*
2061*b24ab676SJeff Bonwick  * ==========================================================================
2062*b24ab676SJeff Bonwick  * Allocate and free blocks
2063*b24ab676SJeff Bonwick  * ==========================================================================
2064*b24ab676SJeff Bonwick  */
2065e05725b1Sbonwick static int
2066fa9e4066Sahrens zio_dva_allocate(zio_t *zio)
2067fa9e4066Sahrens {
20688654d025Sperrin 	spa_t *spa = zio->io_spa;
2069*b24ab676SJeff Bonwick 	metaslab_class_t *mc = spa_normal_class(spa);
2070fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
2071fa9e4066Sahrens 	int error;
2072fa9e4066Sahrens 
2073f5383399SBill Moore 	if (zio->io_gang_leader == NULL) {
2074f5383399SBill Moore 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2075f5383399SBill Moore 		zio->io_gang_leader = zio;
2076f5383399SBill Moore 	}
2077f5383399SBill Moore 
2078fa9e4066Sahrens 	ASSERT(BP_IS_HOLE(bp));
207944cd46caSbillm 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
2080*b24ab676SJeff Bonwick 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
2081*b24ab676SJeff Bonwick 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2082fa9e4066Sahrens 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2083fa9e4066Sahrens 
2084e14bb325SJeff Bonwick 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
2085*b24ab676SJeff Bonwick 	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
2086fa9e4066Sahrens 
2087e14bb325SJeff Bonwick 	if (error) {
2088e14bb325SJeff Bonwick 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2089e14bb325SJeff Bonwick 			return (zio_write_gang_block(zio));
2090fa9e4066Sahrens 		zio->io_error = error;
2091fa9e4066Sahrens 	}
2092e05725b1Sbonwick 
2093e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2094fa9e4066Sahrens }
2095fa9e4066Sahrens 
2096e05725b1Sbonwick static int
2097fa9e4066Sahrens zio_dva_free(zio_t *zio)
2098fa9e4066Sahrens {
2099e14bb325SJeff Bonwick 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2100fa9e4066Sahrens 
2101e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2102fa9e4066Sahrens }
2103fa9e4066Sahrens 
2104e05725b1Sbonwick static int
2105fa9e4066Sahrens zio_dva_claim(zio_t *zio)
2106fa9e4066Sahrens {
2107e14bb325SJeff Bonwick 	int error;
2108e14bb325SJeff Bonwick 
2109e14bb325SJeff Bonwick 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2110e14bb325SJeff Bonwick 	if (error)
2111e14bb325SJeff Bonwick 		zio->io_error = error;
2112fa9e4066Sahrens 
2113e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2114fa9e4066Sahrens }
2115fa9e4066Sahrens 
2116e14bb325SJeff Bonwick /*
2117e14bb325SJeff Bonwick  * Undo an allocation.  This is used by zio_done() when an I/O fails
2118e14bb325SJeff Bonwick  * and we want to give back the block we just allocated.
2119e14bb325SJeff Bonwick  * This handles both normal blocks and gang blocks.
2120e14bb325SJeff Bonwick  */
2121e14bb325SJeff Bonwick static void
2122e14bb325SJeff Bonwick zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2123e14bb325SJeff Bonwick {
2124e14bb325SJeff Bonwick 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2125*b24ab676SJeff Bonwick 	ASSERT(zio->io_bp_override == NULL);
2126e14bb325SJeff Bonwick 
2127e14bb325SJeff Bonwick 	if (!BP_IS_HOLE(bp))
2128*b24ab676SJeff Bonwick 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2129e14bb325SJeff Bonwick 
2130e14bb325SJeff Bonwick 	if (gn != NULL) {
2131e14bb325SJeff Bonwick 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2132e14bb325SJeff Bonwick 			zio_dva_unallocate(zio, gn->gn_child[g],
2133e14bb325SJeff Bonwick 			    &gn->gn_gbh->zg_blkptr[g]);
2134e14bb325SJeff Bonwick 		}
2135e14bb325SJeff Bonwick 	}
2136e14bb325SJeff Bonwick }
2137e14bb325SJeff Bonwick 
2138e14bb325SJeff Bonwick /*
2139e14bb325SJeff Bonwick  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2140e14bb325SJeff Bonwick  */
2141e14bb325SJeff Bonwick int
2142*b24ab676SJeff Bonwick zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2143*b24ab676SJeff Bonwick     uint64_t size, boolean_t use_slog)
2144e14bb325SJeff Bonwick {
2145e09fa4daSNeil Perrin 	int error = 1;
2146e14bb325SJeff Bonwick 
2147*b24ab676SJeff Bonwick 	ASSERT(txg > spa_syncing_txg(spa));
2148*b24ab676SJeff Bonwick 
2149d48e086fSNeil Perrin 	if (use_slog)
2150*b24ab676SJeff Bonwick 		error = metaslab_alloc(spa, spa_log_class(spa), size,
2151e09fa4daSNeil Perrin 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
2152e14bb325SJeff Bonwick 
2153e14bb325SJeff Bonwick 	if (error)
2154*b24ab676SJeff Bonwick 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
2155e14bb325SJeff Bonwick 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
2156e14bb325SJeff Bonwick 
2157e14bb325SJeff Bonwick 	if (error == 0) {
2158e14bb325SJeff Bonwick 		BP_SET_LSIZE(new_bp, size);
2159e14bb325SJeff Bonwick 		BP_SET_PSIZE(new_bp, size);
2160e14bb325SJeff Bonwick 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2161e14bb325SJeff Bonwick 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
2162e14bb325SJeff Bonwick 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2163e14bb325SJeff Bonwick 		BP_SET_LEVEL(new_bp, 0);
2164*b24ab676SJeff Bonwick 		BP_SET_DEDUP(new_bp, 0);
2165e14bb325SJeff Bonwick 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2166e14bb325SJeff Bonwick 	}
2167e14bb325SJeff Bonwick 
2168e14bb325SJeff Bonwick 	return (error);
2169e14bb325SJeff Bonwick }
2170e14bb325SJeff Bonwick 
2171e14bb325SJeff Bonwick /*
2172*b24ab676SJeff Bonwick  * Free an intent log block.
2173e14bb325SJeff Bonwick  */
2174e14bb325SJeff Bonwick void
2175*b24ab676SJeff Bonwick zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2176e14bb325SJeff Bonwick {
2177*b24ab676SJeff Bonwick 	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2178e14bb325SJeff Bonwick 	ASSERT(!BP_IS_GANG(bp));
2179e14bb325SJeff Bonwick 
2180*b24ab676SJeff Bonwick 	zio_free(spa, txg, bp);
2181e14bb325SJeff Bonwick }
2182e14bb325SJeff Bonwick 
2183fa9e4066Sahrens /*
2184fa9e4066Sahrens  * ==========================================================================
2185fa9e4066Sahrens  * Read and write to physical devices
2186fa9e4066Sahrens  * ==========================================================================
2187fa9e4066Sahrens  */
2188e05725b1Sbonwick static int
218944cd46caSbillm zio_vdev_io_start(zio_t *zio)
2190fa9e4066Sahrens {
2191fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
219244cd46caSbillm 	uint64_t align;
21930a4e9518Sgw 	spa_t *spa = zio->io_spa;
21940a4e9518Sgw 
2195e14bb325SJeff Bonwick 	ASSERT(zio->io_error == 0);
2196e14bb325SJeff Bonwick 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2197fa9e4066Sahrens 
2198e14bb325SJeff Bonwick 	if (vd == NULL) {
2199e14bb325SJeff Bonwick 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2200e14bb325SJeff Bonwick 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2201fa9e4066Sahrens 
2202e14bb325SJeff Bonwick 		/*
2203e14bb325SJeff Bonwick 		 * The mirror_ops handle multiple DVAs in a single BP.
2204e14bb325SJeff Bonwick 		 */
2205e14bb325SJeff Bonwick 		return (vdev_mirror_ops.vdev_op_io_start(zio));
2206fa9e4066Sahrens 	}
2207fa9e4066Sahrens 
2208e14bb325SJeff Bonwick 	align = 1ULL << vd->vdev_top->vdev_ashift;
2209e14bb325SJeff Bonwick 
2210ecc2d604Sbonwick 	if (P2PHASE(zio->io_size, align) != 0) {
2211ecc2d604Sbonwick 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
2212ecc2d604Sbonwick 		char *abuf = zio_buf_alloc(asize);
2213e14bb325SJeff Bonwick 		ASSERT(vd == vd->vdev_top);
2214ecc2d604Sbonwick 		if (zio->io_type == ZIO_TYPE_WRITE) {
2215ecc2d604Sbonwick 			bcopy(zio->io_data, abuf, zio->io_size);
2216ecc2d604Sbonwick 			bzero(abuf + zio->io_size, asize - zio->io_size);
2217ecc2d604Sbonwick 		}
2218e14bb325SJeff Bonwick 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2219ecc2d604Sbonwick 	}
2220ecc2d604Sbonwick 
2221ecc2d604Sbonwick 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
2222ecc2d604Sbonwick 	ASSERT(P2PHASE(zio->io_size, align) == 0);
22238ad4d6ddSJeff Bonwick 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
22248ad4d6ddSJeff Bonwick 
22258ad4d6ddSJeff Bonwick 	/*
22268ad4d6ddSJeff Bonwick 	 * If this is a repair I/O, and there's no self-healing involved --
22278ad4d6ddSJeff Bonwick 	 * that is, we're just resilvering what we expect to resilver --
22288ad4d6ddSJeff Bonwick 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
22298ad4d6ddSJeff Bonwick 	 * This prevents spurious resilvering with nested replication.
22308ad4d6ddSJeff Bonwick 	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
22318ad4d6ddSJeff Bonwick 	 * A is out of date, we'll read from C+D, then use the data to
22328ad4d6ddSJeff Bonwick 	 * resilver A+B -- but we don't actually want to resilver B, just A.
22338ad4d6ddSJeff Bonwick 	 * The top-level mirror has no way to know this, so instead we just
22348ad4d6ddSJeff Bonwick 	 * discard unnecessary repairs as we work our way down the vdev tree.
22358ad4d6ddSJeff Bonwick 	 * The same logic applies to any form of nested replication:
22368ad4d6ddSJeff Bonwick 	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
22378ad4d6ddSJeff Bonwick 	 */
22388ad4d6ddSJeff Bonwick 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
22398ad4d6ddSJeff Bonwick 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
22408ad4d6ddSJeff Bonwick 	    zio->io_txg != 0 &&	/* not a delegated i/o */
22418ad4d6ddSJeff Bonwick 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
22428ad4d6ddSJeff Bonwick 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
22438ad4d6ddSJeff Bonwick 		zio_vdev_io_bypass(zio);
22448ad4d6ddSJeff Bonwick 		return (ZIO_PIPELINE_CONTINUE);
22458ad4d6ddSJeff Bonwick 	}
2246fa9e4066Sahrens 
2247e14bb325SJeff Bonwick 	if (vd->vdev_ops->vdev_op_leaf &&
2248e14bb325SJeff Bonwick 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2249e14bb325SJeff Bonwick 
2250e14bb325SJeff Bonwick 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
2251a3f829aeSBill Moore 			return (ZIO_PIPELINE_CONTINUE);
2252e14bb325SJeff Bonwick 
2253e14bb325SJeff Bonwick 		if ((zio = vdev_queue_io(zio)) == NULL)
2254e14bb325SJeff Bonwick 			return (ZIO_PIPELINE_STOP);
2255e14bb325SJeff Bonwick 
2256e14bb325SJeff Bonwick 		if (!vdev_accessible(vd, zio)) {
2257e14bb325SJeff Bonwick 			zio->io_error = ENXIO;
2258e14bb325SJeff Bonwick 			zio_interrupt(zio);
2259e14bb325SJeff Bonwick 			return (ZIO_PIPELINE_STOP);
2260e14bb325SJeff Bonwick 		}
2261e14bb325SJeff Bonwick 	}
2262e14bb325SJeff Bonwick 
2263e05725b1Sbonwick 	return (vd->vdev_ops->vdev_op_io_start(zio));
2264fa9e4066Sahrens }
2265fa9e4066Sahrens 
2266e05725b1Sbonwick static int
2267fa9e4066Sahrens zio_vdev_io_done(zio_t *zio)
2268fa9e4066Sahrens {
2269e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
2270e14bb325SJeff Bonwick 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2271e14bb325SJeff Bonwick 	boolean_t unexpected_error = B_FALSE;
2272e05725b1Sbonwick 
2273e14bb325SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2274e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
2275fa9e4066Sahrens 
2276e14bb325SJeff Bonwick 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2277e14bb325SJeff Bonwick 
2278e14bb325SJeff Bonwick 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2279e14bb325SJeff Bonwick 
2280e14bb325SJeff Bonwick 		vdev_queue_io_done(zio);
2281fa9e4066Sahrens 
2282e14bb325SJeff Bonwick 		if (zio->io_type == ZIO_TYPE_WRITE)
2283e14bb325SJeff Bonwick 			vdev_cache_write(zio);
2284e14bb325SJeff Bonwick 
2285e14bb325SJeff Bonwick 		if (zio_injection_enabled && zio->io_error == 0)
22868956713aSEric Schrock 			zio->io_error = zio_handle_device_injection(vd,
22878956713aSEric Schrock 			    zio, EIO);
2288e14bb325SJeff Bonwick 
2289e14bb325SJeff Bonwick 		if (zio_injection_enabled && zio->io_error == 0)
2290e14bb325SJeff Bonwick 			zio->io_error = zio_handle_label_injection(zio, EIO);
2291e14bb325SJeff Bonwick 
2292e14bb325SJeff Bonwick 		if (zio->io_error) {
2293e14bb325SJeff Bonwick 			if (!vdev_accessible(vd, zio)) {
2294e14bb325SJeff Bonwick 				zio->io_error = ENXIO;
2295e14bb325SJeff Bonwick 			} else {
2296e14bb325SJeff Bonwick 				unexpected_error = B_TRUE;
2297e14bb325SJeff Bonwick 			}
2298e14bb325SJeff Bonwick 		}
229951ece835Seschrock 	}
2300fa9e4066Sahrens 
2301e14bb325SJeff Bonwick 	ops->vdev_op_io_done(zio);
2302e14bb325SJeff Bonwick 
2303e14bb325SJeff Bonwick 	if (unexpected_error)
2304a3f829aeSBill Moore 		VERIFY(vdev_probe(vd, zio) == NULL);
2305e14bb325SJeff Bonwick 
2306e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
2307fa9e4066Sahrens }
2308fa9e4066Sahrens 
230922fe2c88SJonathan Adams /*
231022fe2c88SJonathan Adams  * For non-raidz ZIOs, we can just copy aside the bad data read from the
231122fe2c88SJonathan Adams  * disk, and use that to finish the checksum ereport later.
231222fe2c88SJonathan Adams  */
231322fe2c88SJonathan Adams static void
231422fe2c88SJonathan Adams zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
231522fe2c88SJonathan Adams     const void *good_buf)
231622fe2c88SJonathan Adams {
231722fe2c88SJonathan Adams 	/* no processing needed */
231822fe2c88SJonathan Adams 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
231922fe2c88SJonathan Adams }
232022fe2c88SJonathan Adams 
232122fe2c88SJonathan Adams /*ARGSUSED*/
232222fe2c88SJonathan Adams void
232322fe2c88SJonathan Adams zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
232422fe2c88SJonathan Adams {
232522fe2c88SJonathan Adams 	void *buf = zio_buf_alloc(zio->io_size);
232622fe2c88SJonathan Adams 
232722fe2c88SJonathan Adams 	bcopy(zio->io_data, buf, zio->io_size);
232822fe2c88SJonathan Adams 
232922fe2c88SJonathan Adams 	zcr->zcr_cbinfo = zio->io_size;
233022fe2c88SJonathan Adams 	zcr->zcr_cbdata = buf;
233122fe2c88SJonathan Adams 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
233222fe2c88SJonathan Adams 	zcr->zcr_free = zio_buf_free;
233322fe2c88SJonathan Adams }
233422fe2c88SJonathan Adams 
2335e05725b1Sbonwick static int
2336fa9e4066Sahrens zio_vdev_io_assess(zio_t *zio)
2337fa9e4066Sahrens {
2338fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
2339e14bb325SJeff Bonwick 
2340e14bb325SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2341e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
2342e14bb325SJeff Bonwick 
2343e14bb325SJeff Bonwick 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2344e14bb325SJeff Bonwick 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2345e14bb325SJeff Bonwick 
2346e14bb325SJeff Bonwick 	if (zio->io_vsd != NULL) {
234722fe2c88SJonathan Adams 		zio->io_vsd_ops->vsd_free(zio);
2348e14bb325SJeff Bonwick 		zio->io_vsd = NULL;
2349ecc2d604Sbonwick 	}
2350ecc2d604Sbonwick 
2351e14bb325SJeff Bonwick 	if (zio_injection_enabled && zio->io_error == 0)
2352ea8dc4b6Seschrock 		zio->io_error = zio_handle_fault_injection(zio, EIO);
2353ea8dc4b6Seschrock 
2354fa9e4066Sahrens 	/*
2355fa9e4066Sahrens 	 * If the I/O failed, determine whether we should attempt to retry it.
2356fa9e4066Sahrens 	 */
2357e14bb325SJeff Bonwick 	if (zio->io_error && vd == NULL &&
2358e14bb325SJeff Bonwick 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2359e14bb325SJeff Bonwick 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
2360e14bb325SJeff Bonwick 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
2361fa9e4066Sahrens 		zio->io_error = 0;
2362e14bb325SJeff Bonwick 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
2363e14bb325SJeff Bonwick 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2364*b24ab676SJeff Bonwick 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2365e14bb325SJeff Bonwick 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
2366e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
2367ea8dc4b6Seschrock 	}
2368fa9e4066Sahrens 
2369e14bb325SJeff Bonwick 	/*
2370e14bb325SJeff Bonwick 	 * If we got an error on a leaf device, convert it to ENXIO
2371e14bb325SJeff Bonwick 	 * if the device is not accessible at all.
2372e14bb325SJeff Bonwick 	 */
2373e14bb325SJeff Bonwick 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2374e14bb325SJeff Bonwick 	    !vdev_accessible(vd, zio))
2375e14bb325SJeff Bonwick 		zio->io_error = ENXIO;
2376e14bb325SJeff Bonwick 
2377e14bb325SJeff Bonwick 	/*
2378e14bb325SJeff Bonwick 	 * If we can't write to an interior vdev (mirror or RAID-Z),
2379e14bb325SJeff Bonwick 	 * set vdev_cant_write so that we stop trying to allocate from it.
2380e14bb325SJeff Bonwick 	 */
2381e14bb325SJeff Bonwick 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2382e14bb325SJeff Bonwick 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
2383e14bb325SJeff Bonwick 		vd->vdev_cant_write = B_TRUE;
2384e14bb325SJeff Bonwick 
2385e14bb325SJeff Bonwick 	if (zio->io_error)
2386e14bb325SJeff Bonwick 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2387e14bb325SJeff Bonwick 
2388e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2389fa9e4066Sahrens }
2390fa9e4066Sahrens 
2391fa9e4066Sahrens void
2392fa9e4066Sahrens zio_vdev_io_reissue(zio_t *zio)
2393fa9e4066Sahrens {
2394fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2395fa9e4066Sahrens 	ASSERT(zio->io_error == 0);
2396fa9e4066Sahrens 
2397*b24ab676SJeff Bonwick 	zio->io_stage >>= 1;
2398fa9e4066Sahrens }
2399fa9e4066Sahrens 
2400fa9e4066Sahrens void
2401fa9e4066Sahrens zio_vdev_io_redone(zio_t *zio)
2402fa9e4066Sahrens {
2403fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2404fa9e4066Sahrens 
2405*b24ab676SJeff Bonwick 	zio->io_stage >>= 1;
2406fa9e4066Sahrens }
2407fa9e4066Sahrens 
2408fa9e4066Sahrens void
2409fa9e4066Sahrens zio_vdev_io_bypass(zio_t *zio)
2410fa9e4066Sahrens {
2411fa9e4066Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2412fa9e4066Sahrens 	ASSERT(zio->io_error == 0);
2413fa9e4066Sahrens 
2414fa9e4066Sahrens 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2415*b24ab676SJeff Bonwick 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2416fa9e4066Sahrens }
2417fa9e4066Sahrens 
2418fa9e4066Sahrens /*
2419fa9e4066Sahrens  * ==========================================================================
2420fa9e4066Sahrens  * Generate and verify checksums
2421fa9e4066Sahrens  * ==========================================================================
2422fa9e4066Sahrens  */
2423e05725b1Sbonwick static int
2424fa9e4066Sahrens zio_checksum_generate(zio_t *zio)
2425fa9e4066Sahrens {
2426fa9e4066Sahrens 	blkptr_t *bp = zio->io_bp;
2427e14bb325SJeff Bonwick 	enum zio_checksum checksum;
2428fa9e4066Sahrens 
2429e14bb325SJeff Bonwick 	if (bp == NULL) {
2430e14bb325SJeff Bonwick 		/*
2431e14bb325SJeff Bonwick 		 * This is zio_write_phys().
2432e14bb325SJeff Bonwick 		 * We're either generating a label checksum, or none at all.
2433e14bb325SJeff Bonwick 		 */
2434e14bb325SJeff Bonwick 		checksum = zio->io_prop.zp_checksum;
2435e14bb325SJeff Bonwick 
2436e14bb325SJeff Bonwick 		if (checksum == ZIO_CHECKSUM_OFF)
2437e14bb325SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
2438fa9e4066Sahrens 
2439e14bb325SJeff Bonwick 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2440e14bb325SJeff Bonwick 	} else {
2441e14bb325SJeff Bonwick 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2442e14bb325SJeff Bonwick 			ASSERT(!IO_IS_ALLOCATING(zio));
2443e14bb325SJeff Bonwick 			checksum = ZIO_CHECKSUM_GANG_HEADER;
2444e14bb325SJeff Bonwick 		} else {
2445e14bb325SJeff Bonwick 			checksum = BP_GET_CHECKSUM(bp);
2446e14bb325SJeff Bonwick 		}
2447e14bb325SJeff Bonwick 	}
2448fa9e4066Sahrens 
2449e14bb325SJeff Bonwick 	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2450fa9e4066Sahrens 
2451e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2452fa9e4066Sahrens }
2453fa9e4066Sahrens 
2454e05725b1Sbonwick static int
2455e14bb325SJeff Bonwick zio_checksum_verify(zio_t *zio)
2456fa9e4066Sahrens {
245722fe2c88SJonathan Adams 	zio_bad_cksum_t info;
2458e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
2459e14bb325SJeff Bonwick 	int error;
2460fa9e4066Sahrens 
2461*b24ab676SJeff Bonwick 	ASSERT(zio->io_vd != NULL);
2462*b24ab676SJeff Bonwick 
2463e14bb325SJeff Bonwick 	if (bp == NULL) {
2464e14bb325SJeff Bonwick 		/*
2465e14bb325SJeff Bonwick 		 * This is zio_read_phys().
2466e14bb325SJeff Bonwick 		 * We're either verifying a label checksum, or nothing at all.
2467e14bb325SJeff Bonwick 		 */
2468e14bb325SJeff Bonwick 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2469e14bb325SJeff Bonwick 			return (ZIO_PIPELINE_CONTINUE);
2470fa9e4066Sahrens 
2471e14bb325SJeff Bonwick 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2472e14bb325SJeff Bonwick 	}
2473fa9e4066Sahrens 
247422fe2c88SJonathan Adams 	if ((error = zio_checksum_error(zio, &info)) != 0) {
2475e14bb325SJeff Bonwick 		zio->io_error = error;
2476e14bb325SJeff Bonwick 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
247722fe2c88SJonathan Adams 			zfs_ereport_start_checksum(zio->io_spa,
247822fe2c88SJonathan Adams 			    zio->io_vd, zio, zio->io_offset,
247922fe2c88SJonathan Adams 			    zio->io_size, NULL, &info);
2480e14bb325SJeff Bonwick 		}
2481fa9e4066Sahrens 	}
2482fa9e4066Sahrens 
2483e05725b1Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
2484fa9e4066Sahrens }
2485fa9e4066Sahrens 
2486fa9e4066Sahrens /*
2487fa9e4066Sahrens  * Called by RAID-Z to ensure we don't compute the checksum twice.
2488fa9e4066Sahrens  */
2489fa9e4066Sahrens void
2490fa9e4066Sahrens zio_checksum_verified(zio_t *zio)
2491fa9e4066Sahrens {
2492*b24ab676SJeff Bonwick 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2493fa9e4066Sahrens }
2494fa9e4066Sahrens 
2495fa9e4066Sahrens /*
2496e14bb325SJeff Bonwick  * ==========================================================================
2497e14bb325SJeff Bonwick  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2498e14bb325SJeff Bonwick  * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2499e14bb325SJeff Bonwick  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2500e14bb325SJeff Bonwick  * indicate errors that are specific to one I/O, and most likely permanent.
2501e14bb325SJeff Bonwick  * Any other error is presumed to be worse because we weren't expecting it.
2502e14bb325SJeff Bonwick  * ==========================================================================
2503fa9e4066Sahrens  */
2504e14bb325SJeff Bonwick int
2505e14bb325SJeff Bonwick zio_worst_error(int e1, int e2)
2506fa9e4066Sahrens {
2507e14bb325SJeff Bonwick 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2508e14bb325SJeff Bonwick 	int r1, r2;
2509e14bb325SJeff Bonwick 
2510e14bb325SJeff Bonwick 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2511e14bb325SJeff Bonwick 		if (e1 == zio_error_rank[r1])
2512e14bb325SJeff Bonwick 			break;
2513e14bb325SJeff Bonwick 
2514e14bb325SJeff Bonwick 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2515e14bb325SJeff Bonwick 		if (e2 == zio_error_rank[r2])
2516e14bb325SJeff Bonwick 			break;
251744cd46caSbillm 
2518e14bb325SJeff Bonwick 	return (r1 > r2 ? e1 : e2);
2519fa9e4066Sahrens }
2520fa9e4066Sahrens 
2521fa9e4066Sahrens /*
2522fa9e4066Sahrens  * ==========================================================================
2523e14bb325SJeff Bonwick  * I/O completion
2524fa9e4066Sahrens  * ==========================================================================
2525fa9e4066Sahrens  */
2526e14bb325SJeff Bonwick static int
2527e14bb325SJeff Bonwick zio_ready(zio_t *zio)
2528fa9e4066Sahrens {
2529e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
2530a3f829aeSBill Moore 	zio_t *pio, *pio_next;
2531fa9e4066Sahrens 
2532*b24ab676SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2533*b24ab676SJeff Bonwick 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2534f5383399SBill Moore 		return (ZIO_PIPELINE_STOP);
2535fa9e4066Sahrens 
2536f5383399SBill Moore 	if (zio->io_ready) {
2537e14bb325SJeff Bonwick 		ASSERT(IO_IS_ALLOCATING(zio));
2538e14bb325SJeff Bonwick 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2539e14bb325SJeff Bonwick 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2540fa9e4066Sahrens 
2541e14bb325SJeff Bonwick 		zio->io_ready(zio);
2542e14bb325SJeff Bonwick 	}
2543fa9e4066Sahrens 
2544e14bb325SJeff Bonwick 	if (bp != NULL && bp != &zio->io_bp_copy)
2545e14bb325SJeff Bonwick 		zio->io_bp_copy = *bp;
2546fa9e4066Sahrens 
2547e14bb325SJeff Bonwick 	if (zio->io_error)
2548e14bb325SJeff Bonwick 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2549fa9e4066Sahrens 
2550a3f829aeSBill Moore 	mutex_enter(&zio->io_lock);
2551a3f829aeSBill Moore 	zio->io_state[ZIO_WAIT_READY] = 1;
2552a3f829aeSBill Moore 	pio = zio_walk_parents(zio);
2553a3f829aeSBill Moore 	mutex_exit(&zio->io_lock);
2554a3f829aeSBill Moore 
2555a3f829aeSBill Moore 	/*
2556a3f829aeSBill Moore 	 * As we notify zio's parents, new parents could be added.
2557a3f829aeSBill Moore 	 * New parents go to the head of zio's io_parent_list, however,
2558a3f829aeSBill Moore 	 * so we will (correctly) not notify them.  The remainder of zio's
2559a3f829aeSBill Moore 	 * io_parent_list, from 'pio_next' onward, cannot change because
2560a3f829aeSBill Moore 	 * all parents must wait for us to be done before they can be done.
2561a3f829aeSBill Moore 	 */
2562a3f829aeSBill Moore 	for (; pio != NULL; pio = pio_next) {
2563a3f829aeSBill Moore 		pio_next = zio_walk_parents(zio);
2564e14bb325SJeff Bonwick 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2565a3f829aeSBill Moore 	}
2566fa9e4066Sahrens 
2567*b24ab676SJeff Bonwick 	if (zio->io_flags & ZIO_FLAG_NODATA) {
2568*b24ab676SJeff Bonwick 		if (BP_IS_GANG(bp)) {
2569*b24ab676SJeff Bonwick 			zio->io_flags &= ~ZIO_FLAG_NODATA;
2570*b24ab676SJeff Bonwick 		} else {
2571*b24ab676SJeff Bonwick 			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2572*b24ab676SJeff Bonwick 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2573*b24ab676SJeff Bonwick 		}
2574*b24ab676SJeff Bonwick 	}
2575*b24ab676SJeff Bonwick 
2576e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_CONTINUE);
2577fa9e4066Sahrens }
2578fa9e4066Sahrens 
2579e14bb325SJeff Bonwick static int
2580e14bb325SJeff Bonwick zio_done(zio_t *zio)
2581d63d470bSgw {
2582e14bb325SJeff Bonwick 	spa_t *spa = zio->io_spa;
2583e14bb325SJeff Bonwick 	zio_t *lio = zio->io_logical;
2584e14bb325SJeff Bonwick 	blkptr_t *bp = zio->io_bp;
2585e14bb325SJeff Bonwick 	vdev_t *vd = zio->io_vd;
2586e14bb325SJeff Bonwick 	uint64_t psize = zio->io_size;
2587a3f829aeSBill Moore 	zio_t *pio, *pio_next;
2588d63d470bSgw 
2589e14bb325SJeff Bonwick 	/*
2590f5383399SBill Moore 	 * If our children haven't all completed,
2591e14bb325SJeff Bonwick 	 * wait for them and then repeat this pipeline stage.
2592e14bb325SJeff Bonwick 	 */
2593e14bb325SJeff Bonwick 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2594e14bb325SJeff Bonwick 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2595*b24ab676SJeff Bonwick 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2596e14bb325SJeff Bonwick 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2597e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
2598d63d470bSgw 
2599e14bb325SJeff Bonwick 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2600e14bb325SJeff Bonwick 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2601e14bb325SJeff Bonwick 			ASSERT(zio->io_children[c][w] == 0);
2602e14bb325SJeff Bonwick 
2603e14bb325SJeff Bonwick 	if (bp != NULL) {
2604e14bb325SJeff Bonwick 		ASSERT(bp->blk_pad[0] == 0);
2605e14bb325SJeff Bonwick 		ASSERT(bp->blk_pad[1] == 0);
2606e14bb325SJeff Bonwick 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2607a3f829aeSBill Moore 		    (bp == zio_unique_parent(zio)->io_bp));
2608e14bb325SJeff Bonwick 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2609*b24ab676SJeff Bonwick 		    zio->io_bp_override == NULL &&
2610e14bb325SJeff Bonwick 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2611e14bb325SJeff Bonwick 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
2612*b24ab676SJeff Bonwick 			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2613e14bb325SJeff Bonwick 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
2614e14bb325SJeff Bonwick 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2615e14bb325SJeff Bonwick 		}
2616e14bb325SJeff Bonwick 	}
2617fa9e4066Sahrens 
2618e14bb325SJeff Bonwick 	/*
2619*b24ab676SJeff Bonwick 	 * If there were child vdev/gang/ddt errors, they apply to us now.
2620e14bb325SJeff Bonwick 	 */
2621e14bb325SJeff Bonwick 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2622e14bb325SJeff Bonwick 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2623*b24ab676SJeff Bonwick 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
2624*b24ab676SJeff Bonwick 
2625*b24ab676SJeff Bonwick 	/*
2626*b24ab676SJeff Bonwick 	 * If the I/O on the transformed data was successful, generate any
2627*b24ab676SJeff Bonwick 	 * checksum reports now while we still have the transformed data.
2628*b24ab676SJeff Bonwick 	 */
2629*b24ab676SJeff Bonwick 	if (zio->io_error == 0) {
2630*b24ab676SJeff Bonwick 		while (zio->io_cksum_report != NULL) {
2631*b24ab676SJeff Bonwick 			zio_cksum_report_t *zcr = zio->io_cksum_report;
2632*b24ab676SJeff Bonwick 			uint64_t align = zcr->zcr_align;
2633*b24ab676SJeff Bonwick 			uint64_t asize = P2ROUNDUP(psize, align);
2634*b24ab676SJeff Bonwick 			char *abuf = zio->io_data;
2635*b24ab676SJeff Bonwick 
2636*b24ab676SJeff Bonwick 			if (asize != psize) {
2637*b24ab676SJeff Bonwick 				abuf = zio_buf_alloc(asize);
2638*b24ab676SJeff Bonwick 				bcopy(zio->io_data, abuf, psize);
2639*b24ab676SJeff Bonwick 				bzero(abuf + psize, asize - psize);
2640*b24ab676SJeff Bonwick 			}
2641*b24ab676SJeff Bonwick 
2642*b24ab676SJeff Bonwick 			zio->io_cksum_report = zcr->zcr_next;
2643*b24ab676SJeff Bonwick 			zcr->zcr_next = NULL;
2644*b24ab676SJeff Bonwick 			zcr->zcr_finish(zcr, abuf);
2645*b24ab676SJeff Bonwick 			zfs_ereport_free_checksum(zcr);
2646*b24ab676SJeff Bonwick 
2647*b24ab676SJeff Bonwick 			if (asize != psize)
2648*b24ab676SJeff Bonwick 				zio_buf_free(abuf, asize);
2649*b24ab676SJeff Bonwick 		}
2650*b24ab676SJeff Bonwick 	}
2651e14bb325SJeff Bonwick 
2652e14bb325SJeff Bonwick 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
2653e14bb325SJeff Bonwick 
2654e14bb325SJeff Bonwick 	vdev_stat_update(zio, psize);
2655e14bb325SJeff Bonwick 
2656e14bb325SJeff Bonwick 	if (zio->io_error) {
2657e14bb325SJeff Bonwick 		/*
2658e14bb325SJeff Bonwick 		 * If this I/O is attached to a particular vdev,
2659e14bb325SJeff Bonwick 		 * generate an error message describing the I/O failure
2660e14bb325SJeff Bonwick 		 * at the block level.  We ignore these errors if the
2661e14bb325SJeff Bonwick 		 * device is currently unavailable.
2662e14bb325SJeff Bonwick 		 */
2663e14bb325SJeff Bonwick 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2664e14bb325SJeff Bonwick 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2665e14bb325SJeff Bonwick 
26668f18d1faSGeorge Wilson 		if ((zio->io_error == EIO || !(zio->io_flags &
26678f18d1faSGeorge Wilson 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
26688f18d1faSGeorge Wilson 		    zio == lio) {
2669e14bb325SJeff Bonwick 			/*
2670e14bb325SJeff Bonwick 			 * For logical I/O requests, tell the SPA to log the
2671e14bb325SJeff Bonwick 			 * error and generate a logical data ereport.
2672e14bb325SJeff Bonwick 			 */
2673e14bb325SJeff Bonwick 			spa_log_error(spa, zio);
2674e14bb325SJeff Bonwick 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2675e14bb325SJeff Bonwick 			    0, 0);
2676e14bb325SJeff Bonwick 		}
2677e14bb325SJeff Bonwick 	}
2678fa9e4066Sahrens 
2679e14bb325SJeff Bonwick 	if (zio->io_error && zio == lio) {
2680e14bb325SJeff Bonwick 		/*
2681e14bb325SJeff Bonwick 		 * Determine whether zio should be reexecuted.  This will
2682e14bb325SJeff Bonwick 		 * propagate all the way to the root via zio_notify_parent().
2683e14bb325SJeff Bonwick 		 */
2684e14bb325SJeff Bonwick 		ASSERT(vd == NULL && bp != NULL);
2685*b24ab676SJeff Bonwick 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2686e14bb325SJeff Bonwick 
2687*b24ab676SJeff Bonwick 		if (IO_IS_ALLOCATING(zio) &&
2688*b24ab676SJeff Bonwick 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
2689e14bb325SJeff Bonwick 			if (zio->io_error != ENOSPC)
2690e14bb325SJeff Bonwick 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2691e14bb325SJeff Bonwick 			else
2692e14bb325SJeff Bonwick 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2693*b24ab676SJeff Bonwick 		}
2694e14bb325SJeff Bonwick 
2695e14bb325SJeff Bonwick 		if ((zio->io_type == ZIO_TYPE_READ ||
2696e14bb325SJeff Bonwick 		    zio->io_type == ZIO_TYPE_FREE) &&
2697e14bb325SJeff Bonwick 		    zio->io_error == ENXIO &&
26988ad4d6ddSJeff Bonwick 		    spa->spa_load_state == SPA_LOAD_NONE &&
2699e14bb325SJeff Bonwick 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2700e14bb325SJeff Bonwick 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2701e14bb325SJeff Bonwick 
2702e14bb325SJeff Bonwick 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2703e14bb325SJeff Bonwick 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
270422fe2c88SJonathan Adams 
270522fe2c88SJonathan Adams 		/*
270622fe2c88SJonathan Adams 		 * Here is a possibly good place to attempt to do
270722fe2c88SJonathan Adams 		 * either combinatorial reconstruction or error correction
270822fe2c88SJonathan Adams 		 * based on checksums.  It also might be a good place
270922fe2c88SJonathan Adams 		 * to send out preliminary ereports before we suspend
271022fe2c88SJonathan Adams 		 * processing.
271122fe2c88SJonathan Adams 		 */
2712d63d470bSgw 	}
2713d63d470bSgw 
271467bd71c6Sperrin 	/*
2715e14bb325SJeff Bonwick 	 * If there were logical child errors, they apply to us now.
2716e14bb325SJeff Bonwick 	 * We defer this until now to avoid conflating logical child
2717e14bb325SJeff Bonwick 	 * errors with errors that happened to the zio itself when
2718e14bb325SJeff Bonwick 	 * updating vdev stats and reporting FMA events above.
271967bd71c6Sperrin 	 */
2720e14bb325SJeff Bonwick 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
27218654d025Sperrin 
2722*b24ab676SJeff Bonwick 	if ((zio->io_error || zio->io_reexecute) &&
2723*b24ab676SJeff Bonwick 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
2724*b24ab676SJeff Bonwick 	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
2725f5383399SBill Moore 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2726f5383399SBill Moore 
2727f5383399SBill Moore 	zio_gang_tree_free(&zio->io_gang_tree);
2728f5383399SBill Moore 
272933a372edSGeorge Wilson 	/*
273033a372edSGeorge Wilson 	 * Godfather I/Os should never suspend.
273133a372edSGeorge Wilson 	 */
273233a372edSGeorge Wilson 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
273333a372edSGeorge Wilson 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
273433a372edSGeorge Wilson 		zio->io_reexecute = 0;
273533a372edSGeorge Wilson 
273633a372edSGeorge Wilson 	if (zio->io_reexecute) {
2737e14bb325SJeff Bonwick 		/*
2738e14bb325SJeff Bonwick 		 * This is a logical I/O that wants to reexecute.
2739e14bb325SJeff Bonwick 		 *
2740e14bb325SJeff Bonwick 		 * Reexecute is top-down.  When an i/o fails, if it's not
2741e14bb325SJeff Bonwick 		 * the root, it simply notifies its parent and sticks around.
2742e14bb325SJeff Bonwick 		 * The parent, seeing that it still has children in zio_done(),
2743e14bb325SJeff Bonwick 		 * does the same.  This percolates all the way up to the root.
2744e14bb325SJeff Bonwick 		 * The root i/o will reexecute or suspend the entire tree.
2745e14bb325SJeff Bonwick 		 *
2746e14bb325SJeff Bonwick 		 * This approach ensures that zio_reexecute() honors
2747e14bb325SJeff Bonwick 		 * all the original i/o dependency relationships, e.g.
2748e14bb325SJeff Bonwick 		 * parents not executing until children are ready.
2749e14bb325SJeff Bonwick 		 */
2750e14bb325SJeff Bonwick 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2751fa9e4066Sahrens 
2752f5383399SBill Moore 		zio->io_gang_leader = NULL;
2753e14bb325SJeff Bonwick 
2754a3f829aeSBill Moore 		mutex_enter(&zio->io_lock);
2755a3f829aeSBill Moore 		zio->io_state[ZIO_WAIT_DONE] = 1;
2756a3f829aeSBill Moore 		mutex_exit(&zio->io_lock);
2757a3f829aeSBill Moore 
275854d692b7SGeorge Wilson 		/*
275954d692b7SGeorge Wilson 		 * "The Godfather" I/O monitors its children but is
276054d692b7SGeorge Wilson 		 * not a true parent to them. It will track them through
276154d692b7SGeorge Wilson 		 * the pipeline but severs its ties whenever they get into
276254d692b7SGeorge Wilson 		 * trouble (e.g. suspended). This allows "The Godfather"
276354d692b7SGeorge Wilson 		 * I/O to return status without blocking.
276454d692b7SGeorge Wilson 		 */
276554d692b7SGeorge Wilson 		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
276654d692b7SGeorge Wilson 			zio_link_t *zl = zio->io_walk_link;
276754d692b7SGeorge Wilson 			pio_next = zio_walk_parents(zio);
276854d692b7SGeorge Wilson 
276954d692b7SGeorge Wilson 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
277054d692b7SGeorge Wilson 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
277154d692b7SGeorge Wilson 				zio_remove_child(pio, zio, zl);
277254d692b7SGeorge Wilson 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
277354d692b7SGeorge Wilson 			}
277454d692b7SGeorge Wilson 		}
277554d692b7SGeorge Wilson 
2776a3f829aeSBill Moore 		if ((pio = zio_unique_parent(zio)) != NULL) {
2777e14bb325SJeff Bonwick 			/*
2778e14bb325SJeff Bonwick 			 * We're not a root i/o, so there's nothing to do
2779e14bb325SJeff Bonwick 			 * but notify our parent.  Don't propagate errors
2780e14bb325SJeff Bonwick 			 * upward since we haven't permanently failed yet.
2781e14bb325SJeff Bonwick 			 */
278233a372edSGeorge Wilson 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
2783e14bb325SJeff Bonwick 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2784e14bb325SJeff Bonwick 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2785e14bb325SJeff Bonwick 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2786e14bb325SJeff Bonwick 			/*
2787e14bb325SJeff Bonwick 			 * We'd fail again if we reexecuted now, so suspend
2788e14bb325SJeff Bonwick 			 * until conditions improve (e.g. device comes online).
2789e14bb325SJeff Bonwick 			 */
2790e14bb325SJeff Bonwick 			zio_suspend(spa, zio);
2791e14bb325SJeff Bonwick 		} else {
2792e14bb325SJeff Bonwick 			/*
2793e14bb325SJeff Bonwick 			 * Reexecution is potentially a huge amount of work.
2794e14bb325SJeff Bonwick 			 * Hand it off to the otherwise-unused claim taskq.
2795e14bb325SJeff Bonwick 			 */
2796e14bb325SJeff Bonwick 			(void) taskq_dispatch(
2797e14bb325SJeff Bonwick 			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2798e14bb325SJeff Bonwick 			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
2799e14bb325SJeff Bonwick 		}
2800e14bb325SJeff Bonwick 		return (ZIO_PIPELINE_STOP);
2801fa9e4066Sahrens 	}
2802fa9e4066Sahrens 
2803*b24ab676SJeff Bonwick 	ASSERT(zio->io_child_count == 0);
280433a372edSGeorge Wilson 	ASSERT(zio->io_reexecute == 0);
2805e14bb325SJeff Bonwick 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2806fa9e4066Sahrens 
2807*b24ab676SJeff Bonwick 	/*
2808*b24ab676SJeff Bonwick 	 * Report any checksum errors, since the I/O is complete.
2809*b24ab676SJeff Bonwick 	 */
281022fe2c88SJonathan Adams 	while (zio->io_cksum_report != NULL) {
2811*b24ab676SJeff Bonwick 		zio_cksum_report_t *zcr = zio->io_cksum_report;
2812*b24ab676SJeff Bonwick 		zio->io_cksum_report = zcr->zcr_next;
2813*b24ab676SJeff Bonwick 		zcr->zcr_next = NULL;
2814*b24ab676SJeff Bonwick 		zcr->zcr_finish(zcr, NULL);
2815*b24ab676SJeff Bonwick 		zfs_ereport_free_checksum(zcr);
281622fe2c88SJonathan Adams 	}
281722fe2c88SJonathan Adams 
2818a3f829aeSBill Moore 	/*
2819a3f829aeSBill Moore 	 * It is the responsibility of the done callback to ensure that this
2820a3f829aeSBill Moore 	 * particular zio is no longer discoverable for adoption, and as
2821a3f829aeSBill Moore 	 * such, cannot acquire any new parents.
2822a3f829aeSBill Moore 	 */
2823e14bb325SJeff Bonwick 	if (zio->io_done)
2824e14bb325SJeff Bonwick 		zio->io_done(zio);
2825fa9e4066Sahrens 
2826a3f829aeSBill Moore 	mutex_enter(&zio->io_lock);
2827a3f829aeSBill Moore 	zio->io_state[ZIO_WAIT_DONE] = 1;
2828a3f829aeSBill Moore 	mutex_exit(&zio->io_lock);
2829fa9e4066Sahrens 
2830a3f829aeSBill Moore 	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2831a3f829aeSBill Moore 		zio_link_t *zl = zio->io_walk_link;
2832a3f829aeSBill Moore 		pio_next = zio_walk_parents(zio);
2833a3f829aeSBill Moore 		zio_remove_child(pio, zio, zl);
2834e14bb325SJeff Bonwick 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2835e14bb325SJeff Bonwick 	}
2836fa9e4066Sahrens 
2837e14bb325SJeff Bonwick 	if (zio->io_waiter != NULL) {
2838e14bb325SJeff Bonwick 		mutex_enter(&zio->io_lock);
2839e14bb325SJeff Bonwick 		zio->io_executor = NULL;
2840e14bb325SJeff Bonwick 		cv_broadcast(&zio->io_cv);
2841e14bb325SJeff Bonwick 		mutex_exit(&zio->io_lock);
2842e14bb325SJeff Bonwick 	} else {
2843e14bb325SJeff Bonwick 		zio_destroy(zio);
2844e14bb325SJeff Bonwick 	}
2845fa9e4066Sahrens 
2846e14bb325SJeff Bonwick 	return (ZIO_PIPELINE_STOP);
2847fa9e4066Sahrens }
284846341222Sperrin 
284946341222Sperrin /*
2850e14bb325SJeff Bonwick  * ==========================================================================
2851e14bb325SJeff Bonwick  * I/O pipeline definition
2852e14bb325SJeff Bonwick  * ==========================================================================
285346341222Sperrin  */
2854*b24ab676SJeff Bonwick static zio_pipe_stage_t *zio_pipeline[] = {
2855e14bb325SJeff Bonwick 	NULL,
2856e14bb325SJeff Bonwick 	zio_read_bp_init,
2857*b24ab676SJeff Bonwick 	zio_free_bp_init,
2858*b24ab676SJeff Bonwick 	zio_issue_async,
2859e14bb325SJeff Bonwick 	zio_write_bp_init,
2860e14bb325SJeff Bonwick 	zio_checksum_generate,
2861*b24ab676SJeff Bonwick 	zio_ddt_read_start,
2862*b24ab676SJeff Bonwick 	zio_ddt_read_done,
2863*b24ab676SJeff Bonwick 	zio_ddt_write,
2864*b24ab676SJeff Bonwick 	zio_ddt_free,
2865e14bb325SJeff Bonwick 	zio_gang_assemble,
2866e14bb325SJeff Bonwick 	zio_gang_issue,
2867e14bb325SJeff Bonwick 	zio_dva_allocate,
2868e14bb325SJeff Bonwick 	zio_dva_free,
2869e14bb325SJeff Bonwick 	zio_dva_claim,
2870e14bb325SJeff Bonwick 	zio_ready,
2871e14bb325SJeff Bonwick 	zio_vdev_io_start,
2872e14bb325SJeff Bonwick 	zio_vdev_io_done,
2873e14bb325SJeff Bonwick 	zio_vdev_io_assess,
2874e14bb325SJeff Bonwick 	zio_checksum_verify,
2875e14bb325SJeff Bonwick 	zio_done
2876e14bb325SJeff Bonwick };
2877