1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
223f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23f78cdc3Paul Dagnelie * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
245aeb947Garrett D'Amore * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
26663207aDon Brady * Copyright (c) 2017, Intel Corporation.
278548ec7John Levon * Copyright 2020 Joyent, Inc.
28fa9e406ahrens */
29fa9e406ahrens
30de710d2Josef 'Jeff' Sipek#include <sys/sysmacros.h>
31fa9e406ahrens#include <sys/zfs_context.h>
32ea8dc4beschrock#include <sys/fm/fs/zfs.h>
33fa9e406ahrens#include <sys/spa.h>
34fa9e406ahrens#include <sys/txg.h>
35fa9e406ahrens#include <sys/spa_impl.h>
36fa9e406ahrens#include <sys/vdev_impl.h>
37084fd14Brian Behlendorf#include <sys/vdev_trim.h>
38fa9e406ahrens#include <sys/zio_impl.h>
39fa9e406ahrens#include <sys/zio_compress.h>
40fa9e406ahrens#include <sys/zio_checksum.h>
41b24ab67Jeff Bonwick#include <sys/dmu_objset.h>
42b24ab67Jeff Bonwick#include <sys/arc.h>
43b24ab67Jeff Bonwick#include <sys/ddt.h>
445d7b4d4Matthew Ahrens#include <sys/blkptr.h>
4543466aaMax Grossman#include <sys/zfeature.h>
46dd50e0cTony Hutter#include <sys/time.h>
47a3874b8Toomas Soome#include <sys/dsl_scan.h>
480f7643cGeorge Wilson#include <sys/metaslab_impl.h>
49770499eDan Kimmel#include <sys/abd.h>
50f78cdc3Paul Dagnelie#include <sys/cityhash.h>
51eb63303Tom Caputi#include <sys/dsl_crypt.h>
52fa9e406ahrens
53fa9e406ahrens/*
54fa9e406ahrens * ==========================================================================
55fa9e406ahrens * I/O type descriptions
56fa9e406ahrens * ==========================================================================
57fa9e406ahrens */
5869962b5Matthew Ahrensconst char *zio_type_name[ZIO_TYPES] = {
5980eb36fGeorge Wilson	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
60084fd14Brian Behlendorf	"zio_ioctl", "z_trim"
6180eb36fGeorge Wilson};
62fa9e406ahrens
630f7643cGeorge Wilsonboolean_t zio_dva_throttle_enabled = B_TRUE;
640f7643cGeorge Wilson
65fa9e406ahrens/*
66fa9e406ahrens * ==========================================================================
67fa9e406ahrens * I/O kmem caches
68fa9e406ahrens * ==========================================================================
69fa9e406ahrens */
70ccae0b5eschrockkmem_cache_t *zio_cache;
71a3f829aBill Moorekmem_cache_t *zio_link_cache;
72fa9e406ahrenskmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
73ad23a2djohansenkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
74ad23a2djohansen
75ad23a2djohansen#ifdef _KERNEL
76ad23a2djohansenextern vmem_t *zio_alloc_arena;
77ad23a2djohansen#endif
78fa9e406ahrens
79738f37bGeorge Wilson#define	ZIO_PIPELINE_CONTINUE		0x100
80738f37bGeorge Wilson#define	ZIO_PIPELINE_STOP		0x101
81738f37bGeorge Wilson
82dd50e0cTony Hutter/* Mark IOs as "slow" if they take longer than 30 seconds */
83dd50e0cTony Hutterint zio_slow_io_ms = (30 * MILLISEC);
84dd50e0cTony Hutter
85a2cdcddPaul Dagnelie#define	BP_SPANB(indblkshift, level) \
86a2cdcddPaul Dagnelie	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
87a2cdcddPaul Dagnelie#define	COMPARE_META_LEVEL	0x80000000ul
880a4e951gw/*
8901f55e4George Wilson * The following actions directly effect the spa's sync-to-convergence logic.
9001f55e4George Wilson * The values below define the sync pass when we start performing the action.
9101f55e4George Wilson * Care should be taken when changing these values as they directly impact
9201f55e4George Wilson * spa_sync() performance. Tuning these values may introduce subtle performance
9301f55e4George Wilson * pathologies and should only be done in the context of performance analysis.
9401f55e4George Wilson * These tunables will eventually be removed and replaced with #defines once
9501f55e4George Wilson * enough analysis has been done to determine optimal values.
9601f55e4George Wilson *
9701f55e4George Wilson * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
9801f55e4George Wilson * regular blocks are not deferred.
9901f55e4George Wilson */
10001f55e4George Wilsonint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
10101f55e4George Wilsonint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
10201f55e4George Wilsonint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
10301f55e4George Wilson
10401f55e4George Wilson/*
105e14bb32Jeff Bonwick * An allocating zio is one that either currently has the DVA allocate
106e14bb32Jeff Bonwick * stage set or will have it later in its lifetime.
1070a4e951gw */
108b24ab67Jeff Bonwick#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
109b24ab67Jeff Bonwick
11035a5a35Jonathan Adamsboolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
11135a5a35Jonathan Adams
112b24ab67Jeff Bonwick#ifdef ZFS_DEBUG
113b24ab67Jeff Bonwickint zio_buf_debug_limit = 16384;
114b24ab67Jeff Bonwick#else
115b24ab67Jeff Bonwickint zio_buf_debug_limit = 0;
116b24ab67Jeff Bonwick#endif
1170a4e951gw
1180f7643cGeorge Wilsonstatic void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
1190f7643cGeorge Wilson
120fa9e406ahrensvoid
121fa9e406ahrenszio_init(void)
122fa9e406ahrens{
123fa9e406ahrens	size_t c;
124ad23a2djohansen	vmem_t *data_alloc_arena = NULL;
125ad23a2djohansen
126ad23a2djohansen#ifdef _KERNEL
127ad23a2djohansen	data_alloc_arena = zio_alloc_arena;
128ad23a2djohansen#endif
129a3f829aBill Moore	zio_cache = kmem_cache_create("zio_cache",
130a3f829aBill Moore	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
131a3f829aBill Moore	zio_link_cache = kmem_cache_create("zio_link_cache",
132a3f829aBill Moore	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
133ccae0b5eschrock
134fa9e406ahrens	/*
135fa9e406ahrens	 * For small buffers, we want a cache for each multiple of
136b515258Matthew Ahrens	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
137b515258Matthew Ahrens	 * for each quarter-power of 2.
138fa9e406ahrens	 */
139fa9e406ahrens	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
140fa9e406ahrens		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
141fa9e406ahrens		size_t p2 = size;
142fa9e406ahrens		size_t align = 0;
143e291592Jonathan Adams		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
144fa9e406ahrens
145de710d2Josef 'Jeff' Sipek		while (!ISP2(p2))
146fa9e406ahrens			p2 &= p2 - 1;
147fa9e406ahrens
148cd1c8b8Matthew Ahrens#ifndef _KERNEL
149cd1c8b8Matthew Ahrens		/*
150cd1c8b8Matthew Ahrens		 * If we are using watchpoints, put each buffer on its own page,
151cd1c8b8Matthew Ahrens		 * to eliminate the performance overhead of trapping to the
152cd1c8b8Matthew Ahrens		 * kernel when modifying a non-watched buffer that shares the
153cd1c8b8Matthew Ahrens		 * page with a watched buffer.
154cd1c8b8Matthew Ahrens		 */
155cd1c8b8Matthew Ahrens		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
156cd1c8b8Matthew Ahrens			continue;
157cd1c8b8Matthew Ahrens#endif
158fa9e406ahrens		if (size <= 4 * SPA_MINBLOCKSIZE) {
159fa9e406ahrens			align = SPA_MINBLOCKSIZE;
160cd1c8b8Matthew Ahrens		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
161b515258Matthew Ahrens			align = MIN(p2 >> 2, PAGESIZE);
162fa9e406ahrens		}
163fa9e406ahrens
164fa9e406ahrens		if (align != 0) {
165ad23a2djohansen			char name[36];
1665ad8204nd			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
167fa9e406ahrens			zio_buf_cache[c] = kmem_cache_create(name, size,
168e291592Jonathan Adams			    align, NULL, NULL, NULL, NULL, NULL, cflags);
169ad23a2djohansen
170e291592Jonathan Adams			/*
171e291592Jonathan Adams			 * Since zio_data bufs do not appear in crash dumps, we
172e291592Jonathan Adams			 * pass KMC_NOTOUCH so that no allocator metadata is
173e291592Jonathan Adams			 * stored with the buffers.
174e291592Jonathan Adams			 */
175ad23a2djohansen			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
176ad23a2djohansen			zio_data_buf_cache[c] = kmem_cache_create(name, size,
177ad23a2djohansen			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
178e291592Jonathan Adams			    cflags | KMC_NOTOUCH);
179fa9e406ahrens		}
180fa9e406ahrens	}
181fa9e406ahrens
182fa9e406ahrens	while (--c != 0) {
183fa9e406ahrens		ASSERT(zio_buf_cache[c] != NULL);
184fa9e406ahrens		if (zio_buf_cache[c - 1] == NULL)
185fa9e406ahrens			zio_buf_cache[c - 1] = zio_buf_cache[c];
186ad23a2djohansen
187ad23a2djohansen		ASSERT(zio_data_buf_cache[c] != NULL);
188ad23a2djohansen		if (zio_data_buf_cache[c - 1] == NULL)
189ad23a2djohansen			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
190fa9e406ahrens	}
191ea8dc4beschrock
192ea8dc4beschrock	zio_inject_init();
193fa9e406ahrens}
194fa9e406ahrens
195fa9e406ahrensvoid
196fa9e406ahrenszio_fini(void)
197fa9e406ahrens{
198fa9e406ahrens	size_t c;
199fa9e406ahrens	kmem_cache_t *last_cache = NULL;
200ad23a2djohansen	kmem_cache_t *last_data_cache = NULL;
201fa9e406ahrens
202fa9e406ahrens	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
203fa9e406ahrens		if (zio_buf_cache[c] != last_cache) {
204fa9e406ahrens			last_cache = zio_buf_cache[c];
205fa9e406ahrens			kmem_cache_destroy(zio_buf_cache[c]);
206fa9e406ahrens		}
207fa9e406ahrens		zio_buf_cache[c] = NULL;
208ad23a2djohansen
209ad23a2djohansen		if (zio_data_buf_cache[c] != last_data_cache) {
210ad23a2djohansen			last_data_cache = zio_data_buf_cache[c];
211ad23a2djohansen			kmem_cache_destroy(zio_data_buf_cache[c]);
212ad23a2djohansen		}
213ad23a2djohansen		zio_data_buf_cache[c] = NULL;
214fa9e406ahrens	}
215ea8dc4beschrock
216a3f829aBill Moore	kmem_cache_destroy(zio_link_cache);
217ccae0b5eschrock	kmem_cache_destroy(zio_cache);
218ccae0b5eschrock
219ea8dc4beschrock	zio_inject_fini();
220fa9e406ahrens}
221fa9e406ahrens
222fa9e406ahrens/*
223fa9e406ahrens * ==========================================================================
224fa9e406ahrens * Allocate and free I/O buffers
225fa9e406ahrens * ==========================================================================
226fa9e406ahrens */
227ad23a2djohansen
228ad23a2djohansen/*
229ad23a2djohansen * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
230ad23a2djohansen * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
231ad23a2djohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping
232ad23a2djohansen * excess / transient data in-core during a crashdump.
233ad23a2djohansen */
234fa9e406ahrensvoid *
235fa9e406ahrenszio_buf_alloc(size_t size)
236fa9e406ahrens{
237fa9e406ahrens	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
238fa9e406ahrens
239f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
240fa9e406ahrens
2411ab7f2dmaybee	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
242fa9e406ahrens}
243fa9e406ahrens
244ad23a2djohansen/*
245ad23a2djohansen * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
246ad23a2djohansen * crashdump if the kernel panics.  This exists so that we will limit the amount
247ad23a2djohansen * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
248ad23a2djohansen * of kernel heap dumped to disk when the kernel panics)
249ad23a2djohansen */
250ad23a2djohansenvoid *
251ad23a2djohansenzio_data_buf_alloc(size_t size)
252ad23a2djohansen{
253ad23a2djohansen	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
254ad23a2djohansen
255f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
256ad23a2djohansen
2571ab7f2dmaybee	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
258ad23a2djohansen}
259ad23a2djohansen
260fa9e406ahrensvoid
261fa9e406ahrenszio_buf_free(void *buf, size_t size)
262fa9e406ahrens{
263fa9e406ahrens	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
264fa9e406ahrens
265f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
266fa9e406ahrens
267fa9e406ahrens	kmem_cache_free(zio_buf_cache[c], buf);
268fa9e406ahrens}
269fa9e406ahrens
270ad23a2djohansenvoid
271ad23a2djohansenzio_data_buf_free(void *buf, size_t size)
272ad23a2djohansen{
273ad23a2djohansen	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
274ad23a2djohansen
275f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
276ad23a2djohansen
277ad23a2djohansen	kmem_cache_free(zio_data_buf_cache[c], buf);
278ad23a2djohansen}
279b3995adahrens
280eb63303Tom Caputi/* ARGSUSED */
281eb63303Tom Caputistatic void
282eb63303Tom Caputizio_abd_free(void *abd, size_t size)
283eb63303Tom Caputi{
284eb63303Tom Caputi	abd_free((abd_t *)abd);
285eb63303Tom Caputi}
286eb63303Tom Caputi
287fa9e406ahrens/*
288fa9e406ahrens * ==========================================================================
289fa9e406ahrens * Push and pop I/O transform buffers
290fa9e406ahrens * ==========================================================================
291fa9e406ahrens */
292dcbf3bdGeorge Wilsonvoid
293770499eDan Kimmelzio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
2949a686fbPaul Dagnelie    zio_transform_func_t *transform)
295fa9e406ahrens{
296fa9e406ahrens	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
297fa9e406ahrens
298770499eDan Kimmel	/*
299770499eDan Kimmel	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
300770499eDan Kimmel	 * going to get a nasty surprise when they try to access the data.
301770499eDan Kimmel	 */
302770499eDan Kimmel	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
303770499eDan Kimmel
304770499eDan Kimmel	zt->zt_orig_abd = zio->io_abd;
305e14bb32Jeff Bonwick	zt->zt_orig_size = zio->io_size;
306fa9e406ahrens	zt->zt_bufsize = bufsize;
307e14bb32Jeff Bonwick	zt->zt_transform = transform;
308fa9e406ahrens
309fa9e406ahrens	zt->zt_next = zio->io_transform_stack;
310fa9e406ahrens	zio->io_transform_stack = zt;
311fa9e406ahrens
312770499eDan Kimmel	zio->io_abd = data;
313fa9e406ahrens	zio->io_size = size;
314fa9e406ahrens}
315fa9e406ahrens
316dcbf3bdGeorge Wilsonvoid
317e14bb32Jeff Bonwickzio_pop_transforms(zio_t *zio)
318fa9e406ahrens{
319e14bb32Jeff Bonwick	zio_transform_t *zt;
320e14bb32Jeff Bonwick
321e14bb32Jeff Bonwick	while ((zt = zio->io_transform_stack) != NULL) {
322e14bb32Jeff Bonwick		if (zt->zt_transform != NULL)
323e14bb32Jeff Bonwick			zt->zt_transform(zio,
324770499eDan Kimmel			    zt->zt_orig_abd, zt->zt_orig_size);
325fa9e406ahrens
326b24ab67Jeff Bonwick		if (zt->zt_bufsize != 0)
327770499eDan Kimmel			abd_free(zio->io_abd);
328fa9e406ahrens
329770499eDan Kimmel		zio->io_abd = zt->zt_orig_abd;
330e14bb32Jeff Bonwick		zio->io_size = zt->zt_orig_size;
331e14bb32Jeff Bonwick		zio->io_transform_stack = zt->zt_next;
332fa9e406ahrens
333e14bb32Jeff Bonwick		kmem_free(zt, sizeof (zio_transform_t));
334fa9e406ahrens	}
335fa9e406ahrens}
336fa9e406ahrens
337e14bb32Jeff Bonwick/*
338e14bb32Jeff Bonwick * ==========================================================================
339eb63303Tom Caputi * I/O transform callbacks for subblocks, decompression, and decryption
340e14bb32Jeff Bonwick * ==========================================================================
341e14bb32Jeff Bonwick */
342e14bb32Jeff Bonwickstatic void
343770499eDan Kimmelzio_subblock(zio_t *zio, abd_t *data, uint64_t size)
344e14bb32Jeff Bonwick{
345e14bb32Jeff Bonwick	ASSERT(zio->io_size > size);
346e14bb32Jeff Bonwick
347e14bb32Jeff Bonwick	if (zio->io_type == ZIO_TYPE_READ)
348770499eDan Kimmel		abd_copy(data, zio->io_abd, size);
349e14bb32Jeff Bonwick}
350e14bb32Jeff Bonwick
351e14bb32Jeff Bonwickstatic void
352770499eDan Kimmelzio_decompress(zio_t *zio, abd_t *data, uint64_t size)
353e14bb32Jeff Bonwick{
354770499eDan Kimmel	if (zio->io_error == 0) {
355770499eDan Kimmel		void *tmp = abd_borrow_buf(data, size);
356770499eDan Kimmel		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
357770499eDan Kimmel		    zio->io_abd, tmp, zio->io_size, size);
358770499eDan Kimmel		abd_return_buf_copy(data, tmp, size);
359770499eDan Kimmel
360770499eDan Kimmel		if (ret != 0)
361770499eDan Kimmel			zio->io_error = SET_ERROR(EIO);
362770499eDan Kimmel	}
363e14bb32Jeff Bonwick}
364e14bb32Jeff Bonwick
365eb63303Tom Caputistatic void
366eb63303Tom Caputizio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
367eb63303Tom Caputi{
368eb63303Tom Caputi	int ret;
369eb63303Tom Caputi	void *tmp;
370eb63303Tom Caputi	blkptr_t *bp = zio->io_bp;
371eb63303Tom Caputi	spa_t *spa = zio->io_spa;
372eb63303Tom Caputi	uint64_t dsobj = zio->io_bookmark.zb_objset;
373eb63303Tom Caputi	uint64_t lsize = BP_GET_LSIZE(bp);
374eb63303Tom Caputi	dmu_object_type_t ot = BP_GET_TYPE(bp);
375eb63303Tom Caputi	uint8_t salt[ZIO_DATA_SALT_LEN];
376eb63303Tom Caputi	uint8_t iv[ZIO_DATA_IV_LEN];
377eb63303Tom Caputi	uint8_t mac[ZIO_DATA_MAC_LEN];
378eb63303Tom Caputi	boolean_t no_crypt = B_FALSE;
379eb63303Tom Caputi
380eb63303Tom Caputi	ASSERT(BP_USES_CRYPT(bp));
381eb63303Tom Caputi	ASSERT3U(size, !=, 0);
382eb63303Tom Caputi
383eb63303Tom Caputi	if (zio->io_error != 0)
384eb63303Tom Caputi		return;
385eb63303Tom Caputi
386eb63303Tom Caputi	/*
387eb63303Tom Caputi	 * Verify the cksum of MACs stored in an indirect bp. It will always
388eb63303Tom Caputi	 * be possible to verify this since it does not require an encryption
389eb63303Tom Caputi	 * key.
390eb63303Tom Caputi	 */
391eb63303Tom Caputi	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
392eb63303Tom Caputi		zio_crypt_decode_mac_bp(bp, mac);
393eb63303Tom Caputi
394eb63303Tom Caputi		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
395eb63303Tom Caputi			/*
396eb63303Tom Caputi			 * We haven't decompressed the data yet, but
397eb63303Tom Caputi			 * zio_crypt_do_indirect_mac_checksum() requires
398eb63303Tom Caputi			 * decompressed data to be able to parse out the MACs
399eb63303Tom Caputi			 * from the indirect block. We decompress it now and
400eb63303Tom Caputi			 * throw away the result after we are finished.
401eb63303Tom Caputi			 */
402eb63303Tom Caputi			tmp = zio_buf_alloc(lsize);
403eb63303Tom Caputi			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
404eb63303Tom Caputi			    zio->io_abd, tmp, zio->io_size, lsize);
405eb63303Tom Caputi			if (ret != 0) {
406eb63303Tom Caputi				ret = SET_ERROR(EIO);
407eb63303Tom Caputi				goto error;
408eb63303Tom Caputi			}
409eb63303Tom Caputi			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
410eb63303Tom Caputi			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
411eb63303Tom Caputi			zio_buf_free(tmp, lsize);
412eb63303Tom Caputi		} else {
413eb63303Tom Caputi			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
414eb63303Tom Caputi			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
415eb63303Tom Caputi		}
416eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
417eb63303Tom Caputi
418eb63303Tom Caputi		if (ret != 0)
419eb63303Tom Caputi			goto error;
420eb63303Tom Caputi
421eb63303Tom Caputi		return;
422eb63303Tom Caputi	}
423eb63303Tom Caputi
424eb63303Tom Caputi	/*
425eb63303Tom Caputi	 * If this is an authenticated block, just check the MAC. It would be
426eb63303Tom Caputi	 * nice to separate this out into its own flag, but for the moment
427eb63303Tom Caputi	 * enum zio_flag is out of bits.
428eb63303Tom Caputi	 */
429eb63303Tom Caputi	if (BP_IS_AUTHENTICATED(bp)) {
430eb63303Tom Caputi		if (ot == DMU_OT_OBJSET) {
431eb63303Tom Caputi			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
432eb63303Tom Caputi			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
433eb63303Tom Caputi		} else {
434eb63303Tom Caputi			zio_crypt_decode_mac_bp(bp, mac);
435eb63303Tom Caputi			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
436eb63303Tom Caputi			    zio->io_abd, size, mac);
437eb63303Tom Caputi		}
438eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
439eb63303Tom Caputi
440eb63303Tom Caputi		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
441eb63303Tom Caputi			ret = zio_handle_decrypt_injection(spa,
442eb63303Tom Caputi			    &zio->io_bookmark, ot, ECKSUM);
443eb63303Tom Caputi		}
444eb63303Tom Caputi		if (ret != 0)
445eb63303Tom Caputi			goto error;
446eb63303Tom Caputi
447eb63303Tom Caputi		return;
448eb63303Tom Caputi	}
449eb63303Tom Caputi
450eb63303Tom Caputi	zio_crypt_decode_params_bp(bp, salt, iv);
451eb63303Tom Caputi
452eb63303Tom Caputi	if (ot == DMU_OT_INTENT_LOG) {
453eb63303Tom Caputi		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
454eb63303Tom Caputi		zio_crypt_decode_mac_zil(tmp, mac);
455eb63303Tom Caputi		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
456eb63303Tom Caputi	} else {
457eb63303Tom Caputi		zio_crypt_decode_mac_bp(bp, mac);
458eb63303Tom Caputi	}
459eb63303Tom Caputi
460eb63303Tom Caputi	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
461eb63303Tom Caputi	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
462eb63303Tom Caputi	    zio->io_abd, &no_crypt);
463eb63303Tom Caputi	if (no_crypt)
464eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
465eb63303Tom Caputi
466eb63303Tom Caputi	if (ret != 0)
467eb63303Tom Caputi		goto error;
468eb63303Tom Caputi
469eb63303Tom Caputi	return;
470eb63303Tom Caputi
471eb63303Tom Caputierror:
472eb63303Tom Caputi	/* assert that the key was found unless this was speculative */
473eb63303Tom Caputi	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
474eb63303Tom Caputi
475eb63303Tom Caputi	/*
476eb63303Tom Caputi	 * If there was a decryption / authentication error return EIO as
477eb63303Tom Caputi	 * the io_error. If this was not a speculative zio, create an ereport.
478eb63303Tom Caputi	 */
479eb63303Tom Caputi	if (ret == ECKSUM) {
480eb63303Tom Caputi		zio->io_error = SET_ERROR(EIO);
481eb63303Tom Caputi		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
482eb63303Tom Caputi			spa_log_error(spa, &zio->io_bookmark);
4839b08814Toomas Soome			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
484eb63303Tom Caputi			    spa, NULL, &zio->io_bookmark, zio, 0, 0);
485eb63303Tom Caputi		}
486eb63303Tom Caputi	} else {
487eb63303Tom Caputi		zio->io_error = ret;
488eb63303Tom Caputi	}
489eb63303Tom Caputi}
490eb63303Tom Caputi
491e14bb32Jeff Bonwick/*
492e14bb32Jeff Bonwick * ==========================================================================
493e14bb32Jeff Bonwick * I/O parent/child relationships and pipeline interlocks
494e14bb32Jeff Bonwick * ==========================================================================
495e14bb32Jeff Bonwick */
496a3f829aBill Moorezio_t *
4970f7643cGeorge Wilsonzio_walk_parents(zio_t *cio, zio_link_t **zl)
498a3f829aBill Moore{
499a3f829aBill Moore	list_t *pl = &cio->io_parent_list;
500e14bb32Jeff Bonwick
5010f7643cGeorge Wilson	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
5020f7643cGeorge Wilson	if (*zl == NULL)
503a3f829aBill Moore		return (NULL);
504a3f829aBill Moore
5050f7643cGeorge Wilson	ASSERT((*zl)->zl_child == cio);
5060f7643cGeorge Wilson	return ((*zl)->zl_parent);
507a3f829aBill Moore}
508a3f829aBill Moore
509a3f829aBill Moorezio_t *
5100f7643cGeorge Wilsonzio_walk_children(zio_t *pio, zio_link_t **zl)
511a3f829aBill Moore{
512a3f829aBill Moore	list_t *cl = &pio->io_child_list;
513a3f829aBill Moore
514a3874b8Toomas Soome	ASSERT(MUTEX_HELD(&pio->io_lock));
515a3874b8Toomas Soome
5160f7643cGeorge Wilson	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
5170f7643cGeorge Wilson	if (*zl == NULL)
518a3f829aBill Moore		return (NULL);
519a3f829aBill Moore
5200f7643cGeorge Wilson	ASSERT((*zl)->zl_parent == pio);
5210f7643cGeorge Wilson	return ((*zl)->zl_child);
522a3f829aBill Moore}
523a3f829aBill Moore
524a3f829aBill Moorezio_t *
525a3f829aBill Moorezio_unique_parent(zio_t *cio)
526a3f829aBill Moore{
5270f7643cGeorge Wilson	zio_link_t *zl = NULL;
5280f7643cGeorge Wilson	zio_t *pio = zio_walk_parents(cio, &zl);
529a3f829aBill Moore
5300f7643cGeorge Wilson	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
531a3f829aBill Moore	return (pio);
532a3f829aBill Moore}
533a3f829aBill Moore
534a3f829aBill Moorevoid
535a3f829aBill Moorezio_add_child(zio_t *pio, zio_t *cio)
536e14bb32Jeff Bonwick{
537a3f829aBill Moore	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
538a3f829aBill Moore
539a3f829aBill Moore	/*
540a3f829aBill Moore	 * Logical I/Os can have logical, gang, or vdev children.
541a3f829aBill Moore	 * Gang I/Os can have gang or vdev children.
542a3f829aBill Moore	 * Vdev I/Os can only have vdev children.
543a3f829aBill Moore	 * The following ASSERT captures all of these constraints.
544a3f829aBill Moore	 */
5451271e4bPrakash Surya	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
546a3f829aBill Moore
547a3f829aBill Moore	zl->zl_parent = pio;
548a3f829aBill Moore	zl->zl_child = cio;
549a3f829aBill Moore
550e14bb32Jeff Bonwick	mutex_enter(&pio->io_lock);
551a3874b8Toomas Soome	mutex_enter(&cio->io_lock);
552a3f829aBill Moore
553a3f829aBill Moore	ASSERT(pio->io_state[