1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
223f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23f78cdc3Paul Dagnelie * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
245aeb947Garrett D'Amore * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
26663207aDon Brady * Copyright (c) 2017, Intel Corporation.
27fa9e406ahrens */
28fa9e406ahrens
29de710d2Josef 'Jeff' Sipek#include <sys/sysmacros.h>
30fa9e406ahrens#include <sys/zfs_context.h>
31ea8dc4beschrock#include <sys/fm/fs/zfs.h>
32fa9e406ahrens#include <sys/spa.h>
33fa9e406ahrens#include <sys/txg.h>
34fa9e406ahrens#include <sys/spa_impl.h>
35fa9e406ahrens#include <sys/vdev_impl.h>
36084fd14Brian Behlendorf#include <sys/vdev_trim.h>
37fa9e406ahrens#include <sys/zio_impl.h>
38fa9e406ahrens#include <sys/zio_compress.h>
39fa9e406ahrens#include <sys/zio_checksum.h>
40b24ab67Jeff Bonwick#include <sys/dmu_objset.h>
41b24ab67Jeff Bonwick#include <sys/arc.h>
42b24ab67Jeff Bonwick#include <sys/ddt.h>
435d7b4d4Matthew Ahrens#include <sys/blkptr.h>
4443466aaMax Grossman#include <sys/zfeature.h>
45a3874b8Toomas Soome#include <sys/dsl_scan.h>
460f7643cGeorge Wilson#include <sys/metaslab_impl.h>
47770499eDan Kimmel#include <sys/abd.h>
48f78cdc3Paul Dagnelie#include <sys/cityhash.h>
49eb63303Tom Caputi#include <sys/dsl_crypt.h>
50fa9e406ahrens
51fa9e406ahrens/*
52fa9e406ahrens * ==========================================================================
53fa9e406ahrens * I/O type descriptions
54fa9e406ahrens * ==========================================================================
55fa9e406ahrens */
5669962b5Matthew Ahrensconst char *zio_type_name[ZIO_TYPES] = {
5780eb36fGeorge Wilson	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
58084fd14Brian Behlendorf	"zio_ioctl", "z_trim"
5980eb36fGeorge Wilson};
60fa9e406ahrens
610f7643cGeorge Wilsonboolean_t zio_dva_throttle_enabled = B_TRUE;
620f7643cGeorge Wilson
63fa9e406ahrens/*
64fa9e406ahrens * ==========================================================================
65fa9e406ahrens * I/O kmem caches
66fa9e406ahrens * ==========================================================================
67fa9e406ahrens */
68ccae0b5eschrockkmem_cache_t *zio_cache;
69a3f829aBill Moorekmem_cache_t *zio_link_cache;
70fa9e406ahrenskmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
71ad23a2djohansenkmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
72ad23a2djohansen
73ad23a2djohansen#ifdef _KERNEL
74ad23a2djohansenextern vmem_t *zio_alloc_arena;
75ad23a2djohansen#endif
76fa9e406ahrens
77738f37bGeorge Wilson#define	ZIO_PIPELINE_CONTINUE		0x100
78738f37bGeorge Wilson#define	ZIO_PIPELINE_STOP		0x101
79738f37bGeorge Wilson
80a2cdcddPaul Dagnelie#define	BP_SPANB(indblkshift, level) \
81a2cdcddPaul Dagnelie	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
82a2cdcddPaul Dagnelie#define	COMPARE_META_LEVEL	0x80000000ul
830a4e951gw/*
8401f55e4George Wilson * The following actions directly effect the spa's sync-to-convergence logic.
8501f55e4George Wilson * The values below define the sync pass when we start performing the action.
8601f55e4George Wilson * Care should be taken when changing these values as they directly impact
8701f55e4George Wilson * spa_sync() performance. Tuning these values may introduce subtle performance
8801f55e4George Wilson * pathologies and should only be done in the context of performance analysis.
8901f55e4George Wilson * These tunables will eventually be removed and replaced with #defines once
9001f55e4George Wilson * enough analysis has been done to determine optimal values.
9101f55e4George Wilson *
9201f55e4George Wilson * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
9301f55e4George Wilson * regular blocks are not deferred.
9401f55e4George Wilson */
9501f55e4George Wilsonint zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
9601f55e4George Wilsonint zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
9701f55e4George Wilsonint zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
9801f55e4George Wilson
9901f55e4George Wilson/*
100e14bb32Jeff Bonwick * An allocating zio is one that either currently has the DVA allocate
101e14bb32Jeff Bonwick * stage set or will have it later in its lifetime.
1020a4e951gw */
103b24ab67Jeff Bonwick#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
104b24ab67Jeff Bonwick
10535a5a35Jonathan Adamsboolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
10635a5a35Jonathan Adams
107b24ab67Jeff Bonwick#ifdef ZFS_DEBUG
108b24ab67Jeff Bonwickint zio_buf_debug_limit = 16384;
109b24ab67Jeff Bonwick#else
110b24ab67Jeff Bonwickint zio_buf_debug_limit = 0;
111b24ab67Jeff Bonwick#endif
1120a4e951gw
1130f7643cGeorge Wilsonstatic void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
1140f7643cGeorge Wilson
115fa9e406ahrensvoid
116fa9e406ahrenszio_init(void)
117fa9e406ahrens{
118fa9e406ahrens	size_t c;
119ad23a2djohansen	vmem_t *data_alloc_arena = NULL;
120ad23a2djohansen
121ad23a2djohansen#ifdef _KERNEL
122ad23a2djohansen	data_alloc_arena = zio_alloc_arena;
123ad23a2djohansen#endif
124a3f829aBill Moore	zio_cache = kmem_cache_create("zio_cache",
125a3f829aBill Moore	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
126a3f829aBill Moore	zio_link_cache = kmem_cache_create("zio_link_cache",
127a3f829aBill Moore	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
128ccae0b5eschrock
129fa9e406ahrens	/*
130fa9e406ahrens	 * For small buffers, we want a cache for each multiple of
131b515258Matthew Ahrens	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
132b515258Matthew Ahrens	 * for each quarter-power of 2.
133fa9e406ahrens	 */
134fa9e406ahrens	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
135fa9e406ahrens		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
136fa9e406ahrens		size_t p2 = size;
137fa9e406ahrens		size_t align = 0;
138e291592Jonathan Adams		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
139fa9e406ahrens
140de710d2Josef 'Jeff' Sipek		while (!ISP2(p2))
141fa9e406ahrens			p2 &= p2 - 1;
142fa9e406ahrens
143cd1c8b8Matthew Ahrens#ifndef _KERNEL
144cd1c8b8Matthew Ahrens		/*
145cd1c8b8Matthew Ahrens		 * If we are using watchpoints, put each buffer on its own page,
146cd1c8b8Matthew Ahrens		 * to eliminate the performance overhead of trapping to the
147cd1c8b8Matthew Ahrens		 * kernel when modifying a non-watched buffer that shares the
148cd1c8b8Matthew Ahrens		 * page with a watched buffer.
149cd1c8b8Matthew Ahrens		 */
150cd1c8b8Matthew Ahrens		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
151cd1c8b8Matthew Ahrens			continue;
152cd1c8b8Matthew Ahrens#endif
153fa9e406ahrens		if (size <= 4 * SPA_MINBLOCKSIZE) {
154fa9e406ahrens			align = SPA_MINBLOCKSIZE;
155cd1c8b8Matthew Ahrens		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
156b515258Matthew Ahrens			align = MIN(p2 >> 2, PAGESIZE);
157fa9e406ahrens		}
158fa9e406ahrens
159fa9e406ahrens		if (align != 0) {
160ad23a2djohansen			char name[36];
1615ad8204nd			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
162fa9e406ahrens			zio_buf_cache[c] = kmem_cache_create(name, size,
163e291592Jonathan Adams			    align, NULL, NULL, NULL, NULL, NULL, cflags);
164ad23a2djohansen
165e291592Jonathan Adams			/*
166e291592Jonathan Adams			 * Since zio_data bufs do not appear in crash dumps, we
167e291592Jonathan Adams			 * pass KMC_NOTOUCH so that no allocator metadata is
168e291592Jonathan Adams			 * stored with the buffers.
169e291592Jonathan Adams			 */
170ad23a2djohansen			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
171ad23a2djohansen			zio_data_buf_cache[c] = kmem_cache_create(name, size,
172ad23a2djohansen			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
173e291592Jonathan Adams			    cflags | KMC_NOTOUCH);
174fa9e406ahrens		}
175fa9e406ahrens	}
176fa9e406ahrens
177fa9e406ahrens	while (--c != 0) {
178fa9e406ahrens		ASSERT(zio_buf_cache[c] != NULL);
179fa9e406ahrens		if (zio_buf_cache[c - 1] == NULL)
180fa9e406ahrens			zio_buf_cache[c - 1] = zio_buf_cache[c];
181ad23a2djohansen
182ad23a2djohansen		ASSERT(zio_data_buf_cache[c] != NULL);
183ad23a2djohansen		if (zio_data_buf_cache[c - 1] == NULL)
184ad23a2djohansen			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
185fa9e406ahrens	}
186ea8dc4beschrock
187ea8dc4beschrock	zio_inject_init();
188fa9e406ahrens}
189fa9e406ahrens
190fa9e406ahrensvoid
191fa9e406ahrenszio_fini(void)
192fa9e406ahrens{
193fa9e406ahrens	size_t c;
194fa9e406ahrens	kmem_cache_t *last_cache = NULL;
195ad23a2djohansen	kmem_cache_t *last_data_cache = NULL;
196fa9e406ahrens
197fa9e406ahrens	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
198fa9e406ahrens		if (zio_buf_cache[c] != last_cache) {
199fa9e406ahrens			last_cache = zio_buf_cache[c];
200fa9e406ahrens			kmem_cache_destroy(zio_buf_cache[c]);
201fa9e406ahrens		}
202fa9e406ahrens		zio_buf_cache[c] = NULL;
203ad23a2djohansen
204ad23a2djohansen		if (zio_data_buf_cache[c] != last_data_cache) {
205ad23a2djohansen			last_data_cache = zio_data_buf_cache[c];
206ad23a2djohansen			kmem_cache_destroy(zio_data_buf_cache[c]);
207ad23a2djohansen		}
208ad23a2djohansen		zio_data_buf_cache[c] = NULL;
209fa9e406ahrens	}
210ea8dc4beschrock
211a3f829aBill Moore	kmem_cache_destroy(zio_link_cache);
212ccae0b5eschrock	kmem_cache_destroy(zio_cache);
213ccae0b5eschrock
214ea8dc4beschrock	zio_inject_fini();
215fa9e406ahrens}
216fa9e406ahrens
217fa9e406ahrens/*
218fa9e406ahrens * ==========================================================================
219fa9e406ahrens * Allocate and free I/O buffers
220fa9e406ahrens * ==========================================================================
221fa9e406ahrens */
222ad23a2djohansen
223ad23a2djohansen/*
224ad23a2djohansen * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
225ad23a2djohansen * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
226ad23a2djohansen * useful to inspect ZFS metadata, but if possible, we should avoid keeping
227ad23a2djohansen * excess / transient data in-core during a crashdump.
228ad23a2djohansen */
229fa9e406ahrensvoid *
230fa9e406ahrenszio_buf_alloc(size_t size)
231fa9e406ahrens{
232fa9e406ahrens	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
233fa9e406ahrens
234f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
235fa9e406ahrens
2361ab7f2dmaybee	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
237fa9e406ahrens}
238fa9e406ahrens
239ad23a2djohansen/*
240ad23a2djohansen * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
241ad23a2djohansen * crashdump if the kernel panics.  This exists so that we will limit the amount
242ad23a2djohansen * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
243ad23a2djohansen * of kernel heap dumped to disk when the kernel panics)
244ad23a2djohansen */
245ad23a2djohansenvoid *
246ad23a2djohansenzio_data_buf_alloc(size_t size)
247ad23a2djohansen{
248ad23a2djohansen	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
249ad23a2djohansen
250f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
251ad23a2djohansen
2521ab7f2dmaybee	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
253ad23a2djohansen}
254ad23a2djohansen
255fa9e406ahrensvoid
256fa9e406ahrenszio_buf_free(void *buf, size_t size)
257fa9e406ahrens{
258fa9e406ahrens	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
259fa9e406ahrens
260f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
261fa9e406ahrens
262fa9e406ahrens	kmem_cache_free(zio_buf_cache[c], buf);
263fa9e406ahrens}
264fa9e406ahrens
265ad23a2djohansenvoid
266ad23a2djohansenzio_data_buf_free(void *buf, size_t size)
267ad23a2djohansen{
268ad23a2djohansen	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
269ad23a2djohansen
270f63ab3dMatthew Ahrens	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
271ad23a2djohansen
272ad23a2djohansen	kmem_cache_free(zio_data_buf_cache[c], buf);
273ad23a2djohansen}
274b3995adahrens
275eb63303Tom Caputi/* ARGSUSED */
276eb63303Tom Caputistatic void
277eb63303Tom Caputizio_abd_free(void *abd, size_t size)
278eb63303Tom Caputi{
279eb63303Tom Caputi	abd_free((abd_t *)abd);
280eb63303Tom Caputi}
281eb63303Tom Caputi
282fa9e406ahrens/*
283fa9e406ahrens * ==========================================================================
284fa9e406ahrens * Push and pop I/O transform buffers
285fa9e406ahrens * ==========================================================================
286fa9e406ahrens */
287dcbf3bdGeorge Wilsonvoid
288770499eDan Kimmelzio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
2899a686fbPaul Dagnelie    zio_transform_func_t *transform)
290fa9e406ahrens{
291fa9e406ahrens	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
292fa9e406ahrens
293770499eDan Kimmel	/*
294770499eDan Kimmel	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
295770499eDan Kimmel	 * going to get a nasty surprise when they try to access the data.
296770499eDan Kimmel	 */
297770499eDan Kimmel	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
298770499eDan Kimmel
299770499eDan Kimmel	zt->zt_orig_abd = zio->io_abd;
300e14bb32Jeff Bonwick	zt->zt_orig_size = zio->io_size;
301fa9e406ahrens	zt->zt_bufsize = bufsize;
302e14bb32Jeff Bonwick	zt->zt_transform = transform;
303fa9e406ahrens
304fa9e406ahrens	zt->zt_next = zio->io_transform_stack;
305fa9e406ahrens	zio->io_transform_stack = zt;
306fa9e406ahrens
307770499eDan Kimmel	zio->io_abd = data;
308fa9e406ahrens	zio->io_size = size;
309fa9e406ahrens}
310fa9e406ahrens
311dcbf3bdGeorge Wilsonvoid
312e14bb32Jeff Bonwickzio_pop_transforms(zio_t *zio)
313fa9e406ahrens{
314e14bb32Jeff Bonwick	zio_transform_t *zt;
315e14bb32Jeff Bonwick
316e14bb32Jeff Bonwick	while ((zt = zio->io_transform_stack) != NULL) {
317e14bb32Jeff Bonwick		if (zt->zt_transform != NULL)
318e14bb32Jeff Bonwick			zt->zt_transform(zio,
319770499eDan Kimmel			    zt->zt_orig_abd, zt->zt_orig_size);
320fa9e406ahrens
321b24ab67Jeff Bonwick		if (zt->zt_bufsize != 0)
322770499eDan Kimmel			abd_free(zio->io_abd);
323fa9e406ahrens
324770499eDan Kimmel		zio->io_abd = zt->zt_orig_abd;
325e14bb32Jeff Bonwick		zio->io_size = zt->zt_orig_size;
326e14bb32Jeff Bonwick		zio->io_transform_stack = zt->zt_next;
327fa9e406ahrens
328e14bb32Jeff Bonwick		kmem_free(zt, sizeof (zio_transform_t));
329fa9e406ahrens	}
330fa9e406ahrens}
331fa9e406ahrens
332e14bb32Jeff Bonwick/*
333e14bb32Jeff Bonwick * ==========================================================================
334eb63303Tom Caputi * I/O transform callbacks for subblocks, decompression, and decryption
335e14bb32Jeff Bonwick * ==========================================================================
336e14bb32Jeff Bonwick */
337e14bb32Jeff Bonwickstatic void
338770499eDan Kimmelzio_subblock(zio_t *zio, abd_t *data, uint64_t size)
339e14bb32Jeff Bonwick{
340e14bb32Jeff Bonwick	ASSERT(zio->io_size > size);
341e14bb32Jeff Bonwick
342e14bb32Jeff Bonwick	if (zio->io_type == ZIO_TYPE_READ)
343770499eDan Kimmel		abd_copy(data, zio->io_abd, size);
344e14bb32Jeff Bonwick}
345e14bb32Jeff Bonwick
346e14bb32Jeff Bonwickstatic void
347770499eDan Kimmelzio_decompress(zio_t *zio, abd_t *data, uint64_t size)
348e14bb32Jeff Bonwick{
349770499eDan Kimmel	if (zio->io_error == 0) {
350770499eDan Kimmel		void *tmp = abd_borrow_buf(data, size);
351770499eDan Kimmel		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
352770499eDan Kimmel		    zio->io_abd, tmp, zio->io_size, size);
353770499eDan Kimmel		abd_return_buf_copy(data, tmp, size);
354770499eDan Kimmel
355770499eDan Kimmel		if (ret != 0)
356770499eDan Kimmel			zio->io_error = SET_ERROR(EIO);
357770499eDan Kimmel	}
358e14bb32Jeff Bonwick}
359e14bb32Jeff Bonwick
360eb63303Tom Caputistatic void
361eb63303Tom Caputizio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
362eb63303Tom Caputi{
363eb63303Tom Caputi	int ret;
364eb63303Tom Caputi	void *tmp;
365eb63303Tom Caputi	blkptr_t *bp = zio->io_bp;
366eb63303Tom Caputi	spa_t *spa = zio->io_spa;
367eb63303Tom Caputi	uint64_t dsobj = zio->io_bookmark.zb_objset;
368eb63303Tom Caputi	uint64_t lsize = BP_GET_LSIZE(bp);
369eb63303Tom Caputi	dmu_object_type_t ot = BP_GET_TYPE(bp);
370eb63303Tom Caputi	uint8_t salt[ZIO_DATA_SALT_LEN];
371eb63303Tom Caputi	uint8_t iv[ZIO_DATA_IV_LEN];
372eb63303Tom Caputi	uint8_t mac[ZIO_DATA_MAC_LEN];
373eb63303Tom Caputi	boolean_t no_crypt = B_FALSE;
374eb63303Tom Caputi
375eb63303Tom Caputi	ASSERT(BP_USES_CRYPT(bp));
376eb63303Tom Caputi	ASSERT3U(size, !=, 0);
377eb63303Tom Caputi
378eb63303Tom Caputi	if (zio->io_error != 0)
379eb63303Tom Caputi		return;
380eb63303Tom Caputi
381eb63303Tom Caputi	/*
382eb63303Tom Caputi	 * Verify the cksum of MACs stored in an indirect bp. It will always
383eb63303Tom Caputi	 * be possible to verify this since it does not require an encryption
384eb63303Tom Caputi	 * key.
385eb63303Tom Caputi	 */
386eb63303Tom Caputi	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
387eb63303Tom Caputi		zio_crypt_decode_mac_bp(bp, mac);
388eb63303Tom Caputi
389eb63303Tom Caputi		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
390eb63303Tom Caputi			/*
391eb63303Tom Caputi			 * We haven't decompressed the data yet, but
392eb63303Tom Caputi			 * zio_crypt_do_indirect_mac_checksum() requires
393eb63303Tom Caputi			 * decompressed data to be able to parse out the MACs
394eb63303Tom Caputi			 * from the indirect block. We decompress it now and
395eb63303Tom Caputi			 * throw away the result after we are finished.
396eb63303Tom Caputi			 */
397eb63303Tom Caputi			tmp = zio_buf_alloc(lsize);
398eb63303Tom Caputi			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
399eb63303Tom Caputi			    zio->io_abd, tmp, zio->io_size, lsize);
400eb63303Tom Caputi			if (ret != 0) {
401eb63303Tom Caputi				ret = SET_ERROR(EIO);
402eb63303Tom Caputi				goto error;
403eb63303Tom Caputi			}
404eb63303Tom Caputi			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
405eb63303Tom Caputi			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
406eb63303Tom Caputi			zio_buf_free(tmp, lsize);
407eb63303Tom Caputi		} else {
408eb63303Tom Caputi			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
409eb63303Tom Caputi			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
410eb63303Tom Caputi		}
411eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
412eb63303Tom Caputi
413eb63303Tom Caputi		if (ret != 0)
414eb63303Tom Caputi			goto error;
415eb63303Tom Caputi
416eb63303Tom Caputi		return;
417eb63303Tom Caputi	}
418eb63303Tom Caputi
419eb63303Tom Caputi	/*
420eb63303Tom Caputi	 * If this is an authenticated block, just check the MAC. It would be
421eb63303Tom Caputi	 * nice to separate this out into its own flag, but for the moment
422eb63303Tom Caputi	 * enum zio_flag is out of bits.
423eb63303Tom Caputi	 */
424eb63303Tom Caputi	if (BP_IS_AUTHENTICATED(bp)) {
425eb63303Tom Caputi		if (ot == DMU_OT_OBJSET) {
426eb63303Tom Caputi			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
427eb63303Tom Caputi			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
428eb63303Tom Caputi		} else {
429eb63303Tom Caputi			zio_crypt_decode_mac_bp(bp, mac);
430eb63303Tom Caputi			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
431eb63303Tom Caputi			    zio->io_abd, size, mac);
432eb63303Tom Caputi		}
433eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
434eb63303Tom Caputi
435eb63303Tom Caputi		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
436eb63303Tom Caputi			ret = zio_handle_decrypt_injection(spa,
437eb63303Tom Caputi			    &zio->io_bookmark, ot, ECKSUM);
438eb63303Tom Caputi		}
439eb63303Tom Caputi		if (ret != 0)
440eb63303Tom Caputi			goto error;
441eb63303Tom Caputi
442eb63303Tom Caputi		return;
443eb63303Tom Caputi	}
444eb63303Tom Caputi
445eb63303Tom Caputi	zio_crypt_decode_params_bp(bp, salt, iv);
446eb63303Tom Caputi
447eb63303Tom Caputi	if (ot == DMU_OT_INTENT_LOG) {
448eb63303Tom Caputi		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
449eb63303Tom Caputi		zio_crypt_decode_mac_zil(tmp, mac);
450eb63303Tom Caputi		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
451eb63303Tom Caputi	} else {
452eb63303Tom Caputi		zio_crypt_decode_mac_bp(bp, mac);
453eb63303Tom Caputi	}
454eb63303Tom Caputi
455eb63303Tom Caputi	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
456eb63303Tom Caputi	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
457eb63303Tom Caputi	    zio->io_abd, &no_crypt);
458eb63303Tom Caputi	if (no_crypt)
459eb63303Tom Caputi		abd_copy(data, zio->io_abd, size);
460eb63303Tom Caputi
461eb63303Tom Caputi	if (ret != 0)
462eb63303Tom Caputi		goto error;
463eb63303Tom Caputi
464eb63303Tom Caputi	return;
465eb63303Tom Caputi
466eb63303Tom Caputierror:
467eb63303Tom Caputi	/* assert that the key was found unless this was speculative */
468eb63303Tom Caputi	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
469eb63303Tom Caputi
470eb63303Tom Caputi	/*
471eb63303Tom Caputi	 * If there was a decryption / authentication error return EIO as
472eb63303Tom Caputi	 * the io_error. If this was not a speculative zio, create an ereport.
473eb63303Tom Caputi	 */
474eb63303Tom Caputi	if (ret == ECKSUM) {
475eb63303Tom Caputi		zio->io_error = SET_ERROR(EIO);
476eb63303Tom Caputi		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
477eb63303Tom Caputi			spa_log_error(spa, &zio->io_bookmark);
478eb63303Tom Caputi			zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
479eb63303Tom Caputi			    spa, NULL, &zio->io_bookmark, zio, 0, 0);
480eb63303Tom Caputi		}
481eb63303Tom Caputi	} else {
482eb63303Tom Caputi		zio->io_error = ret;
483eb63303Tom Caputi	}
484eb63303Tom Caputi}
485eb63303Tom Caputi
486e14bb32Jeff Bonwick/*
487e14bb32Jeff Bonwick * ==========================================================================
488e14bb32Jeff Bonwick * I/O parent/child relationships and pipeline interlocks
489e14bb32Jeff Bonwick * ==========================================================================
490e14bb32Jeff Bonwick */
491a3f829aBill Moorezio_t *
4920f7643cGeorge Wilsonzio_walk_parents(zio_t *cio, zio_link_t **zl)
493a3f829aBill Moore{
494a3f829aBill Moore	list_t *pl = &cio->io_parent_list;
495e14bb32Jeff Bonwick
4960f7643cGeorge Wilson	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
4970f7643cGeorge Wilson	if (*zl == NULL)
498a3f829aBill Moore		return (NULL);
499a3f829aBill Moore
5000f7643cGeorge Wilson	ASSERT((*zl)->zl_child == cio);
5010f7643cGeorge Wilson	return ((*zl)->zl_parent);
502a3f829aBill Moore}
503a3f829aBill Moore
504a3f829aBill Moorezio_t *
5050f7643cGeorge Wilsonzio_walk_children(zio_t *pio, zio_link_t **zl)
506a3f829aBill Moore{
507a3f829aBill Moore	list_t *cl = &pio->io_child_list;
508a3f829aBill Moore
509a3874b8Toomas Soome	ASSERT(MUTEX_HELD(&pio->io_lock));
510a3874b8Toomas Soome
5110f7643cGeorge Wilson	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
5120f7643cGeorge Wilson	if (*zl == NULL)
513a3f829aBill Moore		return (NULL);
514a3f829aBill Moore
5150f7643cGeorge Wilson	ASSERT((*zl)->zl_parent == pio);
5160f7643cGeorge Wilson	return ((*zl)->zl_child);
517a3f829aBill Moore}
518a3f829aBill Moore
519a3f829aBill Moorezio_t *
520a3f829aBill Moorezio_unique_parent(zio_t *cio)
521a3f829aBill Moore{
5220f7643cGeorge Wilson	zio_link_t *zl = NULL;
5230f7643cGeorge Wilson	zio_t *pio = zio_walk_parents(cio, &zl);
524a3f829aBill Moore
5250f7643cGeorge Wilson	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
526a3f829aBill Moore	return (pio);
527a3f829aBill Moore}
528a3f829aBill Moore
529a3f829aBill Moorevoid
530a3f829aBill Moorezio_add_child(zio_t *pio, zio_t *cio)
531e14bb32Jeff Bonwick{
532a3f829aBill Moore	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
533a3f829aBill Moore
534a3f829aBill Moore	/*
535a3f829aBill Moore	 * Logical I/Os can have logical, gang, or vdev children.
536a3f829aBill Moore	 * Gang I/Os can have gang or vdev children.
537a3f829aBill Moore	 * Vdev I/Os can only have vdev children.
538a3f829aBill Moore	 * The following ASSERT captures all of these constraints.
539a3f829aBill Moore	 */
5401271e4bPrakash Surya	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
541a3f829aBill Moore
542a3f829aBill Moore	zl->zl_parent = pio;
543a3f829aBill Moore	zl->zl_child = cio;
544a3f829aBill Moore
545e14bb32Jeff Bonwick	mutex_enter(&pio->io_lock);
546a3874b8Toomas Soome	mutex_enter(&cio->io_lock);
547a3f829aBill Moore
548a3f829aBill Moore	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
549a3f829aBill Moore
550a3f829aBill Moore	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
551a3f829aBill Moore		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
552a3f829aBill Moore
553a3f829aBill Moore	list_insert_head(&pio->io_child_list, zl);
554a3f829aBill Moore	list_insert_head(&cio->io_parent_list, zl);
555a3f829aBill Moore
556b24ab67Jeff Bonwick	pio->io_child_count++;
557b24ab67Jeff Bonwick	cio->io_parent_count++;
558b24ab67Jeff Bonwick
559a3f829aBill Moore	mutex_exit(&cio->io_lock);
560a3874b8Toomas Soome	mutex_exit(&pio->io_lock);
561e14bb32Jeff Bonwick}
562