1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
223f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
238671400Serapheim Dimitropoulos * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24fa9e406ahrens */
25fa9e406ahrens
26fa9e406ahrens#include <sys/zfs_context.h>
27fa9e406ahrens#include <sys/dmu_objset.h>
28fa9e406ahrens#include <sys/dmu_traverse.h>
29fa9e406ahrens#include <sys/dsl_dataset.h>
30fa9e406ahrens#include <sys/dsl_dir.h>
31fa9e406ahrens#include <sys/dsl_pool.h>
32fa9e406ahrens#include <sys/dnode.h>
33fa9e406ahrens#include <sys/spa.h>
348671400Serapheim Dimitropoulos#include <sys/spa_impl.h>
35fa9e406ahrens#include <sys/zio.h>
36fa9e406ahrens#include <sys/dmu_impl.h>
370a586ceMark Shellenbaum#include <sys/sa.h>
380a586ceMark Shellenbaum#include <sys/sa_impl.h>
3988b7b0fMatthew Ahrens#include <sys/callb.h>
4043466aaMax Grossman#include <sys/zfeature.h>
4188b7b0fMatthew Ahrens
4234d7ce0George Wilsonint32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
43f76886dPaul Dagnelieboolean_t send_holes_without_birth_time = B_TRUE;
4444f92b7Chris Kirby
456e0cbcaMatthew Ahrenstypedef struct prefetch_data {
4688b7b0fMatthew Ahrens	kmutex_t pd_mtx;
4788b7b0fMatthew Ahrens	kcondvar_t pd_cv;
4834d7ce0George Wilson	int32_t pd_bytes_fetched;
4988b7b0fMatthew Ahrens	int pd_flags;
5088b7b0fMatthew Ahrens	boolean_t pd_cancel;
5188b7b0fMatthew Ahrens	boolean_t pd_exited;
529c3fd12Matthew Ahrens	zbookmark_phys_t pd_resume;
536e0cbcaMatthew Ahrens} prefetch_data_t;
5488b7b0fMatthew Ahrens
556e0cbcaMatthew Ahrenstypedef struct traverse_data {
5688b7b0fMatthew Ahrens	spa_t *td_spa;
5788b7b0fMatthew Ahrens	uint64_t td_objset;
5888b7b0fMatthew Ahrens	blkptr_t *td_rootbp;
5988b7b0fMatthew Ahrens	uint64_t td_min_txg;
607802d7bMatthew Ahrens	zbookmark_phys_t *td_resume;
6188b7b0fMatthew Ahrens	int td_flags;
626e0cbcaMatthew Ahrens	prefetch_data_t *td_pfd;
637fd05acMatthew Ahrens	boolean_t td_paused;
64f7950bfMatthew Ahrens	uint64_t td_hole_birth_enabled_txg;
6588b7b0fMatthew Ahrens	blkptr_cb_t *td_func;
6688b7b0fMatthew Ahrens	void *td_arg;
67286ef71Paul Dagnelie	boolean_t td_realloc_possible;
686e0cbcaMatthew Ahrens} traverse_data_t;
69fa9e406ahrens
706e0cbcaMatthew Ahrensstatic int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
711b912ecGeorge Wilson    uint64_t objset, uint64_t object);
72b470933Matthew Ahrensstatic void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
731b912ecGeorge Wilson    uint64_t objset, uint64_t object);
741484342Matthew Ahrens
75b24ab67Jeff Bonwickstatic int
765dabedebonwicktraverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
77ea8dc4beschrock{
786e0cbcaMatthew Ahrens	traverse_data_t *td = arg;
797802d7bMatthew Ahrens	zbookmark_phys_t zb;
80ea8dc4beschrock
8143466aaMax Grossman	if (BP_IS_HOLE(bp))
82b24ab67Jeff Bonwick		return (0);
835dabedebonwick
848671400Serapheim Dimitropoulos	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
858671400Serapheim Dimitropoulos		return (-1);
8688b7b0fMatthew Ahrens
87b24ab67Jeff Bonwick	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
88b24ab67Jeff Bonwick	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
89b24ab67Jeff Bonwick
901b912ecGeorge Wilson	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
91b24ab67Jeff Bonwick
92b24ab67Jeff Bonwick	return (0);
93ea8dc4beschrock}
94ea8dc4beschrock
95b24ab67Jeff Bonwickstatic int
965dabedebonwicktraverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
97ea8dc4beschrock{
986e0cbcaMatthew Ahrens	traverse_data_t *td = arg;
99ea8dc4beschrock
100ea8dc4beschrock	if (lrc->lrc_txtype == TX_WRITE) {
101ea8dc4beschrock		lr_write_t *lr = (lr_write_t *)lrc;
102ea8dc4beschrock		blkptr_t *bp = &lr->lr_blkptr;
1037802d7bMatthew Ahrens		zbookmark_phys_t zb;
104ea8dc4beschrock
10543466aaMax Grossman		if (BP_IS_HOLE(bp))
106b24ab67Jeff Bonwick			return (0);
1075dabedebonwick
10888b7b0fMatthew Ahrens		if (claim_txg == 0 || bp->blk_birth < claim_txg)
109b24ab67Jeff Bonwick			return (0);
110b24ab67Jeff Bonwick
1116e0cbcaMatthew Ahrens		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
1126e0cbcaMatthew Ahrens		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
11388b7b0fMatthew Ahrens
1141b912ecGeorge Wilson		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
115b24ab67Jeff Bonwick		    td->td_arg);
116ea8dc4beschrock	}
117b24ab67Jeff Bonwick	return (0);
118ea8dc4beschrock}
119ea8dc4beschrock
120ea8dc4beschrockstatic void
1216e0cbcaMatthew Ahrenstraverse_zil(traverse_data_t *td, zil_header_t *zh)
122ea8dc4beschrock{
1235dabedebonwick	uint64_t claim_txg = zh->zh_claim_txg;
124ea8dc4beschrock
1255dabedebonwick	/*
1265dabedebonwick	 * We only want to visit blocks that have been claimed but not yet
1278671400Serapheim Dimitropoulos	 * replayed; plus blocks that are already stable in read-only mode.
1285dabedebonwick	 */
1298ad4d6dJeff Bonwick	if (claim_txg == 0 && spa_writeable(td->td_spa))
1305dabedebonwick		return;
1315dabedebonwick
1328671400Serapheim Dimitropoulos	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
13388b7b0fMatthew Ahrens	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
134eb63303Tom Caputi	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
135ea8dc4beschrock	zil_free(zilog);
136ea8dc4beschrock}
137ea8dc4beschrock
138ad135b5Christopher Sidentypedef enum resume_skip {
139ad135b5Christopher Siden	RESUME_SKIP_ALL,
140ad135b5Christopher Siden	RESUME_SKIP_NONE,
141ad135b5Christopher Siden	RESUME_SKIP_CHILDREN
142ad135b5Christopher Siden} resume_skip_t;
143ad135b5Christopher Siden
144ad135b5Christopher Siden/*
145ad135b5Christopher Siden * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
146ad135b5Christopher Siden * the block indicated by zb does not need to be visited at all. Returns
147ad135b5Christopher Siden * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
148ad135b5Christopher Siden * resume point. This indicates that this block should be visited but not its
149ad135b5Christopher Siden * children (since they must have been visited in a previous traversal).
150ad135b5Christopher Siden * Otherwise returns RESUME_SKIP_NONE.
151ad135b5Christopher Siden */
152ad135b5Christopher Sidenstatic resume_skip_t
153ad135b5Christopher Sidenresume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
1547802d7bMatthew Ahrens    const zbookmark_phys_t *zb)
155ad135b5Christopher Siden{
156ad135b5Christopher Siden	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
157ad135b5Christopher Siden		/*
158ad135b5Christopher Siden		 * If we already visited this bp & everything below,
159ad135b5Christopher Siden		 * don't bother doing it again.
160ad135b5Christopher Siden		 */
161a2cdcddPaul Dagnelie		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
162ad135b5Christopher Siden			return (RESUME_SKIP_ALL);
163ad135b5Christopher Siden
164ad135b5Christopher Siden		/*
165ad135b5Christopher Siden		 * If we found the block we're trying to resume from, zero
166ad135b5Christopher Siden		 * the bookmark out to indicate that we have resumed.
167ad135b5Christopher Siden		 */
168ad135b5Christopher Siden		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
169ad135b5Christopher Siden			bzero(td->td_resume, sizeof (*zb));
170ad135b5Christopher Siden			if (td->td_flags & TRAVERSE_POST)
171ad135b5Christopher Siden				return (RESUME_SKIP_CHILDREN);
172ad135b5Christopher Siden		}
173ad135b5Christopher Siden	}
174ad135b5Christopher Siden	return (RESUME_SKIP_NONE);
175ad135b5Christopher Siden}
176ad135b5Christopher Siden
177ad135b5Christopher Sidenstatic void
178b470933Matthew Ahrenstraverse_prefetch_metadata(traverse_data_t *td,
1797802d7bMatthew Ahrens    const blkptr_t *bp, const zbookmark_phys_t *zb)
180b470933Matthew Ahrens{
1817adb730George Wilson	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
182eb63303Tom Caputi	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
183b470933Matthew Ahrens
184b470933Matthew Ahrens	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
185b470933Matthew Ahrens		return;
186b470933Matthew Ahrens	/*
187b470933Matthew Ahrens	 * If we are in the process of resuming, don't prefetch, because
188b470933Matthew Ahrens	 * some children will not be needed (and in fact may have already
189b470933Matthew Ahrens	 * been freed).
190b470933Matthew Ahrens	 */
191b470933Matthew Ahrens	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
192b470933Matthew Ahrens		return;
193b470933Matthew Ahrens	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
194b470933Matthew Ahrens		return;
195b470933Matthew Ahrens	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
196b470933Matthew Ahrens		return;
197b470933Matthew Ahrens
198eb63303Tom Caputi	if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
199eb63303Tom Caputi		zio_flags |= ZIO_FLAG_RAW;
200eb63303Tom Caputi
2011b912ecGeorge Wilson	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
202eb63303Tom Caputi	    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
203b470933Matthew Ahrens}
204b470933Matthew Ahrens
20506315b7Matthew Ahrensstatic boolean_t
20606315b7Matthew Ahrensprefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
20706315b7Matthew Ahrens{
20806315b7Matthew Ahrens	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
20906315b7Matthew Ahrens	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
21006315b7Matthew Ahrens	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
21106315b7Matthew Ahrens		return (B_FALSE);
21206315b7Matthew Ahrens	return (B_TRUE);
21306315b7Matthew Ahrens}
21406315b7Matthew Ahrens
215fa9e406ahrensstatic int
2166e0cbcaMatthew Ahrenstraverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
2177802d7bMatthew Ahrens    const blkptr_t *bp, const zbookmark_phys_t *zb)
218fa9e406ahrens{
2197802d7bMatthew Ahrens	zbookmark_phys_t czb;
2207fd05acMatthew Ahrens	int err = 0;
22188b7b0fMatthew Ahrens	arc_buf_t *buf = NULL;
2226e0cbcaMatthew Ahrens	prefetch_data_t *pd = td->td_pfd;
223cd088eaVictor Latushkin	boolean_t hard = td->td_flags & TRAVERSE_HARD;
224ad135b5Christopher Siden
225ad135b5Christopher Siden	switch (resume_skip_check(td, dnp, zb)) {
226ad135b5Christopher Siden	case RESUME_SKIP_ALL:
227ad135b5Christopher Siden		return (0);
228ad135b5Christopher Siden	case RESUME_SKIP_CHILDREN:
229ad135b5Christopher Siden		goto post;
230ad135b5Christopher Siden	case RESUME_SKIP_NONE:
231ad135b5Christopher Siden		break;
232ad135b5Christopher Siden	default:
233ad135b5Christopher Siden		ASSERT(0);
234ad135b5Christopher Siden	}
235fa9e406ahrens
23643466aaMax Grossman	if (bp->blk_birth == 0) {
237f7950bfMatthew Ahrens		/*
238286ef71Paul Dagnelie		 * Since this block has a birth time of 0 it must be one of
239286ef71Paul Dagnelie		 * two things: a hole created before the
240286ef71Paul Dagnelie		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
241286ef71Paul Dagnelie		 * which has always been a hole in an object.
242286ef71Paul Dagnelie		 *
243286ef71Paul Dagnelie		 * If a file is written sparsely, then the unwritten parts of
244286ef71Paul Dagnelie		 * the file were "always holes" -- that is, they have been
245286ef71Paul Dagnelie		 * holes since this object was allocated.  However, we (and
246286ef71Paul Dagnelie		 * our callers) can not necessarily tell when an object was
247286ef71Paul Dagnelie		 * allocated.  Therefore, if it's possible that this object
248286ef71Paul Dagnelie		 * was freed and then its object number reused, we need to
249286ef71Paul Dagnelie		 * visit all the holes with birth==0.
250286ef71Paul Dagnelie		 *
251286ef71Paul Dagnelie		 * If it isn't possible that the object number was reused,
252286ef71Paul Dagnelie		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
253286ef71Paul Dagnelie		 * all the blocks we will visit as part of this traversal,
254286ef71Paul Dagnelie		 * then this hole must have always existed, so we can skip
255286ef71Paul Dagnelie		 * it.  We visit blocks born after (exclusive) td_min_txg.
256286ef71Paul Dagnelie		 *
257286ef71Paul Dagnelie		 * Note that the meta-dnode cannot be reallocated.
258f7950bfMatthew Ahrens		 */
259f76886dPaul Dagnelie		if (!send_holes_without_birth_time &&
260f76886dPaul Dagnelie		    (!td->td_realloc_possible ||
261286ef71Paul Dagnelie		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
262286ef71Paul Dagnelie		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
263f7950bfMatthew Ahrens			return (0);
26443466aaMax Grossman	} else if (bp->blk_birth <= td->td_min_txg) {
26543466aaMax Grossman		return (0);
26643466aaMax Grossman	}
26743466aaMax Grossman
26806315b7Matthew Ahrens	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
26934d7ce0George Wilson		uint64_t size = BP_GET_LSIZE(bp);
27088b7b0fMatthew Ahrens		mutex_enter(&pd->pd_mtx);
27134d7ce0George Wilson		ASSERT(pd->pd_bytes_fetched >= 0);
27234d7ce0George Wilson		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
27388b7b0fMatthew Ahrens			cv_wait(&pd->pd_cv, &pd->pd_mtx);
27434d7ce0George Wilson		pd->pd_bytes_fetched -= size;
27588b7b0fMatthew Ahrens		cv_broadcast(&pd->pd_cv);
27688b7b0fMatthew Ahrens		mutex_exit(&pd->pd_mtx);
277fa9e406ahrens	}
278fa9e406ahrens
27906315b7Matthew Ahrens	if (BP_IS_HOLE(bp)) {
28006315b7Matthew Ahrens		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
28106315b7Matthew Ahrens		if (err != 0)
28206315b7Matthew Ahrens			goto post;
28306315b7Matthew Ahrens		return (0);
28406315b7Matthew Ahrens	}
28506315b7Matthew Ahrens
28688b7b0fMatthew Ahrens	if (td->td_flags & TRAVERSE_PRE) {
2871b912ecGeorge Wilson		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
2883f9d6adLin Ling		    td->td_arg);
28999d5e17Tim Haley		if (err == TRAVERSE_VISIT_NO_CHILDREN)
29099d5e17Tim Haley			return (0);
291ad135b5Christopher Siden		if (err != 0)
292ad135b5Christopher Siden			goto post;
293fa9e406ahrens	}
294fa9e406ahrens
29588b7b0fMatthew Ahrens	if (BP_GET_LEVEL(bp) > 0) {
2967adb730George Wilson		arc_flags_t flags = ARC_FLAG_WAIT;
29788b7b0fMatthew Ahrens		int i;
29888b7b0fMatthew Ahrens		blkptr_t *cbp;
29988b7b0fMatthew Ahrens		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
30088b7b0fMatthew Ahrens
301eb63303Tom Caputi		ASSERT(!BP_IS_PROTECTED(bp));
302eb63303Tom Caputi
3031b912ecGeorge Wilson		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
30488b7b0fMatthew Ahrens		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
3053b2aab1Matthew Ahrens		if (err != 0)
3067fd05acMatthew Ahrens			goto post;
307b470933Matthew Ahrens		cbp = buf->b_data;
308b470933Matthew Ahrens
309b470933Matthew Ahrens		for (i = 0; i < epb; i++) {
310b470933Matthew Ahrens			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
311b470933Matthew Ahrens			    zb->zb_level - 1,
312b470933Matthew Ahrens			    zb->zb_blkid * epb + i);
3131b912ecGeorge Wilson			traverse_prefetch_metadata(td, &cbp[i], &czb);
314b470933Matthew Ahrens		}
31588b7b0fMatthew Ahrens
31688b7b0fMatthew Ahrens		/* recursively visitbp() blocks below this */
317b470933Matthew Ahrens		for (i = 0; i < epb; i++) {
31888b7b0fMatthew Ahrens			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
31988b7b0fMatthew Ahrens			    zb->zb_level - 1,
32088b7b0fMatthew Ahrens			    zb->zb_blkid * epb + i);
3211b912ecGeorge Wilson			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
3227fd05acMatthew Ahrens			if (err != 0)
3237fd05acMatthew Ahrens				break;
32488b7b0fMatthew Ahrens		}
32588b7b0fMatthew Ahrens	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
3267adb730George Wilson		arc_flags_t flags = ARC_FLAG_WAIT;
327eb63303Tom Caputi		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
3281484342Matthew Ahrens		int i;
32988b7b0fMatthew Ahrens		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
33088b7b0fMatthew Ahrens
331eb63303Tom Caputi		/*
332eb63303Tom Caputi		 * dnode blocks might have their bonus buffers encrypted, so
333eb63303Tom Caputi		 * we must be careful to honor TRAVERSE_NO_DECRYPT
334eb63303Tom Caputi		 */
335eb63303Tom Caputi		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
336eb63303Tom Caputi			zio_flags |= ZIO_FLAG_RAW;
3371b912ecGeorge Wilson		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
338eb63303Tom Caputi		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
3393b2aab1Matthew Ahrens		if (err != 0)
3407fd05acMatthew Ahrens			goto post;
3419c3fd12Matthew Ahrens		dnode_phys_t *child_dnp = buf->b_data;
342b470933Matthew Ahrens
34354811daToomas Soome		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
3449c3fd12Matthew Ahrens			prefetch_dnode_metadata(td, &child_dnp[i],
3459c3fd12Matthew Ahrens			    zb->zb_objset, zb->zb_blkid * epb + i);
346b470933Matthew Ahrens		}
34788b7b0fMatthew Ahrens
34888b7b0fMatthew Ahrens		/* recursively visitbp() blocks below this */
34954811daToomas Soome		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
3509c3fd12Matthew Ahrens			err = traverse_dnode(td, &child_dnp[i],
3519c3fd12Matthew Ahrens			    zb->zb_objset, zb->zb_blkid * epb + i);
3527fd05acMatthew Ahrens			if (err != 0)
3537fd05acMatthew Ahrens				break;
354fa9e406ahrens		}
35588b7b0fMatthew Ahrens	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
356eb63303Tom Caputi		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
3577adb730George Wilson		arc_flags_t flags = ARC_FLAG_WAIT;
35888b7b0fMatthew Ahrens
359eb63303Tom Caputi		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
360eb63303Tom Caputi			zio_flags |= ZIO_FLAG_RAW;
361eb63303Tom Caputi
3621b912ecGeorge Wilson		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
363eb63303Tom Caputi		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
3643b2aab1Matthew Ahrens		if (err != 0)
3657fd05acMatthew Ahrens			goto post;
36688b7b0fMatthew Ahrens
3679c3fd12Matthew Ahrens		objset_phys_t *osp = buf->b_data;
3689c3fd12Matthew Ahrens		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
369b470933Matthew Ahrens		    DMU_META_DNODE_OBJECT);
370286ef71Paul Dagnelie		/*
371286ef71Paul Dagnelie		 * See the block comment above for the goal of this variable.
372286ef71Paul Dagnelie		 * If the maxblkid of the meta-dnode is 0, then we know that
373286ef71Paul Dagnelie		 * we've never had more than DNODES_PER_BLOCK objects in the
374286ef71Paul Dagnelie		 * dataset, which means we can't have reused any object ids.
375286ef71Paul Dagnelie		 */
376286ef71Paul Dagnelie		if (osp->os_meta_dnode.dn_maxblkid == 0)
377286ef71Paul Dagnelie			td->td_realloc_possible = B_FALSE;
378286ef71Paul Dagnelie
379f67950bNasf-Fan		if (OBJSET_BUF_HAS_USERUSED(buf)) {
380f67950bNasf-Fan			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
381f67950bNasf-Fan				prefetch_dnode_metadata(td,
382f67950bNasf-Fan				    &osp->os_projectused_dnode,
383f67950bNasf-Fan				    zb->zb_objset, DMU_PROJECTUSED_OBJECT);
384b470933Matthew Ahrens			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
38548f1b90Matthew Ahrens			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
38648f1b90Matthew Ahrens			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
3871b912ecGeorge Wilson			    zb->zb_objset, DMU_USERUSED_OBJECT);
388b470933Matthew Ahrens		}
389b470933Matthew Ahrens
3909c3fd12Matthew Ahrens		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
391b24ab67Jeff Bonwick		    DMU_META_DNODE_OBJECT);
392f67950bNasf-Fan		if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
393f67950bNasf-Fan			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
394f67950bNasf-Fan				err = traverse_dnode(td,
395f67950bNasf-Fan				    &osp->os_projectused_dnode, zb->zb_objset,
396f67950bNasf-Fan				    DMU_PROJECTUSED_OBJECT);
397f67950bNasf-Fan			if (err == 0)
398f67950bNasf-Fan				err = traverse_dnode(td,
399f67950bNasf-Fan				    &osp->os_groupused_dnode, zb->zb_objset,
400f67950bNasf-Fan				    DMU_GROUPUSED_OBJECT);
401f67950bNasf-Fan			if (err == 0)
402f67950bNasf-Fan				err = traverse_dnode(td,
403f67950bNasf-Fan				    &osp->os_userused_dnode, zb->zb_objset,
404f67950bNasf-Fan				    DMU_USERUSED_OBJECT);
40588b7b0fMatthew Ahrens		}
40688b7b0fMatthew Ahrens	}
407fa9e406ahrens
40888b7b0fMatthew Ahrens	if (buf)
409dcbf3bdGeorge Wilson		arc_buf_destroy(buf, &buf);
410fa9e406ahrens
411ad135b5Christopher Sidenpost:
4127fd05acMatthew Ahrens	if (err == 0 && (td->td_flags & TRAVERSE_POST))
4131b912ecGeorge Wilson		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
4147fd05acMatthew Ahrens
4157fd05acMatthew Ahrens	if (hard && (err == EIO || err == ECKSUM)) {
4167fd05acMatthew Ahrens		/*
4177fd05acMatthew Ahrens		 * Ignore this disk error as requested by the HARD flag,
4187fd05acMatthew Ahrens		 * and continue traversal.
4197fd05acMatthew Ahrens		 */
4207fd05acMatthew Ahrens		err = 0;
421ad135b5Christopher Siden	}
422ad135b5Christopher Siden
4237fd05acMatthew Ahrens	/*
4247fd05acMatthew Ahrens	 * If we are stopping here, set td_resume.
4257fd05acMatthew Ahrens	 */
4267fd05acMatthew Ahrens	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
4277fd05acMatthew Ahrens		td->td_resume->zb_objset = zb->zb_objset;
4287fd05acMatthew Ahrens		td->td_resume->zb_object = zb->zb_object;
4297fd05acMatthew Ahrens		td->td_resume->zb_level = 0;
4307fd05acMatthew Ahrens		/*
4317fd05acMatthew Ahrens		 * If we have stopped on an indirect block (e.g. due to
4327fd05acMatthew Ahrens		 * i/o error), we have not visited anything below it.
4337fd05acMatthew Ahrens		 * Set the bookmark to the first level-0 block that we need
4347fd05acMatthew Ahrens		 * to visit.  This way, the resuming code does not need to
4357fd05acMatthew Ahrens		 * deal with resuming from indirect blocks.
4369c3fd12Matthew Ahrens		 *
4379c3fd12Matthew Ahrens		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
4389c3fd12Matthew Ahrens		 * to dereference it.
4397fd05acMatthew Ahrens		 */
4409c3fd12Matthew Ahrens		td->td_resume->zb_blkid = zb->zb_blkid;
4419c3fd12Matthew Ahrens		if (zb->zb_level > 0) {
4429c3fd12Matthew Ahrens			td->td_resume->zb_blkid <<= zb->zb_level *
4439c3fd12Matthew Ahrens			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
4449c3fd12Matthew Ahrens		}
4457fd05acMatthew Ahrens		td->td_paused = B_TRUE;
4463f9d6adLin Ling	}
447fa9e406ahrens
4487fd05acMatthew Ahrens	return (err);
449fa9e406ahrens}
450fa9e406ahrens
451b470933Matthew Ahrensstatic void
452b470933Matthew Ahrensprefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
4531b912ecGeorge Wilson    uint64_t objset, uint64_t object)
454b470933Matthew Ahrens{
455b470933Matthew Ahrens	int j;
4567802d7bMatthew Ahrens	zbookmark_phys_t czb;
457b470933Matthew Ahrens
458b470933Matthew Ahrens	for (j = 0; j < dnp->dn_nblkptr; j++) {
459b470933Matthew Ahrens		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
4601b912ecGeorge Wilson		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
461b470933Matthew Ahrens	}
462b470933Matthew Ahrens
463b470933Matthew Ahrens	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
464b470933Matthew Ahrens		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
46554811daToomas Soome		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
466b470933Matthew Ahrens	}
467b470933Matthew Ahrens}
468b470933Matthew Ahrens
4691484342Matthew Ahrensstatic int
4706e0cbcaMatthew Ahrenstraverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
4711b912ecGeorge Wilson    uint64_t objset, uint64_t object)
4721484342Matthew Ahrens{
4737fd05acMatthew Ahrens	int j, err = 0;
4747802d7bMatthew Ahrens	zbookmark_phys_t czb;
4751484342Matthew Ahrens
4769c3fd12Matthew Ahrens	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
4779c3fd12Matthew Ahrens	    object < td->td_resume->zb_object)
4789c3fd12Matthew Ahrens		return (0);
4799c3fd12Matthew Ahrens
480a2cdcddPaul Dagnelie	if (td->td_flags & TRAVERSE_PRE) {
481a2cdcddPaul Dagnelie		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
482a2cdcddPaul Dagnelie		    ZB_DNODE_BLKID);
483a2cdcddPaul Dagnelie		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
484a2cdcddPaul Dagnelie		    td->td_arg);
485a2cdcddPaul Dagnelie		if (err == TRAVERSE_VISIT_NO_CHILDREN)
486a2cdcddPaul Dagnelie			return (0);
487a2cdcddPaul Dagnelie		if (err != 0)
488a2cdcddPaul Dagnelie			return (err);
489a2cdcddPaul Dagnelie	}
490a2cdcddPaul Dagnelie
4911484342Matthew Ahrens	for (j = 0; j < dnp->dn_nblkptr; j++) {
4921484342Matthew Ahrens		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
4931b912ecGeorge Wilson		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
4947fd05acMatthew Ahrens		if (err != 0)
4957fd05acMatthew Ahrens			break;
4963f9d6adLin Ling	}
4973f9d6adLin Ling
498a2cdcddPaul Dagnelie	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
499b470933Matthew Ahrens		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
50054811daToomas Soome		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
5011484342Matthew Ahrens	}
502a2cdcddPaul Dagnelie
503a2cdcddPaul Dagnelie	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
504a2cdcddPaul Dagnelie		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
505a2cdcddPaul Dagnelie		    ZB_DNODE_BLKID);
506a2cdcddPaul Dagnelie		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
507a2cdcddPaul Dagnelie		    td->td_arg);
508a2cdcddPaul Dagnelie		if (err == TRAVERSE_VISIT_NO_CHILDREN)
509a2cdcddPaul Dagnelie			return (0);
510a2cdcddPaul Dagnelie		if (err != 0)
511a2cdcddPaul Dagnelie			return (err);
512a2cdcddPaul Dagnelie	}
5137fd05acMatthew Ahrens	return (err);
5141484342Matthew Ahrens}
5151484342Matthew Ahrens
51688b7b0fMatthew Ahrens/* ARGSUSED */
51788b7b0fMatthew Ahrensstatic int
518b24ab67Jeff Bonwicktraverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5197802d7bMatthew Ahrens    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
520e7cbe64gw{
5216e0cbcaMatthew Ahrens	prefetch_data_t *pfd = arg;
522eb63303Tom Caputi	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
523a3874b8Toomas Soome	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
524a3874b8Toomas Soome	    ARC_FLAG_PRESCIENT_PREFETCH;
525e7cbe64gw
52634d7ce0George Wilson	ASSERT(pfd->pd_bytes_fetched >= 0);
527a2cdcddPaul Dagnelie	if (bp == NULL)
528a2cdcddPaul Dagnelie		return (0);
52988b7b0fMatthew Ahrens	if (pfd->pd_cancel)
530be6fd75Matthew Ahrens		return (SET_ERROR(EINTR));
531e7cbe64gw
53206315b7Matthew Ahrens	if (!prefetch_needed(pfd, bp))
533fa9e406ahrens		return (0);
534fa9e406ahrens
53588b7b0fMatthew Ahrens	mutex_enter(&pfd->pd_mtx);
53634d7ce0George Wilson	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
53788b7b0fMatthew Ahrens		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
53834d7ce0George Wilson	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
53988b7b0fMatthew Ahrens	cv_broadcast(&pfd->pd_cv);
54088b7b0fMatthew Ahrens	mutex_exit(&pfd->pd_mtx);
541fa9e406ahrens
542eb63303Tom Caputi	if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
543eb63303Tom Caputi		zio_flags |= ZIO_FLAG_RAW;
544eb63303Tom Caputi
5451b912ecGeorge Wilson	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
546eb63303Tom Caputi	    zio_flags, &aflags, zb);
547fa9e406ahrens
54888b7b0fMatthew Ahrens	return (0);
549fa9e406ahrens}
550fa9e406ahrens
551fa9e406ahrensstatic void
55288b7b0fMatthew Ahrenstraverse_prefetch_thread(void *arg)
553fa9e406ahrens{
5546e0cbcaMatthew Ahrens	traverse_data_t *td_main = arg;
5556e0cbcaMatthew Ahrens	traverse_data_t td = *td_main;
5567802d7bMatthew Ahrens	zbookmark_phys_t czb;
557fa9e406ahrens
55888b7b0fMatthew Ahrens	td.td_func = traverse_prefetcher;
55988b7b0fMatthew Ahrens	td.td_arg = td_main->td_pfd;
56088b7b0fMatthew Ahrens	td.td_pfd = NULL;
5619c3fd12Matthew Ahrens	td.td_resume = &td_main->td_pfd->pd_resume;
562fa9e406ahrens
563b24ab67Jeff Bonwick	SET_BOOKMARK(&czb, td.td_objset,
564b24ab67Jeff Bonwick	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
5651b912ecGeorge Wilson	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
566fa9e406ahrens
56788b7b0fMatthew Ahrens	mutex_enter(&td_main->td_pfd->pd_mtx);
56888b7b0fMatthew Ahrens	td_main->td_pfd->pd_exited = B_TRUE;
56988b7b0fMatthew Ahrens	cv_broadcast(&td_main->td_pfd->pd_cv);
57088b7b0fMatthew Ahrens	mutex_exit(&td_main->td_pfd->pd_mtx);
571fa9e406ahrens}
572fa9e406ahrens
57388b7b0fMatthew Ahrens/*
57488b7b0fMatthew Ahrens * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
57588b7b0fMatthew Ahrens * in syncing context).
57688b7b0fMatthew Ahrens */
57788b7b0fMatthew Ahrensstatic int
578ad135b5Christopher Sidentraverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
5797802d7bMatthew Ahrens    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
580ad135b5Christopher Siden    blkptr_cb_t func, void *arg)
581fa9e406ahrens{
5826e0cbcaMatthew Ahrens	traverse_data_t td;
5836e0cbcaMatthew Ahrens	prefetch_data_t pd = { 0 };
5847802d7bMatthew Ahrens	zbookmark_phys_t czb;
58588b7b0fMatthew Ahrens	int err;
586fa9e406ahrens
587ad135b5Christopher Siden	ASSERT(ds == NULL || objset == ds->ds_object);
588ad135b5Christopher Siden	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
589ad135b5Christopher Siden
59088b7b0fMatthew Ahrens	td.td_spa = spa;
591ad135b5Christopher Siden	td.td_objset = objset;
59288b7b0fMatthew Ahrens	td.td_rootbp = rootbp;
59388b7b0fMatthew Ahrens	td.td_min_txg = txg_start;
594ad135b5Christopher Siden	td.td_resume = resume;
59588b7b0fMatthew Ahrens	td.td_func = func;
59688b7b0fMatthew Ahrens	td.td_arg = arg;
59788b7b0fMatthew Ahrens	td.td_pfd = &pd;
59888b7b0fMatthew Ahrens	td.td_flags = flags;
5997fd05acMatthew Ahrens	td.td_paused = B_FALSE;
600286ef71Paul Dagnelie	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
60188b7b0fMatthew Ahrens
602f7950bfMatthew Ahrens	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
603f7950bfMatthew Ahrens		VERIFY(spa_feature_enabled_txg(spa,
604f7950bfMatthew Ahrens		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
605f7950bfMatthew Ahrens	} else {
606286ef71Paul Dagnelie		td.td_hole_birth_enabled_txg = UINT64_MAX;
607f7950bfMatthew Ahrens	}
608f7950bfMatthew Ahrens
60988b7b0fMatthew Ahrens	pd.pd_flags = flags;
6109c3fd12Matthew Ahrens	if (resume != NULL)
6119c3fd12Matthew Ahrens		pd.pd_resume = *resume;
61288b7b0fMatthew Ahrens	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
61388b7b0fMatthew Ahrens	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
61488b7b0fMatthew Ahrens
615eb63303Tom Caputi	SET_BOOKMARK(&czb, td.td_objset,
616eb63303Tom Caputi	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
617eb63303Tom Caputi
6186e0cbcaMatthew Ahrens	/* See comment on ZIL traversal in dsl_scan_visitds. */
619bc9014eJustin Gibbs	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
620eb63303Tom Caputi		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
6217adb730George Wilson		arc_flags_t flags = ARC_FLAG_WAIT;
6223b2aab1Matthew Ahrens		objset_phys_t *