1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/dmu_objset.h>
28#include <sys/dmu_traverse.h>
29#include <sys/dsl_dataset.h>
30#include <sys/dsl_dir.h>
31#include <sys/dsl_pool.h>
32#include <sys/dnode.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/zio.h>
36#include <sys/dmu_impl.h>
37#include <sys/sa.h>
38#include <sys/sa_impl.h>
39#include <sys/callb.h>
40#include <sys/zfeature.h>
41
42int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
43boolean_t send_holes_without_birth_time = B_TRUE;
44
45typedef struct prefetch_data {
46	kmutex_t pd_mtx;
47	kcondvar_t pd_cv;
48	int32_t pd_bytes_fetched;
49	int pd_flags;
50	boolean_t pd_cancel;
51	boolean_t pd_exited;
52	zbookmark_phys_t pd_resume;
53} prefetch_data_t;
54
55typedef struct traverse_data {
56	spa_t *td_spa;
57	uint64_t td_objset;
58	blkptr_t *td_rootbp;
59	uint64_t td_min_txg;
60	zbookmark_phys_t *td_resume;
61	int td_flags;
62	prefetch_data_t *td_pfd;
63	boolean_t td_paused;
64	uint64_t td_hole_birth_enabled_txg;
65	blkptr_cb_t *td_func;
66	void *td_arg;
67	boolean_t td_realloc_possible;
68} traverse_data_t;
69
70static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
71    uint64_t objset, uint64_t object);
72static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
73    uint64_t objset, uint64_t object);
74
75static int
76traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
77{
78	traverse_data_t *td = arg;
79	zbookmark_phys_t zb;
80
81	if (BP_IS_HOLE(bp))
82		return (0);
83
84	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
85		return (-1);
86
87	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
88	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
89
90	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
91
92	return (0);
93}
94
95static int
96traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
97{
98	traverse_data_t *td = arg;
99
100	if (lrc->lrc_txtype == TX_WRITE) {
101		lr_write_t *lr = (lr_write_t *)lrc;
102		blkptr_t *bp = &lr->lr_blkptr;
103		zbookmark_phys_t zb;
104
105		if (BP_IS_HOLE(bp))
106			return (0);
107
108		if (claim_txg == 0 || bp->blk_birth < claim_txg)
109			return (0);
110
111		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
112		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
113
114		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
115		    td->td_arg);
116	}
117	return (0);
118}
119
120static void
121traverse_zil(traverse_data_t *td, zil_header_t *zh)
122{
123	uint64_t claim_txg = zh->zh_claim_txg;
124
125	/*
126	 * We only want to visit blocks that have been claimed but not yet
127	 * replayed; plus blocks that are already stable in read-only mode.
128	 */
129	if (claim_txg == 0 && spa_writeable(td->td_spa))
130		return;
131
132	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
133	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
134	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
135	zil_free(zilog);
136}
137
138typedef enum resume_skip {
139	RESUME_SKIP_ALL,
140	RESUME_SKIP_NONE,
141	RESUME_SKIP_CHILDREN
142} resume_skip_t;
143
144/*
145 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
146 * the block indicated by zb does not need to be visited at all. Returns
147 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
148 * resume point. This indicates that this block should be visited but not its
149 * children (since they must have been visited in a previous traversal).
150 * Otherwise returns RESUME_SKIP_NONE.
151 */
152static resume_skip_t
153resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
154    const zbookmark_phys_t *zb)
155{
156	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
157		/*
158		 * If we already visited this bp & everything below,
159		 * don't bother doing it again.
160		 */
161		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
162			return (RESUME_SKIP_ALL);
163
164		/*
165		 * If we found the block we're trying to resume from, zero
166		 * the bookmark out to indicate that we have resumed.
167		 */
168		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
169			bzero(td->td_resume, sizeof (*zb));
170			if (td->td_flags & TRAVERSE_POST)
171				return (RESUME_SKIP_CHILDREN);
172		}
173	}
174	return (RESUME_SKIP_NONE);
175}
176
177static void
178traverse_prefetch_metadata(traverse_data_t *td,
179    const blkptr_t *bp, const zbookmark_phys_t *zb)
180{
181	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
182	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
183
184	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
185		return;
186	/*
187	 * If we are in the process of resuming, don't prefetch, because
188	 * some children will not be needed (and in fact may have already
189	 * been freed).
190	 */
191	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
192		return;
193	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
194		return;
195	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
196		return;
197
198	if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
199		zio_flags |= ZIO_FLAG_RAW;
200
201	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
202	    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
203}
204
205static boolean_t
206prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
207{
208	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
209	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
210	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
211		return (B_FALSE);
212	return (B_TRUE);
213}
214
215static int
216traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
217    const blkptr_t *bp, const zbookmark_phys_t *zb)
218{
219	zbookmark_phys_t czb;
220	int err = 0;
221	arc_buf_t *buf = NULL;
222	prefetch_data_t *pd = td->td_pfd;
223	boolean_t hard = td->td_flags & TRAVERSE_HARD;
224
225	switch (resume_skip_check(td, dnp, zb)) {
226	case RESUME_SKIP_ALL:
227		return (0);
228	case RESUME_SKIP_CHILDREN:
229		goto post;
230	case RESUME_SKIP_NONE:
231		break;
232	default:
233		ASSERT(0);
234	}
235
236	if (bp->blk_birth == 0) {
237		/*
238		 * Since this block has a birth time of 0 it must be one of
239		 * two things: a hole created before the
240		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
241		 * which has always been a hole in an object.
242		 *
243		 * If a file is written sparsely, then the unwritten parts of
244		 * the file were "always holes" -- that is, they have been
245		 * holes since this object was allocated.  However, we (and
246		 * our callers) can not necessarily tell when an object was
247		 * allocated.  Therefore, if it's possible that this object
248		 * was freed and then its object number reused, we need to
249		 * visit all the holes with birth==0.
250		 *
251		 * If it isn't possible that the object number was reused,
252		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
253		 * all the blocks we will visit as part of this traversal,
254		 * then this hole must have always existed, so we can skip
255		 * it.  We visit blocks born after (exclusive) td_min_txg.
256		 *
257		 * Note that the meta-dnode cannot be reallocated.
258		 */
259		if (!send_holes_without_birth_time &&
260		    (!td->td_realloc_possible ||
261		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
262		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
263			return (0);
264	} else if (bp->blk_birth <= td->td_min_txg) {
265		return (0);
266	}
267
268	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
269		uint64_t size = BP_GET_LSIZE(bp);
270		mutex_enter(&pd->pd_mtx);
271		ASSERT(pd->pd_bytes_fetched >= 0);
272		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
273			cv_wait(&pd->pd_cv, &pd->pd_mtx);
274		pd->pd_bytes_fetched -= size;
275		cv_broadcast(&pd->pd_cv);
276		mutex_exit(&pd->pd_mtx);
277	}
278
279	if (BP_IS_HOLE(bp)) {
280		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
281		if (err != 0)
282			goto post;
283		return (0);
284	}
285
286	if (td->td_flags & TRAVERSE_PRE) {
287		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
288		    td->td_arg);
289		if (err == TRAVERSE_VISIT_NO_CHILDREN)
290			return (0);
291		if (err != 0)
292			goto post;
293	}
294
295	if (BP_GET_LEVEL(bp) > 0) {
296		arc_flags_t flags = ARC_FLAG_WAIT;
297		int i;
298		blkptr_t *cbp;
299		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
300
301		ASSERT(!BP_IS_PROTECTED(bp));
302
303		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
304		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
305		if (err != 0)
306			goto post;
307		cbp = buf->b_data;
308
309		for (i = 0; i < epb; i++) {
310			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
311			    zb->zb_level - 1,
312			    zb->zb_blkid * epb + i);
313			traverse_prefetch_metadata(td, &cbp[i], &czb);
314		}
315
316		/* recursively visitbp() blocks below this */
317		for (i = 0; i < epb; i++) {
318			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
319			    zb->zb_level - 1,
320			    zb->zb_blkid * epb + i);
321			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
322			if (err != 0)
323				break;
324		}
325	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
326		arc_flags_t flags = ARC_FLAG_WAIT;
327		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
328		int i;
329		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
330
331		/*
332		 * dnode blocks might have their bonus buffers encrypted, so
333		 * we must be careful to honor TRAVERSE_NO_DECRYPT
334		 */
335		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
336			zio_flags |= ZIO_FLAG_RAW;
337		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
338		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
339		if (err != 0)
340			goto post;
341		dnode_phys_t *child_dnp = buf->b_data;
342
343		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
344			prefetch_dnode_metadata(td, &child_dnp[i],
345			    zb->zb_objset, zb->zb_blkid * epb + i);
346		}
347
348		/* recursively visitbp() blocks below this */
349		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
350			err = traverse_dnode(td, &child_dnp[i],
351			    zb->zb_objset, zb->zb_blkid * epb + i);
352			if (err != 0)
353				break;
354		}
355	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
356		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
357		arc_flags_t flags = ARC_FLAG_WAIT;
358
359		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
360			zio_flags |= ZIO_FLAG_RAW;
361
362		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
363		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
364		if (err != 0)
365			goto post;
366
367		objset_phys_t *osp = buf->b_data;
368		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
369		    DMU_META_DNODE_OBJECT);
370		/*
371		 * See the block comment above for the goal of this variable.
372		 * If the maxblkid of the meta-dnode is 0, then we know that
373		 * we've never had more than DNODES_PER_BLOCK objects in the
374		 * dataset, which means we can't have reused any object ids.
375		 */
376		if (osp->os_meta_dnode.dn_maxblkid == 0)
377			td->td_realloc_possible = B_FALSE;
378
379		if (OBJSET_BUF_HAS_USERUSED(buf)) {
380			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
381				prefetch_dnode_metadata(td,
382				    &osp->os_projectused_dnode,
383				    zb->zb_objset, DMU_PROJECTUSED_OBJECT);
384			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
385			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
386			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
387			    zb->zb_objset, DMU_USERUSED_OBJECT);
388		}
389
390		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
391		    DMU_META_DNODE_OBJECT);
392		if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
393			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
394				err = traverse_dnode(td,
395				    &osp->os_projectused_dnode, zb->zb_objset,
396				    DMU_PROJECTUSED_OBJECT);
397			if (err == 0)
398				err = traverse_dnode(td,
399				    &osp->os_groupused_dnode, zb->zb_objset,
400				    DMU_GROUPUSED_OBJECT);
401			if (err == 0)
402				err = traverse_dnode(td,
403				    &osp->os_userused_dnode, zb->zb_objset,
404				    DMU_USERUSED_OBJECT);
405		}
406	}
407
408	if (buf)
409		arc_buf_destroy(buf, &buf);
410
411post:
412	if (err == 0 && (td->td_flags & TRAVERSE_POST))
413		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
414
415	if (hard && (err == EIO || err == ECKSUM)) {
416		/*
417		 * Ignore this disk error as requested by the HARD flag,
418		 * and continue traversal.
419		 */
420		err = 0;
421	}
422
423	/*
424	 * If we are stopping here, set td_resume.
425	 */
426	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
427		td->td_resume->zb_objset = zb->zb_objset;
428		td->td_resume->zb_object = zb->zb_object;
429		td->td_resume->zb_level = 0;
430		/*
431		 * If we have stopped on an indirect block (e.g. due to
432		 * i/o error), we have not visited anything below it.
433		 * Set the bookmark to the first level-0 block that we need
434		 * to visit.  This way, the resuming code does not need to
435		 * deal with resuming from indirect blocks.
436		 *
437		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
438		 * to dereference it.
439		 */
440		td->td_resume->zb_blkid = zb->zb_blkid;
441		if (zb->zb_level > 0) {
442			td->td_resume->zb_blkid <<= zb->zb_level *
443			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
444		}
445		td->td_paused = B_TRUE;
446	}
447
448	return (err);
449}
450
451static void
452prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
453    uint64_t objset, uint64_t object)
454{
455	int j;
456	zbookmark_phys_t czb;
457
458	for (j = 0; j < dnp->dn_nblkptr; j++) {
459		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
460		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
461	}
462
463	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
464		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
465		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
466	}
467}
468
469static int
470traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
471    uint64_t objset, uint64_t object)
472{
473	int j, err = 0;
474	zbookmark_phys_t czb;
475
476	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
477	    object < td->td_resume->zb_object)
478		return (0);
479
480	if (td->td_flags & TRAVERSE_PRE) {
481		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
482		    ZB_DNODE_BLKID);
483		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
484		    td->td_arg);
485		if (err == TRAVERSE_VISIT_NO_CHILDREN)
486			return (0);
487		if (err != 0)
488			return (err);
489	}
490
491	for (j = 0; j < dnp->dn_nblkptr; j++) {
492		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
493		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
494		if (err != 0)
495			break;
496	}
497
498	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
499		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
500		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
501	}
502
503	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
504		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
505		    ZB_DNODE_BLKID);
506		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
507		    td->td_arg);
508		if (err == TRAVERSE_VISIT_NO_CHILDREN)
509			return (0);
510		if (err != 0)
511			return (err);
512	}
513	return (err);
514}
515
516/* ARGSUSED */
517static int
518traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
519    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
520{
521	prefetch_data_t *pfd = arg;
522	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
523	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
524	    ARC_FLAG_PRESCIENT_PREFETCH;
525
526	ASSERT(pfd->pd_bytes_fetched >= 0);
527	if (bp == NULL)
528		return (0);
529	if (pfd->pd_cancel)
530		return (SET_ERROR(EINTR));
531
532	if (!prefetch_needed(pfd, bp))
533		return (0);
534
535	mutex_enter(&pfd->pd_mtx);
536	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
537		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
538	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
539	cv_broadcast(&pfd->pd_cv);
540	mutex_exit(&pfd->pd_mtx);
541
542	if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
543		zio_flags |= ZIO_FLAG_RAW;
544
545	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
546	    zio_flags, &aflags, zb);
547
548	return (0);
549}
550
551static void
552traverse_prefetch_thread(void *arg)
553{
554	traverse_data_t *td_main = arg;
555	traverse_data_t td = *td_main;
556	zbookmark_phys_t czb;
557
558	td.td_func = traverse_prefetcher;
559	td.td_arg = td_main->td_pfd;
560	td.td_pfd = NULL;
561	td.td_resume = &td_main->td_pfd->pd_resume;
562
563	SET_BOOKMARK(&czb, td.td_objset,
564	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
565	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
566
567	mutex_enter(&td_main->td_pfd->pd_mtx);
568	td_main->td_pfd->pd_exited = B_TRUE;
569	cv_broadcast(&td_main->td_pfd->pd_cv);
570	mutex_exit(&td_main->td_pfd->pd_mtx);
571}
572
573/*
574 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
575 * in syncing context).
576 */
577static int
578traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
579    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
580    blkptr_cb_t func, void *arg)
581{
582	traverse_data_t td;
583	prefetch_data_t pd = { 0 };
584	zbookmark_phys_t czb;
585	int err;
586
587	ASSERT(ds == NULL || objset == ds->ds_object);
588	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
589
590	td.td_spa = spa;
591	td.td_objset = objset;
592	td.td_rootbp = rootbp;
593	td.td_min_txg = txg_start;
594	td.td_resume = resume;
595	td.td_func = func;
596	td.td_arg = arg;
597	td.td_pfd = &pd;
598	td.td_flags = flags;
599	td.td_paused = B_FALSE;
600	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
601
602	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
603		VERIFY(spa_feature_enabled_txg(spa,
604		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
605	} else {
606		td.td_hole_birth_enabled_txg = UINT64_MAX;
607	}
608
609	pd.pd_flags = flags;
610	if (resume != NULL)
611		pd.pd_resume = *resume;
612	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
613	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
614
615	SET_BOOKMARK(&czb, td.td_objset,
616	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
617
618	/* See comment on ZIL traversal in dsl_scan_visitds. */
619	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
620		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
621		arc_flags_t flags = ARC_FLAG_WAIT;
622		objset_phys_t *osp;
623		arc_buf_t *buf;
624
625		if ((td.td_flags & TRAVERSE_NO_DECRYPT) &&
626		    BP_IS_PROTECTED(rootbp))
627			zio_flags |= ZIO_FLAG_RAW;
628
629		err = arc_read(NULL, td.td_spa, rootbp, arc_getbuf_func,
630		    &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
631		if (err != 0)
632			return (err);
633
634		osp = buf->b_data;
635		traverse_zil(&td, &osp->os_zil_header);
636		arc_buf_destroy(buf, &buf);
637	}
638
639	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
640	    taskq_dispatch(system_taskq, traverse_prefetch_thread,
641	    &td, TQ_NOQUEUE) == TASKQID_INVALID)
642		pd.pd_exited = B_TRUE;
643
644	err = traverse_visitbp(&td, NULL, rootbp, &czb);
645
646	mutex_enter(&pd.pd_mtx);
647	pd.pd_cancel = B_TRUE;
648	cv_broadcast(&pd.pd_cv);
649	while (!pd.pd_exited)
650		cv_wait(&pd.pd_cv, &pd.pd_mtx);
651	mutex_exit(&pd.pd_mtx);
652
653	mutex_destroy(&pd.pd_mtx);
654	cv_destroy(&pd.pd_cv);
655
656	return (err);
657}
658
659/*
660 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
661 * in syncing context).
662 */
663int
664traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
665    zbookmark_phys_t *resume,
666    int flags, blkptr_cb_t func, void *arg)
667{
668	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
669	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
670}
671
672int
673traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
674    int flags, blkptr_cb_t func, void *arg)
675{
676	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
677}
678
679int
680traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
681    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
682    blkptr_cb_t func, void *arg)
683{
684	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
685	    blkptr, txg_start, resume, flags, func, arg));
686}
687
688/*
689 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
690 */
691int
692traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
693    blkptr_cb_t func, void *arg)
694{
695	int err;
696	dsl_pool_t *dp = spa_get_dsl(spa);
697	objset_t *mos = dp->dp_meta_objset;
698	boolean_t hard = (flags & TRAVERSE_HARD);
699
700	/* visit the MOS */
701	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
702	    txg_start, NULL, flags, func, arg);
703	if (err != 0)
704		return (err);
705
706	/* visit each dataset */
707	for (uint64_t obj = 1; err == 0;
708	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
709		dmu_object_info_t doi;
710
711		err = dmu_object_info(mos, obj, &doi);
712		if (err != 0) {
713			if (hard)
714				continue;
715			break;
716		}
717
718		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
719			dsl_dataset_t *ds;
720			uint64_t txg = txg_start;
721
722			dsl_pool_config_enter(dp, FTAG);
723			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
724			dsl_pool_config_exit(dp, FTAG);
725			if (err != 0) {
726				if (hard)
727					continue;
728				break;
729			}
730			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
731				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
732			err = traverse_dataset(ds, txg, flags, func, arg);
733			dsl_dataset_rele(ds, FTAG);
734			if (err != 0)
735				break;
736		}
737	}
738	if (err == ESRCH)
739		err = 0;
740	return (err);
741}
742