zdb.c revision 5dafeea3ebd2dd77affc802bcb90f63faf01589f
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright 2017 RackTop Systems.
28 */
29
30#include <stdio.h>
31#include <unistd.h>
32#include <stdio_ext.h>
33#include <stdlib.h>
34#include <ctype.h>
35#include <sys/zfs_context.h>
36#include <sys/spa.h>
37#include <sys/spa_impl.h>
38#include <sys/dmu.h>
39#include <sys/zap.h>
40#include <sys/fs/zfs.h>
41#include <sys/zfs_znode.h>
42#include <sys/zfs_sa.h>
43#include <sys/sa.h>
44#include <sys/sa_impl.h>
45#include <sys/vdev.h>
46#include <sys/vdev_impl.h>
47#include <sys/metaslab_impl.h>
48#include <sys/dmu_objset.h>
49#include <sys/dsl_dir.h>
50#include <sys/dsl_dataset.h>
51#include <sys/dsl_pool.h>
52#include <sys/dbuf.h>
53#include <sys/zil.h>
54#include <sys/zil_impl.h>
55#include <sys/stat.h>
56#include <sys/resource.h>
57#include <sys/dmu_traverse.h>
58#include <sys/zio_checksum.h>
59#include <sys/zio_compress.h>
60#include <sys/zfs_fuid.h>
61#include <sys/arc.h>
62#include <sys/ddt.h>
63#include <sys/zfeature.h>
64#include <sys/abd.h>
65#include <sys/blkptr.h>
66#include <zfs_comutil.h>
67#include <libcmdutils.h>
68#undef verify
69#include <libzfs.h>
70
71#include "zdb.h"
72
73#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
74	zio_compress_table[(idx)].ci_name : "UNKNOWN")
75#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
76	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
77#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
78	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
79	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
80#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
81	(idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ?	\
82	DMU_OT_ZAP_OTHER : \
83	(idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
84	DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
85
86#ifndef lint
87extern int reference_tracking_enable;
88extern boolean_t zfs_recover;
89extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
90extern int zfs_vdev_async_read_max_active;
91extern int aok;
92extern boolean_t spa_load_verify_dryrun;
93#else
94int reference_tracking_enable;
95boolean_t zfs_recover;
96uint64_t zfs_arc_max, zfs_arc_meta_limit;
97int zfs_vdev_async_read_max_active;
98int aok;
99boolean_t spa_load_verify_dryrun;
100#endif
101
102static const char cmdname[] = "zdb";
103uint8_t dump_opt[256];
104
105typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
106
107uint64_t *zopt_object = NULL;
108static unsigned zopt_objects = 0;
109libzfs_handle_t *g_zfs;
110uint64_t max_inflight = 1000;
111
112static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
113
114/*
115 * These libumem hooks provide a reasonable set of defaults for the allocator's
116 * debugging facilities.
117 */
118const char *
119_umem_debug_init()
120{
121	return ("default,verbose"); /* $UMEM_DEBUG setting */
122}
123
124const char *
125_umem_logging_init(void)
126{
127	return ("fail,contents"); /* $UMEM_LOGGING setting */
128}
129
130static void
131usage(void)
132{
133	(void) fprintf(stderr,
134	    "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
135	    "[-I <inflight I/Os>]\n"
136	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
137	    "\t\t[<poolname> [<object> ...]]\n"
138	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
139	    "[<object> ...]\n"
140	    "\t%s -C [-A] [-U <cache>]\n"
141	    "\t%s -l [-Aqu] <device>\n"
142	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
143	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
144	    "\t%s -O <dataset> <path>\n"
145	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
146	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
147	    "\t%s -E [-A] word0:word1:...:word15\n"
148	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
149	    "<poolname>\n\n",
150	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
151	    cmdname, cmdname);
152
153	(void) fprintf(stderr, "    Dataset name must include at least one "
154	    "separator character '/' or '@'\n");
155	(void) fprintf(stderr, "    If dataset name is specified, only that "
156	    "dataset is dumped\n");
157	(void) fprintf(stderr, "    If object numbers are specified, only "
158	    "those objects are dumped\n\n");
159	(void) fprintf(stderr, "    Options to control amount of output:\n");
160	(void) fprintf(stderr, "        -b block statistics\n");
161	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
162	    "all data) blocks\n");
163	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
164	(void) fprintf(stderr, "        -d dataset(s)\n");
165	(void) fprintf(stderr, "        -D dedup statistics\n");
166	(void) fprintf(stderr, "        -E decode and display block from an "
167	    "embedded block pointer\n");
168	(void) fprintf(stderr, "        -h pool history\n");
169	(void) fprintf(stderr, "        -i intent logs\n");
170	(void) fprintf(stderr, "        -l read label contents\n");
171	(void) fprintf(stderr, "        -k examine the checkpointed state "
172	    "of the pool\n");
173	(void) fprintf(stderr, "        -L disable leak tracking (do not "
174	    "load spacemaps)\n");
175	(void) fprintf(stderr, "        -m metaslabs\n");
176	(void) fprintf(stderr, "        -M metaslab groups\n");
177	(void) fprintf(stderr, "        -O perform object lookups by path\n");
178	(void) fprintf(stderr, "        -R read and display block from a "
179	    "device\n");
180	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
181	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
182	(void) fprintf(stderr, "        -v verbose (applies to all "
183	    "others)\n\n");
184	(void) fprintf(stderr, "    Below options are intended for use "
185	    "with other options:\n");
186	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
187	    "panic recovery (-AA) or both (-AAA)\n");
188	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
189	    "has altroot/not in a cachefile\n");
190	(void) fprintf(stderr, "        -F attempt automatic rewind within "
191	    "safe range of transaction groups\n");
192	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
193	    "exiting\n");
194	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
195	    "specify the maximum number of "
196	    "checksumming I/Os [default is 200]\n");
197	(void) fprintf(stderr, "        -o <variable>=<value> set global "
198	    "variable to an unsigned 32-bit integer value\n");
199	(void) fprintf(stderr, "        -p <path> -- use one or more with "
200	    "-e to specify path to vdev dir\n");
201	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
202	(void) fprintf(stderr, "        -q don't print label contents\n");
203	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
204	    "searching for uberblocks\n");
205	(void) fprintf(stderr, "        -u uberblock\n");
206	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
207	    "cachefile\n");
208	(void) fprintf(stderr, "        -V do verbatim import\n");
209	(void) fprintf(stderr, "        -x <dumpdir> -- "
210	    "dump all read blocks into specified directory\n");
211	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
212	    "work with dataset)\n\n");
213	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
214	    "to make only that option verbose\n");
215	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
216	exit(1);
217}
218
219static void
220dump_debug_buffer()
221{
222	if (dump_opt['G']) {
223		(void) printf("\n");
224		zfs_dbgmsg_print("zdb");
225	}
226}
227
228/*
229 * Called for usage errors that are discovered after a call to spa_open(),
230 * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
231 */
232
233static void
234fatal(const char *fmt, ...)
235{
236	va_list ap;
237
238	va_start(ap, fmt);
239	(void) fprintf(stderr, "%s: ", cmdname);
240	(void) vfprintf(stderr, fmt, ap);
241	va_end(ap);
242	(void) fprintf(stderr, "\n");
243
244	dump_debug_buffer();
245
246	exit(1);
247}
248
249/* ARGSUSED */
250static void
251dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
252{
253	nvlist_t *nv;
254	size_t nvsize = *(uint64_t *)data;
255	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
256
257	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
258
259	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
260
261	umem_free(packed, nvsize);
262
263	dump_nvlist(nv, 8);
264
265	nvlist_free(nv);
266}
267
268/* ARGSUSED */
269static void
270dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
271{
272	spa_history_phys_t *shp = data;
273
274	if (shp == NULL)
275		return;
276
277	(void) printf("\t\tpool_create_len = %llu\n",
278	    (u_longlong_t)shp->sh_pool_create_len);
279	(void) printf("\t\tphys_max_off = %llu\n",
280	    (u_longlong_t)shp->sh_phys_max_off);
281	(void) printf("\t\tbof = %llu\n",
282	    (u_longlong_t)shp->sh_bof);
283	(void) printf("\t\teof = %llu\n",
284	    (u_longlong_t)shp->sh_eof);
285	(void) printf("\t\trecords_lost = %llu\n",
286	    (u_longlong_t)shp->sh_records_lost);
287}
288
289static void
290zdb_nicenum(uint64_t num, char *buf, size_t buflen)
291{
292	if (dump_opt['P'])
293		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
294	else
295		nicenum(num, buf, sizeof (buf));
296}
297
298static const char histo_stars[] = "****************************************";
299static const uint64_t histo_width = sizeof (histo_stars) - 1;
300
301static void
302dump_histogram(const uint64_t *histo, int size, int offset)
303{
304	int i;
305	int minidx = size - 1;
306	int maxidx = 0;
307	uint64_t max = 0;
308
309	for (i = 0; i < size; i++) {
310		if (histo[i] > max)
311			max = histo[i];
312		if (histo[i] > 0 && i > maxidx)
313			maxidx = i;
314		if (histo[i] > 0 && i < minidx)
315			minidx = i;
316	}
317
318	if (max < histo_width)
319		max = histo_width;
320
321	for (i = minidx; i <= maxidx; i++) {
322		(void) printf("\t\t\t%3u: %6llu %s\n",
323		    i + offset, (u_longlong_t)histo[i],
324		    &histo_stars[(max - histo[i]) * histo_width / max]);
325	}
326}
327
328static void
329dump_zap_stats(objset_t *os, uint64_t object)
330{
331	int error;
332	zap_stats_t zs;
333
334	error = zap_get_stats(os, object, &zs);
335	if (error)
336		return;
337
338	if (zs.zs_ptrtbl_len == 0) {
339		ASSERT(zs.zs_num_blocks == 1);
340		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
341		    (u_longlong_t)zs.zs_blocksize,
342		    (u_longlong_t)zs.zs_num_entries);
343		return;
344	}
345
346	(void) printf("\tFat ZAP stats:\n");
347
348	(void) printf("\t\tPointer table:\n");
349	(void) printf("\t\t\t%llu elements\n",
350	    (u_longlong_t)zs.zs_ptrtbl_len);
351	(void) printf("\t\t\tzt_blk: %llu\n",
352	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
353	(void) printf("\t\t\tzt_numblks: %llu\n",
354	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
355	(void) printf("\t\t\tzt_shift: %llu\n",
356	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
357	(void) printf("\t\t\tzt_blks_copied: %llu\n",
358	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
359	(void) printf("\t\t\tzt_nextblk: %llu\n",
360	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
361
362	(void) printf("\t\tZAP entries: %llu\n",
363	    (u_longlong_t)zs.zs_num_entries);
364	(void) printf("\t\tLeaf blocks: %llu\n",
365	    (u_longlong_t)zs.zs_num_leafs);
366	(void) printf("\t\tTotal blocks: %llu\n",
367	    (u_longlong_t)zs.zs_num_blocks);
368	(void) printf("\t\tzap_block_type: 0x%llx\n",
369	    (u_longlong_t)zs.zs_block_type);
370	(void) printf("\t\tzap_magic: 0x%llx\n",
371	    (u_longlong_t)zs.zs_magic);
372	(void) printf("\t\tzap_salt: 0x%llx\n",
373	    (u_longlong_t)zs.zs_salt);
374
375	(void) printf("\t\tLeafs with 2^n pointers:\n");
376	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
377
378	(void) printf("\t\tBlocks with n*5 entries:\n");
379	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
380
381	(void) printf("\t\tBlocks n/10 full:\n");
382	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
383
384	(void) printf("\t\tEntries with n chunks:\n");
385	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
386
387	(void) printf("\t\tBuckets with n entries:\n");
388	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
389}
390
391/*ARGSUSED*/
392static void
393dump_none(objset_t *os, uint64_t object, void *data, size_t size)
394{
395}
396
397/*ARGSUSED*/
398static void
399dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
400{
401	(void) printf("\tUNKNOWN OBJECT TYPE\n");
402}
403
404/*ARGSUSED*/
405static void
406dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
407{
408}
409
410/*ARGSUSED*/
411static void
412dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
413{
414}
415
416/*ARGSUSED*/
417static void
418dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
419{
420	zap_cursor_t zc;
421	zap_attribute_t attr;
422	void *prop;
423	unsigned i;
424
425	dump_zap_stats(os, object);
426	(void) printf("\n");
427
428	for (zap_cursor_init(&zc, os, object);
429	    zap_cursor_retrieve(&zc, &attr) == 0;
430	    zap_cursor_advance(&zc)) {
431		(void) printf("\t\t%s = ", attr.za_name);
432		if (attr.za_num_integers == 0) {
433			(void) printf("\n");
434			continue;
435		}
436		prop = umem_zalloc(attr.za_num_integers *
437		    attr.za_integer_length, UMEM_NOFAIL);
438		(void) zap_lookup(os, object, attr.za_name,
439		    attr.za_integer_length, attr.za_num_integers, prop);
440		if (attr.za_integer_length == 1) {
441			(void) printf("%s", (char *)prop);
442		} else {
443			for (i = 0; i < attr.za_num_integers; i++) {
444				switch (attr.za_integer_length) {
445				case 2:
446					(void) printf("%u ",
447					    ((uint16_t *)prop)[i]);
448					break;
449				case 4:
450					(void) printf("%u ",
451					    ((uint32_t *)prop)[i]);
452					break;
453				case 8:
454					(void) printf("%lld ",
455					    (u_longlong_t)((int64_t *)prop)[i]);
456					break;
457				}
458			}
459		}
460		(void) printf("\n");
461		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
462	}
463	zap_cursor_fini(&zc);
464}
465
466static void
467dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
468{
469	bpobj_phys_t *bpop = data;
470	char bytes[32], comp[32], uncomp[32];
471
472	/* make sure the output won't get truncated */
473	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
474	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
475	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
476
477	if (bpop == NULL)
478		return;
479
480	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
481	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
482	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
483
484	(void) printf("\t\tnum_blkptrs = %llu\n",
485	    (u_longlong_t)bpop->bpo_num_blkptrs);
486	(void) printf("\t\tbytes = %s\n", bytes);
487	if (size >= BPOBJ_SIZE_V1) {
488		(void) printf("\t\tcomp = %s\n", comp);
489		(void) printf("\t\tuncomp = %s\n", uncomp);
490	}
491	if (size >= sizeof (*bpop)) {
492		(void) printf("\t\tsubobjs = %llu\n",
493		    (u_longlong_t)bpop->bpo_subobjs);
494		(void) printf("\t\tnum_subobjs = %llu\n",
495		    (u_longlong_t)bpop->bpo_num_subobjs);
496	}
497
498	if (dump_opt['d'] < 5)
499		return;
500
501	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
502		char blkbuf[BP_SPRINTF_LEN];
503		blkptr_t bp;
504
505		int err = dmu_read(os, object,
506		    i * sizeof (bp), sizeof (bp), &bp, 0);
507		if (err != 0) {
508			(void) printf("got error %u from dmu_read\n", err);
509			break;
510		}
511		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
512		(void) printf("\t%s\n", blkbuf);
513	}
514}
515
516/* ARGSUSED */
517static void
518dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
519{
520	dmu_object_info_t doi;
521
522	VERIFY0(dmu_object_info(os, object, &doi));
523	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
524
525	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
526	if (err != 0) {
527		(void) printf("got error %u from dmu_read\n", err);
528		kmem_free(subobjs, doi.doi_max_offset);
529		return;
530	}
531
532	int64_t last_nonzero = -1;
533	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
534		if (subobjs[i] != 0)
535			last_nonzero = i;
536	}
537
538	for (int64_t i = 0; i <= last_nonzero; i++) {
539		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
540	}
541	kmem_free(subobjs, doi.doi_max_offset);
542}
543
544/*ARGSUSED*/
545static void
546dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
547{
548	dump_zap_stats(os, object);
549	/* contents are printed elsewhere, properly decoded */
550}
551
552/*ARGSUSED*/
553static void
554dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
555{
556	zap_cursor_t zc;
557	zap_attribute_t attr;
558
559	dump_zap_stats(os, object);
560	(void) printf("\n");
561
562	for (zap_cursor_init(&zc, os, object);
563	    zap_cursor_retrieve(&zc, &attr) == 0;
564	    zap_cursor_advance(&zc)) {
565		(void) printf("\t\t%s = ", attr.za_name);
566		if (attr.za_num_integers == 0) {
567			(void) printf("\n");
568			continue;
569		}
570		(void) printf(" %llx : [%d:%d:%d]\n",
571		    (u_longlong_t)attr.za_first_integer,
572		    (int)ATTR_LENGTH(attr.za_first_integer),
573		    (int)ATTR_BSWAP(attr.za_first_integer),
574		    (int)ATTR_NUM(attr.za_first_integer));
575	}
576	zap_cursor_fini(&zc);
577}
578
579/*ARGSUSED*/
580static void
581dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
582{
583	zap_cursor_t zc;
584	zap_attribute_t attr;
585	uint16_t *layout_attrs;
586	unsigned i;
587
588	dump_zap_stats(os, object);
589	(void) printf("\n");
590
591	for (zap_cursor_init(&zc, os, object);
592	    zap_cursor_retrieve(&zc, &attr) == 0;
593	    zap_cursor_advance(&zc)) {
594		(void) printf("\t\t%s = [", attr.za_name);
595		if (attr.za_num_integers == 0) {
596			(void) printf("\n");
597			continue;
598		}
599
600		VERIFY(attr.za_integer_length == 2);
601		layout_attrs = umem_zalloc(attr.za_num_integers *
602		    attr.za_integer_length, UMEM_NOFAIL);
603
604		VERIFY(zap_lookup(os, object, attr.za_name,
605		    attr.za_integer_length,
606		    attr.za_num_integers, layout_attrs) == 0);
607
608		for (i = 0; i != attr.za_num_integers; i++)
609			(void) printf(" %d ", (int)layout_attrs[i]);
610		(void) printf("]\n");
611		umem_free(layout_attrs,
612		    attr.za_num_integers * attr.za_integer_length);
613	}
614	zap_cursor_fini(&zc);
615}
616
617/*ARGSUSED*/
618static void
619dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
620{
621	zap_cursor_t zc;
622	zap_attribute_t attr;
623	const char *typenames[] = {
624		/* 0 */ "not specified",
625		/* 1 */ "FIFO",
626		/* 2 */ "Character Device",
627		/* 3 */ "3 (invalid)",
628		/* 4 */ "Directory",
629		/* 5 */ "5 (invalid)",
630		/* 6 */ "Block Device",
631		/* 7 */ "7 (invalid)",
632		/* 8 */ "Regular File",
633		/* 9 */ "9 (invalid)",
634		/* 10 */ "Symbolic Link",
635		/* 11 */ "11 (invalid)",
636		/* 12 */ "Socket",
637		/* 13 */ "Door",
638		/* 14 */ "Event Port",
639		/* 15 */ "15 (invalid)",
640	};
641
642	dump_zap_stats(os, object);
643	(void) printf("\n");
644
645	for (zap_cursor_init(&zc, os, object);
646	    zap_cursor_retrieve(&zc, &attr) == 0;
647	    zap_cursor_advance(&zc)) {
648		(void) printf("\t\t%s = %lld (type: %s)\n",
649		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
650		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
651	}
652	zap_cursor_fini(&zc);
653}
654
655static int
656get_dtl_refcount(vdev_t *vd)
657{
658	int refcount = 0;
659
660	if (vd->vdev_ops->vdev_op_leaf) {
661		space_map_t *sm = vd->vdev_dtl_sm;
662
663		if (sm != NULL &&
664		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
665			return (1);
666		return (0);
667	}
668
669	for (unsigned c = 0; c < vd->vdev_children; c++)
670		refcount += get_dtl_refcount(vd->vdev_child[c]);
671	return (refcount);
672}
673
674static int
675get_metaslab_refcount(vdev_t *vd)
676{
677	int refcount = 0;
678
679	if (vd->vdev_top == vd) {
680		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
681			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
682
683			if (sm != NULL &&
684			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
685				refcount++;
686		}
687	}
688	for (unsigned c = 0; c < vd->vdev_children; c++)
689		refcount += get_metaslab_refcount(vd->vdev_child[c]);
690
691	return (refcount);
692}
693
694static int
695get_obsolete_refcount(vdev_t *vd)
696{
697	int refcount = 0;
698
699	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
700	if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
701		dmu_object_info_t doi;
702		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
703		    obsolete_sm_obj, &doi));
704		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
705			refcount++;
706		}
707	} else {
708		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
709		ASSERT3U(obsolete_sm_obj, ==, 0);
710	}
711	for (unsigned c = 0; c < vd->vdev_children; c++) {
712		refcount += get_obsolete_refcount(vd->vdev_child[c]);
713	}
714
715	return (refcount);
716}
717
718static int
719get_prev_obsolete_spacemap_refcount(spa_t *spa)
720{
721	uint64_t prev_obj =
722	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
723	if (prev_obj != 0) {
724		dmu_object_info_t doi;
725		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
726		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
727			return (1);
728		}
729	}
730	return (0);
731}
732
733static int
734get_checkpoint_refcount(vdev_t *vd)
735{
736	int refcount = 0;
737
738	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
739	    zap_contains(spa_meta_objset(vd->vdev_spa),
740	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
741		refcount++;
742
743	for (uint64_t c = 0; c < vd->vdev_children; c++)
744		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
745
746	return (refcount);
747}
748
749static int
750verify_spacemap_refcounts(spa_t *spa)
751{
752	uint64_t expected_refcount = 0;
753	uint64_t actual_refcount;
754
755	(void) feature_get_refcount(spa,
756	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
757	    &expected_refcount);
758	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
759	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
760	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
761	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
762	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
763
764	if (expected_refcount != actual_refcount) {
765		(void) printf("space map refcount mismatch: expected %lld != "
766		    "actual %lld\n",
767		    (longlong_t)expected_refcount,
768		    (longlong_t)actual_refcount);
769		return (2);
770	}
771	return (0);
772}
773
774static void
775dump_spacemap(objset_t *os, space_map_t *sm)
776{
777	uint64_t alloc, offset, entry;
778	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
779	    "INVALID", "INVALID", "INVALID", "INVALID" };
780
781	if (sm == NULL)
782		return;
783
784	(void) printf("space map object %llu:\n",
785	    (longlong_t)sm->sm_phys->smp_object);
786	(void) printf("  smp_objsize = 0x%llx\n",
787	    (longlong_t)sm->sm_phys->smp_objsize);
788	(void) printf("  smp_alloc = 0x%llx\n",
789	    (longlong_t)sm->sm_phys->smp_alloc);
790
791	/*
792	 * Print out the freelist entries in both encoded and decoded form.
793	 */
794	alloc = 0;
795	for (offset = 0; offset < space_map_length(sm);
796	    offset += sizeof (entry)) {
797		uint8_t mapshift = sm->sm_shift;
798
799		VERIFY0(dmu_read(os, space_map_object(sm), offset,
800		    sizeof (entry), &entry, DMU_READ_PREFETCH));
801		if (SM_DEBUG_DECODE(entry)) {
802
803			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
804			    (u_longlong_t)(offset / sizeof (entry)),
805			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
806			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
807			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
808		} else {
809			(void) printf("\t    [%6llu]    %c  range:"
810			    " %010llx-%010llx  size: %06llx\n",
811			    (u_longlong_t)(offset / sizeof (entry)),
812			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
813			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
814			    mapshift) + sm->sm_start),
815			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
816			    mapshift) + sm->sm_start +
817			    (SM_RUN_DECODE(entry) << mapshift)),
818			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
819			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
820				alloc += SM_RUN_DECODE(entry) << mapshift;
821			else
822				alloc -= SM_RUN_DECODE(entry) << mapshift;
823		}
824	}
825	if (alloc != space_map_allocated(sm)) {
826		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
827		    "with space map summary (%llu)\n",
828		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
829	}
830}
831
832static void
833dump_metaslab_stats(metaslab_t *msp)
834{
835	char maxbuf[32];
836	range_tree_t *rt = msp->ms_allocatable;
837	avl_tree_t *t = &msp->ms_allocatable_by_size;
838	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
839
840	/* max sure nicenum has enough space */
841	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
842
843	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
844
845	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
846	    "segments", avl_numnodes(t), "maxsize", maxbuf,
847	    "freepct", free_pct);
848	(void) printf("\tIn-memory histogram:\n");
849	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
850}
851
852static void
853dump_metaslab(metaslab_t *msp)
854{
855	vdev_t *vd = msp->ms_group->mg_vd;
856	spa_t *spa = vd->vdev_spa;
857	space_map_t *sm = msp->ms_sm;
858	char freebuf[32];
859
860	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
861	    sizeof (freebuf));
862
863	(void) printf(
864	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
865	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
866	    (u_longlong_t)space_map_object(sm), freebuf);
867
868	if (dump_opt['m'] > 2 && !dump_opt['L']) {
869		mutex_enter(&msp->ms_lock);
870		metaslab_load_wait(msp);
871		if (!msp->ms_loaded) {
872			VERIFY0(metaslab_load(msp));
873			range_tree_stat_verify(msp->ms_allocatable);
874		}
875		dump_metaslab_stats(msp);
876		metaslab_unload(msp);
877		mutex_exit(&msp->ms_lock);
878	}
879
880	if (dump_opt['m'] > 1 && sm != NULL &&
881	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
882		/*
883		 * The space map histogram represents free space in chunks
884		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
885		 */
886		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
887		    (u_longlong_t)msp->ms_fragmentation);
888		dump_histogram(sm->sm_phys->smp_histogram,
889		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
890	}
891
892	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
893		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
894
895		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
896	}
897}
898
899static void
900print_vdev_metaslab_header(vdev_t *vd)
901{
902	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
903	    (u_longlong_t)vd->vdev_id,
904	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
905	    "offset", "spacemap", "free");
906	(void) printf("\t%15s   %19s   %15s   %10s\n",
907	    "---------------", "-------------------",
908	    "---------------", "-------------");
909}
910
911static void
912dump_metaslab_groups(spa_t *spa)
913{
914	vdev_t *rvd = spa->spa_root_vdev;
915	metaslab_class_t *mc = spa_normal_class(spa);
916	uint64_t fragmentation;
917
918	metaslab_class_histogram_verify(mc);
919
920	for (unsigned c = 0; c < rvd->vdev_children; c++) {
921		vdev_t *tvd = rvd->vdev_child[c];
922		metaslab_group_t *mg = tvd->vdev_mg;
923
924		if (mg->mg_class != mc)
925			continue;
926
927		metaslab_group_histogram_verify(mg);
928		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
929
930		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
931		    "fragmentation",
932		    (u_longlong_t)tvd->vdev_id,
933		    (u_longlong_t)tvd->vdev_ms_count);
934		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
935			(void) printf("%3s\n", "-");
936		} else {
937			(void) printf("%3llu%%\n",
938			    (u_longlong_t)mg->mg_fragmentation);
939		}
940		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
941	}
942
943	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
944	fragmentation = metaslab_class_fragmentation(mc);
945	if (fragmentation == ZFS_FRAG_INVALID)
946		(void) printf("\t%3s\n", "-");
947	else
948		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
949	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
950}
951
952static void
953print_vdev_indirect(vdev_t *vd)
954{
955	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
956	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
957	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
958
959	if (vim == NULL) {
960		ASSERT3P(vib, ==, NULL);
961		return;
962	}
963
964	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
965	    vic->vic_mapping_object);
966	ASSERT3U(vdev_indirect_births_object(vib), ==,
967	    vic->vic_births_object);
968
969	(void) printf("indirect births obj %llu:\n",
970	    (longlong_t)vic->vic_births_object);
971	(void) printf("    vib_count = %llu\n",
972	    (longlong_t)vdev_indirect_births_count(vib));
973	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
974		vdev_indirect_birth_entry_phys_t *cur_vibe =
975		    &vib->vib_entries[i];
976		(void) printf("\toffset %llx -> txg %llu\n",
977		    (longlong_t)cur_vibe->vibe_offset,
978		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
979	}
980	(void) printf("\n");
981
982	(void) printf("indirect mapping obj %llu:\n",
983	    (longlong_t)vic->vic_mapping_object);
984	(void) printf("    vim_max_offset = 0x%llx\n",
985	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
986	(void) printf("    vim_bytes_mapped = 0x%llx\n",
987	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
988	(void) printf("    vim_count = %llu\n",
989	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
990
991	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
992		return;
993
994	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
995
996	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
997		vdev_indirect_mapping_entry_phys_t *vimep =
998		    &vim->vim_entries[i];
999		(void) printf("\t<%llx:%llx:%llx> -> "
1000		    "<%llx:%llx:%llx> (%x obsolete)\n",
1001		    (longlong_t)vd->vdev_id,
1002		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1003		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1004		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1005		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1006		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1007		    counts[i]);
1008	}
1009	(void) printf("\n");
1010
1011	uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
1012	if (obsolete_sm_object != 0) {
1013		objset_t *mos = vd->vdev_spa->spa_meta_objset;
1014		(void) printf("obsolete space map object %llu:\n",
1015		    (u_longlong_t)obsolete_sm_object);
1016		ASSERT(vd->vdev_obsolete_sm != NULL);
1017		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1018		    obsolete_sm_object);
1019		dump_spacemap(mos, vd->vdev_obsolete_sm);
1020		(void) printf("\n");
1021	}
1022}
1023
1024static void
1025dump_metaslabs(spa_t *spa)
1026{
1027	vdev_t *vd, *rvd = spa->spa_root_vdev;
1028	uint64_t m, c = 0, children = rvd->vdev_children;
1029
1030	(void) printf("\nMetaslabs:\n");
1031
1032	if (!dump_opt['d'] && zopt_objects > 0) {
1033		c = zopt_object[0];
1034
1035		if (c >= children)
1036			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1037
1038		if (zopt_objects > 1) {
1039			vd = rvd->vdev_child[c];
1040			print_vdev_metaslab_header(vd);
1041
1042			for (m = 1; m < zopt_objects; m++) {
1043				if (zopt_object[m] < vd->vdev_ms_count)
1044					dump_metaslab(
1045					    vd->vdev_ms[zopt_object[m]]);
1046				else
1047					(void) fprintf(stderr, "bad metaslab "
1048					    "number %llu\n",
1049					    (u_longlong_t)zopt_object[m]);
1050			}
1051			(void) printf("\n");
1052			return;
1053		}
1054		children = c + 1;
1055	}
1056	for (; c < children; c++) {
1057		vd = rvd->vdev_child[c];
1058		print_vdev_metaslab_header(vd);
1059
1060		print_vdev_indirect(vd);
1061
1062		for (m = 0; m < vd->vdev_ms_count; m++)
1063			dump_metaslab(vd->vdev_ms[m]);
1064		(void) printf("\n");
1065	}
1066}
1067
1068static void
1069dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1070{
1071	const ddt_phys_t *ddp = dde->dde_phys;
1072	const ddt_key_t *ddk = &dde->dde_key;
1073	const char *types[4] = { "ditto", "single", "double", "triple" };
1074	char blkbuf[BP_SPRINTF_LEN];
1075	blkptr_t blk;
1076
1077	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1078		if (ddp->ddp_phys_birth == 0)
1079			continue;
1080		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1081		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1082		(void) printf("index %llx refcnt %llu %s %s\n",
1083		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1084		    types[p], blkbuf);
1085	}
1086}
1087
1088static void
1089dump_dedup_ratio(const ddt_stat_t *dds)
1090{
1091	double rL, rP, rD, D, dedup, compress, copies;
1092
1093	if (dds->dds_blocks == 0)
1094		return;
1095
1096	rL = (double)dds->dds_ref_lsize;
1097	rP = (double)dds->dds_ref_psize;
1098	rD = (double)dds->dds_ref_dsize;
1099	D = (double)dds->dds_dsize;
1100
1101	dedup = rD / D;
1102	compress = rL / rP;
1103	copies = rD / rP;
1104
1105	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1106	    "dedup * compress / copies = %.2f\n\n",
1107	    dedup, compress, copies, dedup * compress / copies);
1108}
1109
1110static void
1111dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
1112{
1113	char name[DDT_NAMELEN];
1114	ddt_entry_t dde;
1115	uint64_t walk = 0;
1116	dmu_object_info_t doi;
1117	uint64_t count, dspace, mspace;
1118	int error;
1119
1120	error = ddt_object_info(ddt, type, class, &doi);
1121
1122	if (error == ENOENT)
1123		return;
1124	ASSERT(error == 0);
1125
1126	if ((count = ddt_object_count(ddt, type, class)) == 0)
1127		return;
1128
1129	dspace = doi.doi_physical_blocks_512 << 9;
1130	mspace = doi.doi_fill_count * doi.doi_data_block_size;
1131
1132	ddt_object_name(ddt, type, class, name);
1133
1134	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
1135	    name,
1136	    (u_longlong_t)count,
1137	    (u_longlong_t)(dspace / count),
1138	    (u_longlong_t)(mspace / count));
1139
1140	if (dump_opt['D'] < 3)
1141		return;
1142
1143	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
1144
1145	if (dump_opt['D'] < 4)
1146		return;
1147
1148	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
1149		return;
1150
1151	(void) printf("%s contents:\n\n", name);
1152
1153	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
1154		dump_dde(ddt, &dde, walk);
1155
1156	ASSERT(error == ENOENT);
1157
1158	(void) printf("\n");
1159}
1160
1161static void
1162dump_all_ddts(spa_t *spa)
1163{
1164	ddt_histogram_t ddh_total;
1165	ddt_stat_t dds_total;
1166
1167	bzero(&ddh_total, sizeof (ddh_total));
1168	bzero(&dds_total, sizeof (dds_total));
1169
1170	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1171		ddt_t *ddt = spa->spa_ddt[c];
1172		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1173			for (enum ddt_class class = 0; class < DDT_CLASSES;
1174			    class++) {
1175				dump_ddt(ddt, type, class);
1176			}
1177		}
1178	}
1179
1180	ddt_get_dedup_stats(spa, &dds_total);
1181
1182	if (dds_total.dds_blocks == 0) {
1183		(void) printf("All DDTs are empty\n");
1184		return;
1185	}
1186
1187	(void) printf("\n");
1188
1189	if (dump_opt['D'] > 1) {
1190		(void) printf("DDT histogram (aggregated over all DDTs):\n");
1191		ddt_get_dedup_histogram(spa, &ddh_total);
1192		zpool_dump_ddt(&dds_total, &ddh_total);
1193	}
1194
1195	dump_dedup_ratio(&dds_total);
1196}
1197
1198static void
1199dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
1200{
1201	char *prefix = arg;
1202
1203	(void) printf("%s [%llu,%llu) length %llu\n",
1204	    prefix,
1205	    (u_longlong_t)start,
1206	    (u_longlong_t)(start + size),
1207	    (u_longlong_t)(size));
1208}
1209
1210static void
1211dump_dtl(vdev_t *vd, int indent)
1212{
1213	spa_t *spa = vd->vdev_spa;
1214	boolean_t required;
1215	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
1216		"outage" };
1217	char prefix[256];
1218
1219	spa_vdev_state_enter(spa, SCL_NONE);
1220	required = vdev_dtl_required(vd);
1221	(void) spa_vdev_state_exit(spa, NULL, 0);
1222
1223	if (indent == 0)
1224		(void) printf("\nDirty time logs:\n\n");
1225
1226	(void) printf("\t%*s%s [%s]\n", indent, "",
1227	    vd->vdev_path ? vd->vdev_path :
1228	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
1229	    required ? "DTL-required" : "DTL-expendable");
1230
1231	for (int t = 0; t < DTL_TYPES; t++) {
1232		range_tree_t *rt = vd->vdev_dtl[t];
1233		if (range_tree_space(rt) == 0)
1234			continue;
1235		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
1236		    indent + 2, "", name[t]);
1237		range_tree_walk(rt, dump_dtl_seg, prefix);
1238		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
1239			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
1240	}
1241
1242	for (unsigned c = 0; c < vd->vdev_children; c++)
1243		dump_dtl(vd->vdev_child[c], indent + 4);
1244}
1245
1246static void
1247dump_history(spa_t *spa)
1248{
1249	nvlist_t **events = NULL;
1250	uint64_t resid, len, off = 0;
1251	uint_t num = 0;
1252	int error;
1253	time_t tsec;
1254	struct tm t;
1255	char tbuf[30];
1256	char internalstr[MAXPATHLEN];
1257
1258	char *buf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1259	do {
1260		len = SPA_MAXBLOCKSIZE;
1261
1262		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
1263			(void) fprintf(stderr, "Unable to read history: "
1264			    "error %d\n", error);
1265			umem_free(buf, SPA_MAXBLOCKSIZE);
1266			return;
1267		}
1268
1269		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
1270			break;
1271
1272		off -= resid;
1273	} while (len != 0);
1274	umem_free(buf, SPA_MAXBLOCKSIZE);
1275
1276	(void) printf("\nHistory:\n");
1277	for (unsigned i = 0; i < num; i++) {
1278		uint64_t time, txg, ievent;
1279		char *cmd, *intstr;
1280		boolean_t printed = B_FALSE;
1281
1282		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
1283		    &time) != 0)
1284			goto next;
1285		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
1286		    &cmd) != 0) {
1287			if (nvlist_lookup_uint64(events[i],
1288			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
1289				goto next;
1290			verify(nvlist_lookup_uint64(events[i],
1291			    ZPOOL_HIST_TXG, &txg) == 0);
1292			verify(nvlist_lookup_string(events[i],
1293			    ZPOOL_HIST_INT_STR, &intstr) == 0);
1294			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
1295				goto next;
1296
1297			(void) snprintf(internalstr,
1298			    sizeof (internalstr),
1299			    "[internal %s txg:%ju] %s",
1300			    zfs_history_event_names[ievent], (uintmax_t)txg,
1301			    intstr);
1302			cmd = internalstr;
1303		}
1304		tsec = time;
1305		(void) localtime_r(&tsec, &t);
1306		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
1307		(void) printf("%s %s\n", tbuf, cmd);
1308		printed = B_TRUE;
1309
1310next:
1311		if (dump_opt['h'] > 1) {
1312			if (!printed)
1313				(void) printf("unrecognized record:\n");
1314			dump_nvlist(events[i], 2);
1315		}
1316	}
1317}
1318
1319/*ARGSUSED*/
1320static void
1321dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
1322{
1323}
1324
1325static uint64_t
1326blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
1327    const zbookmark_phys_t *zb)
1328{
1329	if (dnp == NULL) {
1330		ASSERT(zb->zb_level < 0);
1331		if (zb->zb_object == 0)
1332			return (zb->zb_blkid);
1333		return (zb->zb_blkid * BP_GET_LSIZE(bp));
1334	}
1335
1336	ASSERT(zb->zb_level >= 0);
1337
1338	return ((zb->zb_blkid <<
1339	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
1340	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
1341}
1342
1343static void
1344snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
1345{
1346	const dva_t *dva = bp->blk_dva;
1347	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
1348
1349	if (dump_opt['b'] >= 6) {
1350		snprintf_blkptr(blkbuf, buflen, bp);
1351		return;
1352	}
1353
1354	if (BP_IS_EMBEDDED(bp)) {
1355		(void) sprintf(blkbuf,
1356		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
1357		    (int)BPE_GET_ETYPE(bp),
1358		    (u_longlong_t)BPE_GET_LSIZE(bp),
1359		    (u_longlong_t)BPE_GET_PSIZE(bp),
1360		    (u_longlong_t)bp->blk_birth);
1361		return;
1362	}
1363
1364	blkbuf[0] = '\0';
1365	for (int i = 0; i < ndvas; i++)
1366		(void) snprintf(blkbuf + strlen(blkbuf),
1367		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
1368		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
1369		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
1370		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
1371
1372	if (BP_IS_HOLE(bp)) {
1373		(void) snprintf(blkbuf + strlen(blkbuf),
1374		    buflen - strlen(blkbuf),
1375		    "%llxL B=%llu",
1376		    (u_longlong_t)BP_GET_LSIZE(bp),
1377		    (u_longlong_t)bp->blk_birth);
1378	} else {
1379		(void) snprintf(blkbuf + strlen(blkbuf),
1380		    buflen - strlen(blkbuf),
1381		    "%llxL/%llxP F=%llu B=%llu/%llu",
1382		    (u_longlong_t)BP_GET_LSIZE(bp),
1383		    (u_longlong_t)BP_GET_PSIZE(bp),
1384		    (u_longlong_t)BP_GET_FILL(bp),
1385		    (u_longlong_t)bp->blk_birth,
1386		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
1387	}
1388}
1389
1390static void
1391print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
1392    const dnode_phys_t *dnp)
1393{
1394	char blkbuf[BP_SPRINTF_LEN];
1395	int l;
1396
1397	if (!BP_IS_EMBEDDED(bp)) {
1398		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
1399		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
1400	}
1401
1402	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
1403
1404	ASSERT(zb->zb_level >= 0);
1405
1406	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
1407		if (l == zb->zb_level) {
1408			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
1409		} else {
1410			(void) printf(" ");
1411		}
1412	}
1413
1414	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1415	(void) printf("%s\n", blkbuf);
1416}
1417
1418static int
1419visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
1420    blkptr_t *bp, const zbookmark_phys_t *zb)
1421{
1422	int err = 0;
1423
1424	if (bp->blk_birth == 0)
1425		return (0);
1426
1427	print_indirect(bp, zb, dnp);
1428
1429	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
1430		arc_flags_t flags = ARC_FLAG_WAIT;
1431		int i;
1432		blkptr_t *cbp;
1433		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1434		arc_buf_t *buf;
1435		uint64_t fill = 0;
1436
1437		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
1438		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
1439		if (err)
1440			return (err);
1441		ASSERT(buf->b_data);
1442
1443		/* recursively visit blocks below this */
1444		cbp = buf->b_data;
1445		for (i = 0; i < epb; i++, cbp++) {
1446			zbookmark_phys_t czb;
1447
1448			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1449			    zb->zb_level - 1,
1450			    zb->zb_blkid * epb + i);
1451			err = visit_indirect(spa, dnp, cbp, &czb);
1452			if (err)
1453				break;
1454			fill += BP_GET_FILL(cbp);
1455		}
1456		if (!err)
1457			ASSERT3U(fill, ==, BP_GET_FILL(bp));
1458		arc_buf_destroy(buf, &buf);
1459	}
1460
1461	return (err);
1462}
1463
1464/*ARGSUSED*/
1465static void
1466dump_indirect(dnode_t *dn)
1467{
1468	dnode_phys_t *dnp = dn->dn_phys;
1469	int j;
1470	zbookmark_phys_t czb;
1471
1472	(void) printf("Indirect blocks:\n");
1473
1474	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
1475	    dn->dn_object, dnp->dn_nlevels - 1, 0);
1476	for (j = 0; j < dnp->dn_nblkptr; j++) {
1477		czb.zb_blkid = j;
1478		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
1479		    &dnp->dn_blkptr[j], &czb);
1480	}
1481
1482	(void) printf("\n");
1483}
1484
1485/*ARGSUSED*/
1486static void
1487dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
1488{
1489	dsl_dir_phys_t *dd = data;
1490	time_t crtime;
1491	char nice[32];
1492
1493	/* make sure nicenum has enough space */
1494	CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
1495
1496	if (dd == NULL)
1497		return;
1498
1499	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
1500
1501	crtime = dd->dd_creation_time;
1502	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1503	(void) printf("\t\thead_dataset_obj = %llu\n",
1504	    (u_longlong_t)dd->dd_head_dataset_obj);
1505	(void) printf("\t\tparent_dir_obj = %llu\n",
1506	    (u_longlong_t)dd->dd_parent_obj);
1507	(void) printf("\t\torigin_obj = %llu\n",
1508	    (u_longlong_t)dd->dd_origin_obj);
1509	(void) printf("\t\tchild_dir_zapobj = %llu\n",
1510	    (u_longlong_t)dd->dd_child_dir_zapobj);
1511	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
1512	(void) printf("\t\tused_bytes = %s\n", nice);
1513	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
1514	(void) printf("\t\tcompressed_bytes = %s\n", nice);
1515	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
1516	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
1517	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
1518	(void) printf("\t\tquota = %s\n", nice);
1519	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
1520	(void) printf("\t\treserved = %s\n", nice);
1521	(void) printf("\t\tprops_zapobj = %llu\n",
1522	    (u_longlong_t)dd->dd_props_zapobj);
1523	(void) printf("\t\tdeleg_zapobj = %llu\n",
1524	    (u_longlong_t)dd->dd_deleg_zapobj);
1525	(void) printf("\t\tflags = %llx\n",
1526	    (u_longlong_t)dd->dd_flags);
1527
1528#define	DO(which) \
1529	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
1530	    sizeof (nice)); \
1531	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
1532	DO(HEAD);
1533	DO(SNAP);
1534	DO(CHILD);
1535	DO(CHILD_RSRV);
1536	DO(REFRSRV);
1537#undef DO
1538}
1539
1540/*ARGSUSED*/
1541static void
1542dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
1543{
1544	dsl_dataset_phys_t *ds = data;
1545	time_t crtime;
1546	char used[32], compressed[32], uncompressed[32], unique[32];
1547	char blkbuf[BP_SPRINTF_LEN];
1548
1549	/* make sure nicenum has enough space */
1550	CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
1551	CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
1552	CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
1553	CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
1554
1555	if (ds == NULL)
1556		return;
1557
1558	ASSERT(size == sizeof (*ds));
1559	crtime = ds->ds_creation_time;
1560	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
1561	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
1562	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
1563	    sizeof (uncompressed));
1564	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
1565	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
1566
1567	(void) printf("\t\tdir_obj = %llu\n",
1568	    (u_longlong_t)ds->ds_dir_obj);
1569	(void) printf("\t\tprev_snap_obj = %llu\n",
1570	    (u_longlong_t)ds->ds_prev_snap_obj);
1571	(void) printf("\t\tprev_snap_txg = %llu\n",
1572	    (u_longlong_t)ds->ds_prev_snap_txg);
1573	(void) printf("\t\tnext_snap_obj = %llu\n",
1574	    (u_longlong_t)ds->ds_next_snap_obj);
1575	(void) printf("\t\tsnapnames_zapobj = %llu\n",
1576	    (u_longlong_t)ds->ds_snapnames_zapobj);
1577	(void) printf("\t\tnum_children = %llu\n",
1578	    (u_longlong_t)ds->ds_num_children);
1579	(void) printf("\t\tuserrefs_obj = %llu\n",
1580	    (u_longlong_t)ds->ds_userrefs_obj);
1581	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1582	(void) printf("\t\tcreation_txg = %llu\n",
1583	    (u_longlong_t)ds->ds_creation_txg);
1584	(void) printf("\t\tdeadlist_obj = %llu\n",
1585	    (u_longlong_t)ds->ds_deadlist_obj);
1586	(void) printf("\t\tused_bytes = %s\n", used);
1587	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
1588	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
1589	(void) printf("\t\tunique = %s\n", unique);
1590	(void) printf("\t\tfsid_guid = %llu\n",
1591	    (u_longlong_t)ds->ds_fsid_guid);
1592	(void) printf("\t\tguid = %llu\n",
1593	    (u_longlong_t)ds->ds_guid);
1594	(void) printf("\t\tflags = %llx\n",
1595	    (u_longlong_t)ds->ds_flags);
1596	(void) printf("\t\tnext_clones_obj = %llu\n",
1597	    (u_longlong_t)ds->ds_next_clones_obj);
1598	(void) printf("\t\tprops_obj = %llu\n",
1599	    (u_longlong_t)ds->ds_props_obj);
1600	(void) printf("\t\tbp = %s\n", blkbuf);
1601}
1602
1603/* ARGSUSED */
1604static int
1605dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1606{
1607	char blkbuf[BP_SPRINTF_LEN];
1608
1609	if (bp->blk_birth != 0) {
1610		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
1611		(void) printf("\t%s\n", blkbuf);
1612	}
1613	return (0);
1614}
1615
1616static void
1617dump_bptree(objset_t *os, uint64_t obj, const char *name)
1618{
1619	char bytes[32];
1620	bptree_phys_t *bt;
1621	dmu_buf_t *db;
1622
1623	/* make sure nicenum has enough space */
1624	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1625
1626	if (dump_opt['d'] < 3)
1627		return;
1628
1629	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
1630	bt = db->db_data;
1631	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
1632	(void) printf("\n    %s: %llu datasets, %s\n",
1633	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
1634	dmu_buf_rele(db, FTAG);
1635
1636	if (dump_opt['d'] < 5)
1637		return;
1638
1639	(void) printf("\n");
1640
1641	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
1642}
1643
1644/* ARGSUSED */
1645static int
1646dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1647{
1648	char blkbuf[BP_SPRINTF_LEN];
1649
1650	ASSERT(bp->blk_birth != 0);
1651	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1652	(void) printf("\t%s\n", blkbuf);
1653	return (0);
1654}
1655
1656static void
1657dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
1658{
1659	char bytes[32];
1660	char comp[32];
1661	char uncomp[32];
1662
1663	/* make sure nicenum has enough space */
1664	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1665	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
1666	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
1667
1668	if (dump_opt['d'] < 3)
1669		return;
1670
1671	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
1672	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1673		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
1674		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
1675		(void) printf("    %*s: object %llu, %llu local blkptrs, "
1676		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
1677		    indent * 8, name,
1678		    (u_longlong_t)bpo->bpo_object,
1679		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1680		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1681		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
1682		    bytes, comp, uncomp);
1683
1684		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1685			uint64_t subobj;
1686			bpobj_t subbpo;
1687			int error;
1688			VERIFY0(dmu_read(bpo->bpo_os,
1689			    bpo->bpo_phys->bpo_subobjs,
1690			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1691			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1692			if (error != 0) {
1693				(void) printf("ERROR %u while trying to open "
1694				    "subobj id %llu\n",
1695				    error, (u_longlong_t)subobj);
1696				continue;
1697			}
1698			dump_full_bpobj(&subbpo, "subobj", indent + 1);
1699			bpobj_close(&subbpo);
1700		}
1701	} else {
1702		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
1703		    indent * 8, name,
1704		    (u_longlong_t)bpo->bpo_object,
1705		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1706		    bytes);
1707	}
1708
1709	if (dump_opt['d'] < 5)
1710		return;
1711
1712
1713	if (indent == 0) {
1714		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
1715		(void) printf("\n");
1716	}
1717}
1718
1719static void
1720dump_deadlist(dsl_deadlist_t *dl)
1721{
1722	dsl_deadlist_entry_t *dle;
1723	uint64_t unused;
1724	char bytes[32];
1725	char comp[32];
1726	char uncomp[32];
1727
1728	/* make sure nicenum has enough space */
1729	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1730	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
1731	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
1732
1733	if (dump_opt['d'] < 3)
1734		return;
1735
1736	if (dl->dl_oldfmt) {
1737		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
1738		return;
1739	}
1740
1741	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
1742	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
1743	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
1744	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
1745	    bytes, comp, uncomp);
1746
1747	if (dump_opt['d'] < 4)
1748		return;
1749
1750	(void) printf("\n");
1751
1752	/* force the tree to be loaded */
1753	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
1754
1755	for (dle = avl_first(&dl->dl_tree); dle;
1756	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
1757		if (dump_opt['d'] >= 5) {
1758			char buf[128];
1759			(void) snprintf(buf, sizeof (buf),
1760			    "mintxg %llu -> obj %llu",
1761			    (longlong_t)dle->dle_mintxg,
1762			    (longlong_t)dle->dle_bpobj.bpo_object);
1763
1764			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
1765		} else {
1766			(void) printf("mintxg %llu -> obj %llu\n",
1767			    (longlong_t)dle->dle_mintxg,
1768			    (longlong_t)dle->dle_bpobj.bpo_object);
1769
1770		}
1771	}
1772}
1773
1774static avl_tree_t idx_tree;
1775static avl_tree_t domain_tree;
1776static boolean_t fuid_table_loaded;
1777static objset_t *sa_os = NULL;
1778static sa_attr_type_t *sa_attr_table = NULL;
1779
1780static int
1781open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp)
1782{
1783	int err;
1784	uint64_t sa_attrs = 0;
1785	uint64_t version = 0;
1786
1787	VERIFY3P(sa_os, ==, NULL);
1788	err = dmu_objset_own(path, type, B_TRUE, tag, osp);
1789	if (err != 0) {
1790		(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
1791		    strerror(err));
1792		return (err);
1793	}
1794
1795	if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
1796		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1797		    8, 1, &version);
1798		if (version >= ZPL_VERSION_SA) {
1799			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
1800			    8, 1, &sa_attrs);
1801		}
1802		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
1803		    &sa_attr_table);
1804		if (err != 0) {
1805			(void) fprintf(stderr, "sa_setup failed: %s\n",
1806			    strerror(err));
1807			dmu_objset_disown(*osp, tag);
1808			*osp = NULL;
1809		}
1810	}
1811	sa_os = *osp;
1812
1813	return (0);
1814}
1815
1816static void
1817close_objset(objset_t *os, void *tag)
1818{
1819	VERIFY3P(os, ==, sa_os);
1820	if (os->os_sa != NULL)
1821		sa_tear_down(os);
1822	dmu_objset_disown(os, tag);
1823	sa_attr_table = NULL;
1824	sa_os = NULL;
1825}
1826
1827static void
1828fuid_table_destroy()
1829{
1830	if (fuid_table_loaded) {
1831		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
1832		fuid_table_loaded = B_FALSE;
1833	}
1834}
1835
1836/*
1837 * print uid or gid information.
1838 * For normal POSIX id just the id is printed in decimal format.
1839 * For CIFS files with FUID the fuid is printed in hex followed by
1840 * the domain-rid string.
1841 */
1842static void
1843print_idstr(uint64_t id, const char *id_type)
1844{
1845	if (FUID_INDEX(id)) {
1846		char *domain;
1847
1848		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
1849		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
1850		    (u_longlong_t)id, domain, (int)FUID_RID(id));
1851	} else {
1852		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
1853	}
1854
1855}
1856
1857static void
1858dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
1859{
1860	uint32_t uid_idx, gid_idx;
1861
1862	uid_idx = FUID_INDEX(uid);
1863	gid_idx = FUID_INDEX(gid);
1864
1865	/* Load domain table, if not already loaded */
1866	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
1867		uint64_t fuid_obj;
1868
1869		/* first find the fuid object.  It lives in the master node */
1870		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
1871		    8, 1, &fuid_obj) == 0);
1872		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
1873		(void) zfs_fuid_table_load(os, fuid_obj,
1874		    &idx_tree, &domain_tree);
1875		fuid_table_loaded = B_TRUE;
1876	}
1877
1878	print_idstr(uid, "uid");
1879	print_idstr(gid, "gid");
1880}
1881
1882/*ARGSUSED*/
1883static void
1884dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
1885{
1886	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
1887	sa_handle_t *hdl;
1888	uint64_t xattr, rdev, gen;
1889	uint64_t uid, gid, mode, fsize, parent, links;
1890	uint64_t pflags;
1891	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
1892	time_t z_crtime, z_atime, z_mtime, z_ctime;
1893	sa_bulk_attr_t bulk[12];
1894	int idx = 0;
1895	int error;
1896
1897	VERIFY3P(os, ==, sa_os);
1898	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
1899		(void) printf("Failed to get handle for SA znode\n");
1900		return;
1901	}
1902
1903	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
1904	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
1905	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
1906	    &links, 8);
1907	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
1908	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
1909	    &mode, 8);
1910	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
1911	    NULL, &parent, 8);
1912	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
1913	    &fsize, 8);
1914	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
1915	    acctm, 16);
1916	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
1917	    modtm, 16);
1918	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
1919	    crtm, 16);
1920	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
1921	    chgtm, 16);
1922	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
1923	    &pflags, 8);
1924
1925	if (sa_bulk_lookup(hdl, bulk, idx)) {
1926		(void) sa_handle_destroy(hdl);
1927		return;
1928	}
1929
1930	z_crtime = (time_t)crtm[0];
1931	z_atime = (time_t)acctm[0];
1932	z_mtime = (time_t)modtm[0];
1933	z_ctime = (time_t)chgtm[0];
1934
1935	if (dump_opt['d'] > 4) {
1936		error = zfs_obj_to_path(os, object, path, sizeof (path));
1937		if (error != 0) {
1938			(void) snprintf(path, sizeof (path),
1939			    "\?\?\?<object#%llu>", (u_longlong_t)object);
1940		}
1941		(void) printf("\tpath	%s\n", path);
1942	}
1943	dump_uidgid(os, uid, gid);
1944	(void) printf("\tatime	%s", ctime(&z_atime));
1945	(void) printf("\tmtime	%s", ctime(&z_mtime));
1946	(void) printf("\tctime	%s", ctime(&z_ctime));
1947	(void) printf("\tcrtime	%s", ctime(&z_crtime));
1948	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
1949	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
1950	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
1951	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
1952	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
1953	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
1954	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
1955	    sizeof (uint64_t)) == 0)
1956		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
1957	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
1958	    sizeof (uint64_t)) == 0)
1959		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
1960	sa_handle_destroy(hdl);
1961}
1962
1963/*ARGSUSED*/
1964static void
1965dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
1966{
1967}
1968
1969/*ARGSUSED*/
1970static void
1971dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
1972{
1973}
1974
1975static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
1976	dump_none,		/* unallocated			*/
1977	dump_zap,		/* object directory		*/
1978	dump_uint64,		/* object array			*/
1979	dump_none,		/* packed nvlist		*/
1980	dump_packed_nvlist,	/* packed nvlist size		*/
1981	dump_none,		/* bpobj			*/
1982	dump_bpobj,		/* bpobj header			*/
1983	dump_none,		/* SPA space map header		*/
1984	dump_none,		/* SPA space map		*/
1985	dump_none,		/* ZIL intent log		*/
1986	dump_dnode,		/* DMU dnode			*/
1987	dump_dmu_objset,	/* DMU objset			*/
1988	dump_dsl_dir,		/* DSL directory		*/
1989	dump_zap,		/* DSL directory child map	*/
1990	dump_zap,		/* DSL dataset snap map		*/
1991	dump_zap,		/* DSL props			*/
1992	dump_dsl_dataset,	/* DSL dataset			*/
1993	dump_znode,		/* ZFS znode			*/
1994	dump_acl,		/* ZFS V0 ACL			*/
1995	dump_uint8,		/* ZFS plain file		*/
1996	dump_zpldir,		/* ZFS directory		*/
1997	dump_zap,		/* ZFS master node		*/
1998	dump_zap,		/* ZFS delete queue		*/
1999	dump_uint8,		/* zvol object			*/
2000	dump_zap,		/* zvol prop			*/
2001	dump_uint8,		/* other uint8[]		*/
2002	dump_uint64,		/* other uint64[]		*/
2003	dump_zap,		/* other ZAP			*/
2004	dump_zap,		/* persistent error log		*/
2005	dump_uint8,		/* SPA history			*/
2006	dump_history_offsets,	/* SPA history offsets		*/
2007	dump_zap,		/* Pool properties		*/
2008	dump_zap,		/* DSL permissions		*/
2009	dump_acl,		/* ZFS ACL			*/
2010	dump_uint8,		/* ZFS SYSACL			*/
2011	dump_none,		/* FUID nvlist			*/
2012	dump_packed_nvlist,	/* FUID nvlist size		*/
2013	dump_zap,		/* DSL dataset next clones	*/
2014	dump_zap,		/* DSL scrub queue		*/
2015	dump_zap,		/* ZFS user/group used		*/
2016	dump_zap,		/* ZFS user/group quota		*/
2017	dump_zap,		/* snapshot refcount tags	*/
2018	dump_ddt_zap,		/* DDT ZAP object		*/
2019	dump_zap,		/* DDT statistics		*/
2020	dump_znode,		/* SA object			*/
2021	dump_zap,		/* SA Master Node		*/
2022	dump_sa_attrs,		/* SA attribute registration	*/
2023	dump_sa_layouts,	/* SA attribute layouts		*/
2024	dump_zap,		/* DSL scrub translations	*/
2025	dump_none,		/* fake dedup BP		*/
2026	dump_zap,		/* deadlist			*/
2027	dump_none,		/* deadlist hdr			*/
2028	dump_zap,		/* dsl clones			*/
2029	dump_bpobj_subobjs,	/* bpobj subobjs		*/
2030	dump_unknown,		/* Unknown type, must be last	*/
2031};
2032
2033static void
2034dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
2035{
2036	dmu_buf_t *db = NULL;
2037	dmu_object_info_t doi;
2038	dnode_t *dn;
2039	void *bonus = NULL;
2040	size_t bsize = 0;
2041	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
2042	char bonus_size[32];
2043	char aux[50];
2044	int error;
2045
2046	/* make sure nicenum has enough space */
2047	CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
2048	CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
2049	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
2050	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
2051	CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
2052
2053	if (*print_header) {
2054		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
2055		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
2056		    "%full", "type");
2057		*print_header = 0;
2058	}
2059
2060	if (object == 0) {
2061		dn = DMU_META_DNODE(os);
2062	} else {
2063		error = dmu_bonus_hold(os, object, FTAG, &db);
2064		if (error)
2065			fatal("dmu_bonus_hold(%llu) failed, errno %u",
2066			    object, error);
2067		bonus = db->db_data;
2068		bsize = db->db_size;
2069		dn = DB_DNODE((dmu_buf_impl_t *)db);
2070	}
2071	dmu_object_info_from_dnode(dn, &doi);
2072
2073	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
2074	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
2075	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
2076	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
2077	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
2078	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
2079	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
2080	    doi.doi_max_offset);
2081
2082	aux[0] = '\0';
2083
2084	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
2085		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
2086		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
2087	}
2088
2089	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
2090		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
2091		    ZDB_COMPRESS_NAME(doi.doi_compress));
2092	}
2093
2094	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
2095	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
2096	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
2097
2098	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
2099		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
2100		    "", "", "", "", "", bonus_size, "bonus",
2101		    ZDB_OT_NAME(doi.doi_bonus_type));
2102	}
2103
2104	if (verbosity >= 4) {
2105		(void) printf("\tdnode flags: %s%s%s\n",
2106		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
2107		    "USED_BYTES " : "",
2108		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
2109		    "USERUSED_ACCOUNTED " : "",
2110		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
2111		    "SPILL_BLKPTR" : "");
2112		(void) printf("\tdnode maxblkid: %llu\n",
2113		    (longlong_t)dn->dn_phys->dn_maxblkid);
2114
2115		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
2116		    bonus, bsize);
2117		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
2118		*print_header = 1;
2119	}
2120
2121	if (verbosity >= 5)
2122		dump_indirect(dn);
2123
2124	if (verbosity >= 5) {
2125		/*
2126		 * Report the list of segments that comprise the object.
2127		 */
2128		uint64_t start = 0;
2129		uint64_t end;
2130		uint64_t blkfill = 1;
2131		int minlvl = 1;
2132
2133		if (dn->dn_type == DMU_OT_DNODE) {
2134			minlvl = 0;
2135			blkfill = DNODES_PER_BLOCK;
2136		}
2137
2138		for (;;) {
2139			char segsize[32];
2140			/* make sure nicenum has enough space */
2141			CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
2142			error = dnode_next_offset(dn,
2143			    0, &start, minlvl, blkfill, 0);
2144			if (error)
2145				break;
2146			end = start;
2147			error = dnode_next_offset(dn,
2148			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
2149			zdb_nicenum(end - start, segsize, sizeof (segsize));
2150			(void) printf("\t\tsegment [%016llx, %016llx)"
2151			    " size %5s\n", (u_longlong_t)start,
2152			    (u_longlong_t)end, segsize);
2153			if (error)
2154				break;
2155			start = end;
2156		}
2157	}
2158
2159	if (db != NULL)
2160		dmu_buf_rele(db, FTAG);
2161}
2162
2163static const char *objset_types[DMU_OST_NUMTYPES] = {
2164	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
2165
2166static void
2167dump_dir(objset_t *os)
2168{
2169	dmu_objset_stats_t dds;
2170	uint64_t object, object_count;
2171	uint64_t refdbytes, usedobjs, scratch;
2172	char numbuf[32];
2173	char blkbuf[BP_SPRINTF_LEN + 20];
2174	char osname[ZFS_MAX_DATASET_NAME_LEN];
2175	const char *type = "UNKNOWN";
2176	int verbosity = dump_opt['d'];
2177	int print_header = 1;
2178	unsigned i;
2179	int error;
2180
2181	/* make sure nicenum has enough space */
2182	CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
2183
2184	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2185	dmu_objset_fast_stat(os, &dds);
2186	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2187
2188	if (dds.dds_type < DMU_OST_NUMTYPES)
2189		type = objset_types[dds.dds_type];
2190
2191	if (dds.dds_type == DMU_OST_META) {
2192		dds.dds_creation_txg = TXG_INITIAL;
2193		usedobjs = BP_GET_FILL(os->os_rootbp);
2194		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
2195		    dd_used_bytes;
2196	} else {
2197		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
2198	}
2199
2200	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
2201
2202	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
2203
2204	if (verbosity >= 4) {
2205		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
2206		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
2207		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
2208	} else {
2209		blkbuf[0] = '\0';
2210	}
2211
2212	dmu_objset_name(os, osname);
2213
2214	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
2215	    "%s, %llu objects%s\n",
2216	    osname, type, (u_longlong_t)dmu_objset_id(os),
2217	    (u_longlong_t)dds.dds_creation_txg,
2218	    numbuf, (u_longlong_t)usedobjs, blkbuf);
2219
2220	if (zopt_objects != 0) {
2221		for (i = 0; i < zopt_objects; i++)
2222			dump_object(os, zopt_object[i], verbosity,
2223			    &print_header);
2224		(void) printf("\n");
2225		return;
2226	}
2227
2228	if (dump_opt['i'] != 0 || verbosity >= 2)
2229		dump_intent_log(dmu_objset_zil(os));
2230
2231	if (dmu_objset_ds(os) != NULL) {
2232		dsl_dataset_t *ds = dmu_objset_ds(os);
2233		dump_deadlist(&ds->ds_deadlist);
2234
2235		if (dsl_dataset_remap_deadlist_exists(ds)) {
2236			(void) printf("ds_remap_deadlist:\n");
2237			dump_deadlist(&ds->ds_remap_deadlist);
2238		}
2239	}
2240
2241	if (verbosity < 2)
2242		return;
2243
2244	if (BP_IS_HOLE(os->os_rootbp))
2245		return;
2246
2247	dump_object(os, 0, verbosity, &print_header);
2248	object_count = 0;
2249	if (DMU_USERUSED_DNODE(os) != NULL &&
2250	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
2251		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
2252		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
2253	}
2254
2255	object = 0;
2256	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
2257		dump_object(os, object, verbosity, &print_header);
2258		object_count++;
2259	}
2260
2261	ASSERT3U(object_count, ==, usedobjs);
2262
2263	(void) printf("\n");
2264
2265	if (error != ESRCH) {
2266		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
2267		abort();
2268	}
2269}
2270
2271static void
2272dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
2273{
2274	time_t timestamp = ub->ub_timestamp;
2275
2276	(void) printf("%s", header ? header : "");
2277	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
2278	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
2279	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
2280	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
2281	(void) printf("\ttimestamp = %llu UTC = %s",
2282	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
2283	if (dump_opt['u'] >= 3) {
2284		char blkbuf[BP_SPRINTF_LEN];
2285		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
2286		(void) printf("\trootbp = %s\n", blkbuf);
2287	}
2288	(void) printf("\tcheckpoint_txg = %llu\n",
2289	    (u_longlong_t)ub->ub_checkpoint_txg);
2290	(void) printf("%s", footer ? footer : "");
2291}
2292
2293static void
2294dump_config(spa_t *spa)
2295{
2296	dmu_buf_t *db;
2297	size_t nvsize = 0;
2298	int error = 0;
2299
2300
2301	error = dmu_bonus_hold(spa->spa_meta_objset,
2302	    spa->spa_config_object, FTAG, &db);
2303
2304	if (error == 0) {
2305		nvsize = *(uint64_t *)db->db_data;
2306		dmu_buf_rele(db, FTAG);
2307
2308		(void) printf("\nMOS Configuration:\n");
2309		dump_packed_nvlist(spa->spa_meta_objset,
2310		    spa->spa_config_object, (void *)&nvsize, 1);
2311	} else {
2312		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
2313		    (u_longlong_t)spa->spa_config_object, error);
2314	}
2315}
2316
2317static void
2318dump_cachefile(const char *cachefile)
2319{
2320	int fd;
2321	struct stat64 statbuf;
2322	char *buf;
2323	nvlist_t *config;
2324
2325	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
2326		(void) printf("cannot open '%s': %s\n", cachefile,
2327		    strerror(errno));
2328		exit(1);
2329	}
2330
2331	if (fstat64(fd, &statbuf) != 0) {
2332		(void) printf("failed to stat '%s': %s\n", cachefile,
2333		    strerror(errno));
2334		exit(1);
2335	}
2336
2337	if ((buf = malloc(statbuf.st_size)) == NULL) {
2338		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
2339		    (u_longlong_t)statbuf.st_size);
2340		exit(1);
2341	}
2342
2343	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2344		(void) fprintf(stderr, "failed to read %llu bytes\n",
2345		    (u_longlong_t)statbuf.st_size);
2346		exit(1);
2347	}
2348
2349	(void) close(fd);
2350
2351	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
2352		(void) fprintf(stderr, "failed to unpack nvlist\n");
2353		exit(1);
2354	}
2355
2356	free(buf);
2357
2358	dump_nvlist(config, 0);
2359
2360	nvlist_free(config);
2361}
2362
2363#define	ZDB_MAX_UB_HEADER_SIZE 32
2364
2365static void
2366dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
2367{
2368	vdev_t vd;
2369	vdev_t *vdp = &vd;
2370	char header[ZDB_MAX_UB_HEADER_SIZE];
2371
2372	vd.vdev_ashift = ashift;
2373	vdp->vdev_top = vdp;
2374
2375	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
2376		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
2377		uberblock_t *ub = (void *)((char *)lbl + uoff);
2378
2379		if (uberblock_verify(ub))
2380			continue;
2381		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
2382		    "Uberblock[%d]\n", i);
2383		dump_uberblock(ub, header, "");
2384	}
2385}
2386
2387static char curpath[PATH_MAX];
2388
2389/*
2390 * Iterate through the path components, recursively passing
2391 * current one's obj and remaining path until we find the obj
2392 * for the last one.
2393 */
2394static int
2395dump_path_impl(objset_t *os, uint64_t obj, char *name)
2396{
2397	int err;
2398	int header = 1;
2399	uint64_t child_obj;
2400	char *s;
2401	dmu_buf_t *db;
2402	dmu_object_info_t doi;
2403
2404	if ((s = strchr(name, '/')) != NULL)
2405		*s = '\0';
2406	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
2407
2408	(void) strlcat(curpath, name, sizeof (curpath));
2409
2410	if (err != 0) {
2411		(void) fprintf(stderr, "failed to lookup %s: %s\n",
2412		    curpath, strerror(err));
2413		return (err);
2414	}
2415
2416	child_obj = ZFS_DIRENT_OBJ(child_obj);
2417	err = sa_buf_hold(os, child_obj, FTAG, &db);
2418	if (err != 0) {
2419		(void) fprintf(stderr,
2420		    "failed to get SA dbuf for obj %llu: %s\n",
2421		    (u_longlong_t)child_obj, strerror(err));
2422		return (EINVAL);
2423	}
2424	dmu_object_info_from_db(db, &doi);
2425	sa_buf_rele(db, FTAG);
2426
2427	if (doi.doi_bonus_type != DMU_OT_SA &&
2428	    doi.doi_bonus_type != DMU_OT_ZNODE) {
2429		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
2430		    doi.doi_bonus_type, (u_longlong_t)child_obj);
2431		return (EINVAL);
2432	}
2433
2434	if (dump_opt['v'] > 6) {
2435		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
2436		    (u_longlong_t)child_obj, curpath, doi.doi_type,
2437		    doi.doi_bonus_type);
2438	}
2439
2440	(void) strlcat(curpath, "/", sizeof (curpath));
2441
2442	switch (doi.doi_type) {
2443	case DMU_OT_DIRECTORY_CONTENTS:
2444		if (s != NULL && *(s + 1) != '\0')
2445			return (dump_path_impl(os, child_obj, s + 1));
2446		/*FALLTHROUGH*/
2447	case DMU_OT_PLAIN_FILE_CONTENTS:
2448		dump_object(os, child_obj, dump_opt['v'], &header);
2449		return (0);
2450	default:
2451		(void) fprintf(stderr, "object %llu has non-file/directory "
2452		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
2453		break;
2454	}
2455
2456	return (EINVAL);
2457}
2458
2459/*
2460 * Dump the blocks for the object specified by path inside the dataset.
2461 */
2462static int
2463dump_path(char *ds, char *path)
2464{
2465	int err;
2466	objset_t *os;
2467	uint64_t root_obj;
2468
2469	err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
2470	if (err != 0)
2471		return (err);
2472
2473	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
2474	if (err != 0) {
2475		(void) fprintf(stderr, "can't lookup root znode: %s\n",
2476		    strerror(err));
2477		dmu_objset_disown(os, FTAG);
2478		return (EINVAL);
2479	}
2480
2481	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
2482
2483	err = dump_path_impl(os, root_obj, path);
2484
2485	close_objset(os, FTAG);
2486	return (err);
2487}
2488
2489static int
2490dump_label(const char *dev)
2491{
2492	int fd;
2493	vdev_label_t label;
2494	char path[MAXPATHLEN];
2495	char *buf = label.vl_vdev_phys.vp_nvlist;
2496	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
2497	struct stat64 statbuf;
2498	uint64_t psize, ashift;
2499	boolean_t label_found = B_FALSE;
2500
2501	(void) strlcpy(path, dev, sizeof (path));
2502	if (dev[0] == '/') {
2503		if (strncmp(dev, ZFS_DISK_ROOTD,
2504		    strlen(ZFS_DISK_ROOTD)) == 0) {
2505			(void) snprintf(path, sizeof (path), "%s%s",
2506			    ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
2507		}
2508	} else if (stat64(path, &statbuf) != 0) {
2509		char *s;
2510
2511		(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
2512		    dev);
2513		if (((s = strrchr(dev, 's')) == NULL &&
2514		    (s = strchr(dev, 'p')) == NULL) ||
2515		    !isdigit(*(s + 1)))
2516			(void) strlcat(path, "s0", sizeof (path));
2517	}
2518
2519	if ((fd = open64(path, O_RDONLY)) < 0) {
2520		(void) fprintf(stderr, "cannot open '%s': %s\n", path,
2521		    strerror(errno));
2522		exit(1);
2523	}
2524
2525	if (fstat64(fd, &statbuf) != 0) {
2526		(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
2527		    strerror(errno));
2528		(void) close(fd);
2529		exit(1);
2530	}
2531
2532	if (S_ISBLK(statbuf.st_mode)) {
2533		(void) fprintf(stderr,
2534		    "cannot use '%s': character device required\n", path);
2535		(void) close(fd);
2536		exit(1);
2537	}
2538
2539	psize = statbuf.st_size;
2540	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
2541
2542	for (int l = 0; l < VDEV_LABELS; l++) {
2543		nvlist_t *config = NULL;
2544
2545		if (!dump_opt['q']) {
2546			(void) printf("------------------------------------\n");
2547			(void) printf("LABEL %d\n", l);
2548			(void) printf("------------------------------------\n");
2549		}
2550
2551		if (pread64(fd, &label, sizeof (label),
2552		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
2553			if (!dump_opt['q'])
2554				(void) printf("failed to read label %d\n", l);
2555			continue;
2556		}
2557
2558		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
2559			if (!dump_opt['q'])
2560				(void) printf("failed to unpack label %d\n", l);
2561			ashift = SPA_MINBLOCKSHIFT;
2562		} else {
2563			nvlist_t *vdev_tree = NULL;
2564
2565			if (!dump_opt['q'])
2566				dump_nvlist(config, 4);
2567			if ((nvlist_lookup_nvlist(config,
2568			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
2569			    (nvlist_lookup_uint64(vdev_tree,
2570			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
2571				ashift = SPA_MINBLOCKSHIFT;
2572			nvlist_free(config);
2573			label_found = B_TRUE;
2574		}
2575		if (dump_opt['u'])
2576			dump_label_uberblocks(&label, ashift);
2577	}
2578
2579	(void) close(fd);
2580
2581	return (label_found ? 0 : 2);
2582}
2583
2584static uint64_t dataset_feature_count[SPA_FEATURES];
2585static uint64_t remap_deadlist_count = 0;
2586
2587/*ARGSUSED*/
2588static int
2589dump_one_dir(const char *dsname, void *arg)
2590{
2591	int error;
2592	objset_t *os;
2593
2594	error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
2595	if (error != 0)
2596		return (0);
2597
2598	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2599		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
2600			continue;
2601		ASSERT(spa_feature_table[f].fi_flags &
2602		    ZFEATURE_FLAG_PER_DATASET);
2603		dataset_feature_count[f]++;
2604	}
2605
2606	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
2607		remap_deadlist_count++;
2608	}
2609
2610	dump_dir(os);
2611	close_objset(os, FTAG);
2612	fuid_table_destroy();
2613	return (0);
2614}
2615
2616/*
2617 * Block statistics.
2618 */
2619#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
2620typedef struct zdb_blkstats {
2621	uint64_t zb_asize;
2622	uint64_t zb_lsize;
2623	uint64_t zb_psize;
2624	uint64_t zb_count;
2625	uint64_t zb_gangs;
2626	uint64_t zb_ditto_samevdev;
2627	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
2628} zdb_blkstats_t;
2629
2630/*
2631 * Extended object types to report deferred frees and dedup auto-ditto blocks.
2632 */
2633#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
2634#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
2635#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
2636#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
2637
2638static const char *zdb_ot_extname[] = {
2639	"deferred free",
2640	"dedup ditto",
2641	"other",
2642	"Total",
2643};
2644
2645#define	ZB_TOTAL	DN_MAX_LEVELS
2646
2647typedef struct zdb_cb {
2648	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
2649	uint64_t	zcb_removing_size;
2650	uint64_t	zcb_checkpoint_size;
2651	uint64_t	zcb_dedup_asize;
2652	uint64_t	zcb_dedup_blocks;
2653	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
2654	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
2655	    [BPE_PAYLOAD_SIZE];
2656	uint64_t	zcb_start;
2657	hrtime_t	zcb_lastprint;
2658	uint64_t	zcb_totalasize;
2659	uint64_t	zcb_errors[256];
2660	int		zcb_readfails;
2661	int		zcb_haderrors;
2662	spa_t		*zcb_spa;
2663	uint32_t	**zcb_vd_obsolete_counts;
2664} zdb_cb_t;
2665
2666static void
2667zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
2668    dmu_object_type_t type)
2669{
2670	uint64_t refcnt = 0;
2671
2672	ASSERT(type < ZDB_OT_TOTAL);
2673
2674	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
2675		return;
2676
2677	for (int i = 0; i < 4; i++) {
2678		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
2679		int t = (i & 1) ? type : ZDB_OT_TOTAL;
2680		int equal;
2681		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
2682
2683		zb->zb_asize += BP_GET_ASIZE(bp);
2684		zb->zb_lsize += BP_GET_LSIZE(bp);
2685		zb->zb_psize += BP_GET_PSIZE(bp);
2686		zb->zb_count++;
2687
2688		/*
2689		 * The histogram is only big enough to record blocks up to
2690		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
2691		 * "other", bucket.
2692		 */
2693		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
2694		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
2695		zb->zb_psize_histogram[idx]++;
2696
2697		zb->zb_gangs += BP_COUNT_GANG(bp);
2698
2699		switch (BP_GET_NDVAS(bp)) {
2700		case 2:
2701			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2702			    DVA_GET_VDEV(&bp->blk_dva[1]))
2703				zb->zb_ditto_samevdev++;
2704			break;
2705		case 3:
2706			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2707			    DVA_GET_VDEV(&bp->blk_dva[1])) +
2708			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2709			    DVA_GET_VDEV(&bp->blk_dva[2])) +
2710			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
2711			    DVA_GET_VDEV(&bp->blk_dva[2]));
2712			if (equal != 0)
2713				zb->zb_ditto_samevdev++;
2714			break;
2715		}
2716
2717	}
2718
2719	if (BP_IS_EMBEDDED(bp)) {
2720		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
2721		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
2722		    [BPE_GET_PSIZE(bp)]++;
2723		return;
2724	}
2725
2726	if (dump_opt['L'])
2727		return;
2728
2729	if (BP_GET_DEDUP(bp)) {
2730		ddt_t *ddt;
2731		ddt_entry_t *dde;
2732
2733		ddt = ddt_select(zcb->zcb_spa, bp);
2734		ddt_enter(ddt);
2735		dde = ddt_lookup(ddt, bp, B_FALSE);
2736
2737		if (dde == NULL) {
2738			refcnt = 0;
2739		} else {
2740			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
2741			ddt_phys_decref(ddp);
2742			refcnt = ddp->ddp_refcnt;
2743			if (ddt_phys_total_refcnt(dde) == 0)
2744				ddt_remove(ddt, dde);
2745		}
2746		ddt_exit(ddt);
2747	}
2748
2749	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
2750	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
2751	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
2752}
2753
2754static void
2755zdb_blkptr_done(zio_t *zio)
2756{
2757	spa_t *spa = zio->io_spa;
2758	blkptr_t *bp = zio->io_bp;
2759	int ioerr = zio->io_error;
2760	zdb_cb_t *zcb = zio->io_private;
2761	zbookmark_phys_t *zb = &zio->io_bookmark;
2762
2763	abd_free(zio->io_abd);
2764
2765	mutex_enter(&spa->spa_scrub_lock);
2766	spa->spa_scrub_inflight--;
2767	cv_broadcast(&spa->spa_scrub_io_cv);
2768
2769	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2770		char blkbuf[BP_SPRINTF_LEN];
2771
2772		zcb->zcb_haderrors = 1;
2773		zcb->zcb_errors[ioerr]++;
2774
2775		if (dump_opt['b'] >= 2)
2776			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2777		else
2778			blkbuf[0] = '\0';
2779
2780		(void) printf("zdb_blkptr_cb: "
2781		    "Got error %d reading "
2782		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
2783		    ioerr,
2784		    (u_longlong_t)zb->zb_objset,
2785		    (u_longlong_t)zb->zb_object,
2786		    (u_longlong_t)zb->zb_level,
2787		    (u_longlong_t)zb->zb_blkid,
2788		    blkbuf);
2789	}
2790	mutex_exit(&spa->spa_scrub_lock);
2791}
2792
2793static int
2794zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2795    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2796{
2797	zdb_cb_t *zcb = arg;
2798	dmu_object_type_t type;
2799	boolean_t is_metadata;
2800
2801	if (bp == NULL)
2802		return (0);
2803
2804	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
2805		char blkbuf[BP_SPRINTF_LEN];
2806		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2807		(void) printf("objset %llu object %llu "
2808		    "level %lld offset 0x%llx %s\n",
2809		    (u_longlong_t)zb->zb_objset,
2810		    (u_longlong_t)zb->zb_object,
2811		    (longlong_t)zb->zb_level,
2812		    (u_longlong_t)blkid2offset(dnp, bp, zb),
2813		    blkbuf);
2814	}
2815
2816	if (BP_IS_HOLE(bp))
2817		return (0);
2818
2819	type = BP_GET_TYPE(bp);
2820
2821	zdb_count_block(zcb, zilog, bp,
2822	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
2823
2824	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
2825
2826	if (!BP_IS_EMBEDDED(bp) &&
2827	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
2828		size_t size = BP_GET_PSIZE(bp);
2829		abd_t *abd = abd_alloc(size, B_FALSE);
2830		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
2831
2832		/* If it's an intent log block, failure is expected. */
2833		if (zb->zb_level == ZB_ZIL_LEVEL)
2834			flags |= ZIO_FLAG_SPECULATIVE;
2835
2836		mutex_enter(&spa->spa_scrub_lock);
2837		while (spa->spa_scrub_inflight > max_inflight)
2838			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2839		spa->spa_scrub_inflight++;
2840		mutex_exit(&spa->spa_scrub_lock);
2841
2842		zio_nowait(zio_read(NULL, spa, bp, abd, size,
2843		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
2844	}
2845
2846	zcb->zcb_readfails = 0;
2847
2848	/* only call gethrtime() every 100 blocks */
2849	static int iters;
2850	if (++iters > 100)
2851		iters = 0;
2852	else
2853		return (0);
2854
2855	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
2856		uint64_t now = gethrtime();
2857		char buf[10];
2858		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
2859		int kb_per_sec =
2860		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
2861		int sec_remaining =
2862		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
2863
2864		/* make sure nicenum has enough space */
2865		CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
2866
2867		zfs_nicenum(bytes, buf, sizeof (buf));
2868		(void) fprintf(stderr,
2869		    "\r%5s completed (%4dMB/s) "
2870		    "estimated time remaining: %uhr %02umin %02usec        ",
2871		    buf, kb_per_sec / 1024,
2872		    sec_remaining / 60 / 60,
2873		    sec_remaining / 60 % 60,
2874		    sec_remaining % 60);
2875
2876		zcb->zcb_lastprint = now;
2877	}
2878
2879	return (0);
2880}
2881
2882static void
2883zdb_leak(void *arg, uint64_t start, uint64_t size)
2884{
2885	vdev_t *vd = arg;
2886
2887	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
2888	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
2889}
2890
2891static metaslab_ops_t zdb_metaslab_ops = {
2892	NULL	/* alloc */
2893};
2894
2895static void
2896zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
2897{
2898	ddt_bookmark_t ddb;
2899	ddt_entry_t dde;
2900	int error;
2901
2902	bzero(&ddb, sizeof (ddb));
2903	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
2904		blkptr_t blk;
2905		ddt_phys_t *ddp = dde.dde_phys;
2906
2907		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
2908			return;
2909
2910		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
2911
2912		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2913			if (ddp->ddp_phys_birth == 0)
2914				continue;
2915			ddt_bp_create(ddb.ddb_checksum,
2916			    &dde.dde_key, ddp, &blk);
2917			if (p == DDT_PHYS_DITTO) {
2918				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
2919			} else {
2920				zcb->zcb_dedup_asize +=
2921				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
2922				zcb->zcb_dedup_blocks++;
2923			}
2924		}
2925		if (!dump_opt['L']) {
2926			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2927			ddt_enter(ddt);
2928			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
2929			ddt_exit(ddt);
2930		}
2931	}
2932
2933	ASSERT(error == ENOENT);
2934}
2935
2936/* ARGSUSED */
2937static void
2938claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
2939    uint64_t size, void *arg)
2940{
2941	/*
2942	 * This callback was called through a remap from
2943	 * a device being removed. Therefore, the vdev that
2944	 * this callback is applied to is a concrete
2945	 * vdev.
2946	 */
2947	ASSERT(vdev_is_concrete(vd));
2948
2949	VERIFY0(metaslab_claim_impl(vd, offset, size,
2950	    spa_min_claim_txg(vd->vdev_spa)));
2951}
2952
2953static void
2954claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
2955{
2956	vdev_t *vd = arg;
2957
2958	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
2959	    claim_segment_impl_cb, NULL);
2960}
2961
2962/*
2963 * After accounting for all allocated blocks that are directly referenced,
2964 * we might have missed a reference to a block from a partially complete
2965 * (and thus unused) indirect mapping object. We perform a secondary pass
2966 * through the metaslabs we have already mapped and claim the destination
2967 * blocks.
2968 */
2969static void
2970zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
2971{
2972	if (spa->spa_vdev_removal == NULL)
2973		return;
2974
2975	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2976
2977	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
2978	vdev_t *vd = svr->svr_vdev;
2979	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
2980
2981	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
2982		metaslab_t *msp = vd->vdev_ms[msi];
2983
2984		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
2985			break;
2986
2987		ASSERT0(range_tree_space(svr->svr_allocd_segs));
2988
2989		if (msp->ms_sm != NULL) {
2990			VERIFY0(space_map_load(msp->ms_sm,
2991			    svr->svr_allocd_segs, SM_ALLOC));
2992
2993			/*
2994			 * Clear everything past what has been synced,
2995			 * because we have not allocated mappings for it yet.
2996			 */
2997			range_tree_clear(svr->svr_allocd_segs,
2998			    vdev_indirect_mapping_max_offset(vim),
2999			    msp->ms_sm->sm_start + msp->ms_sm->sm_size -
3000			    vdev_indirect_mapping_max_offset(vim));
3001		}
3002
3003		zcb->zcb_removing_size +=
3004		    range_tree_space(svr->svr_allocd_segs);
3005		range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
3006	}
3007
3008	spa_config_exit(spa, SCL_CONFIG, FTAG);
3009}
3010
3011/* ARGSUSED */
3012static int
3013increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3014{
3015	zdb_cb_t *zcb = arg;
3016	spa_t *spa = zcb->zcb_spa;
3017	vdev_t *vd;
3018	const dva_t *dva = &bp->blk_dva[0];
3019
3020	ASSERT(!dump_opt['L']);
3021	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
3022
3023	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3024	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
3025	ASSERT3P(vd, !=, NULL);
3026	spa_config_exit(spa, SCL_VDEV, FTAG);
3027
3028	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
3029	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
3030
3031	vdev_indirect_mapping_increment_obsolete_count(
3032	    vd->vdev_indirect_mapping,
3033	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
3034	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3035
3036	return (0);
3037}
3038
3039static uint32_t *
3040zdb_load_obsolete_counts(vdev_t *vd)
3041{
3042	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3043	spa_t *spa = vd->vdev_spa;
3044	spa_condensing_indirect_phys_t *scip =
3045	    &spa->spa_condensing_indirect_phys;
3046	uint32_t *counts;
3047
3048	EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
3049	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
3050	if (vd->vdev_obsolete_sm != NULL) {
3051		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3052		    vd->vdev_obsolete_sm);
3053	}
3054	if (scip->scip_vdev == vd->vdev_id &&
3055	    scip->scip_prev_obsolete_sm_object != 0) {
3056		space_map_t *prev_obsolete_sm = NULL;
3057		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
3058		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
3059		space_map_update(prev_obsolete_sm);
3060		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3061		    prev_obsolete_sm);
3062		space_map_close(prev_obsolete_sm);
3063	}
3064	return (counts);
3065}
3066
3067typedef struct checkpoint_sm_exclude_entry_arg {
3068	vdev_t *cseea_vd;
3069	uint64_t cseea_checkpoint_size;
3070} checkpoint_sm_exclude_entry_arg_t;
3071
3072static int
3073checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
3074    void *arg)
3075{
3076	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
3077	vdev_t *vd = cseea->cseea_vd;
3078	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3079	uint64_t end = offset + size;
3080
3081	ASSERT(type == SM_FREE);
3082
3083	/*
3084	 * Since the vdev_checkpoint_sm exists in the vdev level
3085	 * and the ms_sm space maps exist in the metaslab level,
3086	 * an entry in the checkpoint space map could theoretically
3087	 * cross the boundaries of the metaslab that it belongs.
3088	 *
3089	 * In reality, because of the way that we populate and
3090	 * manipulate the checkpoint's space maps currently,
3091	 * there shouldn't be any entries that cross metaslabs.
3092	 * Hence the assertion below.
3093	 *
3094	 * That said, there is no fundamental requirement that
3095	 * the checkpoint's space map entries should not cross
3096	 * metaslab boundaries. So if needed we could add code
3097	 * that handles metaslab-crossing segments in the future.
3098	 */
3099	VERIFY3U(offset, >=, ms->ms_start);
3100	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
3101
3102	/*
3103	 * By removing the entry from the allocated segments we
3104	 * also verify that the entry is there to begin with.
3105	 */
3106	mutex_enter(&ms->ms_lock);
3107	range_tree_remove(ms->ms_allocatable, offset, size);
3108	mutex_exit(&ms->ms_lock);
3109
3110	cseea->cseea_checkpoint_size += size;
3111	return (0);
3112}
3113
3114static void
3115zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
3116{
3117	spa_t *spa = vd->vdev_spa;
3118	space_map_t *checkpoint_sm = NULL;
3119	uint64_t checkpoint_sm_obj;
3120
3121	/*
3122	 * If there is no vdev_top_zap, we are in a pool whose
3123	 * version predates the pool checkpoint feature.
3124	 */
3125	if (vd->vdev_top_zap == 0)
3126		return;
3127
3128	/*
3129	 * If there is no reference of the vdev_checkpoint_sm in
3130	 * the vdev_top_zap, then one of the following scenarios
3131	 * is true:
3132	 *
3133	 * 1] There is no checkpoint
3134	 * 2] There is a checkpoint, but no checkpointed blocks
3135	 *    have been freed yet
3136	 * 3] The current vdev is indirect
3137	 *
3138	 * In these cases we return immediately.
3139	 */
3140	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
3141	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
3142		return;
3143
3144	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
3145	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
3146	    &checkpoint_sm_obj));
3147
3148	checkpoint_sm_exclude_entry_arg_t cseea;
3149	cseea.cseea_vd = vd;
3150	cseea.cseea_checkpoint_size = 0;
3151
3152	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
3153	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
3154	space_map_update(checkpoint_sm);
3155
3156	VERIFY0(space_map_iterate(checkpoint_sm,
3157	    checkpoint_sm_exclude_entry_cb, &cseea));
3158	space_map_close(checkpoint_sm);
3159
3160	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
3161}
3162
3163static void
3164zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
3165{
3166	vdev_t *rvd = spa->spa_root_vdev;
3167	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3168		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
3169		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
3170	}
3171}
3172
3173static void
3174load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
3175{
3176	vdev_t *rvd = spa->spa_root_vdev;
3177	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
3178		vdev_t *vd = rvd->vdev_child[i];
3179
3180		ASSERT3U(i, ==, vd->vdev_id);
3181
3182		if (vd->vdev_ops == &vdev_indirect_ops)
3183			continue;
3184
3185		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3186			metaslab_t *msp = vd->vdev_ms[m];
3187
3188			(void) fprintf(stderr,
3189			    "\rloading concrete vdev %llu, "
3190			    "metaslab %llu of %llu ...",
3191			    (longlong_t)vd->vdev_id,
3192			    (longlong_t)msp->ms_id,
3193			    (longlong_t)vd->vdev_ms_count);
3194
3195			mutex_enter(&msp->ms_lock);
3196			metaslab_unload(msp);
3197
3198			/*
3199			 * We don't want to spend the CPU manipulating the
3200			 * size-ordered tree, so clear the range_tree ops.
3201			 */
3202			msp->ms_allocatable->rt_ops = NULL;
3203
3204			if (msp->ms_sm != NULL) {
3205				VERIFY0(space_map_load(msp->ms_sm,
3206				    msp->ms_allocatable, maptype));
3207			}
3208			if (!msp->ms_loaded)
3209				msp->ms_loaded = B_TRUE;
3210			mutex_exit(&msp->ms_lock);
3211		}
3212	}
3213}
3214
3215/*
3216 * vm_idxp is an in-out parameter which (for indirect vdevs) is the
3217 * index in vim_entries that has the first entry in this metaslab.
3218 * On return, it will be set to the first entry after this metaslab.
3219 */
3220static void
3221load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
3222    uint64_t *vim_idxp)
3223{
3224	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3225
3226	mutex_enter(&msp->ms_lock);
3227	metaslab_unload(msp);
3228
3229	/*
3230	 * We don't want to spend the CPU manipulating the
3231	 * size-ordered tree, so clear the range_tree ops.
3232	 */
3233	msp->ms_allocatable->rt_ops = NULL;
3234
3235	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
3236	    (*vim_idxp)++) {
3237		vdev_indirect_mapping_entry_phys_t *vimep =
3238		    &vim->vim_entries[*vim_idxp];
3239		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
3240		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
3241		ASSERT3U(ent_offset, >=, msp->ms_start);
3242		if (ent_offset >= msp->ms_start + msp->ms_size)
3243			break;
3244
3245		/*
3246		 * Mappings do not cross metaslab boundaries,
3247		 * because we create them by walking the metaslabs.
3248		 */
3249		ASSERT3U(ent_offset + ent_len, <=,
3250		    msp->ms_start + msp->ms_size);
3251		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
3252	}
3253
3254	if (!msp->ms_loaded)
3255		msp->ms_loaded = B_TRUE;
3256	mutex_exit(&msp->ms_lock);
3257}
3258
3259static void
3260zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
3261{
3262	vdev_t *rvd = spa->spa_root_vdev;
3263	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3264		vdev_t *vd = rvd->vdev_child[c];
3265
3266		ASSERT3U(c, ==, vd->vdev_id);
3267
3268		if (vd->vdev_ops != &vdev_indirect_ops)
3269			continue;
3270
3271		/*
3272		 * Note: we don't check for mapping leaks on
3273		 * removing vdevs because their ms_allocatable's
3274		 * are used to look for leaks in allocated space.
3275		 */
3276		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
3277
3278		/*
3279		 * Normally, indirect vdevs don't have any
3280		 * metaslabs.  We want to set them up for
3281		 * zio_claim().
3282		 */
3283		VERIFY0(vdev_metaslab_init(vd, 0));
3284
3285		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3286		uint64_t vim_idx = 0;
3287		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3288
3289			(void) fprintf(stderr,
3290			    "\rloading indirect vdev %llu, "
3291			    "metaslab %llu of %llu ...",
3292			    (longlong_t)vd->vdev_id,
3293			    (longlong_t)vd->vdev_ms[m]->ms_id,
3294			    (longlong_t)vd->vdev_ms_count);
3295
3296			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
3297			    &vim_idx);
3298		}
3299		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
3300	}
3301}
3302
3303static void
3304zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
3305{
3306	zcb->zcb_spa = spa;
3307
3308	if (!dump_opt['L']) {
3309		dsl_pool_t *dp = spa->spa_dsl_pool;
3310		vdev_t *rvd = spa->spa_root_vdev;
3311
3312		/*
3313		 * We are going to be changing the meaning of the metaslab's
3314		 * ms_allocatable.  Ensure that the allocator doesn't try to
3315		 * use the tree.
3316		 */
3317		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
3318		spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
3319
3320		zcb->zcb_vd_obsolete_counts =
3321		    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
3322		    UMEM_NOFAIL);
3323
3324		/*
3325		 * For leak detection, we overload the ms_allocatable trees
3326		 * to contain allocated segments instead of free segments.
3327		 * As a result, we can't use the normal metaslab_load/unload
3328		 * interfaces.
3329		 */
3330		zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
3331		load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
3332
3333		/*
3334		 * On load_concrete_ms_allocatable_trees() we loaded all the
3335		 * allocated entries from the ms_sm to the ms_allocatable for
3336		 * each metaslab. If the pool has a checkpoint or is in the
3337		 * middle of discarding a checkpoint, some of these blocks
3338		 * may have been freed but their ms_sm may not have been
3339		 * updated because they are referenced by the checkpoint. In
3340		 * order to avoid false-positives during leak-detection, we
3341		 * go through the vdev's checkpoint space map and exclude all
3342		 * its entries from their relevant ms_allocatable.
3343		 *
3344		 * We also aggregate the space held by the checkpoint and add
3345		 * it to zcb_checkpoint_size.
3346		 *
3347		 * Note that at this point we are also verifying that all the
3348		 * entries on the checkpoint_sm are marked as allocated in
3349		 * the ms_sm of their relevant metaslab.
3350		 * [see comment in checkpoint_sm_exclude_entry_cb()]
3351		 */
3352		zdb_leak_init_exclude_checkpoint(spa, zcb);
3353
3354		/* for cleaner progress output */
3355		(void) fprintf(stderr, "\n");
3356
3357		if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3358			ASSERT(spa_feature_is_enabled(spa,
3359			    SPA_FEATURE_DEVICE_REMOVAL));
3360			(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
3361			    increment_indirect_mapping_cb, zcb, NULL);
3362		}
3363	} else {
3364		/*
3365		 * If leak tracing is disabled, we still need to consider
3366		 * any checkpointed space in our space verification.
3367		 */
3368		zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
3369	}
3370
3371	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3372	zdb_ddt_leak_init(spa, zcb);
3373	spa_config_exit(spa, SCL_CONFIG, FTAG);
3374}
3375
3376static boolean_t
3377zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
3378{
3379	boolean_t leaks = B_FALSE;
3380	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3381	uint64_t total_leaked = 0;
3382
3383	ASSERT(vim != NULL);
3384
3385	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
3386		vdev_indirect_mapping_entry_phys_t *vimep =
3387		    &vim->vim_entries[i];
3388		uint64_t obsolete_bytes = 0;
3389		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
3390		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3391
3392		/*
3393		 * This is not very efficient but it's easy to
3394		 * verify correctness.
3395		 */
3396		for (uint64_t inner_offset = 0;
3397		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
3398		    inner_offset += 1 << vd->vdev_ashift) {
3399			if (range_tree_contains(msp->ms_allocatable,
3400			    offset + inner_offset, 1 << vd->vdev_ashift)) {
3401				obsolete_bytes += 1 << vd->vdev_ashift;
3402			}
3403		}
3404
3405		int64_t bytes_leaked = obsolete_bytes -
3406		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
3407		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
3408		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
3409		if (bytes_leaked != 0 &&
3410		    (vdev_obsolete_counts_are_precise(vd) ||
3411		    dump_opt['d'] >= 5)) {
3412			(void) printf("obsolete indirect mapping count "
3413			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
3414			    (u_longlong_t)vd->vdev_id,
3415			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
3416			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
3417			    (u_longlong_t)bytes_leaked);
3418		}
3419		total_leaked += ABS(bytes_leaked);
3420	}
3421
3422	if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
3423		int pct_leaked = total_leaked * 100 /
3424		    vdev_indirect_mapping_bytes_mapped(vim);
3425		(void) printf("cannot verify obsolete indirect mapping "
3426		    "counts of vdev %llu because precise feature was not "
3427		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
3428		    "unreferenced\n",
3429		    (u_longlong_t)vd->vdev_id, pct_leaked,
3430		    (u_longlong_t)total_leaked);
3431	} else if (total_leaked > 0) {
3432		(void) printf("obsolete indirect mapping count mismatch "
3433		    "for vdev %llu -- %llx total bytes mismatched\n",
3434		    (u_longlong_t)vd->vdev_id,
3435		    (u_longlong_t)total_leaked);
3436		leaks |= B_TRUE;
3437	}
3438
3439	vdev_indirect_mapping_free_obsolete_counts(vim,
3440	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3441	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
3442
3443	return (leaks);
3444}
3445
3446static boolean_t
3447zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
3448{
3449	boolean_t leaks = B_FALSE;
3450	if (!dump_opt['L']) {
3451		vdev_t *rvd = spa->spa_root_vdev;
3452		for (unsigned c = 0; c < rvd->vdev_children; c++) {
3453			vdev_t *vd = rvd->vdev_child[c];
3454			metaslab_group_t *mg = vd->vdev_mg;
3455
3456			if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
3457				leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
3458			}
3459
3460			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3461				metaslab_t *msp = vd->vdev_ms[m];
3462				ASSERT3P(mg, ==, msp->ms_group);
3463
3464				/*
3465				 * ms_allocatable has been overloaded
3466				 * to contain allocated segments. Now that
3467				 * we finished traversing all blocks, any
3468				 * block that remains in the ms_allocatable
3469				 * represents an allocated block that we
3470				 * did not claim during the traversal.
3471				 * Claimed blocks would have been removed
3472				 * from the ms_allocatable.  For indirect
3473				 * vdevs, space remaining in the tree
3474				 * represents parts of the mapping that are
3475				 * not referenced, which is not a bug.
3476				 */
3477				if (vd->vdev_ops == &vdev_indirect_ops) {
3478					range_tree_vacate(msp->ms_allocatable,
3479					    NULL, NULL);
3480				} else {
3481					range_tree_vacate(msp->ms_allocatable,
3482					    zdb_leak, vd);
3483				}
3484
3485				if (msp->ms_loaded) {
3486					msp->ms_loaded = B_FALSE;
3487				}
3488			}
3489		}
3490
3491		umem_free(zcb->zcb_vd_obsolete_counts,
3492		    rvd->vdev_children * sizeof (uint32_t *));
3493		zcb->zcb_vd_obsolete_counts = NULL;
3494	}
3495	return (leaks);
3496}
3497
3498/* ARGSUSED */
3499static int
3500count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3501{
3502	zdb_cb_t *zcb = arg;
3503
3504	if (dump_opt['b'] >= 5) {
3505		char blkbuf[BP_SPRINTF_LEN];
3506		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3507		(void) printf("[%s] %s\n",
3508		    "deferred free", blkbuf);
3509	}
3510	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
3511	return (0);
3512}
3513
3514static int
3515dump_block_stats(spa_t *spa)
3516{
3517	zdb_cb_t zcb;
3518	zdb_blkstats_t *zb, *tzb;
3519	uint64_t norm_alloc, norm_space, total_alloc, total_found;
3520	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
3521	boolean_t leaks = B_FALSE;
3522
3523	bzero(&zcb, sizeof (zcb));
3524	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
3525	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
3526	    (dump_opt['c'] == 1) ? "metadata " : "",
3527	    dump_opt['c'] ? "checksums " : "",
3528	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
3529	    !dump_opt['L'] ? "nothing leaked " : "");
3530
3531	/*
3532	 * Load all space maps as SM_ALLOC maps, then traverse the pool
3533	 * claiming each block we discover.  If the pool is perfectly
3534	 * consistent, the space maps will be empty when we're done.
3535	 * Anything left over is a leak; any block we can't claim (because
3536	 * it's not part of any space map) is a double allocation,
3537	 * reference to a freed block, or an unclaimed log block.
3538	 */
3539	zdb_leak_init(spa, &zcb);
3540
3541	/*
3542	 * If there's a deferred-free bplist, process that first.
3543	 */
3544	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
3545	    count_block_cb, &zcb, NULL);
3546
3547	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3548		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
3549		    count_block_cb, &zcb, NULL);
3550	}
3551
3552	zdb_claim_removing(spa, &zcb);
3553
3554	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
3555		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
3556		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
3557		    &zcb, NULL));
3558	}
3559
3560	if (dump_opt['c'] > 1)
3561		flags |= TRAVERSE_PREFETCH_DATA;
3562
3563	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
3564	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
3565	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
3566
3567	/*
3568	 * If we've traversed the data blocks then we need to wait for those
3569	 * I/Os to complete. We leverage "The Godfather" zio to wait on
3570	 * all async I/Os to complete.
3571	 */
3572	if (dump_opt['c']) {
3573		for (int i = 0; i < max_ncpus; i++) {
3574			(void) zio_wait(spa->spa_async_zio_root[i]);
3575			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3576			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3577			    ZIO_FLAG_GODFATHER);
3578		}
3579	}
3580
3581	if (zcb.zcb_haderrors) {
3582		(void) printf("\nError counts:\n\n");
3583		(void) printf("\t%5s  %s\n", "errno", "count");
3584		for (int e = 0; e < 256; e++) {
3585			if (zcb.zcb_errors[e] != 0) {
3586				(void) printf("\t%5d  %llu\n",
3587				    e, (u_longlong_t)zcb.zcb_errors[e]);
3588			}
3589		}
3590	}
3591
3592	/*
3593	 * Report any leaked segments.
3594	 */
3595	leaks |= zdb_leak_fini(spa, &zcb);
3596
3597	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
3598
3599	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
3600	norm_space = metaslab_class_get_space(spa_normal_class(spa));
3601
3602	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
3603	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
3604	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
3605
3606	if (total_found == total_alloc) {
3607		if (!dump_opt['L'])
3608			(void) printf("\n\tNo leaks (block sum matches space"
3609			    " maps exactly)\n");
3610	} else {
3611		(void) printf("block traversal size %llu != alloc %llu "
3612		    "(%s %lld)\n",
3613		    (u_longlong_t)total_found,
3614		    (u_longlong_t)total_alloc,
3615		    (dump_opt['L']) ? "unreachable" : "leaked",
3616		    (longlong_t)(total_alloc - total_found));
3617		leaks = B_TRUE;
3618	}
3619
3620	if (tzb->zb_count == 0)
3621		return (2);
3622
3623	(void) printf("\n");
3624	(void) printf("\tbp count:      %10llu\n",
3625	    (u_longlong_t)tzb->zb_count);
3626	(void) printf("\tganged count:  %10llu\n",
3627	    (longlong_t)tzb->zb_gangs);
3628	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
3629	    (u_longlong_t)tzb->zb_lsize,
3630	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
3631	(void) printf("\tbp physical:   %10llu      avg:"
3632	    " %6llu     compression: %6.2f\n",
3633	    (u_longlong_t)tzb->zb_psize,
3634	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
3635	    (double)tzb->zb_lsize / tzb->zb_psize);
3636	(void) printf("\tbp allocated:  %10llu      avg:"
3637	    " %6llu     compression: %6.2f\n",
3638	    (u_longlong_t)tzb->zb_asize,
3639	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
3640	    (double)tzb->zb_lsize / tzb->zb_asize);
3641	(void) printf("\tbp deduped:    %10llu    ref>1:"
3642	    " %6llu   deduplication: %6.2f\n",
3643	    (u_longlong_t)zcb.zcb_dedup_asize,
3644	    (u_longlong_t)zcb.zcb_dedup_blocks,
3645	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
3646	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
3647	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
3648
3649	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
3650		if (zcb.zcb_embedded_blocks[i] == 0)
3651			continue;
3652		(void) printf("\n");
3653		(void) printf("\tadditional, non-pointer bps of type %u: "
3654		    "%10llu\n",
3655		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
3656
3657		if (dump_opt['b'] >= 3) {
3658			(void) printf("\t number of (compressed) bytes:  "
3659			    "number of bps\n");
3660			dump_histogram(zcb.zcb_embedded_histogram[i],
3661			    sizeof (zcb.zcb_embedded_histogram[i]) /
3662			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
3663		}
3664	}
3665
3666	if (tzb->zb_ditto_samevdev != 0) {
3667		(void) printf("\tDittoed blocks on same vdev: %llu\n",
3668		    (longlong_t)tzb->zb_ditto_samevdev);
3669	}
3670
3671	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
3672		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
3673		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3674
3675		if (vim == NULL) {
3676			continue;
3677		}
3678
3679		char mem[32];
3680		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
3681		    mem, vdev_indirect_mapping_size(vim));
3682
3683		(void) printf("\tindirect vdev id %llu has %llu segments "
3684		    "(%s in memory)\n",
3685		    (longlong_t)vd->vdev_id,
3686		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
3687	}
3688
3689	if (dump_opt['b'] >= 2) {
3690		int l, t, level;
3691		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
3692		    "\t  avg\t comp\t%%Total\tType\n");
3693
3694		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
3695			char csize[32], lsize[32], psize[32], asize[32];
3696			char avg[32], gang[32];
3697			const char *typename;
3698
3699			/* make sure nicenum has enough space */
3700			CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
3701			CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
3702			CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
3703			CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
3704			CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
3705			CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
3706
3707			if (t < DMU_OT_NUMTYPES)
3708				typename = dmu_ot[t].ot_name;
3709			else
3710				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
3711
3712			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
3713				(void) printf("%6s\t%5s\t%5s\t%5s"
3714				    "\t%5s\t%5s\t%6s\t%s\n",
3715				    "-",
3716				    "-",
3717				    "-",
3718				    "-",
3719				    "-",
3720				    "-",
3721				    "-",
3722				    typename);
3723				continue;
3724			}
3725
3726			for (l = ZB_TOTAL - 1; l >= -1; l--) {
3727				level = (l == -1 ? ZB_TOTAL : l);
3728				zb = &zcb.zcb_type[level][t];
3729
3730				if (zb->zb_asize == 0)
3731					continue;
3732
3733				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
3734					continue;
3735
3736				if (level == 0 && zb->zb_asize ==
3737				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
3738					continue;
3739
3740				zdb_nicenum(zb->zb_count, csize,
3741				    sizeof (csize));
3742				zdb_nicenum(zb->zb_lsize, lsize,
3743				    sizeof (lsize));
3744				zdb_nicenum(zb->zb_psize, psize,
3745				    sizeof (psize));
3746				zdb_nicenum(zb->zb_asize, asize,
3747				    sizeof (asize));
3748				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
3749				    sizeof (avg));
3750				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
3751
3752				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
3753				    "\t%5.2f\t%6.2f\t",
3754				    csize, lsize, psize, asize, avg,
3755				    (double)zb->zb_lsize / zb->zb_psize,
3756				    100.0 * zb->zb_asize / tzb->zb_asize);
3757
3758				if (level == ZB_TOTAL)
3759					(void) printf("%s\n", typename);
3760				else
3761					(void) printf("    L%d %s\n",
3762					    level, typename);
3763
3764				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
3765					(void) printf("\t number of ganged "
3766					    "blocks: %s\n", gang);
3767				}
3768
3769				if (dump_opt['b'] >= 4) {
3770					(void) printf("psize "
3771					    "(in 512-byte sectors): "
3772					    "number of blocks\n");
3773					dump_histogram(zb->zb_psize_histogram,
3774					    PSIZE_HISTO_SIZE, 0);
3775				}
3776			}
3777		}
3778	}
3779
3780	(void) printf("\n");
3781
3782	if (leaks)
3783		return (2);
3784
3785	if (zcb.zcb_haderrors)
3786		return (3);
3787
3788	return (0);
3789}
3790
3791typedef struct zdb_ddt_entry {
3792	ddt_key_t	zdde_key;
3793	uint64_t	zdde_ref_blocks;
3794	uint64_t	zdde_ref_lsize;
3795	uint64_t	zdde_ref_psize;
3796	uint64_t	zdde_ref_dsize;
3797	avl_node_t	zdde_node;
3798} zdb_ddt_entry_t;
3799
3800/* ARGSUSED */
3801static int
3802zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
3803    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
3804{
3805	avl_tree_t *t = arg;
3806	avl_index_t where;
3807	zdb_ddt_entry_t *zdde, zdde_search;
3808
3809	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
3810		return (0);
3811
3812	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
3813		(void) printf("traversing objset %llu, %llu objects, "
3814		    "%lu blocks so far\n",
3815		    (u_longlong_t)zb->zb_objset,
3816		    (u_longlong_t)BP_GET_FILL(bp),
3817		    avl_numnodes(t));
3818	}
3819
3820	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
3821	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
3822		return (0);
3823
3824	ddt_key_fill(&zdde_search.zdde_key, bp);
3825
3826	zdde = avl_find(t, &zdde_search, &where);
3827
3828	if (zdde == NULL) {
3829		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
3830		zdde->zdde_key = zdde_search.zdde_key;
3831		avl_insert(t, zdde, where);
3832	}
3833
3834	zdde->zdde_ref_blocks += 1;
3835	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
3836	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
3837	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
3838
3839	return (0);
3840}
3841
3842static void
3843dump_simulated_ddt(spa_t *spa)
3844{
3845	avl_tree_t t;
3846	void *cookie = NULL;
3847	zdb_ddt_entry_t *zdde;
3848	ddt_histogram_t ddh_total;
3849	ddt_stat_t dds_total;
3850
3851	bzero(&ddh_total, sizeof (ddh_total));
3852	bzero(&dds_total, sizeof (dds_total));
3853	avl_create(&t, ddt_entry_compare,
3854	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
3855
3856	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3857
3858	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
3859	    zdb_ddt_add_cb, &t);
3860
3861	spa_config_exit(spa, SCL_CONFIG, FTAG);
3862
3863	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
3864		ddt_stat_t dds;
3865		uint64_t refcnt = zdde->zdde_ref_blocks;
3866		ASSERT(refcnt != 0);
3867
3868		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
3869		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
3870		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
3871		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
3872
3873		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
3874		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
3875		dds.dds_ref_psize = zdde->zdde_ref_psize;
3876		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
3877
3878		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
3879		    &dds, 0);
3880
3881		umem_free(zdde, sizeof (*zdde));
3882	}
3883
3884	avl_destroy(&t);
3885
3886	ddt_histogram_stat(&dds_total, &ddh_total);
3887
3888	(void) printf("Simulated DDT histogram:\n");
3889
3890	zpool_dump_ddt(&dds_total, &ddh_total);
3891
3892	dump_dedup_ratio(&dds_total);
3893}
3894
3895static int
3896verify_device_removal_feature_counts(spa_t *spa)
3897{
3898	uint64_t dr_feature_refcount = 0;
3899	uint64_t oc_feature_refcount = 0;
3900	uint64_t indirect_vdev_count = 0;
3901	uint64_t precise_vdev_count = 0;
3902	uint64_t obsolete_counts_object_count = 0;
3903	uint64_t obsolete_sm_count = 0;
3904	uint64_t obsolete_counts_count = 0;
3905	uint64_t scip_count = 0;
3906	uint64_t obsolete_bpobj_count = 0;
3907	int ret = 0;
3908
3909	spa_condensing_indirect_phys_t *scip =
3910	    &spa->spa_condensing_indirect_phys;
3911	if (scip->scip_next_mapping_object != 0) {
3912		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
3913		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
3914		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
3915
3916		(void) printf("Condensing indirect vdev %llu: new mapping "
3917		    "object %llu, prev obsolete sm %llu\n",
3918		    (u_longlong_t)scip->scip_vdev,
3919		    (u_longlong_t)scip->scip_next_mapping_object,
3920		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
3921		if (scip->scip_prev_obsolete_sm_object != 0) {
3922			space_map_t *prev_obsolete_sm = NULL;
3923			VERIFY0(space_map_open(&prev_obsolete_sm,
3924			    spa->spa_meta_objset,
3925			    scip->scip_prev_obsolete_sm_object,
3926			    0, vd->vdev_asize, 0));
3927			space_map_update(prev_obsolete_sm);
3928			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
3929			(void) printf("\n");
3930			space_map_close(prev_obsolete_sm);
3931		}
3932
3933		scip_count += 2;
3934	}
3935
3936	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
3937		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
3938		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
3939
3940		if (vic->vic_mapping_object != 0) {
3941			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
3942			    vd->vdev_removing);
3943			indirect_vdev_count++;
3944
3945			if (vd->vdev_indirect_mapping->vim_havecounts) {
3946				obsolete_counts_count++;
3947			}
3948		}
3949		if (vdev_obsolete_counts_are_precise(vd)) {
3950			ASSERT(vic->vic_mapping_object != 0);
3951			precise_vdev_count++;
3952		}
3953		if (vdev_obsolete_sm_object(vd) != 0) {
3954			ASSERT(vic->vic_mapping_object != 0);
3955			obsolete_sm_count++;
3956		}
3957	}
3958
3959	(void) feature_get_refcount(spa,
3960	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
3961	    &dr_feature_refcount);
3962	(void) feature_get_refcount(spa,
3963	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
3964	    &oc_feature_refcount);
3965
3966	if (dr_feature_refcount != indirect_vdev_count) {
3967		ret = 1;
3968		(void) printf("Number of indirect vdevs (%llu) " \
3969		    "does not match feature count (%llu)\n",
3970		    (u_longlong_t)indirect_vdev_count,
3971		    (u_longlong_t)dr_feature_refcount);
3972	} else {
3973		(void) printf("Verified device_removal feature refcount " \
3974		    "of %llu is correct\n",
3975		    (u_longlong_t)dr_feature_refcount);
3976	}
3977
3978	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
3979	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
3980		obsolete_bpobj_count++;
3981	}
3982
3983
3984	obsolete_counts_object_count = precise_vdev_count;
3985	obsolete_counts_object_count += obsolete_sm_count;
3986	obsolete_counts_object_count += obsolete_counts_count;
3987	obsolete_counts_object_count += scip_count;
3988	obsolete_counts_object_count += obsolete_bpobj_count;
3989	obsolete_counts_object_count += remap_deadlist_count;
3990
3991	if (oc_feature_refcount != obsolete_counts_object_count) {
3992		ret = 1;
3993		(void) printf("Number of obsolete counts objects (%llu) " \
3994		    "does not match feature count (%llu)\n",
3995		    (u_longlong_t)obsolete_counts_object_count,
3996		    (u_longlong_t)oc_feature_refcount);
3997		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
3998		    "ob:%llu rd:%llu\n",
3999		    (u_longlong_t)precise_vdev_count,
4000		    (u_longlong_t)obsolete_sm_count,
4001		    (u_longlong_t)obsolete_counts_count,
4002		    (u_longlong_t)scip_count,
4003		    (u_longlong_t)obsolete_bpobj_count,
4004		    (u_longlong_t)remap_deadlist_count);
4005	} else {
4006		(void) printf("Verified indirect_refcount feature refcount " \
4007		    "of %llu is correct\n",
4008		    (u_longlong_t)oc_feature_refcount);
4009	}
4010	return (ret);
4011}
4012
4013#define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
4014/*
4015 * Import the checkpointed state of the pool specified by the target
4016 * parameter as readonly. The function also accepts a pool config
4017 * as an optional parameter, else it attempts to infer the config by
4018 * the name of the target pool.
4019 *
4020 * Note that the checkpointed state's pool name will be the name of
4021 * the original pool with the above suffix appened to it. In addition,
4022 * if the target is not a pool name (e.g. a path to a dataset) then
4023 * the new_path parameter is populated with the updated path to
4024 * reflect the fact that we are looking into the checkpointed state.
4025 *
4026 * The function returns a newly-allocated copy of the name of the
4027 * pool containing the checkpointed state. When this copy is no
4028 * longer needed it should be freed with free(3C). Same thing
4029 * applies to the new_path parameter if allocated.
4030 */
4031static char *
4032import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
4033{
4034	int error = 0;
4035	char *poolname, *bogus_name;
4036
4037	/* If the target is not a pool, the extract the pool name */
4038	char *path_start = strchr(target, '/');
4039	if (path_start != NULL) {
4040		size_t poolname_len = path_start - target;
4041		poolname = strndup(target, poolname_len);
4042	} else {
4043		poolname = target;
4044	}
4045
4046	if (cfg == NULL) {
4047		error = spa_get_stats(poolname, &cfg, NULL, 0);
4048		if (error != 0) {
4049			fatal("Tried to read config of pool \"%s\" but "
4050			    "spa_get_stats() failed with error %d\n",
4051			    poolname, error);
4052		}
4053	}
4054
4055	(void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX);
4056	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
4057
4058	error = spa_import(bogus_name, cfg, NULL,
4059	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
4060	if (error != 0) {
4061		fatal("Tried to import pool \"%s\" but spa_import() failed "
4062		    "with error %d\n", bogus_name, error);
4063	}
4064
4065	if (new_path != NULL && path_start != NULL)
4066		(void) asprintf(new_path, "%s%s", bogus_name, path_start);
4067
4068	if (target != poolname)
4069		free(poolname);
4070
4071	return (bogus_name);
4072}
4073
4074typedef struct verify_checkpoint_sm_entry_cb_arg {
4075	vdev_t *vcsec_vd;
4076
4077	/* the following fields are only used for printing progress */
4078	uint64_t vcsec_entryid;
4079	uint64_t vcsec_num_entries;
4080} verify_checkpoint_sm_entry_cb_arg_t;
4081
4082#define	ENTRIES_PER_PROGRESS_UPDATE 10000
4083
4084static int
4085verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
4086    void *arg)
4087{
4088	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
4089	vdev_t *vd = vcsec->vcsec_vd;
4090	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4091	uint64_t end = offset + size;
4092
4093	ASSERT(type == SM_FREE);
4094
4095	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
4096		(void) fprintf(stderr,
4097		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
4098		    (longlong_t)vd->vdev_id,
4099		    (longlong_t)vcsec->vcsec_entryid,
4100		    (longlong_t)vcsec->vcsec_num_entries);
4101	}
4102	vcsec->vcsec_entryid++;
4103
4104	/*
4105	 * See comment in checkpoint_sm_exclude_entry_cb()
4106	 */
4107	VERIFY3U(offset, >=, ms->ms_start);
4108	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4109
4110	/*
4111	 * The entries in the vdev_checkpoint_sm should be marked as
4112	 * allocated in the checkpointed state of the pool, therefore
4113	 * their respective ms_allocateable trees should not contain them.
4114	 */
4115	mutex_enter(&ms->ms_lock);
4116	range_tree_verify(ms->ms_allocatable, offset, size);
4117	mutex_exit(&ms->ms_lock);
4118
4119	return (0);
4120}
4121
4122/*
4123 * Verify that all segments in the vdev_checkpoint_sm are allocated
4124 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
4125 * ms_allocatable).
4126 *
4127 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
4128 * each vdev in the current state of the pool to the metaslab space maps
4129 * (ms_sm) of the checkpointed state of the pool.
4130 *
4131 * Note that the function changes the state of the ms_allocatable
4132 * trees of the current spa_t. The entries of these ms_allocatable
4133 * trees are cleared out and then repopulated from with the free
4134 * entries of their respective ms_sm space maps.
4135 */
4136static void
4137verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
4138{
4139	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
4140	vdev_t *current_rvd = current->spa_root_vdev;
4141
4142	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
4143
4144	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
4145		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
4146		vdev_t *current_vd = current_rvd->vdev_child[c];
4147
4148		space_map_t *checkpoint_sm = NULL;
4149		uint64_t checkpoint_sm_obj;
4150
4151		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
4152			/*
4153			 * Since we don't allow device removal in a pool
4154			 * that has a checkpoint, we expect that all removed
4155			 * vdevs were removed from the pool before the
4156			 * checkpoint.
4157			 */
4158			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
4159			continue;
4160		}
4161
4162		/*
4163		 * If the checkpoint space map doesn't exist, then nothing
4164		 * here is checkpointed so there's nothing to verify.
4165		 */
4166		if (current_vd->vdev_top_zap == 0 ||
4167		    zap_contains(spa_meta_objset(current),
4168		    current_vd->vdev_top_zap,
4169		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4170			continue;
4171
4172		VERIFY0(zap_lookup(spa_meta_objset(current),
4173		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4174		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
4175
4176		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
4177		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
4178		    current_vd->vdev_ashift));
4179		space_map_update(checkpoint_sm);
4180
4181		verify_checkpoint_sm_entry_cb_arg_t vcsec;
4182		vcsec.vcsec_vd = ckpoint_vd;
4183		vcsec.vcsec_entryid = 0;
4184		vcsec.vcsec_num_entries =
4185		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
4186		VERIFY0(space_map_iterate(checkpoint_sm,
4187		    verify_checkpoint_sm_entry_cb, &vcsec));
4188		dump_spacemap(current->spa_meta_objset, checkpoint_sm);
4189		space_map_close(checkpoint_sm);
4190	}
4191
4192	/*
4193	 * If we've added vdevs since we took the checkpoint, ensure
4194	 * that their checkpoint space maps are empty.
4195	 */
4196	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
4197		for (uint64_t c = ckpoint_rvd->vdev_children;
4198		    c < current_rvd->vdev_children; c++) {
4199			vdev_t *current_vd = current_rvd->vdev_child[c];
4200			ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
4201		}
4202	}
4203
4204	/* for cleaner progress output */
4205	(void) fprintf(stderr, "\n");
4206}
4207
4208/*
4209 * Verifies that all space that's allocated in the checkpoint is
4210 * still allocated in the current version, by checking that everything
4211 * in checkpoint's ms_allocatable (which is actually allocated, not
4212 * allocatable/free) is not present in current's ms_allocatable.
4213 *
4214 * Note that the function changes the state of the ms_allocatable
4215 * trees of both spas when called. The entries of all ms_allocatable
4216 * trees are cleared out and then repopulated from their respective
4217 * ms_sm space maps. In the checkpointed state we load the allocated
4218 * entries, and in the current state we load the free entries.
4219 */
4220static void
4221verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
4222{
4223	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
4224	vdev_t *current_rvd = current->spa_root_vdev;
4225
4226	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
4227	load_concrete_ms_allocatable_trees(current, SM_FREE);
4228
4229	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
4230		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
4231		vdev_t *current_vd = current_rvd->vdev_child[i];
4232
4233		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
4234			/*
4235			 * See comment in verify_checkpoint_vdev_spacemaps()
4236			 */
4237			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
4238			continue;
4239		}
4240
4241		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
4242			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
4243			metaslab_t *current_msp = current_vd->vdev_ms[m];
4244
4245			(void) fprintf(stderr,
4246			    "\rverifying vdev %llu of %llu, "
4247			    "metaslab %llu of %llu ...",
4248			    (longlong_t)current_vd->vdev_id,
4249			    (longlong_t)current_rvd->vdev_children,
4250			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
4251			    (longlong_t)current_vd->vdev_ms_count);
4252
4253			/*
4254			 * We walk through the ms_allocatable trees that
4255			 * are loaded with the allocated blocks from the
4256			 * ms_sm spacemaps of the checkpoint. For each
4257			 * one of these ranges we ensure that none of them
4258			 * exists in the ms_allocatable trees of the
4259			 * current state which are loaded with the ranges
4260			 * that are currently free.
4261			 *
4262			 * This way we ensure that none of the blocks that
4263			 * are part of the checkpoint were freed by mistake.
4264			 */
4265			range_tree_walk(ckpoint_msp->ms_allocatable,
4266			    (range_tree_func_t *)range_tree_verify,
4267			    current_msp->ms_allocatable);
4268		}
4269	}
4270
4271	/* for cleaner progress output */
4272	(void) fprintf(stderr, "\n");
4273}
4274
4275static void
4276verify_checkpoint_blocks(spa_t *spa)
4277{
4278	spa_t *checkpoint_spa;
4279	char *checkpoint_pool;
4280	nvlist_t *config = NULL;
4281	int error = 0;
4282
4283	/*
4284	 * We import the checkpointed state of the pool (under a different
4285	 * name) so we can do verification on it against the current state
4286	 * of the pool.
4287	 */
4288	checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
4289	    NULL);
4290	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
4291
4292	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
4293	if (error != 0) {
4294		fatal("Tried to open pool \"%s\" but spa_open() failed with "
4295		    "error %d\n", checkpoint_pool, error);
4296	}
4297
4298	/*
4299	 * Ensure that ranges in the checkpoint space maps of each vdev
4300	 * are allocated according to the checkpointed state's metaslab
4301	 * space maps.
4302	 */
4303	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
4304
4305	/*
4306	 * Ensure that allocated ranges in the checkpoint's metaslab
4307	 * space maps remain allocated in the metaslab space maps of
4308	 * the current state.
4309	 */
4310	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
4311
4312	/*
4313	 * Once we are done, we get rid of the checkpointed state.
4314	 */
4315	spa_close(checkpoint_spa, FTAG);
4316	free(checkpoint_pool);
4317}
4318
4319static void
4320dump_leftover_checkpoint_blocks(spa_t *spa)
4321{
4322	vdev_t *rvd = spa->spa_root_vdev;
4323
4324	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
4325		vdev_t *vd = rvd->vdev_child[i];
4326
4327		space_map_t *checkpoint_sm = NULL;
4328		uint64_t checkpoint_sm_obj;
4329
4330		if (vd->vdev_top_zap == 0)
4331			continue;
4332
4333		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4334		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4335			continue;
4336
4337		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4338		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4339		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
4340
4341		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4342		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4343		space_map_update(checkpoint_sm);
4344		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
4345		space_map_close(checkpoint_sm);
4346	}
4347}
4348
4349static int
4350verify_checkpoint(spa_t *spa)
4351{
4352	uberblock_t checkpoint;
4353	int error;
4354
4355	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
4356		return (0);
4357
4358	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4359	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4360	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4361
4362	if (error == ENOENT) {
4363		/*
4364		 * If the feature is active but the uberblock is missing
4365		 * then we must be in the middle of discarding the
4366		 * checkpoint.
4367		 */
4368		(void) printf("\nPartially discarded checkpoint "
4369		    "state found:\n");
4370		dump_leftover_checkpoint_blocks(spa);
4371		return (0);
4372	} else if (error != 0) {
4373		(void) printf("lookup error %d when looking for "
4374		    "checkpointed uberblock in MOS\n", error);
4375		return (error);
4376	}
4377	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
4378
4379	if (checkpoint.ub_checkpoint_txg == 0) {
4380		(void) printf("\nub_checkpoint_txg not set in checkpointed "
4381		    "uberblock\n");
4382		error = 3;
4383	}
4384
4385	if (error == 0)
4386		verify_checkpoint_blocks(spa);
4387
4388	return (error);
4389}
4390
4391static void
4392dump_zpool(spa_t *spa)
4393{
4394	dsl_pool_t *dp = spa_get_dsl(spa);
4395	int rc = 0;
4396
4397	if (dump_opt['S']) {
4398		dump_simulated_ddt(spa);
4399		return;
4400	}
4401
4402	if (!dump_opt['e'] && dump_opt['C'] > 1) {
4403		(void) printf("\nCached configuration:\n");
4404		dump_nvlist(spa->spa_config, 8);
4405	}
4406
4407	if (dump_opt['C'])
4408		dump_config(spa);
4409
4410	if (dump_opt['u'])
4411		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
4412
4413	if (dump_opt['D'])
4414		dump_all_ddts(spa);
4415
4416	if (dump_opt['d'] > 2 || dump_opt['m'])
4417		dump_metaslabs(spa);
4418	if (dump_opt['M'])
4419		dump_metaslab_groups(spa);
4420
4421	if (dump_opt['d'] || dump_opt['i']) {
4422		dump_dir(dp->dp_meta_objset);
4423		if (dump_opt['d'] >= 3) {
4424			dsl_pool_t *dp = spa->spa_dsl_pool;
4425			dump_full_bpobj(&spa->spa_deferred_bpobj,
4426			    "Deferred frees", 0);
4427			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
4428				dump_full_bpobj(&dp->dp_free_bpobj,
4429				    "Pool snapshot frees", 0);
4430			}
4431			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
4432				ASSERT(spa_feature_is_enabled(spa,
4433				    SPA_FEATURE_DEVICE_REMOVAL));
4434				dump_full_bpobj(&dp->dp_obsolete_bpobj,
4435				    "Pool obsolete blocks", 0);
4436			}
4437
4438			if (spa_feature_is_active(spa,
4439			    SPA_FEATURE_ASYNC_DESTROY)) {
4440				dump_bptree(spa->spa_meta_objset,
4441				    dp->dp_bptree_obj,
4442				    "Pool dataset frees");
4443			}
4444			dump_dtl(spa->spa_root_vdev, 0);
4445		}
4446		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
4447		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
4448
4449		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
4450			uint64_t refcount;
4451
4452			if (!(spa_feature_table[f].fi_flags &
4453			    ZFEATURE_FLAG_PER_DATASET) ||
4454			    !spa_feature_is_enabled(spa, f)) {
4455				ASSERT0(dataset_feature_count[f]);
4456				continue;
4457			}
4458			(void) feature_get_refcount(spa,
4459			    &spa_feature_table[f], &refcount);
4460			if (dataset_feature_count[f] != refcount) {
4461				(void) printf("%s feature refcount mismatch: "
4462				    "%lld datasets != %lld refcount\n",
4463				    spa_feature_table[f].fi_uname,
4464				    (longlong_t)dataset_feature_count[f],
4465				    (longlong_t)refcount);
4466				rc = 2;
4467			} else {
4468				(void) printf("Verified %s feature refcount "
4469				    "of %llu is correct\n",
4470				    spa_feature_table[f].fi_uname,
4471				    (longlong_t)refcount);
4472			}
4473		}
4474
4475		if (rc == 0) {
4476			rc = verify_device_removal_feature_counts(spa);
4477		}
4478	}
4479	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
4480		rc = dump_block_stats(spa);
4481
4482	if (rc == 0)
4483		rc = verify_spacemap_refcounts(spa);
4484
4485	if (dump_opt['s'])
4486		show_pool_stats(spa);
4487
4488	if (dump_opt['h'])
4489		dump_history(spa);
4490
4491	if (rc == 0 && !dump_opt['L'])
4492		rc = verify_checkpoint(spa);
4493
4494	if (rc != 0) {
4495		dump_debug_buffer();
4496		exit(rc);
4497	}
4498}
4499
4500#define	ZDB_FLAG_CHECKSUM	0x0001
4501#define	ZDB_FLAG_DECOMPRESS	0x0002
4502#define	ZDB_FLAG_BSWAP		0x0004
4503#define	ZDB_FLAG_GBH		0x0008
4504#define	ZDB_FLAG_INDIRECT	0x0010
4505#define	ZDB_FLAG_PHYS		0x0020
4506#define	ZDB_FLAG_RAW		0x0040
4507#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
4508
4509static int flagbits[256];
4510
4511static void
4512zdb_print_blkptr(blkptr_t *bp, int flags)
4513{
4514	char blkbuf[BP_SPRINTF_LEN];
4515
4516	if (flags & ZDB_FLAG_BSWAP)
4517		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
4518
4519	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
4520	(void) printf("%s\n", blkbuf);
4521}
4522
4523static void
4524zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
4525{
4526	int i;
4527
4528	for (i = 0; i < nbps; i++)
4529		zdb_print_blkptr(&bp[i], flags);
4530}
4531
4532static void
4533zdb_dump_gbh(void *buf, int flags)
4534{
4535	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
4536}
4537
4538static void
4539zdb_dump_block_raw(void *buf, uint64_t size, int flags)
4540{
4541	if (flags & ZDB_FLAG_BSWAP)
4542		byteswap_uint64_array(buf, size);
4543	(void) write(1, buf, size);
4544}
4545
4546static void
4547zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
4548{
4549	uint64_t *d = (uint64_t *)buf;
4550	unsigned nwords = size / sizeof (uint64_t);
4551	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
4552	unsigned i, j;
4553	const char *hdr;
4554	char *c;
4555
4556
4557	if (do_bswap)
4558		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
4559	else
4560		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
4561
4562	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
4563
4564	for (i = 0; i < nwords; i += 2) {
4565		(void) printf("%06llx:  %016llx  %016llx  ",
4566		    (u_longlong_t)(i * sizeof (uint64_t)),
4567		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
4568		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
4569
4570		c = (char *)&d[i];
4571		for (j = 0; j < 2 * sizeof (uint64_t); j++)
4572			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
4573		(void) printf("\n");
4574	}
4575}
4576
4577/*
4578 * There are two acceptable formats:
4579 *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
4580 *	child[.child]*    - For example: 0.1.1
4581 *
4582 * The second form can be used to specify arbitrary vdevs anywhere
4583 * in the heirarchy.  For example, in a pool with a mirror of
4584 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
4585 */
4586static vdev_t *
4587zdb_vdev_lookup(vdev_t *vdev, const char *path)
4588{
4589	char *s, *p, *q;
4590	unsigned i;
4591
4592	if (vdev == NULL)
4593		return (NULL);
4594
4595	/* First, assume the x.x.x.x format */
4596	i = strtoul(path, &s, 10);
4597	if (s == path || (s && *s != '.' && *s != '\0'))
4598		goto name;
4599	if (i >= vdev->vdev_children)
4600		return (NULL);
4601
4602	vdev = vdev->vdev_child[i];
4603	if (*s == '\0')
4604		return (vdev);
4605	return (zdb_vdev_lookup(vdev, s+1));
4606
4607name:
4608	for (i = 0; i < vdev->vdev_children; i++) {
4609		vdev_t *vc = vdev->vdev_child[i];
4610
4611		if (vc->vdev_path == NULL) {
4612			vc = zdb_vdev_lookup(vc, path);
4613			if (vc == NULL)
4614				continue;
4615			else
4616				return (vc);
4617		}
4618
4619		p = strrchr(vc->vdev_path, '/');
4620		p = p ? p + 1 : vc->vdev_path;
4621		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
4622
4623		if (strcmp(vc->vdev_path, path) == 0)
4624			return (vc);
4625		if (strcmp(p, path) == 0)
4626			return (vc);
4627		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
4628			return (vc);
4629	}
4630
4631	return (NULL);
4632}
4633
4634/* ARGSUSED */
4635static int
4636random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
4637{
4638	return (random_get_pseudo_bytes(buf, len));
4639}
4640
4641/*
4642 * Read a block from a pool and print it out.  The syntax of the
4643 * block descriptor is:
4644 *
4645 *	pool:vdev_specifier:offset:size[:flags]
4646 *
4647 *	pool           - The name of the pool you wish to read from
4648 *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
4649 *	offset         - offset, in hex, in bytes
4650 *	size           - Amount of data to read, in hex, in bytes
4651 *	flags          - A string of characters specifying options
4652 *		 b: Decode a blkptr at given offset within block
4653 *		*c: Calculate and display checksums
4654 *		 d: Decompress data before dumping
4655 *		 e: Byteswap data before dumping
4656 *		 g: Display data as a gang block header
4657 *		 i: Display as an indirect block
4658 *		 p: Do I/O to physical offset
4659 *		 r: Dump raw data to stdout
4660 *
4661 *              * = not yet implemented
4662 */
4663static void
4664zdb_read_block(char *thing, spa_t *spa)
4665{
4666	blkptr_t blk, *bp = &blk;
4667	dva_t *dva = bp->blk_dva;
4668	int flags = 0;
4669	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
4670	zio_t *zio;
4671	vdev_t *vd;
4672	abd_t *pabd;
4673	void *lbuf, *buf;
4674	const char *s, *vdev;
4675	char *p, *dup, *flagstr;
4676	int i, error;
4677
4678	dup = strdup(thing);
4679	s = strtok(dup, ":");
4680	vdev = s ? s : "";
4681	s = strtok(NULL, ":");
4682	offset = strtoull(s ? s : "", NULL, 16);
4683	s = strtok(NULL, ":");
4684	size = strtoull(s ? s : "", NULL, 16);
4685	s = strtok(NULL, ":");
4686	if (s)
4687		flagstr = strdup(s);
4688	else
4689		flagstr = strdup("");
4690
4691	s = NULL;
4692	if (size == 0)
4693		s = "size must not be zero";
4694	if (!IS_P2ALIGNED(size, DEV_BSIZE))
4695		s = "size must be a multiple of sector size";
4696	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
4697		s = "offset must be a multiple of sector size";
4698	if (s) {
4699		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
4700		free(dup);
4701		return;
4702	}
4703
4704	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
4705		for (i = 0; flagstr[i]; i++) {
4706			int bit = flagbits[(uchar_t)flagstr[i]];
4707
4708			if (bit == 0) {
4709				(void) printf("***Invalid flag: %c\n",
4710				    flagstr[i]);
4711				continue;
4712			}
4713			flags |= bit;
4714
4715			/* If it's not something with an argument, keep going */
4716			if ((bit & (ZDB_FLAG_CHECKSUM |
4717			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
4718				continue;
4719
4720			p = &flagstr[i + 1];
4721			if (bit == ZDB_FLAG_PRINT_BLKPTR)
4722				blkptr_offset = strtoull(p, &p, 16);
4723			if (*p != ':' && *p != '\0') {
4724				(void) printf("***Invalid flag arg: '%s'\n", s);
4725				free(dup);
4726				return;
4727			}
4728		}
4729	}
4730	free(flagstr);
4731
4732	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
4733	if (vd == NULL) {
4734		(void) printf("***Invalid vdev: %s\n", vdev);
4735		free(dup);
4736		return;
4737	} else {
4738		if (vd->vdev_path)
4739			(void) fprintf(stderr, "Found vdev: %s\n",
4740			    vd->vdev_path);
4741		else
4742			(void) fprintf(stderr, "Found vdev type: %s\n",
4743			    vd->vdev_ops->vdev_op_type);
4744	}
4745
4746	psize = size;
4747	lsize = size;
4748
4749	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
4750	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
4751
4752	BP_ZERO(bp);
4753
4754	DVA_SET_VDEV(&dva[0], vd->vdev_id);
4755	DVA_SET_OFFSET(&dva[0], offset);
4756	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
4757	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
4758
4759	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
4760
4761	BP_SET_LSIZE(bp, lsize);
4762	BP_SET_PSIZE(bp, psize);
4763	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
4764	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
4765	BP_SET_TYPE(bp, DMU_OT_NONE);
4766	BP_SET_LEVEL(bp, 0);
4767	BP_SET_DEDUP(bp, 0);
4768	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
4769
4770	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4771	zio = zio_root(spa, NULL, NULL, 0);
4772
4773	if (vd == vd->vdev_top) {
4774		/*
4775		 * Treat this as a normal block read.
4776		 */
4777		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
4778		    ZIO_PRIORITY_SYNC_READ,
4779		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
4780	} else {
4781		/*
4782		 * Treat this as a vdev child I/O.
4783		 */
4784		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
4785		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
4786		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
4787		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
4788		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
4789		    NULL, NULL));
4790	}
4791
4792	error = zio_wait(zio);
4793	spa_config_exit(spa, SCL_STATE, FTAG);
4794
4795	if (error) {
4796		(void) printf("Read of %s failed, error: %d\n", thing, error);
4797		goto out;
4798	}
4799
4800	if (flags & ZDB_FLAG_DECOMPRESS) {
4801		/*
4802		 * We don't know how the data was compressed, so just try
4803		 * every decompress function at every inflated blocksize.
4804		 */
4805		enum zio_compress c;
4806		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
4807		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
4808
4809		abd_copy_to_buf(pbuf2, pabd, psize);
4810
4811		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
4812		    random_get_pseudo_bytes_cb, NULL));
4813
4814		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
4815		    SPA_MAXBLOCKSIZE - psize));
4816
4817		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
4818		    lsize -= SPA_MINBLOCKSIZE) {
4819			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
4820				if (zio_decompress_data(c, pabd,
4821				    lbuf, psize, lsize) == 0 &&
4822				    zio_decompress_data_buf(c, pbuf2,
4823				    lbuf2, psize, lsize) == 0 &&
4824				    bcmp(lbuf, lbuf2, lsize) == 0)
4825					break;
4826			}
4827			if (c != ZIO_COMPRESS_FUNCTIONS)
4828				break;
4829			lsize -= SPA_MINBLOCKSIZE;
4830		}
4831
4832		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
4833		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
4834
4835		if (lsize <= psize) {
4836			(void) printf("Decompress of %s failed\n", thing);
4837			goto out;
4838		}
4839		buf = lbuf;
4840		size = lsize;
4841	} else {
4842		buf = abd_to_buf(pabd);
4843		size = psize;
4844	}
4845
4846	if (flags & ZDB_FLAG_PRINT_BLKPTR)
4847		zdb_print_blkptr((blkptr_t *)(void *)
4848		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
4849	else if (flags & ZDB_FLAG_RAW)
4850		zdb_dump_block_raw(buf, size, flags);
4851	else if (flags & ZDB_FLAG_INDIRECT)
4852		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
4853		    flags);
4854	else if (flags & ZDB_FLAG_GBH)
4855		zdb_dump_gbh(buf, flags);
4856	else
4857		zdb_dump_block(thing, buf, size, flags);
4858
4859out:
4860	abd_free(pabd);
4861	umem_free(lbuf, SPA_MAXBLOCKSIZE);
4862	free(dup);
4863}
4864
4865static void
4866zdb_embedded_block(char *thing)
4867{
4868	blkptr_t bp;
4869	unsigned long long *words = (void *)&bp;
4870	char buf[SPA_MAXBLOCKSIZE];
4871	int err;
4872
4873	bzero(&bp, sizeof (bp));
4874	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
4875	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
4876	    words + 0, words + 1, words + 2, words + 3,
4877	    words + 4, words + 5, words + 6, words + 7,
4878	    words + 8, words + 9, words + 10, words + 11,
4879	    words + 12, words + 13, words + 14, words + 15);
4880	if (err != 16) {
4881		(void) printf("invalid input format\n");
4882		exit(1);
4883	}
4884	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
4885	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
4886	if (err != 0) {
4887		(void) printf("decode failed: %u\n", err);
4888		exit(1);
4889	}
4890	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
4891}
4892
4893static boolean_t
4894pool_match(nvlist_t *cfg, char *tgt)
4895{
4896	uint64_t v, guid = strtoull(tgt, NULL, 0);
4897	char *s;
4898
4899	if (guid != 0) {
4900		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
4901			return (v == guid);
4902	} else {
4903		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
4904			return (strcmp(s, tgt) == 0);
4905	}
4906	return (B_FALSE);
4907}
4908
4909static char *
4910find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
4911{
4912	nvlist_t *pools;
4913	nvlist_t *match = NULL;
4914	char *name = NULL;
4915	char *sepp = NULL;
4916	char sep = '\0';
4917	int count = 0;
4918	importargs_t args;
4919
4920	bzero(&args, sizeof (args));
4921	args.paths = dirc;
4922	args.path = dirv;
4923	args.can_be_active = B_TRUE;
4924
4925	if ((sepp = strpbrk(*target, "/@")) != NULL) {
4926		sep = *sepp;
4927		*sepp = '\0';
4928	}
4929
4930	pools = zpool_search_import(g_zfs, &args);
4931
4932	if (pools != NULL) {
4933		nvpair_t *elem = NULL;
4934		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
4935			verify(nvpair_value_nvlist(elem, configp) == 0);
4936			if (pool_match(*configp, *target)) {
4937				count++;
4938				if (match != NULL) {
4939					/* print previously found config */
4940					if (name != NULL) {
4941						(void) printf("%s\n", name);
4942						dump_nvlist(match, 8);
4943						name = NULL;
4944					}
4945					(void) printf("%s\n",
4946					    nvpair_name(elem));
4947					dump_nvlist(*configp, 8);
4948				} else {
4949					match = *configp;
4950					name = nvpair_name(elem);
4951				}
4952			}
4953		}
4954	}
4955	if (count > 1)
4956		(void) fatal("\tMatched %d pools - use pool GUID "
4957		    "instead of pool name or \n"
4958		    "\tpool name part of a dataset name to select pool", count);
4959
4960	if (sepp)
4961		*sepp = sep;
4962	/*
4963	 * If pool GUID was specified for pool id, replace it with pool name
4964	 */
4965	if (name && (strstr(*target, name) != *target)) {
4966		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
4967
4968		*target = umem_alloc(sz, UMEM_NOFAIL);
4969		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
4970	}
4971
4972	*configp = name ? match : NULL;
4973
4974	return (name);
4975}
4976
4977int
4978main(int argc, char **argv)
4979{
4980	int c;
4981	struct rlimit rl = { 1024, 1024 };
4982	spa_t *spa = NULL;
4983	objset_t *os = NULL;
4984	int dump_all = 1;
4985	int verbose = 0;
4986	int error = 0;
4987	char **searchdirs = NULL;
4988	int nsearch = 0;
4989	char *target;
4990	nvlist_t *policy = NULL;
4991	uint64_t max_txg = UINT64_MAX;
4992	int flags = ZFS_IMPORT_MISSING_LOG;
4993	int rewind = ZPOOL_NEVER_REWIND;
4994	char *spa_config_path_env;
4995	boolean_t target_is_spa = B_TRUE;
4996	nvlist_t *cfg = NULL;
4997
4998	(void) setrlimit(RLIMIT_NOFILE, &rl);
4999	(void) enable_extended_FILE_stdio(-1, -1);
5000
5001	dprintf_setup(&argc, argv);
5002
5003	/*
5004	 * If there is an environment variable SPA_CONFIG_PATH it overrides
5005	 * default spa_config_path setting. If -U flag is specified it will
5006	 * override this environment variable settings once again.
5007	 */
5008	spa_config_path_env = getenv("SPA_CONFIG_PATH");
5009	if (spa_config_path_env != NULL)
5010		spa_config_path = spa_config_path_env;
5011
5012	while ((c = getopt(argc, argv,
5013	    "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
5014		switch (c) {
5015		case 'b':
5016		case 'c':
5017		case 'C':
5018		case 'd':
5019		case 'D':
5020		case 'E':
5021		case 'G':
5022		case 'h':
5023		case 'i':
5024		case 'l':
5025		case 'm':
5026		case 'M':
5027		case 'O':
5028		case 'R':
5029		case 's':
5030		case 'S':
5031		case 'u':
5032			dump_opt[c]++;
5033			dump_all = 0;
5034			break;
5035		case 'A':
5036		case 'e':
5037		case 'F':
5038		case 'k':
5039		case 'L':
5040		case 'P':
5041		case 'q':
5042		case 'X':
5043			dump_opt[c]++;
5044			break;
5045		/* NB: Sort single match options below. */
5046		case 'I':
5047			max_inflight = strtoull(optarg, NULL, 0);
5048			if (max_inflight == 0) {
5049				(void) fprintf(stderr, "maximum number "
5050				    "of inflight I/Os must be greater "
5051				    "than 0\n");
5052				usage();
5053			}
5054			break;
5055		case 'o':
5056			error = set_global_var(optarg);
5057			if (error != 0)
5058				usage();
5059			break;
5060		case 'p':
5061			if (searchdirs == NULL) {
5062				searchdirs = umem_alloc(sizeof (char *),
5063				    UMEM_NOFAIL);
5064			} else {
5065				char **tmp = umem_alloc((nsearch + 1) *
5066				    sizeof (char *), UMEM_NOFAIL);
5067				bcopy(searchdirs, tmp, nsearch *
5068				    sizeof (char *));
5069				umem_free(searchdirs,
5070				    nsearch * sizeof (char *));
5071				searchdirs = tmp;
5072			}
5073			searchdirs[nsearch++] = optarg;
5074			break;
5075		case 't':
5076			max_txg = strtoull(optarg, NULL, 0);
5077			if (max_txg < TXG_INITIAL) {
5078				(void) fprintf(stderr, "incorrect txg "
5079				    "specified: %s\n", optarg);
5080				usage();
5081			}
5082			break;
5083		case 'U':
5084			spa_config_path = optarg;
5085			if (spa_config_path[0] != '/') {
5086				(void) fprintf(stderr,
5087				    "cachefile must be an absolute path "
5088				    "(i.e. start with a slash)\n");
5089				usage();
5090			}
5091			break;
5092		case 'v':
5093			verbose++;
5094			break;
5095		case 'V':
5096			flags = ZFS_IMPORT_VERBATIM;
5097			break;
5098		case 'x':
5099			vn_dumpdir = optarg;
5100			break;
5101		default:
5102			usage();
5103			break;
5104		}
5105	}
5106
5107	if (!dump_opt['e'] && searchdirs != NULL) {
5108		(void) fprintf(stderr, "-p option requires use of -e\n");
5109		usage();
5110	}
5111
5112	/*
5113	 * ZDB does not typically re-read blocks; therefore limit the ARC
5114	 * to 256 MB, which can be used entirely for metadata.
5115	 */
5116	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
5117
5118	/*
5119	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
5120	 * "zdb -b" uses traversal prefetch which uses async reads.
5121	 * For good performance, let several of them be active at once.
5122	 */
5123	zfs_vdev_async_read_max_active = 10;
5124
5125	/*
5126	 * Disable reference tracking for better performance.
5127	 */
5128	reference_tracking_enable = B_FALSE;
5129
5130	/*
5131	 * Do not fail spa_load when spa_load_verify fails. This is needed
5132	 * to load non-idle pools.
5133	 */
5134	spa_load_verify_dryrun = B_TRUE;
5135
5136	kernel_init(FREAD);
5137	g_zfs = libzfs_init();
5138	ASSERT(g_zfs != NULL);
5139
5140	if (dump_all)
5141		verbose = MAX(verbose, 1);
5142
5143	for (c = 0; c < 256; c++) {
5144		if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
5145			dump_opt[c] = 1;
5146		if (dump_opt[c])
5147			dump_opt[c] += verbose;
5148	}
5149
5150	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
5151	zfs_recover = (dump_opt['A'] > 1);
5152
5153	argc -= optind;
5154	argv += optind;
5155
5156	if (argc < 2 && dump_opt['R'])
5157		usage();
5158
5159	if (dump_opt['E']) {
5160		if (argc != 1)
5161			usage();
5162		zdb_embedded_block(argv[0]);
5163		return (0);
5164	}
5165
5166	if (argc < 1) {
5167		if (!dump_opt['e'] && dump_opt['C']) {
5168			dump_cachefile(spa_config_path);
5169			return (0);
5170		}
5171		usage();
5172	}
5173
5174	if (dump_opt['l'])
5175		return (dump_label(argv[0]));
5176
5177	if (dump_opt['O']) {
5178		if (argc != 2)
5179			usage();
5180		dump_opt['v'] = verbose + 3;
5181		return (dump_path(argv[0], argv[1]));
5182	}
5183
5184	if (dump_opt['X'] || dump_opt['F'])
5185		rewind = ZPOOL_DO_REWIND |
5186		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
5187
5188	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
5189	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
5190	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
5191		fatal("internal error: %s", strerror(ENOMEM));
5192
5193	error = 0;
5194	target = argv[0];
5195
5196	if (dump_opt['e']) {
5197		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
5198
5199		error = ENOENT;
5200		if (name) {
5201			if (dump_opt['C'] > 1) {
5202				(void) printf("\nConfiguration for import:\n");
5203				dump_nvlist(cfg, 8);
5204			}
5205
5206			if (nvlist_add_nvlist(cfg,
5207			    ZPOOL_LOAD_POLICY, policy) != 0) {
5208				fatal("can't open '%s': %s",
5209				    target, strerror(ENOMEM));
5210			}
5211			error = spa_import(name, cfg, NULL, flags);
5212		}
5213	}
5214
5215	char *checkpoint_pool = NULL;
5216	char *checkpoint_target = NULL;
5217	if (dump_opt['k']) {
5218		checkpoint_pool = import_checkpointed_state(target, cfg,
5219		    &checkpoint_target);
5220
5221		if (checkpoint_target != NULL)
5222			target = checkpoint_target;
5223
5224	}
5225
5226	if (strpbrk(target, "/@") != NULL) {
5227		size_t targetlen;
5228
5229		target_is_spa = B_FALSE;
5230		/*
5231		 * Remove any trailing slash.  Later code would get confused
5232		 * by it, but we want to allow it so that "pool/" can
5233		 * indicate that we want to dump the topmost filesystem,
5234		 * rather than the whole pool.
5235		 */
5236		targetlen = strlen(target);
5237		if (targetlen != 0 && target[targetlen - 1] == '/')
5238			target[targetlen - 1] = '\0';
5239	}
5240
5241	if (error == 0) {
5242		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
5243			ASSERT(checkpoint_pool != NULL);
5244			ASSERT(checkpoint_target == NULL);
5245
5246			error = spa_open(checkpoint_pool, &spa, FTAG);
5247			if (error != 0) {
5248				fatal("Tried to open pool \"%s\" but "
5249				    "spa_open() failed with error %d\n",
5250				    checkpoint_pool, error);
5251			}
5252
5253		} else if (target_is_spa || dump_opt['R']) {
5254			error = spa_open_rewind(target, &spa, FTAG, policy,
5255			    NULL);
5256			if (error) {
5257				/*
5258				 * If we're missing the log device then
5259				 * try opening the pool after clearing the
5260				 * log state.
5261				 */
5262				mutex_enter(&spa_namespace_lock);
5263				if ((spa = spa_lookup(target)) != NULL &&
5264				    spa->spa_log_state == SPA_LOG_MISSING) {
5265					spa->spa_log_state = SPA_LOG_CLEAR;
5266					error = 0;
5267				}
5268				mutex_exit(&spa_namespace_lock);
5269
5270				if (!error) {
5271					error = spa_open_rewind(target, &spa,
5272					    FTAG, policy, NULL);
5273				}
5274			}
5275		} else {
5276			error = open_objset(target, DMU_OST_ANY, FTAG, &os);
5277		}
5278	}
5279	nvlist_free(policy);
5280
5281	if (error)
5282		fatal("can't open '%s': %s", target, strerror(error));
5283
5284	argv++;
5285	argc--;
5286	if (!dump_opt['R']) {
5287		if (argc > 0) {
5288			zopt_objects = argc;
5289			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
5290			for (unsigned i = 0; i < zopt_objects; i++) {
5291				errno = 0;
5292				zopt_object[i] = strtoull(argv[i], NULL, 0);
5293				if (zopt_object[i] == 0 && errno != 0)
5294					fatal("bad number %s: %s",
5295					    argv[i], strerror(errno));
5296			}
5297		}
5298		if (os != NULL) {
5299			dump_dir(os);
5300		} else if (zopt_objects > 0 && !dump_opt['m']) {
5301			dump_dir(spa->spa_meta_objset);
5302		} else {
5303			dump_zpool(spa);
5304		}
5305	} else {
5306		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
5307		flagbits['c'] = ZDB_FLAG_CHECKSUM;
5308		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
5309		flagbits['e'] = ZDB_FLAG_BSWAP;
5310		flagbits['g'] = ZDB_FLAG_GBH;
5311		flagbits['i'] = ZDB_FLAG_INDIRECT;
5312		flagbits['p'] = ZDB_FLAG_PHYS;
5313		flagbits['r'] = ZDB_FLAG_RAW;
5314
5315		for (int i = 0; i < argc; i++)
5316			zdb_read_block(argv[i], spa);
5317	}
5318
5319	if (dump_opt['k']) {
5320		free(checkpoint_pool);
5321		if (!target_is_spa)
5322			free(checkpoint_target);
5323	}
5324
5325	if (os != NULL)
5326		close_objset(os, FTAG);
5327	else
5328		spa_close(spa, FTAG);
5329
5330	fuid_table_destroy();
5331
5332	dump_debug_buffer();
5333
5334	libzfs_fini(g_zfs);
5335	kernel_fini();
5336
5337	return (0);
5338}
5339