1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
28 * Copyright 2017 RackTop Systems.
29 */
30
31#include <stdio.h>
32#include <unistd.h>
33#include <stdio_ext.h>
34#include <stdlib.h>
35#include <ctype.h>
36#include <sys/zfs_context.h>
37#include <sys/spa.h>
38#include <sys/spa_impl.h>
39#include <sys/dmu.h>
40#include <sys/zap.h>
41#include <sys/fs/zfs.h>
42#include <sys/zfs_znode.h>
43#include <sys/zfs_sa.h>
44#include <sys/sa.h>
45#include <sys/sa_impl.h>
46#include <sys/vdev.h>
47#include <sys/vdev_impl.h>
48#include <sys/metaslab_impl.h>
49#include <sys/dmu_objset.h>
50#include <sys/dsl_dir.h>
51#include <sys/dsl_dataset.h>
52#include <sys/dsl_pool.h>
53#include <sys/dbuf.h>
54#include <sys/zil.h>
55#include <sys/zil_impl.h>
56#include <sys/stat.h>
57#include <sys/resource.h>
58#include <sys/dmu_traverse.h>
59#include <sys/zio_checksum.h>
60#include <sys/zio_compress.h>
61#include <sys/zfs_fuid.h>
62#include <sys/arc.h>
63#include <sys/ddt.h>
64#include <sys/zfeature.h>
65#include <sys/abd.h>
66#include <sys/blkptr.h>
67#include <sys/dsl_scan.h>
68#include <sys/dsl_crypt.h>
69#include <zfs_comutil.h>
70#include <libcmdutils.h>
71#undef verify
72#include <libzfs.h>
73
74#include <libnvpair.h>
75#include <libzutil.h>
76
77#include "zdb.h"
78
79#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
80	zio_compress_table[(idx)].ci_name : "UNKNOWN")
81#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
82	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
83#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
84	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
85	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
86#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
87	(idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ?	\
88	DMU_OT_ZAP_OTHER : \
89	(idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
90	DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
91
92extern int reference_tracking_enable;
93extern boolean_t zfs_recover;
94extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
95extern int zfs_vdev_async_read_max_active;
96extern int aok;
97extern boolean_t spa_load_verify_dryrun;
98extern int zfs_btree_verify_intensity;
99
100static const char cmdname[] = "zdb";
101uint8_t dump_opt[256];
102
103typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
104
105uint64_t *zopt_object = NULL;
106static unsigned zopt_objects = 0;
107uint64_t max_inflight = 1000;
108static int leaked_objects = 0;
109
110static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
111static void mos_obj_refd(uint64_t);
112
113/*
114 * These libumem hooks provide a reasonable set of defaults for the allocator's
115 * debugging facilities.
116 */
117const char *
118_umem_debug_init()
119{
120	return ("default,verbose"); /* $UMEM_DEBUG setting */
121}
122
123const char *
124_umem_logging_init(void)
125{
126	return ("fail,contents"); /* $UMEM_LOGGING setting */
127}
128
129static void
130usage(void)
131{
132	(void) fprintf(stderr,
133	    "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
134	    "[-I <inflight I/Os>]\n"
135	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
136	    "\t\t[<poolname> [<object> ...]]\n"
137	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
138	    "[<object> ...]\n"
139	    "\t%s -C [-A] [-U <cache>]\n"
140	    "\t%s -l [-Aqu] <device>\n"
141	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
142	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
143	    "\t%s -O <dataset> <path>\n"
144	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
145	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
146	    "\t%s -E [-A] word0:word1:...:word15\n"
147	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
148	    "<poolname>\n\n",
149	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
150	    cmdname, cmdname);
151
152	(void) fprintf(stderr, "    Dataset name must include at least one "
153	    "separator character '/' or '@'\n");
154	(void) fprintf(stderr, "    If dataset name is specified, only that "
155	    "dataset is dumped\n");
156	(void) fprintf(stderr, "    If object numbers are specified, only "
157	    "those objects are dumped\n\n");
158	(void) fprintf(stderr, "    Options to control amount of output:\n");
159	(void) fprintf(stderr, "        -b block statistics\n");
160	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
161	    "all data) blocks\n");
162	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
163	(void) fprintf(stderr, "        -d dataset(s)\n");
164	(void) fprintf(stderr, "        -D dedup statistics\n");
165	(void) fprintf(stderr, "        -E decode and display block from an "
166	    "embedded block pointer\n");
167	(void) fprintf(stderr, "        -h pool history\n");
168	(void) fprintf(stderr, "        -i intent logs\n");
169	(void) fprintf(stderr, "        -l read label contents\n");
170	(void) fprintf(stderr, "        -k examine the checkpointed state "
171	    "of the pool\n");
172	(void) fprintf(stderr, "        -L disable leak tracking (do not "
173	    "load spacemaps)\n");
174	(void) fprintf(stderr, "        -m metaslabs\n");
175	(void) fprintf(stderr, "        -M metaslab groups\n");
176	(void) fprintf(stderr, "        -O perform object lookups by path\n");
177	(void) fprintf(stderr, "        -R read and display block from a "
178	    "device\n");
179	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
180	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
181	(void) fprintf(stderr, "        -v verbose (applies to all "
182	    "others)\n\n");
183	(void) fprintf(stderr, "    Below options are intended for use "
184	    "with other options:\n");
185	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
186	    "panic recovery (-AA) or both (-AAA)\n");
187	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
188	    "has altroot/not in a cachefile\n");
189	(void) fprintf(stderr, "        -F attempt automatic rewind within "
190	    "safe range of transaction groups\n");
191	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
192	    "exiting\n");
193	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
194	    "specify the maximum number of "
195	    "checksumming I/Os [default is 200]\n");
196	(void) fprintf(stderr, "        -o <variable>=<value> set global "
197	    "variable to an unsigned 32-bit integer value\n");
198	(void) fprintf(stderr, "        -p <path> -- use one or more with "
199	    "-e to specify path to vdev dir\n");
200	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
201	(void) fprintf(stderr, "        -q don't print label contents\n");
202	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
203	    "searching for uberblocks\n");
204	(void) fprintf(stderr, "        -u uberblock\n");
205	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
206	    "cachefile\n");
207	(void) fprintf(stderr, "        -V do verbatim import\n");
208	(void) fprintf(stderr, "        -x <dumpdir> -- "
209	    "dump all read blocks into specified directory\n");
210	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
211	    "work with dataset)\n\n");
212	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
213	    "to make only that option verbose\n");
214	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
215	exit(1);
216}
217
218static void
219dump_debug_buffer()
220{
221	if (dump_opt['G']) {
222		(void) printf("\n");
223		zfs_dbgmsg_print("zdb");
224	}
225}
226
227/*
228 * Called for usage errors that are discovered after a call to spa_open(),
229 * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
230 */
231
232static void
233fatal(const char *fmt, ...)
234{
235	va_list ap;
236
237	va_start(ap, fmt);
238	(void) fprintf(stderr, "%s: ", cmdname);
239	(void) vfprintf(stderr, fmt, ap);
240	va_end(ap);
241	(void) fprintf(stderr, "\n");
242
243	dump_debug_buffer();
244
245	exit(1);
246}
247
248/* ARGSUSED */
249static void
250dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
251{
252	nvlist_t *nv;
253	size_t nvsize = *(uint64_t *)data;
254	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
255
256	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
257
258	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
259
260	umem_free(packed, nvsize);
261
262	dump_nvlist(nv, 8);
263
264	nvlist_free(nv);
265}
266
267/* ARGSUSED */
268static void
269dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
270{
271	spa_history_phys_t *shp = data;
272
273	if (shp == NULL)
274		return;
275
276	(void) printf("\t\tpool_create_len = %llu\n",
277	    (u_longlong_t)shp->sh_pool_create_len);
278	(void) printf("\t\tphys_max_off = %llu\n",
279	    (u_longlong_t)shp->sh_phys_max_off);
280	(void) printf("\t\tbof = %llu\n",
281	    (u_longlong_t)shp->sh_bof);
282	(void) printf("\t\teof = %llu\n",
283	    (u_longlong_t)shp->sh_eof);
284	(void) printf("\t\trecords_lost = %llu\n",
285	    (u_longlong_t)shp->sh_records_lost);
286}
287
288static void
289zdb_nicenum(uint64_t num, char *buf, size_t buflen)
290{
291	if (dump_opt['P'])
292		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
293	else
294		nicenum(num, buf, sizeof (buf));
295}
296
297static const char histo_stars[] = "****************************************";
298static const uint64_t histo_width = sizeof (histo_stars) - 1;
299
300static void
301dump_histogram(const uint64_t *histo, int size, int offset)
302{
303	int i;
304	int minidx = size - 1;
305	int maxidx = 0;
306	uint64_t max = 0;
307
308	for (i = 0; i < size; i++) {
309		if (histo[i] > max)
310			max = histo[i];
311		if (histo[i] > 0 && i > maxidx)
312			maxidx = i;
313		if (histo[i] > 0 && i < minidx)
314			minidx = i;
315	}
316
317	if (max < histo_width)
318		max = histo_width;
319
320	for (i = minidx; i <= maxidx; i++) {
321		(void) printf("\t\t\t%3u: %6llu %s\n",
322		    i + offset, (u_longlong_t)histo[i],
323		    &histo_stars[(max - histo[i]) * histo_width / max]);
324	}
325}
326
327static void
328dump_zap_stats(objset_t *os, uint64_t object)
329{
330	int error;
331	zap_stats_t zs;
332
333	error = zap_get_stats(os, object, &zs);
334	if (error)
335		return;
336
337	if (zs.zs_ptrtbl_len == 0) {
338		ASSERT(zs.zs_num_blocks == 1);
339		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
340		    (u_longlong_t)zs.zs_blocksize,
341		    (u_longlong_t)zs.zs_num_entries);
342		return;
343	}
344
345	(void) printf("\tFat ZAP stats:\n");
346
347	(void) printf("\t\tPointer table:\n");
348	(void) printf("\t\t\t%llu elements\n",
349	    (u_longlong_t)zs.zs_ptrtbl_len);
350	(void) printf("\t\t\tzt_blk: %llu\n",
351	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
352	(void) printf("\t\t\tzt_numblks: %llu\n",
353	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
354	(void) printf("\t\t\tzt_shift: %llu\n",
355	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
356	(void) printf("\t\t\tzt_blks_copied: %llu\n",
357	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
358	(void) printf("\t\t\tzt_nextblk: %llu\n",
359	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
360
361	(void) printf("\t\tZAP entries: %llu\n",
362	    (u_longlong_t)zs.zs_num_entries);
363	(void) printf("\t\tLeaf blocks: %llu\n",
364	    (u_longlong_t)zs.zs_num_leafs);
365	(void) printf("\t\tTotal blocks: %llu\n",
366	    (u_longlong_t)zs.zs_num_blocks);
367	(void) printf("\t\tzap_block_type: 0x%llx\n",
368	    (u_longlong_t)zs.zs_block_type);
369	(void) printf("\t\tzap_magic: 0x%llx\n",
370	    (u_longlong_t)zs.zs_magic);
371	(void) printf("\t\tzap_salt: 0x%llx\n",
372	    (u_longlong_t)zs.zs_salt);
373
374	(void) printf("\t\tLeafs with 2^n pointers:\n");
375	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
376
377	(void) printf("\t\tBlocks with n*5 entries:\n");
378	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
379
380	(void) printf("\t\tBlocks n/10 full:\n");
381	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
382
383	(void) printf("\t\tEntries with n chunks:\n");
384	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
385
386	(void) printf("\t\tBuckets with n entries:\n");
387	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
388}
389
390/*ARGSUSED*/
391static void
392dump_none(objset_t *os, uint64_t object, void *data, size_t size)
393{
394}
395
396/*ARGSUSED*/
397static void
398dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
399{
400	(void) printf("\tUNKNOWN OBJECT TYPE\n");
401}
402
403/*ARGSUSED*/
404static void
405dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
406{
407}
408
409/*ARGSUSED*/
410static void
411dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
412{
413}
414
415/*ARGSUSED*/
416static void
417dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
418{
419	zap_cursor_t zc;
420	zap_attribute_t attr;
421	void *prop;
422	unsigned i;
423
424	dump_zap_stats(os, object);
425	(void) printf("\n");
426
427	for (zap_cursor_init(&zc, os, object);
428	    zap_cursor_retrieve(&zc, &attr) == 0;
429	    zap_cursor_advance(&zc)) {
430		(void) printf("\t\t%s = ", attr.za_name);
431		if (attr.za_num_integers == 0) {
432			(void) printf("\n");
433			continue;
434		}
435		prop = umem_zalloc(attr.za_num_integers *
436		    attr.za_integer_length, UMEM_NOFAIL);
437		(void) zap_lookup(os, object, attr.za_name,
438		    attr.za_integer_length, attr.za_num_integers, prop);
439		if (attr.za_integer_length == 1) {
440			if (strcmp(attr.za_name,
441			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
442			    strcmp(attr.za_name,
443			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
444			    strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 ||
445			    strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
446			    strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) {
447				uint8_t *u8 = prop;
448
449				for (i = 0; i < attr.za_num_integers; i++) {
450					(void) printf("%02x", u8[i]);
451				}
452			} else {
453				(void) printf("%s", (char *)prop);
454			}
455		} else {
456			for (i = 0; i < attr.za_num_integers; i++) {
457				switch (attr.za_integer_length) {
458				case 2:
459					(void) printf("%u ",
460					    ((uint16_t *)prop)[i]);
461					break;
462				case 4:
463					(void) printf("%u ",
464					    ((uint32_t *)prop)[i]);
465					break;
466				case 8:
467					(void) printf("%lld ",
468					    (u_longlong_t)((int64_t *)prop)[i]);
469					break;
470				}
471			}
472		}
473		(void) printf("\n");
474		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
475	}
476	zap_cursor_fini(&zc);
477}
478
479static void
480dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
481{
482	bpobj_phys_t *bpop = data;
483	char bytes[32], comp[32], uncomp[32];
484
485	/* make sure the output won't get truncated */
486	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
487	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
488	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
489
490	if (bpop == NULL)
491		return;
492
493	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
494	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
495	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
496
497	(void) printf("\t\tnum_blkptrs = %llu\n",
498	    (u_longlong_t)bpop->bpo_num_blkptrs);
499	(void) printf("\t\tbytes = %s\n", bytes);
500	if (size >= BPOBJ_SIZE_V1) {
501		(void) printf("\t\tcomp = %s\n", comp);
502		(void) printf("\t\tuncomp = %s\n", uncomp);
503	}
504	if (size >= sizeof (*bpop)) {
505		(void) printf("\t\tsubobjs = %llu\n",
506		    (u_longlong_t)bpop->bpo_subobjs);
507		(void) printf("\t\tnum_subobjs = %llu\n",
508		    (u_longlong_t)bpop->bpo_num_subobjs);
509	}
510
511	if (dump_opt['d'] < 5)
512		return;
513
514	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
515		char blkbuf[BP_SPRINTF_LEN];
516		blkptr_t bp;
517
518		int err = dmu_read(os, object,
519		    i * sizeof (bp), sizeof (bp), &bp, 0);
520		if (err != 0) {
521			(void) printf("got error %u from dmu_read\n", err);
522			break;
523		}
524		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
525		(void) printf("\t%s\n", blkbuf);
526	}
527}
528
529/* ARGSUSED */
530static void
531dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
532{
533	dmu_object_info_t doi;
534
535	VERIFY0(dmu_object_info(os, object, &doi));
536	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
537
538	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
539	if (err != 0) {
540		(void) printf("got error %u from dmu_read\n", err);
541		kmem_free(subobjs, doi.doi_max_offset);
542		return;
543	}
544
545	int64_t last_nonzero = -1;
546	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
547		if (subobjs[i] != 0)
548			last_nonzero = i;
549	}
550
551	for (int64_t i = 0; i <= last_nonzero; i++) {
552		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
553	}
554	kmem_free(subobjs, doi.doi_max_offset);
555}
556
557/*ARGSUSED*/
558static void
559dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
560{
561	dump_zap_stats(os, object);
562	/* contents are printed elsewhere, properly decoded */
563}
564
565/*ARGSUSED*/
566static void
567dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
568{
569	zap_cursor_t zc;
570	zap_attribute_t attr;
571
572	dump_zap_stats(os, object);
573	(void) printf("\n");
574
575	for (zap_cursor_init(&zc, os, object);
576	    zap_cursor_retrieve(&zc, &attr) == 0;
577	    zap_cursor_advance(&zc)) {
578		(void) printf("\t\t%s = ", attr.za_name);
579		if (attr.za_num_integers == 0) {
580			(void) printf("\n");
581			continue;
582		}
583		(void) printf(" %llx : [%d:%d:%d]\n",
584		    (u_longlong_t)attr.za_first_integer,
585		    (int)ATTR_LENGTH(attr.za_first_integer),
586		    (int)ATTR_BSWAP(attr.za_first_integer),
587		    (int)ATTR_NUM(attr.za_first_integer));
588	}
589	zap_cursor_fini(&zc);
590}
591
592/*ARGSUSED*/
593static void
594dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
595{
596	zap_cursor_t zc;
597	zap_attribute_t attr;
598	uint16_t *layout_attrs;
599	unsigned i;
600
601	dump_zap_stats(os, object);
602	(void) printf("\n");
603
604	for (zap_cursor_init(&zc, os, object);
605	    zap_cursor_retrieve(&zc, &attr) == 0;
606	    zap_cursor_advance(&zc)) {
607		(void) printf("\t\t%s = [", attr.za_name);
608		if (attr.za_num_integers == 0) {
609			(void) printf("\n");
610			continue;
611		}
612
613		VERIFY(attr.za_integer_length == 2);
614		layout_attrs = umem_zalloc(attr.za_num_integers *
615		    attr.za_integer_length, UMEM_NOFAIL);
616
617		VERIFY(zap_lookup(os, object, attr.za_name,
618		    attr.za_integer_length,
619		    attr.za_num_integers, layout_attrs) == 0);
620
621		for (i = 0; i != attr.za_num_integers; i++)
622			(void) printf(" %d ", (int)layout_attrs[i]);
623		(void) printf("]\n");
624		umem_free(layout_attrs,
625		    attr.za_num_integers * attr.za_integer_length);
626	}
627	zap_cursor_fini(&zc);
628}
629
630/*ARGSUSED*/
631static void
632dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
633{
634	zap_cursor_t zc;
635	zap_attribute_t attr;
636	const char *typenames[] = {
637		/* 0 */ "not specified",
638		/* 1 */ "FIFO",
639		/* 2 */ "Character Device",
640		/* 3 */ "3 (invalid)",
641		/* 4 */ "Directory",
642		/* 5 */ "5 (invalid)",
643		/* 6 */ "Block Device",
644		/* 7 */ "7 (invalid)",
645		/* 8 */ "Regular File",
646		/* 9 */ "9 (invalid)",
647		/* 10 */ "Symbolic Link",
648		/* 11 */ "11 (invalid)",
649		/* 12 */ "Socket",
650		/* 13 */ "Door",
651		/* 14 */ "Event Port",
652		/* 15 */ "15 (invalid)",
653	};
654
655	dump_zap_stats(os, object);
656	(void) printf("\n");
657
658	for (zap_cursor_init(&zc, os, object);
659	    zap_cursor_retrieve(&zc, &attr) == 0;
660	    zap_cursor_advance(&zc)) {
661		(void) printf("\t\t%s = %lld (type: %s)\n",
662		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
663		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
664	}
665	zap_cursor_fini(&zc);
666}
667
668static int
669get_dtl_refcount(vdev_t *vd)
670{
671	int refcount = 0;
672
673	if (vd->vdev_ops->vdev_op_leaf) {
674		space_map_t *sm = vd->vdev_dtl_sm;
675
676		if (sm != NULL &&
677		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
678			return (1);
679		return (0);
680	}
681
682	for (unsigned c = 0; c < vd->vdev_children; c++)
683		refcount += get_dtl_refcount(vd->vdev_child[c]);
684	return (refcount);
685}
686
687static int
688get_metaslab_refcount(vdev_t *vd)
689{
690	int refcount = 0;
691
692	if (vd->vdev_top == vd) {
693		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
694			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
695
696			if (sm != NULL &&
697			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
698				refcount++;
699		}
700	}
701	for (unsigned c = 0; c < vd->vdev_children; c++)
702		refcount += get_metaslab_refcount(vd->vdev_child[c]);
703
704	return (refcount);
705}
706
707static int
708get_obsolete_refcount(vdev_t *vd)
709{
710	int refcount = 0;
711
712	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
713	if (vd->vdev_top == vd && obsolete_sm_obj != 0) {
714		dmu_object_info_t doi;
715		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
716		    obsolete_sm_obj, &doi));
717		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
718			refcount++;
719		}
720	} else {
721		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
722		ASSERT3U(obsolete_sm_obj, ==, 0);
723	}
724	for (unsigned c = 0; c < vd->vdev_children; c++) {
725		refcount += get_obsolete_refcount(vd->vdev_child[c]);
726	}
727
728	return (refcount);
729}
730
731static int
732get_prev_obsolete_spacemap_refcount(spa_t *spa)
733{
734	uint64_t prev_obj =
735	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
736	if (prev_obj != 0) {
737		dmu_object_info_t doi;
738		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
739		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
740			return (1);
741		}
742	}
743	return (0);
744}
745
746static int
747get_checkpoint_refcount(vdev_t *vd)
748{
749	int refcount = 0;
750
751	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
752	    zap_contains(spa_meta_objset(vd->vdev_spa),
753	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
754		refcount++;
755
756	for (uint64_t c = 0; c < vd->vdev_children; c++)
757		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
758
759	return (refcount);
760}
761
762static int
763get_log_spacemap_refcount(spa_t *spa)
764{
765	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
766}
767
768static int
769verify_spacemap_refcounts(spa_t *spa)
770{
771	uint64_t expected_refcount = 0;
772	uint64_t actual_refcount;
773
774	(void) feature_get_refcount(spa,
775	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
776	    &expected_refcount);
777	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
778	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
779	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
780	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
781	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
782	actual_refcount += get_log_spacemap_refcount(spa);
783
784	if (expected_refcount != actual_refcount) {
785		(void) printf("space map refcount mismatch: expected %lld != "
786		    "actual %lld\n",
787		    (longlong_t)expected_refcount,
788		    (longlong_t)actual_refcount);
789		return (2);
790	}
791	return (0);
792}
793
794static void
795dump_spacemap(objset_t *os, space_map_t *sm)
796{
797	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
798	    "INVALID", "INVALID", "INVALID", "INVALID" };
799
800	if (sm == NULL)
801		return;
802
803	(void) printf("space map object %llu:\n",
804	    (longlong_t)sm->sm_object);
805	(void) printf("  smp_length = 0x%llx\n",
806	    (longlong_t)sm->sm_phys->smp_length);
807	(void) printf("  smp_alloc = 0x%llx\n",
808	    (longlong_t)sm->sm_phys->smp_alloc);
809
810	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
811		return;
812
813	/*
814	 * Print out the freelist entries in both encoded and decoded form.
815	 */
816	uint8_t mapshift = sm->sm_shift;
817	int64_t alloc = 0;
818	uint64_t word, entry_id = 0;
819	for (uint64_t offset = 0; offset < space_map_length(sm);
820	    offset += sizeof (word)) {
821
822		VERIFY0(dmu_read(os, space_map_object(sm), offset,
823		    sizeof (word), &word, DMU_READ_PREFETCH));
824
825		if (sm_entry_is_debug(word)) {
826			(void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
827			    (u_longlong_t)entry_id,
828			    ddata[SM_DEBUG_ACTION_DECODE(word)],
829			    (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
830			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
831			entry_id++;
832			continue;
833		}
834
835		uint8_t words;
836		char entry_type;
837		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
838
839		if (sm_entry_is_single_word(word)) {
840			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
841			    'A' : 'F';
842			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
843			    sm->sm_start;
844			entry_run = SM_RUN_DECODE(word) << mapshift;
845			words = 1;
846		} else {
847			/* it is a two-word entry so we read another word */
848			ASSERT(sm_entry_is_double_word(word));
849
850			uint64_t extra_word;
851			offset += sizeof (extra_word);
852			VERIFY0(dmu_read(os, space_map_object(sm), offset,
853			    sizeof (extra_word), &extra_word,
854			    DMU_READ_PREFETCH));
855
856			ASSERT3U(offset, <=, space_map_length(sm));
857
858			entry_run = SM2_RUN_DECODE(word) << mapshift;
859			entry_vdev = SM2_VDEV_DECODE(word);
860			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
861			    'A' : 'F';
862			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
863			    mapshift) + sm->sm_start;
864			words = 2;
865		}
866
867		(void) printf("\t    [%6llu]    %c  range:"
868		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
869		    (u_longlong_t)entry_id,
870		    entry_type, (u_longlong_t)entry_off,
871		    (u_longlong_t)(entry_off + entry_run),
872		    (u_longlong_t)entry_run,
873		    (u_longlong_t)entry_vdev, words);
874
875		if (entry_type == 'A')
876			alloc += entry_run;
877		else
878			alloc -= entry_run;
879		entry_id++;
880	}
881	if (alloc != space_map_allocated(sm)) {
882		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
883		    "with space map summary (%lld)\n",
884		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
885	}
886}
887
888static void
889dump_metaslab_stats(metaslab_t *msp)
890{
891	char maxbuf[32];
892	range_tree_t *rt = msp->ms_allocatable;
893	zfs_btree_t *t = &msp->ms_allocatable_by_size;
894	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
895
896	/* max sure nicenum has enough space */
897	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
898
899	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
900
901	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
902	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
903	    "freepct", free_pct);
904	(void) printf("\tIn-memory histogram:\n");
905	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
906}
907
908static void
909dump_metaslab(metaslab_t *msp)
910{
911	vdev_t *vd = msp->ms_group->mg_vd;
912	spa_t *spa = vd->vdev_spa;
913	space_map_t *sm = msp->ms_sm;
914	char freebuf[32];
915
916	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
917	    sizeof (freebuf));
918
919	(void) printf(
920	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
921	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
922	    (u_longlong_t)space_map_object(sm), freebuf);
923
924	if (dump_opt['m'] > 2 && !dump_opt['L']) {
925		mutex_enter(&msp->ms_lock);
926		VERIFY0(metaslab_load(msp));
927		range_tree_stat_verify(msp->ms_allocatable);
928		dump_metaslab_stats(msp);
929		metaslab_unload(msp);
930		mutex_exit(&msp->ms_lock);
931	}
932
933	if (dump_opt['m'] > 1 && sm != NULL &&
934	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
935		/*
936		 * The space map histogram represents free space in chunks
937		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
938		 */
939		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
940		    (u_longlong_t)msp->ms_fragmentation);
941		dump_histogram(sm->sm_phys->smp_histogram,
942		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
943	}
944
945	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
946	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
947
948	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
949		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
950		    (u_longlong_t)metaslab_unflushed_txg(msp));
951	}
952}
953
954static void
955print_vdev_metaslab_header(vdev_t *vd)
956{
957	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
958	const char *bias_str = "";
959
960	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
961		bias_str = VDEV_ALLOC_BIAS_LOG;
962	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
963		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
964	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
965		bias_str = VDEV_ALLOC_BIAS_DEDUP;
966	}
967
968	uint64_t ms_flush_data_obj = 0;
969	if (vd->vdev_top_zap != 0) {
970		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
971		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
972		    sizeof (uint64_t), 1, &ms_flush_data_obj);
973		if (error != ENOENT) {
974			ASSERT0(error);
975		}
976	}
977
978	(void) printf("\tvdev %10llu   %s",
979	    (u_longlong_t)vd->vdev_id, bias_str);
980
981	if (ms_flush_data_obj != 0) {
982		(void) printf("   ms_unflushed_phys object %llu",
983		    (u_longlong_t)ms_flush_data_obj);
984	}
985
986	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
987	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
988	    "offset", "spacemap", "free");
989	(void) printf("\t%15s   %19s   %15s   %12s\n",
990	    "---------------", "-------------------",
991	    "---------------", "------------");
992}
993
994static void
995dump_metaslab_groups(spa_t *spa)
996{
997	vdev_t *rvd = spa->spa_root_vdev;
998	metaslab_class_t *mc = spa_normal_class(spa);
999	uint64_t fragmentation;
1000
1001	metaslab_class_histogram_verify(mc);
1002
1003	for (unsigned c = 0; c < rvd->vdev_children; c++) {
1004		vdev_t *tvd = rvd->vdev_child[c];
1005		metaslab_group_t *mg = tvd->vdev_mg;
1006
1007		if (mg == NULL || mg->mg_class != mc)
1008			continue;
1009
1010		metaslab_group_histogram_verify(mg);
1011		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1012
1013		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
1014		    "fragmentation",
1015		    (u_longlong_t)tvd->vdev_id,
1016		    (u_longlong_t)tvd->vdev_ms_count);
1017		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
1018			(void) printf("%3s\n", "-");
1019		} else {
1020			(void) printf("%3llu%%\n",
1021			    (u_longlong_t)mg->mg_fragmentation);
1022		}
1023		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1024	}
1025
1026	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
1027	fragmentation = metaslab_class_fragmentation(mc);
1028	if (fragmentation == ZFS_FRAG_INVALID)
1029		(void) printf("\t%3s\n", "-");
1030	else
1031		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
1032	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1033}
1034
1035static void
1036print_vdev_indirect(vdev_t *vd)
1037{
1038	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1039	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1040	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
1041
1042	if (vim == NULL) {
1043		ASSERT3P(vib, ==, NULL);
1044		return;
1045	}
1046
1047	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
1048	    vic->vic_mapping_object);
1049	ASSERT3U(vdev_indirect_births_object(vib), ==,
1050	    vic->vic_births_object);
1051
1052	(void) printf("indirect births obj %llu:\n",
1053	    (longlong_t)vic->vic_births_object);
1054	(void) printf("    vib_count = %llu\n",
1055	    (longlong_t)vdev_indirect_births_count(vib));
1056	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
1057		vdev_indirect_birth_entry_phys_t *cur_vibe =
1058		    &vib->vib_entries[i];
1059		(void) printf("\toffset %llx -> txg %llu\n",
1060		    (longlong_t)cur_vibe->vibe_offset,
1061		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
1062	}
1063	(void) printf("\n");
1064
1065	(void) printf("indirect mapping obj %llu:\n",
1066	    (longlong_t)vic->vic_mapping_object);
1067	(void) printf("    vim_max_offset = 0x%llx\n",
1068	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
1069	(void) printf("    vim_bytes_mapped = 0x%llx\n",
1070	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
1071	(void) printf("    vim_count = %llu\n",
1072	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
1073
1074	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
1075		return;
1076
1077	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
1078
1079	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
1080		vdev_indirect_mapping_entry_phys_t *vimep =
1081		    &vim->vim_entries[i];
1082		(void) printf("\t<%llx:%llx:%llx> -> "
1083		    "<%llx:%llx:%llx> (%x obsolete)\n",
1084		    (longlong_t)vd->vdev_id,
1085		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1086		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1087		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1088		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1089		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1090		    counts[i]);
1091	}
1092	(void) printf("\n");
1093
1094	uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
1095	if (obsolete_sm_object != 0) {
1096		objset_t *mos = vd->vdev_spa->spa_meta_objset;
1097		(void) printf("obsolete space map object %llu:\n",
1098		    (u_longlong_t)obsolete_sm_object);
1099		ASSERT(vd->vdev_obsolete_sm != NULL);
1100		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1101		    obsolete_sm_object);
1102		dump_spacemap(mos, vd->vdev_obsolete_sm);
1103		(void) printf("\n");
1104	}
1105}
1106
1107static void
1108dump_metaslabs(spa_t *spa)
1109{
1110	vdev_t *vd, *rvd = spa->spa_root_vdev;
1111	uint64_t m, c = 0, children = rvd->vdev_children;
1112
1113	(void) printf("\nMetaslabs:\n");
1114
1115	if (!dump_opt['d'] && zopt_objects > 0) {
1116		c = zopt_object[0];
1117
1118		if (c >= children)
1119			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1120
1121		if (zopt_objects > 1) {
1122			vd = rvd->vdev_child[c];
1123			print_vdev_metaslab_header(vd);
1124
1125			for (m = 1; m < zopt_objects; m++) {
1126				if (zopt_object[m] < vd->vdev_ms_count)
1127					dump_metaslab(
1128					    vd->vdev_ms[zopt_object[m]]);
1129				else
1130					(void) fprintf(stderr, "bad metaslab "
1131					    "number %llu\n",
1132					    (u_longlong_t)zopt_object[m]);
1133			}
1134			(void) printf("\n");
1135			return;
1136		}
1137		children = c + 1;
1138	}
1139	for (; c < children; c++) {
1140		vd = rvd->vdev_child[c];
1141		print_vdev_metaslab_header(vd);
1142
1143		print_vdev_indirect(vd);
1144
1145		for (m = 0; m < vd->vdev_ms_count; m++)
1146			dump_metaslab(vd->vdev_ms[m]);
1147		(void) printf("\n");
1148	}
1149}
1150
1151static void
1152dump_log_spacemaps(spa_t *spa)
1153{
1154	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1155		return;
1156
1157	(void) printf("\nLog Space Maps in Pool:\n");
1158	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1159	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1160		space_map_t *sm = NULL;
1161		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
1162		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
1163
1164		(void) printf("Log Spacemap object %llu txg %llu\n",
1165		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
1166		dump_spacemap(spa->spa_meta_objset, sm);
1167		space_map_close(sm);
1168	}
1169	(void) printf("\n");
1170}
1171
1172static void
1173dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1174{
1175	const ddt_phys_t *ddp = dde->dde_phys;
1176	const ddt_key_t *ddk = &dde->dde_key;
1177	const char *types[4] = { "ditto", "single", "double", "triple" };
1178	char blkbuf[BP_SPRINTF_LEN];
1179	blkptr_t blk;
1180
1181	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1182		if (ddp->ddp_phys_birth == 0)
1183			continue;
1184		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1185		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1186		(void) printf("index %llx refcnt %llu %s %s\n",
1187		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1188		    types[p], blkbuf);
1189	}
1190}
1191
1192static void
1193dump_dedup_ratio(const ddt_stat_t *dds)
1194{
1195	double rL, rP, rD, D, dedup, compress, copies;
1196
1197	if (dds->dds_blocks == 0)
1198		return;
1199
1200	rL = (double)dds->dds_ref_lsize;
1201	rP = (double)dds->dds_ref_psize;
1202	rD = (double)dds->dds_ref_dsize;
1203	D = (double)dds->dds_dsize;
1204
1205	dedup = rD / D;
1206	compress = rL / rP;
1207	copies = rD / rP;
1208
1209	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1210	    "dedup * compress / copies = %.2f\n\n",
1211	    dedup, compress, copies, dedup * compress / copies);
1212}
1213
1214static void
1215dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
1216{
1217	char name[DDT_NAMELEN];
1218	ddt_entry_t dde;
1219	uint64_t walk = 0;
1220	dmu_object_info_t doi;
1221	uint64_t count, dspace, mspace;
1222	int error;
1223
1224	error = ddt_object_info(ddt, type, class, &doi);
1225
1226	if (error == ENOENT)
1227		return;
1228	ASSERT(error == 0);
1229
1230	if ((count = ddt_object_count(ddt, type, class)) == 0)
1231		return;
1232
1233	dspace = doi.doi_physical_blocks_512 << 9;
1234	mspace = doi.doi_fill_count * doi.doi_data_block_size;
1235
1236	ddt_object_name(ddt, type, class, name);
1237
1238	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
1239	    name,
1240	    (u_longlong_t)count,
1241	    (u_longlong_t)(dspace / count),
1242	    (u_longlong_t)(mspace / count));
1243
1244	if (dump_opt['D'] < 3)
1245		return;
1246
1247	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
1248
1249	if (dump_opt['D'] < 4)
1250		return;
1251
1252	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
1253		return;
1254
1255	(void) printf("%s contents:\n\n", name);
1256
1257	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
1258		dump_dde(ddt, &dde, walk);
1259
1260	ASSERT3U(error, ==, ENOENT);
1261
1262	(void) printf("\n");
1263}
1264
1265static void
1266dump_all_ddts(spa_t *spa)
1267{
1268	ddt_histogram_t ddh_total;
1269	ddt_stat_t dds_total;
1270
1271	bzero(&ddh_total, sizeof (ddh_total));
1272	bzero(&dds_total, sizeof (dds_total));
1273
1274	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1275		ddt_t *ddt = spa->spa_ddt[c];
1276		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1277			for (enum ddt_class class = 0; class < DDT_CLASSES;
1278			    class++) {
1279				dump_ddt(ddt, type, class);
1280			}
1281		}
1282	}
1283
1284	ddt_get_dedup_stats(spa, &dds_total);
1285
1286	if (dds_total.dds_blocks == 0) {
1287		(void) printf("All DDTs are empty\n");
1288		return;
1289	}
1290
1291	(void) printf("\n");
1292
1293	if (dump_opt['D'] > 1) {
1294		(void) printf("DDT histogram (aggregated over all DDTs):\n");
1295		ddt_get_dedup_histogram(spa, &ddh_total);
1296		zpool_dump_ddt(&dds_total, &ddh_total);
1297	}
1298
1299	dump_dedup_ratio(&dds_total);
1300}
1301
1302static void
1303dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
1304{
1305	char *prefix = arg;
1306
1307	(void) printf("%s [%llu,%llu) length %llu\n",
1308	    prefix,
1309	    (u_longlong_t)start,
1310	    (u_longlong_t)(start + size),
1311	    (u_longlong_t)(size));
1312}
1313
1314static void
1315dump_dtl(vdev_t *vd, int indent)
1316{
1317	spa_t *spa = vd->vdev_spa;
1318	boolean_t required;
1319	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
1320		"outage" };
1321	char prefix[256];
1322
1323	spa_vdev_state_enter(spa, SCL_NONE);
1324	required = vdev_dtl_required(vd);
1325	(void) spa_vdev_state_exit(spa, NULL, 0);
1326
1327	if (indent == 0)
1328		(void) printf("\nDirty time logs:\n\n");
1329
1330	(void) printf("\t%*s%s [%s]\n", indent, "",
1331	    vd->vdev_path ? vd->vdev_path :
1332	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
1333	    required ? "DTL-required" : "DTL-expendable");
1334
1335	for (int t = 0; t < DTL_TYPES; t++) {
1336		range_tree_t *rt = vd->vdev_dtl[t];
1337		if (range_tree_space(rt) == 0)
1338			continue;
1339		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
1340		    indent + 2, "", name[t]);
1341		range_tree_walk(rt, dump_dtl_seg, prefix);
1342		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
1343			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
1344	}
1345
1346	for (unsigned c = 0; c < vd->vdev_children; c++)
1347		dump_dtl(vd->vdev_child[c], indent + 4);
1348}
1349
1350static void
1351dump_history(spa_t *spa)
1352{
1353	nvlist_t **events = NULL;
1354	uint64_t resid, len, off = 0;
1355	uint_t num = 0;
1356	int error;
1357	time_t tsec;
1358	struct tm t;
1359	char tbuf[30];
1360	char internalstr[MAXPATHLEN];
1361
1362	char *buf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1363	do {
1364		len = SPA_MAXBLOCKSIZE;
1365
1366		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
1367			(void) fprintf(stderr, "Unable to read history: "
1368			    "error %d\n", error);
1369			umem_free(buf, SPA_MAXBLOCKSIZE);
1370			return;
1371		}
1372
1373		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
1374			break;
1375
1376		off -= resid;
1377	} while (len != 0);
1378	umem_free(buf, SPA_MAXBLOCKSIZE);
1379
1380	(void) printf("\nHistory:\n");
1381	for (unsigned i = 0; i < num; i++) {
1382		uint64_t time, txg, ievent;
1383		char *cmd, *intstr;
1384		boolean_t printed = B_FALSE;
1385
1386		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
1387		    &time) != 0)
1388			goto next;
1389		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
1390		    &cmd) != 0) {
1391			if (nvlist_lookup_uint64(events[i],
1392			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
1393				goto next;
1394			verify(nvlist_lookup_uint64(events[i],
1395			    ZPOOL_HIST_TXG, &txg) == 0);
1396			verify(nvlist_lookup_string(events[i],
1397			    ZPOOL_HIST_INT_STR, &intstr) == 0);
1398			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
1399				goto next;
1400
1401			(void) snprintf(internalstr,
1402			    sizeof (internalstr),
1403			    "[internal %s txg:%ju] %s",
1404			    zfs_history_event_names[ievent], (uintmax_t)txg,
1405			    intstr);
1406			cmd = internalstr;
1407		}
1408		tsec = time;
1409		(void) localtime_r(&tsec, &t);
1410		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
1411		(void) printf("%s %s\n", tbuf, cmd);
1412		printed = B_TRUE;
1413
1414next:
1415		if (dump_opt['h'] > 1) {
1416			if (!printed)
1417				(void) printf("unrecognized record:\n");
1418			dump_nvlist(events[i], 2);
1419		}
1420	}
1421}
1422
1423/*ARGSUSED*/
1424static void
1425dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
1426{
1427}
1428
1429static uint64_t
1430blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
1431    const zbookmark_phys_t *zb)
1432{
1433	if (dnp == NULL) {
1434		ASSERT(zb->zb_level < 0);
1435		if (zb->zb_object == 0)
1436			return (zb->zb_blkid);
1437		return (zb->zb_blkid * BP_GET_LSIZE(bp));
1438	}
1439
1440	ASSERT(zb->zb_level >= 0);
1441
1442	return ((zb->zb_blkid <<
1443	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
1444	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
1445}
1446
1447static void
1448snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
1449{
1450	const dva_t *dva = bp->blk_dva;
1451	unsigned int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
1452
1453	if (dump_opt['b'] >= 6) {
1454		snprintf_blkptr(blkbuf, buflen, bp);
1455		return;
1456	}
1457
1458	if (BP_IS_EMBEDDED(bp)) {
1459		(void) sprintf(blkbuf,
1460		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
1461		    (int)BPE_GET_ETYPE(bp),
1462		    (u_longlong_t)BPE_GET_LSIZE(bp),
1463		    (u_longlong_t)BPE_GET_PSIZE(bp),
1464		    (u_longlong_t)bp->blk_birth);
1465		return;
1466	}
1467
1468	blkbuf[0] = '\0';
1469	for (unsigned int i = 0; i < ndvas; i++)
1470		(void) snprintf(blkbuf + strlen(blkbuf),
1471		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
1472		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
1473		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
1474		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
1475
1476	if (BP_IS_HOLE(bp)) {
1477		(void) snprintf(blkbuf + strlen(blkbuf),
1478		    buflen - strlen(blkbuf),
1479		    "%llxL B=%llu",
1480		    (u_longlong_t)BP_GET_LSIZE(bp),
1481		    (u_longlong_t)bp->blk_birth);
1482	} else {
1483		(void) snprintf(blkbuf + strlen(blkbuf),
1484		    buflen - strlen(blkbuf),
1485		    "%llxL/%llxP F=%llu B=%llu/%llu",
1486		    (u_longlong_t)BP_GET_LSIZE(bp),
1487		    (u_longlong_t)BP_GET_PSIZE(bp),
1488		    (u_longlong_t)BP_GET_FILL(bp),
1489		    (u_longlong_t)bp->blk_birth,
1490		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
1491	}
1492}
1493
1494static void
1495print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
1496    const dnode_phys_t *dnp)
1497{
1498	char blkbuf[BP_SPRINTF_LEN];
1499	int l;
1500
1501	if (!BP_IS_EMBEDDED(bp)) {
1502		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
1503		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
1504	}
1505
1506	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
1507
1508	ASSERT(zb->zb_level >= 0);
1509
1510	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
1511		if (l == zb->zb_level) {
1512			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
1513		} else {
1514			(void) printf(" ");
1515		}
1516	}
1517
1518	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1519	(void) printf("%s\n", blkbuf);
1520}
1521
1522static int
1523visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
1524    blkptr_t *bp, const zbookmark_phys_t *zb)
1525{
1526	int err = 0;
1527
1528	if (bp->blk_birth == 0)
1529		return (0);
1530
1531	print_indirect(bp, zb, dnp);
1532
1533	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
1534		arc_flags_t flags = ARC_FLAG_WAIT;
1535		int i;
1536		blkptr_t *cbp;
1537		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1538		arc_buf_t *buf;
1539		uint64_t fill = 0;
1540
1541		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
1542		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
1543		if (err)
1544			return (err);
1545		ASSERT(buf->b_data);
1546
1547		/* recursively visit blocks below this */
1548		cbp = buf->b_data;
1549		for (i = 0; i < epb; i++, cbp++) {
1550			zbookmark_phys_t czb;
1551
1552			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1553			    zb->zb_level - 1,
1554			    zb->zb_blkid * epb + i);
1555			err = visit_indirect(spa, dnp, cbp, &czb);
1556			if (err)
1557				break;
1558			fill += BP_GET_FILL(cbp);
1559		}
1560		if (!err)
1561			ASSERT3U(fill, ==, BP_GET_FILL(bp));
1562		arc_buf_destroy(buf, &buf);
1563	}
1564
1565	return (err);
1566}
1567
1568/*ARGSUSED*/
1569static void
1570dump_indirect(dnode_t *dn)
1571{
1572	dnode_phys_t *dnp = dn->dn_phys;
1573	int j;
1574	zbookmark_phys_t czb;
1575
1576	(void) printf("Indirect blocks:\n");
1577
1578	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
1579	    dn->dn_object, dnp->dn_nlevels - 1, 0);
1580	for (j = 0; j < dnp->dn_nblkptr; j++) {
1581		czb.zb_blkid = j;
1582		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
1583		    &dnp->dn_blkptr[j], &czb);
1584	}
1585
1586	(void) printf("\n");
1587}
1588
1589/*ARGSUSED*/
1590static void
1591dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
1592{
1593	dsl_dir_phys_t *dd = data;
1594	time_t crtime;
1595	char nice[32];
1596
1597	/* make sure nicenum has enough space */
1598	CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
1599
1600	if (dd == NULL)
1601		return;
1602
1603	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
1604
1605	crtime = dd->dd_creation_time;
1606	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1607	(void) printf("\t\thead_dataset_obj = %llu\n",
1608	    (u_longlong_t)dd->dd_head_dataset_obj);
1609	(void) printf("\t\tparent_dir_obj = %llu\n",
1610	    (u_longlong_t)dd->dd_parent_obj);
1611	(void) printf("\t\torigin_obj = %llu\n",
1612	    (u_longlong_t)dd->dd_origin_obj);
1613	(void) printf("\t\tchild_dir_zapobj = %llu\n",
1614	    (u_longlong_t)dd->dd_child_dir_zapobj);
1615	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
1616	(void) printf("\t\tused_bytes = %s\n", nice);
1617	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
1618	(void) printf("\t\tcompressed_bytes = %s\n", nice);
1619	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
1620	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
1621	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
1622	(void) printf("\t\tquota = %s\n", nice);
1623	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
1624	(void) printf("\t\treserved = %s\n", nice);
1625	(void) printf("\t\tprops_zapobj = %llu\n",
1626	    (u_longlong_t)dd->dd_props_zapobj);
1627	(void) printf("\t\tdeleg_zapobj = %llu\n",
1628	    (u_longlong_t)dd->dd_deleg_zapobj);
1629	(void) printf("\t\tflags = %llx\n",
1630	    (u_longlong_t)dd->dd_flags);
1631
1632#define	DO(which) \
1633	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
1634	    sizeof (nice)); \
1635	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
1636	DO(HEAD);
1637	DO(SNAP);
1638	DO(CHILD);
1639	DO(CHILD_RSRV);
1640	DO(REFRSRV);
1641#undef DO
1642	(void) printf("\t\tclones = %llu\n",
1643	    (u_longlong_t)dd->dd_clones);
1644}
1645
1646/*ARGSUSED*/
1647static void
1648dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
1649{
1650	dsl_dataset_phys_t *ds = data;
1651	time_t crtime;
1652	char used[32], compressed[32], uncompressed[32], unique[32];
1653	char blkbuf[BP_SPRINTF_LEN];
1654
1655	/* make sure nicenum has enough space */
1656	CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
1657	CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
1658	CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
1659	CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
1660
1661	if (ds == NULL)
1662		return;
1663
1664	ASSERT(size == sizeof (*ds));
1665	crtime = ds->ds_creation_time;
1666	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
1667	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
1668	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
1669	    sizeof (uncompressed));
1670	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
1671	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
1672
1673	(void) printf("\t\tdir_obj = %llu\n",
1674	    (u_longlong_t)ds->ds_dir_obj);
1675	(void) printf("\t\tprev_snap_obj = %llu\n",
1676	    (u_longlong_t)ds->ds_prev_snap_obj);
1677	(void) printf("\t\tprev_snap_txg = %llu\n",
1678	    (u_longlong_t)ds->ds_prev_snap_txg);
1679	(void) printf("\t\tnext_snap_obj = %llu\n",
1680	    (u_longlong_t)ds->ds_next_snap_obj);
1681	(void) printf("\t\tsnapnames_zapobj = %llu\n",
1682	    (u_longlong_t)ds->ds_snapnames_zapobj);
1683	(void) printf("\t\tnum_children = %llu\n",
1684	    (u_longlong_t)ds->ds_num_children);
1685	(void) printf("\t\tuserrefs_obj = %llu\n",
1686	    (u_longlong_t)ds->ds_userrefs_obj);
1687	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1688	(void) printf("\t\tcreation_txg = %llu\n",
1689	    (u_longlong_t)ds->ds_creation_txg);
1690	(void) printf("\t\tdeadlist_obj = %llu\n",
1691	    (u_longlong_t)ds->ds_deadlist_obj);
1692	(void) printf("\t\tused_bytes = %s\n", used);
1693	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
1694	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
1695	(void) printf("\t\tunique = %s\n", unique);
1696	(void) printf("\t\tfsid_guid = %llu\n",
1697	    (u_longlong_t)ds->ds_fsid_guid);
1698	(void) printf("\t\tguid = %llu\n",
1699	    (u_longlong_t)ds->ds_guid);
1700	(void) printf("\t\tflags = %llx\n",
1701	    (u_longlong_t)ds->ds_flags);
1702	(void) printf("\t\tnext_clones_obj = %llu\n",
1703	    (u_longlong_t)ds->ds_next_clones_obj);
1704	(void) printf("\t\tprops_obj = %llu\n",
1705	    (u_longlong_t)ds->ds_props_obj);
1706	(void) printf("\t\tbp = %s\n", blkbuf);
1707}
1708
1709/* ARGSUSED */
1710static int
1711dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1712{
1713	char blkbuf[BP_SPRINTF_LEN];
1714
1715	if (bp->blk_birth != 0) {
1716		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
1717		(void) printf("\t%s\n", blkbuf);
1718	}
1719	return (0);
1720}
1721
1722static void
1723dump_bptree(objset_t *os, uint64_t obj, const char *name)
1724{
1725	char bytes[32];
1726	bptree_phys_t *bt;
1727	dmu_buf_t *db;
1728
1729	/* make sure nicenum has enough space */
1730	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1731
1732	if (dump_opt['d'] < 3)
1733		return;
1734
1735	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
1736	bt = db->db_data;
1737	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
1738	(void) printf("\n    %s: %llu datasets, %s\n",
1739	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
1740	dmu_buf_rele(db, FTAG);
1741
1742	if (dump_opt['d'] < 5)
1743		return;
1744
1745	(void) printf("\n");
1746
1747	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
1748}
1749
1750/* ARGSUSED */
1751static int
1752dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1753{
1754	char blkbuf[BP_SPRINTF_LEN];
1755
1756	ASSERT(bp->blk_birth != 0);
1757	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1758	(void) printf("\t%s\n", blkbuf);
1759	return (0);
1760}
1761
1762static void
1763dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
1764{
1765	char bytes[32];
1766	char comp[32];
1767	char uncomp[32];
1768
1769	/* make sure nicenum has enough space */
1770	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1771	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
1772	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
1773
1774	if (dump_opt['d'] < 3)
1775		return;
1776
1777	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
1778	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1779		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
1780		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
1781		(void) printf("    %*s: object %llu, %llu local blkptrs, "
1782		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
1783		    indent * 8, name,
1784		    (u_longlong_t)bpo->bpo_object,
1785		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1786		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1787		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
1788		    bytes, comp, uncomp);
1789
1790		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1791			uint64_t subobj;
1792			bpobj_t subbpo;
1793			int error;
1794			VERIFY0(dmu_read(bpo->bpo_os,
1795			    bpo->bpo_phys->bpo_subobjs,
1796			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1797			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1798			if (error != 0) {
1799				(void) printf("ERROR %u while trying to open "
1800				    "subobj id %llu\n",
1801				    error, (u_longlong_t)subobj);
1802				continue;
1803			}
1804			dump_full_bpobj(&subbpo, "subobj", indent + 1);
1805			bpobj_close(&subbpo);
1806		}
1807	} else {
1808		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
1809		    indent * 8, name,
1810		    (u_longlong_t)bpo->bpo_object,
1811		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1812		    bytes);
1813	}
1814
1815	if (dump_opt['d'] < 5)
1816		return;
1817
1818
1819	if (indent == 0) {
1820		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
1821		(void) printf("\n");
1822	}
1823}
1824
1825static void
1826bpobj_count_refd(bpobj_t *bpo)
1827{
1828	mos_obj_refd(bpo->bpo_object);
1829
1830	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1831		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
1832		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1833			uint64_t subobj;
1834			bpobj_t subbpo;
1835			int error;
1836			VERIFY0(dmu_read(bpo->bpo_os,
1837			    bpo->bpo_phys->bpo_subobjs,
1838			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1839			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1840			if (error != 0) {
1841				(void) printf("ERROR %u while trying to open "
1842				    "subobj id %llu\n",
1843				    error, (u_longlong_t)subobj);
1844				continue;
1845			}
1846			bpobj_count_refd(&subbpo);
1847			bpobj_close(&subbpo);
1848		}
1849	}
1850}
1851
1852static void
1853dump_deadlist(dsl_deadlist_t *dl)
1854{
1855	dsl_deadlist_entry_t *dle;
1856	uint64_t unused;
1857	char bytes[32];
1858	char comp[32];
1859	char uncomp[32];
1860	uint64_t empty_bpobj =
1861	    dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj;
1862
1863	/* force the tree to be loaded */
1864	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
1865
1866	if (dl->dl_oldfmt) {
1867		if (dl->dl_bpobj.bpo_object != empty_bpobj)
1868			bpobj_count_refd(&dl->dl_bpobj);
1869	} else {
1870		mos_obj_refd(dl->dl_object);
1871		for (dle = avl_first(&dl->dl_tree); dle;
1872		    dle = AVL_NEXT(&dl->dl_tree, dle)) {
1873			if (dle->dle_bpobj.bpo_object != empty_bpobj)
1874				bpobj_count_refd(&dle->dle_bpobj);
1875		}
1876	}
1877
1878	/* make sure nicenum has enough space */
1879	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
1880	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
1881	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
1882
1883	if (dump_opt['d'] < 3)
1884		return;
1885
1886	if (dl->dl_oldfmt) {
1887		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
1888		return;
1889	}
1890
1891	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
1892	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
1893	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
1894	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
1895	    bytes, comp, uncomp);
1896
1897	if (dump_opt['d'] < 4)
1898		return;
1899
1900	(void) printf("\n");
1901
1902	for (dle = avl_first(&dl->dl_tree); dle;
1903	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
1904		if (dump_opt['d'] >= 5) {
1905			char buf[128];
1906			(void) snprintf(buf, sizeof (buf),
1907			    "mintxg %llu -> obj %llu",
1908			    (longlong_t)dle->dle_mintxg,
1909			    (longlong_t)dle->dle_bpobj.bpo_object);
1910
1911			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
1912		} else {
1913			(void) printf("mintxg %llu -> obj %llu\n",
1914			    (longlong_t)dle->dle_mintxg,
1915			    (longlong_t)dle->dle_bpobj.bpo_object);
1916		}
1917	}
1918}
1919
1920static avl_tree_t idx_tree;
1921static avl_tree_t domain_tree;
1922static boolean_t fuid_table_loaded;
1923static objset_t *sa_os = NULL;
1924static sa_attr_type_t *sa_attr_table = NULL;
1925
1926static int
1927open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp)
1928{
1929	int err;
1930	uint64_t sa_attrs = 0;
1931	uint64_t version = 0;
1932
1933	VERIFY3P(sa_os, ==, NULL);
1934	err = dmu_objset_own(path, type, B_TRUE, B_FALSE, tag, osp);
1935	if (err != 0) {
1936		(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
1937		    strerror(err));
1938		return (err);
1939	}
1940
1941	if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) {
1942		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1943		    8, 1, &version);
1944		if (version >= ZPL_VERSION_SA) {
1945			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
1946			    8, 1, &sa_attrs);
1947		}
1948		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
1949		    &sa_attr_table);
1950		if (err != 0) {
1951			(void) fprintf(stderr, "sa_setup failed: %s\n",
1952			    strerror(err));
1953			dmu_objset_disown(*osp, B_FALSE, tag);
1954			*osp = NULL;
1955		}
1956	}
1957	sa_os = *osp;
1958
1959	return (0);
1960}
1961
1962static void
1963close_objset(objset_t *os, void *tag)
1964{
1965	VERIFY3P(os, ==, sa_os);
1966	if (os->os_sa != NULL)
1967		sa_tear_down(os);
1968	dmu_objset_disown(os, B_FALSE, tag);
1969	sa_attr_table = NULL;
1970	sa_os = NULL;
1971}
1972
1973static void
1974fuid_table_destroy()
1975{
1976	if (fuid_table_loaded) {
1977		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
1978		fuid_table_loaded = B_FALSE;
1979	}
1980}
1981
1982/*
1983 * print uid or gid information.
1984 * For normal POSIX id just the id is printed in decimal format.
1985 * For CIFS files with FUID the fuid is printed in hex followed by
1986 * the domain-rid string.
1987 */
1988static void
1989print_idstr(uint64_t id, const char *id_type)
1990{
1991	if (FUID_INDEX(id)) {
1992		char *domain;
1993
1994		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
1995		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
1996		    (u_longlong_t)id, domain, (int)FUID_RID(id));
1997	} else {
1998		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
1999	}
2000
2001}
2002
2003static void
2004dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
2005{
2006	uint32_t uid_idx, gid_idx;
2007
2008	uid_idx = FUID_INDEX(uid);
2009	gid_idx = FUID_INDEX(gid);
2010
2011	/* Load domain table, if not already loaded */
2012	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
2013		uint64_t fuid_obj;
2014
2015		/* first find the fuid object.  It lives in the master node */
2016		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
2017		    8, 1, &fuid_obj) == 0);
2018		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
2019		(void) zfs_fuid_table_load(os, fuid_obj,
2020		    &idx_tree, &domain_tree);
2021		fuid_table_loaded = B_TRUE;
2022	}
2023
2024	print_idstr(uid, "uid");
2025	print_idstr(gid, "gid");
2026}
2027
2028/*ARGSUSED*/
2029static void
2030dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
2031{
2032	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
2033	sa_handle_t *hdl;
2034	uint64_t xattr, rdev, gen;
2035	uint64_t uid, gid, mode, fsize, parent, links;
2036	uint64_t pflags;
2037	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
2038	time_t z_crtime, z_atime, z_mtime, z_ctime;
2039	sa_bulk_attr_t bulk[12];
2040	int idx = 0;
2041	int error;
2042
2043	VERIFY3P(os, ==, sa_os);
2044	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
2045		(void) printf("Failed to get handle for SA znode\n");
2046		return;
2047	}
2048
2049	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
2050	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
2051	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
2052	    &links, 8);
2053	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
2054	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
2055	    &mode, 8);
2056	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
2057	    NULL, &parent, 8);
2058	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
2059	    &fsize, 8);
2060	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
2061	    acctm, 16);
2062	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
2063	    modtm, 16);
2064	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
2065	    crtm, 16);
2066	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
2067	    chgtm, 16);
2068	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
2069	    &pflags, 8);
2070
2071	if (sa_bulk_lookup(hdl, bulk, idx)) {
2072		(void) sa_handle_destroy(hdl);
2073		return;
2074	}
2075
2076	z_crtime = (time_t)crtm[0];
2077	z_atime = (time_t)acctm[0];
2078	z_mtime = (time_t)modtm[0];
2079	z_ctime = (time_t)chgtm[0];
2080
2081	if (dump_opt['d'] > 4) {
2082		error = zfs_obj_to_path(os, object, path, sizeof (path));
2083		if (error == ESTALE) {
2084			(void) snprintf(path, sizeof (path), "on delete queue");
2085		} else if (error != 0) {
2086			leaked_objects++;
2087			(void) snprintf(path, sizeof (path),
2088			    "path not found, possibly leaked");
2089		}
2090		(void) printf("\tpath	%s\n", path);
2091	}
2092	dump_uidgid(os, uid, gid);
2093	(void) printf("\tatime	%s", ctime(&z_atime));
2094	(void) printf("\tmtime	%s", ctime(&z_mtime));
2095	(void) printf("\tctime	%s", ctime(&z_ctime));
2096	(void) printf("\tcrtime	%s", ctime(&z_crtime));
2097	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
2098	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
2099	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
2100	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
2101	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
2102	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
2103	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
2104		uint64_t projid;
2105
2106		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
2107		    sizeof (uint64_t)) == 0)
2108			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
2109	}
2110	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
2111	    sizeof (uint64_t)) == 0)
2112		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
2113	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
2114	    sizeof (uint64_t)) == 0)
2115		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
2116	sa_handle_destroy(hdl);
2117}
2118
2119/*ARGSUSED*/
2120static void
2121dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
2122{
2123}
2124
2125/*ARGSUSED*/
2126static void
2127dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
2128{
2129}
2130
2131
2132static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
2133	dump_none,		/* unallocated			*/
2134	dump_zap,		/* object directory		*/
2135	dump_uint64,		/* object array			*/
2136	dump_none,		/* packed nvlist		*/
2137	dump_packed_nvlist,	/* packed nvlist size		*/
2138	dump_none,		/* bpobj			*/
2139	dump_bpobj,		/* bpobj header			*/
2140	dump_none,		/* SPA space map header		*/
2141	dump_none,		/* SPA space map		*/
2142	dump_none,		/* ZIL intent log		*/
2143	dump_dnode,		/* DMU dnode			*/
2144	dump_dmu_objset,	/* DMU objset			*/
2145	dump_dsl_dir,		/* DSL directory		*/
2146	dump_zap,		/* DSL directory child map	*/
2147	dump_zap,		/* DSL dataset snap map		*/
2148	dump_zap,		/* DSL props			*/
2149	dump_dsl_dataset,	/* DSL dataset			*/
2150	dump_znode,		/* ZFS znode			*/
2151	dump_acl,		/* ZFS V0 ACL			*/
2152	dump_uint8,		/* ZFS plain file		*/
2153	dump_zpldir,		/* ZFS directory		*/
2154	dump_zap,		/* ZFS master node		*/
2155	dump_zap,		/* ZFS delete queue		*/
2156	dump_uint8,		/* zvol object			*/
2157	dump_zap,		/* zvol prop			*/
2158	dump_uint8,		/* other uint8[]		*/
2159	dump_uint64,		/* other uint64[]		*/
2160	dump_zap,		/* other ZAP			*/
2161	dump_zap,		/* persistent error log		*/
2162	dump_uint8,		/* SPA history			*/
2163	dump_history_offsets,	/* SPA history offsets		*/
2164	dump_zap,		/* Pool properties		*/
2165	dump_zap,		/* DSL permissions		*/
2166	dump_acl,		/* ZFS ACL			*/
2167	dump_uint8,		/* ZFS SYSACL			*/
2168	dump_none,		/* FUID nvlist			*/
2169	dump_packed_nvlist,	/* FUID nvlist size		*/
2170	dump_zap,		/* DSL dataset next clones	*/
2171	dump_zap,		/* DSL scrub queue		*/
2172	dump_zap,		/* ZFS user/group/project used	*/
2173	dump_zap,		/* ZFS user/group/project quota	*/
2174	dump_zap,		/* snapshot refcount tags	*/
2175	dump_ddt_zap,		/* DDT ZAP object		*/
2176	dump_zap,		/* DDT statistics		*/
2177	dump_znode,		/* SA object			*/
2178	dump_zap,		/* SA Master Node		*/
2179	dump_sa_attrs,		/* SA attribute registration	*/
2180	dump_sa_layouts,	/* SA attribute layouts		*/
2181	dump_zap,		/* DSL scrub translations	*/
2182	dump_none,		/* fake dedup BP		*/
2183	dump_zap,		/* deadlist			*/
2184	dump_none,		/* deadlist hdr			*/
2185	dump_zap,		/* dsl clones			*/
2186	dump_bpobj_subobjs,	/* bpobj subobjs		*/
2187	dump_unknown,		/* Unknown type, must be last	*/
2188};
2189
2190static void
2191dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
2192    uint64_t *dnode_slots_used)
2193{
2194	dmu_buf_t *db = NULL;
2195	dmu_object_info_t doi;
2196	dnode_t *dn;
2197	boolean_t dnode_held = B_FALSE;
2198	void *bonus = NULL;
2199	size_t bsize = 0;
2200	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
2201	char bonus_size[32];
2202	char aux[50];
2203	int error;
2204
2205	/* make sure nicenum has enough space */
2206	CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
2207	CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
2208	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
2209	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
2210	CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
2211
2212	if (*print_header) {
2213		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
2214		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
2215		    "lsize", "%full", "type");
2216		*print_header = 0;
2217	}
2218
2219	if (object == 0) {
2220		dn = DMU_META_DNODE(os);
2221		dmu_object_info_from_dnode(dn, &doi);
2222	} else {
2223		/*
2224		 * Encrypted datasets will have sensitive bonus buffers
2225		 * encrypted. Therefore we cannot hold the bonus buffer and
2226		 * must hold the dnode itself instead.
2227		 */
2228		error = dmu_object_info(os, object, &doi);
2229		if (error)
2230			fatal("dmu_object_info() failed, errno %u", error);
2231
2232		if (os->os_encrypted &&
2233		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
2234			error = dnode_hold(os, object, FTAG, &dn);
2235			if (error)
2236				fatal("dnode_hold() failed, errno %u", error);
2237			dnode_held = B_TRUE;
2238		} else {
2239			error = dmu_bonus_hold(os, object, FTAG, &db);
2240			if (error)
2241				fatal("dmu_bonus_hold(%llu) failed, errno %u",
2242				    object, error);
2243			bonus = db->db_data;
2244			bsize = db->db_size;
2245			dn = DB_DNODE((dmu_buf_impl_t *)db);
2246		}
2247	}
2248
2249	if (dnode_slots_used != NULL)
2250		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
2251
2252	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
2253	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
2254	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
2255	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
2256	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
2257	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
2258	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
2259	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
2260	    doi.doi_max_offset);
2261
2262	aux[0] = '\0';
2263
2264	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
2265		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
2266		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
2267	}
2268
2269	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
2270		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
2271		    ZDB_COMPRESS_NAME(doi.doi_compress));
2272	}
2273
2274	(void) printf("%10" PRIu64
2275	    "  %3u  %5s  %5s  %5s  %5s  %5s  %6s  %s%s\n",
2276	    object, doi.doi_indirection, iblk, dblk,
2277	    asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
2278
2279	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
2280		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
2281		    "", "", "", "", "", "", bonus_size, "bonus",
2282		    ZDB_OT_NAME(doi.doi_bonus_type));
2283	}
2284
2285	if (verbosity >= 4) {
2286		(void) printf("\tdnode flags: %s%s%s%s\n",
2287		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
2288		    "USED_BYTES " : "",
2289		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
2290		    "USERUSED_ACCOUNTED " : "",
2291		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
2292		    "USEROBJUSED_ACCOUNTED " : "",
2293		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
2294		    "SPILL_BLKPTR" : "");
2295		(void) printf("\tdnode maxblkid: %llu\n",
2296		    (longlong_t)dn->dn_phys->dn_maxblkid);
2297
2298		if (!dnode_held) {
2299			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
2300			    object, bonus, bsize);
2301		} else {
2302			(void) printf("\t\t(bonus encrypted)\n");
2303		}
2304
2305		if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) {
2306			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
2307			    NULL, 0);
2308		} else {
2309			(void) printf("\t\t(object encrypted)\n");
2310		}
2311
2312		*print_header = 1;
2313	}
2314
2315	if (verbosity >= 5)
2316		dump_indirect(dn);
2317
2318	if (verbosity >= 5) {
2319		/*
2320		 * Report the list of segments that comprise the object.
2321		 */
2322		uint64_t start = 0;
2323		uint64_t end;
2324		uint64_t blkfill = 1;
2325		int minlvl = 1;
2326
2327		if (dn->dn_type == DMU_OT_DNODE) {
2328			minlvl = 0;
2329			blkfill = DNODES_PER_BLOCK;
2330		}
2331
2332		for (;;) {
2333			char segsize[32];
2334			/* make sure nicenum has enough space */
2335			CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
2336			error = dnode_next_offset(dn,
2337			    0, &start, minlvl, blkfill, 0);
2338			if (error)
2339				break;
2340			end = start;
2341			error = dnode_next_offset(dn,
2342			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
2343			zdb_nicenum(end - start, segsize, sizeof (segsize));
2344			(void) printf("\t\tsegment [%016llx, %016llx)"
2345			    " size %5s\n", (u_longlong_t)start,
2346			    (u_longlong_t)end, segsize);
2347			if (error)
2348				break;
2349			start = end;
2350		}
2351	}
2352
2353	if (db != NULL)
2354		dmu_buf_rele(db, FTAG);
2355	if (dnode_held)
2356		dnode_rele(dn, FTAG);
2357}
2358
2359static void
2360count_dir_mos_objects(dsl_dir_t *dd)
2361{
2362	mos_obj_refd(dd->dd_object);
2363	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
2364	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
2365	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
2366	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
2367}
2368
2369static void
2370count_ds_mos_objects(dsl_dataset_t *ds)
2371{
2372	mos_obj_refd(ds->ds_object);
2373	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
2374	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
2375	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
2376	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
2377
2378	if (!dsl_dataset_is_snapshot(ds)) {
2379		count_dir_mos_objects(ds->ds_dir);
2380	}
2381}
2382
2383static const char *objset_types[DMU_OST_NUMTYPES] = {
2384	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
2385
2386static void
2387dump_dir(objset_t *os)
2388{
2389	dmu_objset_stats_t dds;
2390	uint64_t object, object_count;
2391	uint64_t refdbytes, usedobjs, scratch;
2392	char numbuf[32];
2393	char blkbuf[BP_SPRINTF_LEN + 20];
2394	char osname[ZFS_MAX_DATASET_NAME_LEN];
2395	const char *type = "UNKNOWN";
2396	int verbosity = dump_opt['d'];
2397	int print_header = 1;
2398	unsigned i;
2399	int error;
2400	uint64_t total_slots_used = 0;
2401	uint64_t max_slot_used = 0;
2402	uint64_t dnode_slots;
2403
2404	/* make sure nicenum has enough space */
2405	CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
2406
2407	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2408	dmu_objset_fast_stat(os, &dds);
2409	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2410
2411	if (dds.dds_type < DMU_OST_NUMTYPES)
2412		type = objset_types[dds.dds_type];
2413
2414	if (dds.dds_type == DMU_OST_META) {
2415		dds.dds_creation_txg = TXG_INITIAL;
2416		usedobjs = BP_GET_FILL(os->os_rootbp);
2417		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
2418		    dd_used_bytes;
2419	} else {
2420		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
2421	}
2422
2423	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
2424
2425	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
2426
2427	if (verbosity >= 4) {
2428		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
2429		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
2430		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
2431	} else {
2432		blkbuf[0] = '\0';
2433	}
2434
2435	dmu_objset_name(os, osname);
2436
2437	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
2438	    "%s, %llu objects%s%s\n",
2439	    osname, type, (u_longlong_t)dmu_objset_id(os),
2440	    (u_longlong_t)dds.dds_creation_txg,
2441	    numbuf, (u_longlong_t)usedobjs, blkbuf,
2442	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
2443
2444	if (zopt_objects != 0) {
2445		for (i = 0; i < zopt_objects; i++)
2446			dump_object(os, zopt_object[i], verbosity,
2447			    &print_header, NULL);
2448		(void) printf("\n");
2449		return;
2450	}
2451
2452	if (dump_opt['i'] != 0 || verbosity >= 2)
2453		dump_intent_log(dmu_objset_zil(os));
2454
2455	if (dmu_objset_ds(os) != NULL) {
2456		dsl_dataset_t *ds = dmu_objset_ds(os);
2457		dump_deadlist(&ds->ds_deadlist);
2458
2459		if (dsl_dataset_remap_deadlist_exists(ds)) {
2460			(void) printf("ds_remap_deadlist:\n");
2461			dump_deadlist(&ds->ds_remap_deadlist);
2462		}
2463		count_ds_mos_objects(ds);
2464	}
2465
2466	if (verbosity < 2)
2467		return;
2468
2469	if (BP_IS_HOLE(os->os_rootbp))
2470		return;
2471
2472	dump_object(os, 0, verbosity, &print_header, NULL);
2473	object_count = 0;
2474	if (DMU_USERUSED_DNODE(os) != NULL &&
2475	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
2476		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
2477		    NULL);
2478		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
2479		    NULL);
2480	}
2481
2482	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
2483	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
2484		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
2485		    &print_header, NULL);
2486
2487	object = 0;
2488	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
2489		dump_object(os, object, verbosity, &print_header, &dnode_slots);
2490		object_count++;
2491		total_slots_used += dnode_slots;
2492		max_slot_used = object + dnode_slots - 1;
2493	}
2494
2495	(void) printf("\n");
2496
2497	(void) printf("    Dnode slots:\n");
2498	(void) printf("\tTotal used:    %10llu\n",
2499	    (u_longlong_t)total_slots_used);
2500	(void) printf("\tMax used:      %10llu\n",
2501	    (u_longlong_t)max_slot_used);
2502	(void) printf("\tPercent empty: %10lf\n",
2503	    (double)(max_slot_used - total_slots_used)*100 /
2504	    (double)max_slot_used);
2505
2506	(void) printf("\n");
2507
2508	if (error != ESRCH) {
2509		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
2510		abort();
2511	}
2512	if (leaked_objects != 0) {
2513		(void) printf("%d potentially leaked objects detected\n",
2514		    leaked_objects);
2515		leaked_objects = 0;
2516	}
2517
2518	ASSERT3U(object_count, ==, usedobjs);
2519}
2520
2521static void
2522dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
2523{
2524	time_t timestamp = ub->ub_timestamp;
2525
2526	(void) printf("%s", header ? header : "");
2527	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
2528	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
2529	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
2530	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
2531	(void) printf("\ttimestamp = %llu UTC = %s",
2532	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
2533
2534	(void) printf("\tmmp_magic = %016llx\n",
2535	    (u_longlong_t)ub->ub_mmp_magic);
2536	if (MMP_VALID(ub)) {
2537		(void) printf("\tmmp_delay = %0llu\n",
2538		    (u_longlong_t)ub->ub_mmp_delay);
2539		if (MMP_SEQ_VALID(ub))
2540			(void) printf("\tmmp_seq = %u\n",
2541			    (unsigned int) MMP_SEQ(ub));
2542		if (MMP_FAIL_INT_VALID(ub))
2543			(void) printf("\tmmp_fail = %u\n",
2544			    (unsigned int) MMP_FAIL_INT(ub));
2545		if (MMP_INTERVAL_VALID(ub))
2546			(void) printf("\tmmp_write = %u\n",
2547			    (unsigned int) MMP_INTERVAL(ub));
2548		/* After MMP_* to make summarize_uberblock_mmp cleaner */
2549		(void) printf("\tmmp_valid = %x\n",
2550		    (unsigned int) ub->ub_mmp_config & 0xFF);
2551	}
2552
2553	if (dump_opt['u'] >= 3) {
2554		char blkbuf[BP_SPRINTF_LEN];
2555		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
2556		(void) printf("\trootbp = %s\n", blkbuf);
2557	}
2558	(void) printf("\tcheckpoint_txg = %llu\n",
2559	    (u_longlong_t)ub->ub_checkpoint_txg);
2560	(void) printf("%s", footer ? footer : "");
2561}
2562
2563static void
2564dump_config(spa_t *spa)
2565{
2566	dmu_buf_t *db;
2567	size_t nvsize = 0;
2568	int error = 0;
2569
2570
2571	error = dmu_bonus_hold(spa->spa_meta_objset,
2572	    spa->spa_config_object, FTAG, &db);
2573
2574	if (error == 0) {
2575		nvsize = *(uint64_t *)db->db_data;
2576		dmu_buf_rele(db, FTAG);
2577
2578		(void) printf("\nMOS Configuration:\n");
2579		dump_packed_nvlist(spa->spa_meta_objset,
2580		    spa->spa_config_object, (void *)&nvsize, 1);
2581	} else {
2582		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
2583		    (u_longlong_t)spa->spa_config_object, error);
2584	}
2585}
2586
2587static void
2588dump_cachefile(const char *cachefile)
2589{
2590	int fd;
2591	struct stat64 statbuf;
2592	char *buf;
2593	nvlist_t *config;
2594
2595	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
2596		(void) printf("cannot open '%s': %s\n", cachefile,
2597		    strerror(errno));
2598		exit(1);
2599	}
2600
2601	if (fstat64(fd, &statbuf) != 0) {
2602		(void) printf("failed to stat '%s': %s\n", cachefile,
2603		    strerror(errno));
2604		exit(1);
2605	}
2606
2607	if ((buf = malloc(statbuf.st_size)) == NULL) {
2608		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
2609		    (u_longlong_t)statbuf.st_size);
2610		exit(1);
2611	}
2612
2613	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2614		(void) fprintf(stderr, "failed to read %llu bytes\n",
2615		    (u_longlong_t)statbuf.st_size);
2616		exit(1);
2617	}
2618
2619	(void) close(fd);
2620
2621	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
2622		(void) fprintf(stderr, "failed to unpack nvlist\n");
2623		exit(1);
2624	}
2625
2626	free(buf);
2627
2628	dump_nvlist(config, 0);
2629
2630	nvlist_free(config);
2631}
2632
2633#define	ZDB_MAX_UB_HEADER_SIZE 32
2634
2635static void
2636dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
2637{
2638	vdev_t vd;
2639	vdev_t *vdp = &vd;
2640	char header[ZDB_MAX_UB_HEADER_SIZE];
2641
2642	vd.vdev_ashift = ashift;
2643	vdp->vdev_top = vdp;
2644
2645	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
2646		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
2647		uberblock_t *ub = (void *)((char *)lbl + uoff);
2648
2649		if (uberblock_verify(ub))
2650			continue;
2651
2652		if ((dump_opt['u'] < 4) &&
2653		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
2654		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
2655			continue;
2656
2657		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
2658		    "Uberblock[%d]\n", i);
2659		dump_uberblock(ub, header, "");
2660	}
2661}
2662
2663static char curpath[PATH_MAX];
2664
2665/*
2666 * Iterate through the path components, recursively passing
2667 * current one's obj and remaining path until we find the obj
2668 * for the last one.
2669 */
2670static int
2671dump_path_impl(objset_t *os, uint64_t obj, char *name)
2672{
2673	int err;
2674	int header = 1;
2675	uint64_t child_obj;
2676	char *s;
2677	dmu_buf_t *db;
2678	dmu_object_info_t doi;
2679
2680	if ((s = strchr(name, '/')) != NULL)
2681		*s = '\0';
2682	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
2683
2684	(void) strlcat(curpath, name, sizeof (curpath));
2685
2686	if (err != 0) {
2687		(void) fprintf(stderr, "failed to lookup %s: %s\n",
2688		    curpath, strerror(err));
2689		return (err);
2690	}
2691
2692	child_obj = ZFS_DIRENT_OBJ(child_obj);
2693	err = sa_buf_hold(os, child_obj, FTAG, &db);
2694	if (err != 0) {
2695		(void) fprintf(stderr,
2696		    "failed to get SA dbuf for obj %llu: %s\n",
2697		    (u_longlong_t)child_obj, strerror(err));
2698		return (EINVAL);
2699	}
2700	dmu_object_info_from_db(db, &doi);
2701	sa_buf_rele(db, FTAG);
2702
2703	if (doi.doi_bonus_type != DMU_OT_SA &&
2704	    doi.doi_bonus_type != DMU_OT_ZNODE) {
2705		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
2706		    doi.doi_bonus_type, (u_longlong_t)child_obj);
2707		return (EINVAL);
2708	}
2709
2710	if (dump_opt['v'] > 6) {
2711		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
2712		    (u_longlong_t)child_obj, curpath, doi.doi_type,
2713		    doi.doi_bonus_type);
2714	}
2715
2716	(void) strlcat(curpath, "/", sizeof (curpath));
2717
2718	switch (doi.doi_type) {
2719	case DMU_OT_DIRECTORY_CONTENTS:
2720		if (s != NULL && *(s + 1) != '\0')
2721			return (dump_path_impl(os, child_obj, s + 1));
2722		/*FALLTHROUGH*/
2723	case DMU_OT_PLAIN_FILE_CONTENTS:
2724		dump_object(os, child_obj, dump_opt['v'], &header, NULL);
2725		return (0);
2726	default:
2727		(void) fprintf(stderr, "object %llu has non-file/directory "
2728		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
2729		break;
2730	}
2731
2732	return (EINVAL);
2733}
2734
2735/*
2736 * Dump the blocks for the object specified by path inside the dataset.
2737 */
2738static int
2739dump_path(char *ds, char *path)
2740{
2741	int err;
2742	objset_t *os;
2743	uint64_t root_obj;
2744
2745	err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
2746	if (err != 0)
2747		return (err);
2748
2749	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
2750	if (err != 0) {
2751		(void) fprintf(stderr, "can't lookup root znode: %s\n",
2752		    strerror(err));
2753		dmu_objset_disown(os, B_FALSE, FTAG);
2754		return (EINVAL);
2755	}
2756
2757	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
2758
2759	err = dump_path_impl(os, root_obj, path);
2760
2761	close_objset(os, FTAG);
2762	return (err);
2763}
2764
2765static int
2766dump_label(const char *dev)
2767{
2768	int fd;
2769	vdev_label_t label;
2770	char path[MAXPATHLEN];
2771	char *buf = label.vl_vdev_phys.vp_nvlist;
2772	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
2773	struct stat64 statbuf;
2774	uint64_t psize, ashift;
2775	boolean_t label_found = B_FALSE;
2776
2777	(void) strlcpy(path, dev, sizeof (path));
2778	if (dev[0] == '/') {
2779		if (strncmp(dev, ZFS_DISK_ROOTD,
2780		    strlen(ZFS_DISK_ROOTD)) == 0) {
2781			(void) snprintf(path, sizeof (path), "%s%s",
2782			    ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
2783		}
2784	} else if (stat64(path, &statbuf) != 0) {
2785		char *s;
2786
2787		(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
2788		    dev);
2789		if (((s = strrchr(dev, 's')) == NULL &&
2790		    (s = strchr(dev, 'p')) == NULL) ||
2791		    !isdigit(*(s + 1)))
2792			(void) strlcat(path, "s0", sizeof (path));
2793	}
2794
2795	if ((fd = open64(path, O_RDONLY)) < 0) {
2796		(void) fprintf(stderr, "cannot open '%s': %s\n", path,
2797		    strerror(errno));
2798		exit(1);
2799	}
2800
2801	if (fstat64(fd, &statbuf) != 0) {
2802		(void) fprintf(stderr, "failed to stat '%s': %s\n", path,
2803		    strerror(errno));
2804		(void) close(fd);
2805		exit(1);
2806	}
2807
2808	if (S_ISBLK(statbuf.st_mode)) {
2809		(void) fprintf(stderr,
2810		    "cannot use '%s': character device required\n", path);
2811		(void) close(fd);
2812		exit(1);
2813	}
2814
2815	psize = statbuf.st_size;
2816	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
2817
2818	for (int l = 0; l < VDEV_LABELS; l++) {
2819		nvlist_t *config = NULL;
2820
2821		if (!dump_opt['q']) {
2822			(void) printf("------------------------------------\n");
2823			(void) printf("LABEL %d\n", l);
2824			(void) printf("------------------------------------\n");
2825		}
2826
2827		if (pread64(fd, &label, sizeof (label),
2828		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
2829			if (!dump_opt['q'])
2830				(void) printf("failed to read label %d\n", l);
2831			continue;
2832		}
2833
2834		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
2835			if (!dump_opt['q'])
2836				(void) printf("failed to unpack label %d\n", l);
2837			ashift = SPA_MINBLOCKSHIFT;
2838		} else {
2839			nvlist_t *vdev_tree = NULL;
2840
2841			if (!dump_opt['q'])
2842				dump_nvlist(config, 4);
2843			if ((nvlist_lookup_nvlist(config,
2844			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
2845			    (nvlist_lookup_uint64(vdev_tree,
2846			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
2847				ashift = SPA_MINBLOCKSHIFT;
2848			nvlist_free(config);
2849			label_found = B_TRUE;
2850		}
2851		if (dump_opt['u'])
2852			dump_label_uberblocks(&label, ashift);
2853	}
2854
2855	(void) close(fd);
2856
2857	return (label_found ? 0 : 2);
2858}
2859
2860static uint64_t dataset_feature_count[SPA_FEATURES];
2861static uint64_t remap_deadlist_count = 0;
2862
2863/*ARGSUSED*/
2864static int
2865dump_one_dir(const char *dsname, void *arg)
2866{
2867	int error;
2868	objset_t *os;
2869
2870	error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
2871	if (error != 0)
2872		return (0);
2873
2874	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2875		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
2876			continue;
2877		ASSERT(spa_feature_table[f].fi_flags &
2878		    ZFEATURE_FLAG_PER_DATASET);
2879		dataset_feature_count[f]++;
2880	}
2881
2882	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
2883		remap_deadlist_count++;
2884	}
2885
2886	dump_dir(os);
2887	close_objset(os, FTAG);
2888	fuid_table_destroy();
2889	return (0);
2890}
2891
2892/*
2893 * Block statistics.
2894 */
2895#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
2896typedef struct zdb_blkstats {
2897	uint64_t zb_asize;
2898	uint64_t zb_lsize;
2899	uint64_t zb_psize;
2900	uint64_t zb_count;
2901	uint64_t zb_gangs;
2902	uint64_t zb_ditto_samevdev;
2903	uint64_t zb_ditto_same_ms;
2904	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
2905} zdb_blkstats_t;
2906
2907/*
2908 * Extended object types to report deferred frees and dedup auto-ditto blocks.
2909 */
2910#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
2911#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
2912#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
2913#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
2914
2915static const char *zdb_ot_extname[] = {
2916	"deferred free",
2917	"dedup ditto",
2918	"other",
2919	"Total",
2920};
2921
2922#define	ZB_TOTAL	DN_MAX_LEVELS
2923
2924typedef struct zdb_cb {
2925	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
2926	uint64_t	zcb_removing_size;
2927	uint64_t	zcb_checkpoint_size;
2928	uint64_t	zcb_dedup_asize;
2929	uint64_t	zcb_dedup_blocks;
2930	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
2931	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
2932	    [BPE_PAYLOAD_SIZE];
2933	uint64_t	zcb_start;
2934	hrtime_t	zcb_lastprint;
2935	uint64_t	zcb_totalasize;
2936	uint64_t	zcb_errors[256];
2937	int		zcb_readfails;
2938	int		zcb_haderrors;
2939	spa_t		*zcb_spa;
2940	uint32_t	**zcb_vd_obsolete_counts;
2941} zdb_cb_t;
2942
2943/* test if two DVA offsets from same vdev are within the same metaslab */
2944static boolean_t
2945same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
2946{
2947	vdev_t *vd = vdev_lookup_top(spa, vdev);
2948	uint64_t ms_shift = vd->vdev_ms_shift;
2949
2950	return ((off1 >> ms_shift) == (off2 >> ms_shift));
2951}
2952
2953static void
2954zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
2955    dmu_object_type_t type)
2956{
2957	uint64_t refcnt = 0;
2958
2959	ASSERT(type < ZDB_OT_TOTAL);
2960
2961	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
2962		return;
2963
2964	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
2965
2966	for (int i = 0; i < 4; i++) {
2967		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
2968		int t = (i & 1) ? type : ZDB_OT_TOTAL;
2969		int equal;
2970		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
2971
2972		zb->zb_asize += BP_GET_ASIZE(bp);
2973		zb->zb_lsize += BP_GET_LSIZE(bp);
2974		zb->zb_psize += BP_GET_PSIZE(bp);
2975		zb->zb_count++;
2976
2977		/*
2978		 * The histogram is only big enough to record blocks up to
2979		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
2980		 * "other", bucket.
2981		 */
2982		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
2983		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
2984		zb->zb_psize_histogram[idx]++;
2985
2986		zb->zb_gangs += BP_COUNT_GANG(bp);
2987
2988		switch (BP_GET_NDVAS(bp)) {
2989		case 2:
2990			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2991			    DVA_GET_VDEV(&bp->blk_dva[1])) {
2992				zb->zb_ditto_samevdev++;
2993
2994				if (same_metaslab(zcb->zcb_spa,
2995				    DVA_GET_VDEV(&bp->blk_dva[0]),
2996				    DVA_GET_OFFSET(&bp->blk_dva[0]),
2997				    DVA_GET_OFFSET(&bp->blk_dva[1])))
2998					zb->zb_ditto_same_ms++;
2999			}
3000			break;
3001		case 3:
3002			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3003			    DVA_GET_VDEV(&bp->blk_dva[1])) +
3004			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3005			    DVA_GET_VDEV(&bp->blk_dva[2])) +
3006			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
3007			    DVA_GET_VDEV(&bp->blk_dva[2]));
3008			if (equal != 0) {
3009				zb->zb_ditto_samevdev++;
3010
3011				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3012				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
3013				    same_metaslab(zcb->zcb_spa,
3014				    DVA_GET_VDEV(&bp->blk_dva[0]),
3015				    DVA_GET_OFFSET(&bp->blk_dva[0]),
3016				    DVA_GET_OFFSET(&bp->blk_dva[1])))
3017					zb->zb_ditto_same_ms++;
3018				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
3019				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
3020				    same_metaslab(zcb->zcb_spa,
3021				    DVA_GET_VDEV(&bp->blk_dva[0]),
3022				    DVA_GET_OFFSET(&bp->blk_dva[0]),
3023				    DVA_GET_OFFSET(&bp->blk_dva[2])))
3024					zb->zb_ditto_same_ms++;
3025				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
3026				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
3027				    same_metaslab(zcb->zcb_spa,
3028				    DVA_GET_VDEV(&bp->blk_dva[1]),
3029				    DVA_GET_OFFSET(&bp->blk_dva[1]),
3030				    DVA_GET_OFFSET(&bp->blk_dva[2])))
3031					zb->zb_ditto_same_ms++;
3032			}
3033			break;
3034		}
3035	}
3036
3037	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
3038
3039	if (BP_IS_EMBEDDED(bp)) {
3040		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
3041		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
3042		    [BPE_GET_PSIZE(bp)]++;
3043		return;
3044	}
3045
3046	if (dump_opt['L'])
3047		return;
3048
3049	if (BP_GET_DEDUP(bp)) {
3050		ddt_t *ddt;
3051		ddt_entry_t *dde;
3052
3053		ddt = ddt_select(zcb->zcb_spa, bp);
3054		ddt_enter(ddt);
3055		dde = ddt_lookup(ddt, bp, B_FALSE);
3056
3057		if (dde == NULL) {
3058			refcnt = 0;
3059		} else {
3060			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
3061			ddt_phys_decref(ddp);
3062			refcnt = ddp->ddp_refcnt;
3063			if (ddt_phys_total_refcnt(dde) == 0)
3064				ddt_remove(ddt, dde);
3065		}
3066		ddt_exit(ddt);
3067	}
3068
3069	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
3070	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
3071	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
3072}
3073
3074static void
3075zdb_blkptr_done(zio_t *zio)
3076{
3077	spa_t *spa = zio->io_spa;
3078	blkptr_t *bp = zio->io_bp;
3079	int ioerr = zio->io_error;
3080	zdb_cb_t *zcb = zio->io_private;
3081	zbookmark_phys_t *zb = &zio->io_bookmark;
3082
3083	abd_free(zio->io_abd);
3084
3085	mutex_enter(&spa->spa_scrub_lock);
3086	spa->spa_load_verify_ios--;
3087	cv_broadcast(&spa->spa_scrub_io_cv);
3088
3089	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3090		char blkbuf[BP_SPRINTF_LEN];
3091
3092		zcb->zcb_haderrors = 1;
3093		zcb->zcb_errors[ioerr]++;
3094
3095		if (dump_opt['b'] >= 2)
3096			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3097		else
3098			blkbuf[0] = '\0';
3099
3100		(void) printf("zdb_blkptr_cb: "
3101		    "Got error %d reading "
3102		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
3103		    ioerr,
3104		    (u_longlong_t)zb->zb_objset,
3105		    (u_longlong_t)zb->zb_object,
3106		    (u_longlong_t)zb->zb_level,
3107		    (u_longlong_t)zb->zb_blkid,
3108		    blkbuf);
3109	}
3110	mutex_exit(&spa->spa_scrub_lock);
3111}
3112
3113static int
3114zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
3115    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
3116{
3117	zdb_cb_t *zcb = arg;
3118	dmu_object_type_t type;
3119	boolean_t is_metadata;
3120
3121	if (bp == NULL)
3122		return (0);
3123
3124	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
3125		char blkbuf[BP_SPRINTF_LEN];
3126		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3127		(void) printf("objset %llu object %llu "
3128		    "level %lld offset 0x%llx %s\n",
3129		    (u_longlong_t)zb->zb_objset,
3130		    (u_longlong_t)zb->zb_object,
3131		    (longlong_t)zb->zb_level,
3132		    (u_longlong_t)blkid2offset(dnp, bp, zb),
3133		    blkbuf);
3134	}
3135
3136	if (BP_IS_HOLE(bp))
3137		return (0);
3138
3139	type = BP_GET_TYPE(bp);
3140
3141	zdb_count_block(zcb, zilog, bp,
3142	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
3143
3144	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
3145
3146	if (!BP_IS_EMBEDDED(bp) &&
3147	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
3148		size_t size = BP_GET_PSIZE(bp);
3149		abd_t *abd = abd_alloc(size, B_FALSE);
3150		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
3151
3152		/* If it's an intent log block, failure is expected. */
3153		if (zb->zb_level == ZB_ZIL_LEVEL)
3154			flags |= ZIO_FLAG_SPECULATIVE;
3155
3156		mutex_enter(&spa->spa_scrub_lock);
3157		while (spa->spa_load_verify_ios > max_inflight)
3158			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3159		spa->spa_load_verify_ios++;
3160		mutex_exit(&spa->spa_scrub_lock);
3161
3162		zio_nowait(zio_read(NULL, spa, bp, abd, size,
3163		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
3164	}
3165
3166	zcb->zcb_readfails = 0;
3167
3168	/* only call gethrtime() every 100 blocks */
3169	static int iters;
3170	if (++iters > 100)
3171		iters = 0;
3172	else
3173		return (0);
3174
3175	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
3176		uint64_t now = gethrtime();
3177		char buf[10];
3178		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
3179		int kb_per_sec =
3180		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
3181		int sec_remaining =
3182		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
3183
3184		/* make sure nicenum has enough space */
3185		CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
3186
3187		zfs_nicenum(bytes, buf, sizeof (buf));
3188		(void) fprintf(stderr,
3189		    "\r%5s completed (%4dMB/s) "
3190		    "estimated time remaining: %uhr %02umin %02usec        ",
3191		    buf, kb_per_sec / 1024,
3192		    sec_remaining / 60 / 60,
3193		    sec_remaining / 60 % 60,
3194		    sec_remaining % 60);
3195
3196		zcb->zcb_lastprint = now;
3197	}
3198
3199	return (0);
3200}
3201
3202static void
3203zdb_leak(void *arg, uint64_t start, uint64_t size)
3204{
3205	vdev_t *vd = arg;
3206
3207	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
3208	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
3209}
3210
3211static metaslab_ops_t zdb_metaslab_ops = {
3212	NULL	/* alloc */
3213};
3214
3215typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme,
3216    uint64_t txg, void *arg);
3217
3218typedef struct unflushed_iter_cb_arg {
3219	spa_t *uic_spa;
3220	uint64_t uic_txg;
3221	void *uic_arg;
3222	zdb_log_sm_cb_t uic_cb;
3223} unflushed_iter_cb_arg_t;
3224
3225static int
3226iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
3227{
3228	unflushed_iter_cb_arg_t *uic = arg;
3229
3230	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
3231}
3232
3233static void
3234iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
3235{
3236	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3237		return;
3238
3239	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3240	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
3241	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
3242		space_map_t *sm = NULL;
3243		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
3244		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
3245
3246		unflushed_iter_cb_arg_t uic = {
3247			.uic_spa = spa,
3248			.uic_txg = sls->sls_txg,
3249			.uic_arg = arg,
3250			.uic_cb = cb
3251		};
3252
3253		VERIFY0(space_map_iterate(sm, space_map_length(sm),
3254		    iterate_through_spacemap_logs_cb, &uic));
3255		space_map_close(sm);
3256	}
3257	spa_config_exit(spa, SCL_CONFIG, FTAG);
3258}
3259
3260/* ARGSUSED */
3261static int
3262load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
3263    uint64_t txg, void *arg)
3264{
3265	spa_vdev_removal_t *svr = arg;
3266
3267	uint64_t offset = sme->sme_offset;
3268	uint64_t size = sme->sme_run;
3269
3270	/* skip vdevs we don't care about */
3271	if (sme->sme_vdev != svr->svr_vdev_id)
3272		return (0);
3273
3274	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
3275	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3276	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
3277
3278	if (txg < metaslab_unflushed_txg(ms))
3279		return (0);
3280
3281	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3282	ASSERT(vim != NULL);
3283	if (offset >= vdev_indirect_mapping_max_offset(vim))
3284		return (0);
3285
3286	if (sme->sme_type == SM_ALLOC)
3287		range_tree_add(svr->svr_allocd_segs, offset, size);
3288	else
3289		range_tree_remove(svr->svr_allocd_segs, offset, size);
3290
3291	return (0);
3292}
3293
3294static void
3295zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
3296{
3297	ddt_bookmark_t ddb;
3298	ddt_entry_t dde;
3299	int error;
3300
3301	ASSERT(!dump_opt['L']);
3302
3303	bzero(&ddb, sizeof (ddb));
3304	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
3305		blkptr_t blk;
3306		ddt_phys_t *ddp = dde.dde_phys;
3307
3308		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
3309			return;
3310
3311		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
3312
3313		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
3314			if (ddp->ddp_phys_birth == 0)
3315				continue;
3316			ddt_bp_create(ddb.ddb_checksum,
3317			    &dde.dde_key, ddp, &blk);
3318			if (p == DDT_PHYS_DITTO) {
3319				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
3320			} else {
3321				zcb->zcb_dedup_asize +=
3322				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
3323				zcb->zcb_dedup_blocks++;
3324			}
3325		}
3326		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
3327		ddt_enter(ddt);
3328		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
3329		ddt_exit(ddt);
3330	}
3331
3332	ASSERT(error == ENOENT);
3333}
3334
3335/* ARGSUSED */
3336static void
3337claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3338    uint64_t size, void *arg)
3339{
3340	/*
3341	 * This callback was called through a remap from
3342	 * a device being removed. Therefore, the vdev that
3343	 * this callback is applied to is a concrete
3344	 * vdev.
3345	 */
3346	ASSERT(vdev_is_concrete(vd));
3347
3348	VERIFY0(metaslab_claim_impl(vd, offset, size,
3349	    spa_min_claim_txg(vd->vdev_spa)));
3350}
3351
3352static void
3353claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
3354{
3355	vdev_t *vd = arg;
3356
3357	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
3358	    claim_segment_impl_cb, NULL);
3359}
3360
3361/*
3362 * After accounting for all allocated blocks that are directly referenced,
3363 * we might have missed a reference to a block from a partially complete
3364 * (and thus unused) indirect mapping object. We perform a secondary pass
3365 * through the metaslabs we have already mapped and claim the destination
3366 * blocks.
3367 */
3368static void
3369zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
3370{
3371	if (dump_opt['L'])
3372		return;
3373
3374	if (spa->spa_vdev_removal == NULL)
3375		return;
3376
3377	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3378
3379	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
3380	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
3381	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3382
3383	ASSERT0(range_tree_space(svr->svr_allocd_segs));
3384
3385	range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
3386	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
3387		metaslab_t *msp = vd->vdev_ms[msi];
3388
3389		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
3390			break;
3391
3392		ASSERT0(range_tree_space(allocs));
3393		if (msp->ms_sm != NULL)
3394			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
3395		range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
3396	}
3397	range_tree_destroy(allocs);
3398
3399	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
3400
3401	/*
3402	 * Clear everything past what has been synced,
3403	 * because we have not allocated mappings for
3404	 * it yet.
3405	 */
3406	range_tree_clear(svr->svr_allocd_segs,
3407	    vdev_indirect_mapping_max_offset(vim),
3408	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
3409
3410	zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
3411	range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
3412
3413	spa_config_exit(spa, SCL_CONFIG, FTAG);
3414}
3415
3416/* ARGSUSED */
3417static int
3418increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3419{
3420	zdb_cb_t *zcb = arg;
3421	spa_t *spa = zcb->zcb_spa;
3422	vdev_t *vd;
3423	const dva_t *dva = &bp->blk_dva[0];
3424
3425	ASSERT(!dump_opt['L']);
3426	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
3427
3428	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3429	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
3430	ASSERT3P(vd, !=, NULL);
3431	spa_config_exit(spa, SCL_VDEV, FTAG);
3432
3433	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
3434	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
3435
3436	vdev_indirect_mapping_increment_obsolete_count(
3437	    vd->vdev_indirect_mapping,
3438	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
3439	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3440
3441	return (0);
3442}
3443
3444static uint32_t *
3445zdb_load_obsolete_counts(vdev_t *vd)
3446{
3447	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3448	spa_t *spa = vd->vdev_spa;
3449	spa_condensing_indirect_phys_t *scip =
3450	    &spa->spa_condensing_indirect_phys;
3451	uint32_t *counts;
3452
3453	EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
3454	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
3455	if (vd->vdev_obsolete_sm != NULL) {
3456		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3457		    vd->vdev_obsolete_sm);
3458	}
3459	if (scip->scip_vdev == vd->vdev_id &&
3460	    scip->scip_prev_obsolete_sm_object != 0) {
3461		space_map_t *prev_obsolete_sm = NULL;
3462		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
3463		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
3464		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3465		    prev_obsolete_sm);
3466		space_map_close(prev_obsolete_sm);
3467	}
3468	return (counts);
3469}
3470
3471typedef struct checkpoint_sm_exclude_entry_arg {
3472	vdev_t *cseea_vd;
3473	uint64_t cseea_checkpoint_size;
3474} checkpoint_sm_exclude_entry_arg_t;
3475
3476static int
3477checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
3478{
3479	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
3480	vdev_t *vd = cseea->cseea_vd;
3481	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
3482	uint64_t end = sme->sme_offset + sme->sme_run;
3483
3484	ASSERT(sme->sme_type == SM_FREE);
3485
3486	/*
3487	 * Since the vdev_checkpoint_sm exists in the vdev level
3488	 * and the ms_sm space maps exist in the metaslab level,
3489	 * an entry in the checkpoint space map could theoretically
3490	 * cross the boundaries of the metaslab that it belongs.
3491	 *
3492	 * In reality, because of the way that we populate and
3493	 * manipulate the checkpoint's space maps currently,
3494	 * there shouldn't be any entries that cross metaslabs.
3495	 * Hence the assertion below.
3496	 *
3497	 * That said, there is no fundamental requirement that
3498	 * the checkpoint's space map entries should not cross
3499	 * metaslab boundaries. So if needed we could add code
3500	 * that handles metaslab-crossing segments in the future.
3501	 */
3502	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
3503	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
3504
3505	/*
3506	 * By removing the entry from the allocated segments we
3507	 * also verify that the entry is there to begin with.
3508	 */
3509	mutex_enter(&ms->ms_lock);
3510	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
3511	mutex_exit(&ms->ms_lock);
3512
3513	cseea->cseea_checkpoint_size += sme->sme_run;
3514	return (0);
3515}
3516
3517static void
3518zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
3519{
3520	spa_t *spa = vd->vdev_spa;
3521	space_map_t *checkpoint_sm = NULL;
3522	uint64_t checkpoint_sm_obj;
3523
3524	/*
3525	 * If there is no vdev_top_zap, we are in a pool whose
3526	 * version predates the pool checkpoint feature.
3527	 */
3528	if (vd->vdev_top_zap == 0)
3529		return;
3530
3531	/*
3532	 * If there is no reference of the vdev_checkpoint_sm in
3533	 * the vdev_top_zap, then one of the following scenarios
3534	 * is true:
3535	 *
3536	 * 1] There is no checkpoint
3537	 * 2] There is a checkpoint, but no checkpointed blocks
3538	 *    have been freed yet
3539	 * 3] The current vdev is indirect
3540	 *
3541	 * In these cases we return immediately.
3542	 */
3543	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
3544	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
3545		return;
3546
3547	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
3548	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
3549	    &checkpoint_sm_obj));
3550
3551	checkpoint_sm_exclude_entry_arg_t cseea;
3552	cseea.cseea_vd = vd;
3553	cseea.cseea_checkpoint_size = 0;
3554
3555	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
3556	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
3557
3558	VERIFY0(space_map_iterate(checkpoint_sm,
3559	    space_map_length(checkpoint_sm),
3560	    checkpoint_sm_exclude_entry_cb, &cseea));
3561	space_map_close(checkpoint_sm);
3562
3563	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
3564}
3565
3566static void
3567zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
3568{
3569	ASSERT(!dump_opt['L']);
3570
3571	vdev_t *rvd = spa->spa_root_vdev;
3572	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3573		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
3574		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
3575	}
3576}
3577
3578static int
3579count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
3580    uint64_t txg, void *arg)
3581{
3582	int64_t *ualloc_space = arg;
3583	uint64_t offset = sme->sme_offset;
3584	uint64_t vdev_id = sme->sme_vdev;
3585
3586	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3587	if (!vdev_is_concrete(vd))
3588		return (0);
3589
3590	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3591	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
3592
3593	if (txg < metaslab_unflushed_txg(ms))
3594		return (0);
3595
3596	if (sme->sme_type == SM_ALLOC)
3597		*ualloc_space += sme->sme_run;
3598	else
3599		*ualloc_space -= sme->sme_run;
3600
3601	return (0);
3602}
3603
3604static int64_t
3605get_unflushed_alloc_space(spa_t *spa)
3606{
3607	if (dump_opt['L'])
3608		return (0);
3609
3610	int64_t ualloc_space = 0;
3611	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
3612	    &ualloc_space);
3613	return (ualloc_space);
3614}
3615
3616static int
3617load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
3618{
3619	maptype_t *uic_maptype = arg;
3620	uint64_t offset = sme->sme_offset;
3621	uint64_t size = sme->sme_run;
3622	uint64_t vdev_id = sme->sme_vdev;
3623	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3624
3625	/* skip indirect vdevs */
3626	if (!vdev_is_concrete(vd))
3627		return (0);
3628
3629	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3630
3631	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
3632	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
3633
3634	if (txg < metaslab_unflushed_txg(ms))
3635		return (0);
3636
3637	if (*uic_maptype == sme->sme_type)
3638		range_tree_add(ms->ms_allocatable, offset, size);
3639	else
3640		range_tree_remove(ms->ms_allocatable, offset, size);
3641
3642	return (0);
3643}
3644
3645static void
3646load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
3647{
3648	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
3649}
3650
3651static void
3652load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
3653{
3654	vdev_t *rvd = spa->spa_root_vdev;
3655	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
3656		vdev_t *vd = rvd->vdev_child[i];
3657
3658		ASSERT3U(i, ==, vd->vdev_id);
3659
3660		if (vd->vdev_ops == &vdev_indirect_ops)
3661			continue;
3662
3663		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3664			metaslab_t *msp = vd->vdev_ms[m];
3665
3666			(void) fprintf(stderr,
3667			    "\rloading concrete vdev %llu, "
3668			    "metaslab %llu of %llu ...",
3669			    (longlong_t)vd->vdev_id,
3670			    (longlong_t)msp->ms_id,
3671			    (longlong_t)vd->vdev_ms_count);
3672
3673			mutex_enter(&msp->ms_lock);
3674			range_tree_vacate(msp->ms_allocatable, NULL, NULL);
3675
3676			/*
3677			 * We don't want to spend the CPU manipulating the
3678			 * size-ordered tree, so clear the range_tree ops.
3679			 */
3680			msp->ms_allocatable->rt_ops = NULL;
3681
3682			if (msp->ms_sm != NULL) {
3683				VERIFY0(space_map_load(msp->ms_sm,
3684				    msp->ms_allocatable, maptype));
3685			}
3686			if (!msp->ms_loaded)
3687				msp->ms_loaded = B_TRUE;
3688			mutex_exit(&msp->ms_lock);
3689		}
3690	}
3691
3692	load_unflushed_to_ms_allocatables(spa, maptype);
3693}
3694
3695/*
3696 * vm_idxp is an in-out parameter which (for indirect vdevs) is the
3697 * index in vim_entries that has the first entry in this metaslab.
3698 * On return, it will be set to the first entry after this metaslab.
3699 */
3700static void
3701load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
3702    uint64_t *vim_idxp)
3703{
3704	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3705
3706	mutex_enter(&msp->ms_lock);
3707	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
3708
3709	/*
3710	 * We don't want to spend the CPU manipulating the
3711	 * size-ordered tree, so clear the range_tree ops.
3712	 */
3713	msp->ms_allocatable->rt_ops = NULL;
3714
3715	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
3716	    (*vim_idxp)++) {
3717		vdev_indirect_mapping_entry_phys_t *vimep =
3718		    &vim->vim_entries[*vim_idxp];
3719		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
3720		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
3721		ASSERT3U(ent_offset, >=, msp->ms_start);
3722		if (ent_offset >= msp->ms_start + msp->ms_size)
3723			break;
3724
3725		/*
3726		 * Mappings do not cross metaslab boundaries,
3727		 * because we create them by walking the metaslabs.
3728		 */
3729		ASSERT3U(ent_offset + ent_len, <=,
3730		    msp->ms_start + msp->ms_size);
3731		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
3732	}
3733
3734	if (!msp->ms_loaded)
3735		msp->ms_loaded = B_TRUE;
3736	mutex_exit(&msp->ms_lock);
3737}
3738
3739static void
3740zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
3741{
3742	ASSERT(!dump_opt['L']);
3743
3744	vdev_t *rvd = spa->spa_root_vdev;
3745	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3746		vdev_t *vd = rvd->vdev_child[c];
3747
3748		ASSERT3U(c, ==, vd->vdev_id);
3749
3750		if (vd->vdev_ops != &vdev_indirect_ops)
3751			continue;
3752
3753		/*
3754		 * Note: we don't check for mapping leaks on
3755		 * removing vdevs because their ms_allocatable's
3756		 * are used to look for leaks in allocated space.
3757		 */
3758		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
3759
3760		/*
3761		 * Normally, indirect vdevs don't have any
3762		 * metaslabs.  We want to set them up for
3763		 * zio_claim().
3764		 */
3765		VERIFY0(vdev_metaslab_init(vd, 0));
3766
3767		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3768		uint64_t vim_idx = 0;
3769		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3770
3771			(void) fprintf(stderr,
3772			    "\rloading indirect vdev %llu, "
3773			    "metaslab %llu of %llu ...",
3774			    (longlong_t)vd->vdev_id,
3775			    (longlong_t)vd->vdev_ms[m]->ms_id,
3776			    (longlong_t)vd->vdev_ms_count);
3777
3778			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
3779			    &vim_idx);
3780		}
3781		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
3782	}
3783}
3784
3785static void
3786zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
3787{
3788	zcb->zcb_spa = spa;
3789
3790	if (dump_opt['L'])
3791		return;
3792
3793	dsl_pool_t *dp = spa->spa_dsl_pool;
3794	vdev_t *rvd = spa->spa_root_vdev;
3795
3796	/*
3797	 * We are going to be changing the meaning of the metaslab's
3798	 * ms_allocatable.  Ensure that the allocator doesn't try to
3799	 * use the tree.
3800	 */
3801	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
3802	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
3803
3804	zcb->zcb_vd_obsolete_counts =
3805	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
3806	    UMEM_NOFAIL);
3807
3808	/*
3809	 * For leak detection, we overload the ms_allocatable trees
3810	 * to contain allocated segments instead of free segments.
3811	 * As a result, we can't use the normal metaslab_load/unload
3812	 * interfaces.
3813	 */
3814	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
3815	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
3816
3817	/*
3818	 * On load_concrete_ms_allocatable_trees() we loaded all the
3819	 * allocated entries from the ms_sm to the ms_allocatable for
3820	 * each metaslab. If the pool has a checkpoint or is in the
3821	 * middle of discarding a checkpoint, some of these blocks
3822	 * may have been freed but their ms_sm may not have been
3823	 * updated because they are referenced by the checkpoint. In
3824	 * order to avoid false-positives during leak-detection, we
3825	 * go through the vdev's checkpoint space map and exclude all
3826	 * its entries from their relevant ms_allocatable.
3827	 *
3828	 * We also aggregate the space held by the checkpoint and add
3829	 * it to zcb_checkpoint_size.
3830	 *
3831	 * Note that at this point we are also verifying that all the
3832	 * entries on the checkpoint_sm are marked as allocated in
3833	 * the ms_sm of their relevant metaslab.
3834	 * [see comment in checkpoint_sm_exclude_entry_cb()]
3835	 */
3836	zdb_leak_init_exclude_checkpoint(spa, zcb);
3837	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
3838
3839	/* for cleaner progress output */
3840	(void) fprintf(stderr, "\n");
3841
3842	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3843		ASSERT(spa_feature_is_enabled(spa,
3844		    SPA_FEATURE_DEVICE_REMOVAL));
3845		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
3846		    increment_indirect_mapping_cb, zcb, NULL);
3847	}
3848
3849	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3850	zdb_ddt_leak_init(spa, zcb);
3851	spa_config_exit(spa, SCL_CONFIG, FTAG);
3852}
3853
3854static boolean_t
3855zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
3856{
3857	boolean_t leaks = B_FALSE;
3858	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3859	uint64_t total_leaked = 0;
3860
3861	ASSERT(vim != NULL);
3862
3863	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
3864		vdev_indirect_mapping_entry_phys_t *vimep =
3865		    &vim->vim_entries[i];
3866		uint64_t obsolete_bytes = 0;
3867		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
3868		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3869
3870		/*
3871		 * This is not very efficient but it's easy to
3872		 * verify correctness.
3873		 */
3874		for (uint64_t inner_offset = 0;
3875		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
3876		    inner_offset += 1 << vd->vdev_ashift) {
3877			if (range_tree_contains(msp->ms_allocatable,
3878			    offset + inner_offset, 1 << vd->vdev_ashift)) {
3879				obsolete_bytes += 1 << vd->vdev_ashift;
3880			}
3881		}
3882
3883		int64_t bytes_leaked = obsolete_bytes -
3884		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
3885		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
3886		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
3887		if (bytes_leaked != 0 &&
3888		    (vdev_obsolete_counts_are_precise(vd) ||
3889		    dump_opt['d'] >= 5)) {
3890			(void) printf("obsolete indirect mapping count "
3891			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
3892			    (u_longlong_t)vd->vdev_id,
3893			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
3894			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
3895			    (u_longlong_t)bytes_leaked);
3896		}
3897		total_leaked += ABS(bytes_leaked);
3898	}
3899
3900	if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) {
3901		int pct_leaked = total_leaked * 100 /
3902		    vdev_indirect_mapping_bytes_mapped(vim);
3903		(void) printf("cannot verify obsolete indirect mapping "
3904		    "counts of vdev %llu because precise feature was not "
3905		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
3906		    "unreferenced\n",
3907		    (u_longlong_t)vd->vdev_id, pct_leaked,
3908		    (u_longlong_t)total_leaked);
3909	} else if (total_leaked > 0) {
3910		(void) printf("obsolete indirect mapping count mismatch "
3911		    "for vdev %llu -- %llx total bytes mismatched\n",
3912		    (u_longlong_t)vd->vdev_id,
3913		    (u_longlong_t)total_leaked);
3914		leaks |= B_TRUE;
3915	}
3916
3917	vdev_indirect_mapping_free_obsolete_counts(vim,
3918	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3919	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
3920
3921	return (leaks);
3922}
3923
3924static boolean_t
3925zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
3926{
3927	if (dump_opt['L'])
3928		return (B_FALSE);
3929
3930	boolean_t leaks = B_FALSE;
3931
3932	vdev_t *rvd = spa->spa_root_vdev;
3933	for (unsigned c = 0; c < rvd->vdev_children; c++) {
3934		vdev_t *vd = rvd->vdev_child[c];
3935#if DEBUG
3936		metaslab_group_t *mg = vd->vdev_mg;
3937#endif
3938
3939		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
3940			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
3941		}
3942
3943		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3944			metaslab_t *msp = vd->vdev_ms[m];
3945			ASSERT3P(mg, ==, msp->ms_group);
3946
3947			/*
3948			 * ms_allocatable has been overloaded
3949			 * to contain allocated segments. Now that
3950			 * we finished traversing all blocks, any
3951			 * block that remains in the ms_allocatable
3952			 * represents an allocated block that we
3953			 * did not claim during the traversal.
3954			 * Claimed blocks would have been removed
3955			 * from the ms_allocatable.  For indirect
3956			 * vdevs, space remaining in the tree
3957			 * represents parts of the mapping that are
3958			 * not referenced, which is not a bug.
3959			 */
3960			if (vd->vdev_ops == &vdev_indirect_ops) {
3961				range_tree_vacate(msp->ms_allocatable,
3962				    NULL, NULL);
3963			} else {
3964				range_tree_vacate(msp->ms_allocatable,
3965				    zdb_leak, vd);
3966			}
3967			if (msp->ms_loaded) {
3968				msp->ms_loaded = B_FALSE;
3969			}
3970		}
3971
3972	}
3973
3974	umem_free(zcb->zcb_vd_obsolete_counts,
3975	    rvd->vdev_children * sizeof (uint32_t *));
3976	zcb->zcb_vd_obsolete_counts = NULL;
3977
3978	return (leaks);
3979}
3980
3981/* ARGSUSED */
3982static int
3983count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3984{
3985	zdb_cb_t *zcb = arg;
3986
3987	if (dump_opt['b'] >= 5) {
3988		char blkbuf[BP_SPRINTF_LEN];
3989		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3990		(void) printf("[%s] %s\n",
3991		    "deferred free", blkbuf);
3992	}
3993	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
3994	return (0);
3995}
3996
3997static int
3998dump_block_stats(spa_t *spa)
3999{
4000	zdb_cb_t zcb;
4001	zdb_blkstats_t *zb, *tzb;
4002	uint64_t norm_alloc, norm_space, total_alloc, total_found;
4003	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
4004	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
4005	boolean_t leaks = B_FALSE;
4006	int err;
4007
4008	bzero(&zcb, sizeof (zcb));
4009	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
4010	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
4011	    (dump_opt['c'] == 1) ? "metadata " : "",
4012	    dump_opt['c'] ? "checksums " : "",
4013	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
4014	    !dump_opt['L'] ? "nothing leaked " : "");
4015
4016	/*
4017	 * When leak detection is enabled we load all space maps as SM_ALLOC
4018	 * maps, then traverse the pool claiming each block we discover. If
4019	 * the pool is perfectly consistent, the segment trees will be empty
4020	 * when we're done. Anything left over is a leak; any block we can't
4021	 * claim (because it's not part of any space map) is a double
4022	 * allocation, reference to a freed block, or an unclaimed log block.
4023	 *
4024	 * When leak detection is disabled (-L option) we still traverse the
4025	 * pool claiming each block we discover, but we skip opening any space
4026	 * maps.
4027	 */
4028	bzero(&zcb, sizeof (zdb_cb_t));
4029	zdb_leak_init(spa, &zcb);
4030
4031	/*
4032	 * If there's a deferred-free bplist, process that first.
4033	 */
4034	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
4035	    count_block_cb, &zcb, NULL);
4036
4037	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
4038		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
4039		    count_block_cb, &zcb, NULL);
4040	}
4041
4042	zdb_claim_removing(spa, &zcb);
4043
4044	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
4045		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
4046		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
4047		    &zcb, NULL));
4048	}
4049
4050	if (dump_opt['c'] > 1)
4051		flags |= TRAVERSE_PREFETCH_DATA;
4052
4053	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
4054	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
4055	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
4056	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
4057	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
4058
4059	/*
4060	 * If we've traversed the data blocks then we need to wait for those
4061	 * I/Os to complete. We leverage "The Godfather" zio to wait on
4062	 * all async I/Os to complete.
4063	 */
4064	if (dump_opt['c']) {
4065		for (int i = 0; i < max_ncpus; i++) {
4066			(void) zio_wait(spa->spa_async_zio_root[i]);
4067			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4068			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4069			    ZIO_FLAG_GODFATHER);
4070		}
4071	}
4072
4073	/*
4074	 * Done after zio_wait() since zcb_haderrors is modified in
4075	 * zdb_blkptr_done()
4076	 */
4077	zcb.zcb_haderrors |= err;
4078
4079	if (zcb.zcb_haderrors) {
4080		(void) printf("\nError counts:\n\n");
4081		(void) printf("\t%5s  %s\n", "errno", "count");
4082		for (int e = 0; e < 256; e++) {
4083			if (zcb.zcb_errors[e] != 0) {
4084				(void) printf("\t%5d  %llu\n",
4085				    e, (u_longlong_t)zcb.zcb_errors[e]);
4086			}
4087		}
4088	}
4089
4090	/*
4091	 * Report any leaked segments.
4092	 */
4093	leaks |= zdb_leak_fini(spa, &zcb);
4094
4095	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
4096
4097	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
4098	norm_space = metaslab_class_get_space(spa_normal_class(spa));
4099
4100	total_alloc = norm_alloc +
4101	    metaslab_class_get_alloc(spa_log_class(spa)) +
4102	    metaslab_class_get_alloc(spa_special_class(spa)) +
4103	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
4104	    get_unflushed_alloc_space(spa);
4105	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
4106	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
4107
4108	if (total_found == total_alloc && !dump_opt['L']) {
4109		(void) printf("\n\tNo leaks (block sum matches space"
4110		    " maps exactly)\n");
4111	} else if (!dump_opt['L']) {
4112		(void) printf("block traversal size %llu != alloc %llu "
4113		    "(%s %lld)\n",
4114		    (u_longlong_t)total_found,
4115		    (u_longlong_t)total_alloc,
4116		    (dump_opt['L']) ? "unreachable" : "leaked",
4117		    (longlong_t)(total_alloc - total_found));
4118		leaks = B_TRUE;
4119	}
4120
4121	if (tzb->zb_count == 0)
4122		return (2);
4123
4124	(void) printf("\n");
4125	(void) printf("\t%-16s %14llu\n", "bp count:",
4126	    (u_longlong_t)tzb->zb_count);
4127	(void) printf("\t%-16s %14llu\n", "ganged count:",
4128	    (longlong_t)tzb->zb_gangs);
4129	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
4130	    (u_longlong_t)tzb->zb_lsize,
4131	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
4132	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
4133	    "bp physical:", (u_longlong_t)tzb->zb_psize,
4134	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
4135	    (double)tzb->zb_lsize / tzb->zb_psize);
4136	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
4137	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
4138	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
4139	    (double)tzb->zb_lsize / tzb->zb_asize);
4140	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
4141	    "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
4142	    (u_longlong_t)zcb.zcb_dedup_blocks,
4143	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
4144	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
4145	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
4146
4147	if (spa_special_class(spa)->mc_rotor != NULL) {
4148		uint64_t alloc = metaslab_class_get_alloc(
4149		    spa_special_class(spa));
4150		uint64_t space = metaslab_class_get_space(
4151		    spa_special_class(spa));
4152
4153		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
4154		    "Special class", (u_longlong_t)alloc,
4155		    100.0 * alloc / space);
4156	}
4157
4158	if (spa_dedup_class(spa)->mc_rotor != NULL) {
4159		uint64_t alloc = metaslab_class_get_alloc(
4160		    spa_dedup_class(spa));
4161		uint64_t space = metaslab_class_get_space(
4162		    spa_dedup_class(spa));
4163
4164		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
4165		    "Dedup class", (u_longlong_t)alloc,
4166		    100.0 * alloc / space);
4167	}
4168
4169	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
4170		if (zcb.zcb_embedded_blocks[i] == 0)
4171			continue;
4172		(void) printf("\n");
4173		(void) printf("\tadditional, non-pointer bps of type %u: "
4174		    "%10llu\n",
4175		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
4176
4177		if (dump_opt['b'] >= 3) {
4178			(void) printf("\t number of (compressed) bytes:  "
4179			    "number of bps\n");
4180			dump_histogram(zcb.zcb_embedded_histogram[i],
4181			    sizeof (zcb.zcb_embedded_histogram[i]) /
4182			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
4183		}
4184	}
4185
4186	if (tzb->zb_ditto_samevdev != 0) {
4187		(void) printf("\tDittoed blocks on same vdev: %llu\n",
4188		    (longlong_t)tzb->zb_ditto_samevdev);
4189	}
4190	if (tzb->zb_ditto_same_ms != 0) {
4191		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
4192		    (longlong_t)tzb->zb_ditto_same_ms);
4193	}
4194
4195	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
4196		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
4197		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4198
4199		if (vim == NULL) {
4200			continue;
4201		}
4202
4203		char mem[32];
4204		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
4205		    mem, vdev_indirect_mapping_size(vim));
4206
4207		(void) printf("\tindirect vdev id %llu has %llu segments "
4208		    "(%s in memory)\n",
4209		    (longlong_t)vd->vdev_id,
4210		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
4211	}
4212
4213	if (dump_opt['b'] >= 2) {
4214		int l, t, level;
4215		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
4216		    "\t  avg\t comp\t%%Total\tType\n");
4217
4218		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
4219			char csize[32], lsize[32], psize[32], asize[32];
4220			char avg[32], gang[32];
4221			const char *typename;
4222
4223			/* make sure nicenum has enough space */
4224			CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
4225			CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
4226			CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
4227			CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
4228			CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
4229			CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
4230
4231			if (t < DMU_OT_NUMTYPES)
4232				typename = dmu_ot[t].ot_name;
4233			else
4234				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
4235
4236			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
4237				(void) printf("%6s\t%5s\t%5s\t%5s"
4238				    "\t%5s\t%5s\t%6s\t%s\n",
4239				    "-",
4240				    "-",
4241				    "-",
4242				    "-",
4243				    "-",
4244				    "-",
4245				    "-",
4246				    typename);
4247				continue;
4248			}
4249
4250			for (l = ZB_TOTAL - 1; l >= -1; l--) {
4251				level = (l == -1 ? ZB_TOTAL : l);
4252				zb = &zcb.zcb_type[level][t];
4253
4254				if (zb->zb_asize == 0)
4255					continue;
4256
4257				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
4258					continue;
4259
4260				if (level == 0 && zb->zb_asize ==
4261				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
4262					continue;
4263
4264				zdb_nicenum(zb->zb_count, csize,
4265				    sizeof (csize));
4266				zdb_nicenum(zb->zb_lsize, lsize,
4267				    sizeof (lsize));
4268				zdb_nicenum(zb->zb_psize, psize,
4269				    sizeof (psize));
4270				zdb_nicenum(zb->zb_asize, asize,
4271				    sizeof (asize));
4272				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
4273				    sizeof (avg));
4274				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
4275
4276				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
4277				    "\t%5.2f\t%6.2f\t",
4278				    csize, lsize, psize, asize, avg,
4279				    (double)zb->zb_lsize / zb->zb_psize,
4280				    100.0 * zb->zb_asize / tzb->zb_asize);
4281
4282				if (level == ZB_TOTAL)
4283					(void) printf("%s\n", typename);
4284				else
4285					(void) printf("    L%d %s\n",
4286					    level, typename);
4287
4288				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
4289					(void) printf("\t number of ganged "
4290					    "blocks: %s\n", gang);
4291				}
4292
4293				if (dump_opt['b'] >= 4) {
4294					(void) printf("psize "
4295					    "(in 512-byte sectors): "
4296					    "number of blocks\n");
4297					dump_histogram(zb->zb_psize_histogram,
4298					    PSIZE_HISTO_SIZE, 0);
4299				}
4300			}
4301		}
4302	}
4303
4304	(void) printf("\n");
4305
4306	if (leaks)
4307		return (2);
4308
4309	if (zcb.zcb_haderrors)
4310		return (3);
4311
4312	return (0);
4313}
4314
4315typedef struct zdb_ddt_entry {
4316	ddt_key_t	zdde_key;
4317	uint64_t	zdde_ref_blocks;
4318	uint64_t	zdde_ref_lsize;
4319	uint64_t	zdde_ref_psize;
4320	uint64_t	zdde_ref_dsize;
4321	avl_node_t	zdde_node;
4322} zdb_ddt_entry_t;
4323
4324/* ARGSUSED */
4325static int
4326zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
4327    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
4328{
4329	avl_tree_t *t = arg;
4330	avl_index_t where;
4331	zdb_ddt_entry_t *zdde, zdde_search;
4332
4333	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
4334		return (0);
4335
4336	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
4337		(void) printf("traversing objset %llu, %llu objects, "
4338		    "%lu blocks so far\n",
4339		    (u_longlong_t)zb->zb_objset,
4340		    (u_longlong_t)BP_GET_FILL(bp),
4341		    avl_numnodes(t));
4342	}
4343
4344	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
4345	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
4346		return (0);
4347
4348	ddt_key_fill(&zdde_search.zdde_key, bp);
4349
4350	zdde = avl_find(t, &zdde_search, &where);
4351
4352	if (zdde == NULL) {
4353		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
4354		zdde->zdde_key = zdde_search.zdde_key;
4355		avl_insert(t, zdde, where);
4356	}
4357
4358	zdde->zdde_ref_blocks += 1;
4359	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
4360	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
4361	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
4362
4363	return (0);
4364}
4365
4366static void
4367dump_simulated_ddt(spa_t *spa)
4368{
4369	avl_tree_t t;
4370	void *cookie = NULL;
4371	zdb_ddt_entry_t *zdde;
4372	ddt_histogram_t ddh_total;
4373	ddt_stat_t dds_total;
4374
4375	bzero(&ddh_total, sizeof (ddh_total));
4376	bzero(&dds_total, sizeof (dds_total));
4377	avl_create(&t, ddt_entry_compare,
4378	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
4379
4380	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4381
4382	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
4383	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
4384
4385	spa_config_exit(spa, SCL_CONFIG, FTAG);
4386
4387	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
4388		ddt_stat_t dds;
4389		uint64_t refcnt = zdde->zdde_ref_blocks;
4390		ASSERT(refcnt != 0);
4391
4392		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
4393		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
4394		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
4395		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
4396
4397		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
4398		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
4399		dds.dds_ref_psize = zdde->zdde_ref_psize;
4400		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
4401
4402		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
4403		    &dds, 0);
4404
4405		umem_free(zdde, sizeof (*zdde));
4406	}
4407
4408	avl_destroy(&t);
4409
4410	ddt_histogram_stat(&dds_total, &ddh_total);
4411
4412	(void) printf("Simulated DDT histogram:\n");
4413
4414	zpool_dump_ddt(&dds_total, &ddh_total);
4415
4416	dump_dedup_ratio(&dds_total);
4417}
4418
4419static int
4420verify_device_removal_feature_counts(spa_t *spa)
4421{
4422	uint64_t dr_feature_refcount = 0;
4423	uint64_t oc_feature_refcount = 0;
4424	uint64_t indirect_vdev_count = 0;
4425	uint64_t precise_vdev_count = 0;
4426	uint64_t obsolete_counts_object_count = 0;
4427	uint64_t obsolete_sm_count = 0;
4428	uint64_t obsolete_counts_count = 0;
4429	uint64_t scip_count = 0;
4430	uint64_t obsolete_bpobj_count = 0;
4431	int ret = 0;
4432
4433	spa_condensing_indirect_phys_t *scip =
4434	    &spa->spa_condensing_indirect_phys;
4435	if (scip->scip_next_mapping_object != 0) {
4436		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
4437		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
4438		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4439
4440		(void) printf("Condensing indirect vdev %llu: new mapping "
4441		    "object %llu, prev obsolete sm %llu\n",
4442		    (u_longlong_t)scip->scip_vdev,
4443		    (u_longlong_t)scip->scip_next_mapping_object,
4444		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
4445		if (scip->scip_prev_obsolete_sm_object != 0) {
4446			space_map_t *prev_obsolete_sm = NULL;
4447			VERIFY0(space_map_open(&prev_obsolete_sm,
4448			    spa->spa_meta_objset,
4449			    scip->scip_prev_obsolete_sm_object,
4450			    0, vd->vdev_asize, 0));
4451			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
4452			(void) printf("\n");
4453			space_map_close(prev_obsolete_sm);
4454		}
4455
4456		scip_count += 2;
4457	}
4458
4459	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
4460		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
4461		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4462
4463		if (vic->vic_mapping_object != 0) {
4464			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
4465			    vd->vdev_removing);
4466			indirect_vdev_count++;
4467
4468			if (vd->vdev_indirect_mapping->vim_havecounts) {
4469				obsolete_counts_count++;
4470			}
4471		}
4472		if (vdev_obsolete_counts_are_precise(vd)) {
4473			ASSERT(vic->vic_mapping_object != 0);
4474			precise_vdev_count++;
4475		}
4476		if (vdev_obsolete_sm_object(vd) != 0) {
4477			ASSERT(vic->vic_mapping_object != 0);
4478			obsolete_sm_count++;
4479		}
4480	}
4481
4482	(void) feature_get_refcount(spa,
4483	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
4484	    &dr_feature_refcount);
4485	(void) feature_get_refcount(spa,
4486	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
4487	    &oc_feature_refcount);
4488
4489	if (dr_feature_refcount != indirect_vdev_count) {
4490		ret = 1;
4491		(void) printf("Number of indirect vdevs (%llu) " \
4492		    "does not match feature count (%llu)\n",
4493		    (u_longlong_t)indirect_vdev_count,
4494		    (u_longlong_t)dr_feature_refcount);
4495	} else {
4496		(void) printf("Verified device_removal feature refcount " \
4497		    "of %llu is correct\n",
4498		    (u_longlong_t)dr_feature_refcount);
4499	}
4500
4501	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
4502	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
4503		obsolete_bpobj_count++;
4504	}
4505
4506
4507	obsolete_counts_object_count = precise_vdev_count;
4508	obsolete_counts_object_count += obsolete_sm_count;
4509	obsolete_counts_object_count += obsolete_counts_count;
4510	obsolete_counts_object_count += scip_count;
4511	obsolete_counts_object_count += obsolete_bpobj_count;
4512	obsolete_counts_object_count += remap_deadlist_count;
4513
4514	if (oc_feature_refcount != obsolete_counts_object_count) {
4515		ret = 1;
4516		(void) printf("Number of obsolete counts objects (%llu) " \
4517		    "does not match feature count (%llu)\n",
4518		    (u_longlong_t)obsolete_counts_object_count,
4519		    (u_longlong_t)oc_feature_refcount);
4520		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
4521		    "ob:%llu rd:%llu\n",
4522		    (u_longlong_t)precise_vdev_count,
4523		    (u_longlong_t)obsolete_sm_count,
4524		    (u_longlong_t)obsolete_counts_count,
4525		    (u_longlong_t)scip_count,
4526		    (u_longlong_t)obsolete_bpobj_count,
4527		    (u_longlong_t)remap_deadlist_count);
4528	} else {
4529		(void) printf("Verified indirect_refcount feature refcount " \
4530		    "of %llu is correct\n",
4531		    (u_longlong_t)oc_feature_refcount);
4532	}
4533	return (ret);
4534}
4535
4536static void
4537zdb_set_skip_mmp(char *target)
4538{
4539	spa_t *spa;
4540
4541	/*
4542	 * Disable the activity check to allow examination of
4543	 * active pools.
4544	 */
4545	mutex_enter(&spa_namespace_lock);
4546	if ((spa = spa_lookup(target)) != NULL) {
4547		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
4548	}
4549	mutex_exit(&spa_namespace_lock);
4550}
4551
4552#define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
4553/*
4554 * Import the checkpointed state of the pool specified by the target
4555 * parameter as readonly. The function also accepts a pool config
4556 * as an optional parameter, else it attempts to infer the config by
4557 * the name of the target pool.
4558 *
4559 * Note that the checkpointed state's pool name will be the name of
4560 * the original pool with the above suffix appened to it. In addition,
4561 * if the target is not a pool name (e.g. a path to a dataset) then
4562 * the new_path parameter is populated with the updated path to
4563 * reflect the fact that we are looking into the checkpointed state.
4564 *
4565 * The function returns a newly-allocated copy of the name of the
4566 * pool containing the checkpointed state. When this copy is no
4567 * longer needed it should be freed with free(3C). Same thing
4568 * applies to the new_path parameter if allocated.
4569 */
4570static char *
4571import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
4572{
4573	int error = 0;
4574	char *poolname, *bogus_name;
4575
4576	/* If the target is not a pool, the extract the pool name */
4577	char *path_start = strchr(target, '/');
4578	if (path_start != NULL) {
4579		size_t poolname_len = path_start - target;
4580		poolname = strndup(target, poolname_len);
4581	} else {
4582		poolname = target;
4583	}
4584
4585	if (cfg == NULL) {
4586		zdb_set_skip_mmp(poolname);
4587		error = spa_get_stats(poolname, &cfg, NULL, 0);
4588		if (error != 0) {
4589			fatal("Tried to read config of pool \"%s\" but "
4590			    "spa_get_stats() failed with error %d\n",
4591			    poolname, error);
4592		}
4593	}
4594
4595	(void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX);
4596	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
4597
4598	error = spa_import(bogus_name, cfg, NULL,
4599	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
4600	    ZFS_IMPORT_SKIP_MMP);
4601	if (error != 0) {
4602		fatal("Tried to import pool \"%s\" but spa_import() failed "
4603		    "with error %d\n", bogus_name, error);
4604	}
4605
4606	if (new_path != NULL && path_start != NULL)
4607		(void) asprintf(new_path, "%s%s", bogus_name, path_start);
4608
4609	if (target != poolname)
4610		free(poolname);
4611
4612	return (bogus_name);
4613}
4614
4615typedef struct verify_checkpoint_sm_entry_cb_arg {
4616	vdev_t *vcsec_vd;
4617
4618	/* the following fields are only used for printing progress */
4619	uint64_t vcsec_entryid;
4620	uint64_t vcsec_num_entries;
4621} verify_checkpoint_sm_entry_cb_arg_t;
4622
4623#define	ENTRIES_PER_PROGRESS_UPDATE 10000
4624
4625static int
4626verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
4627{
4628	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
4629	vdev_t *vd = vcsec->vcsec_vd;
4630	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
4631	uint64_t end = sme->sme_offset + sme->sme_run;
4632
4633	ASSERT(sme->sme_type == SM_FREE);
4634
4635	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
4636		(void) fprintf(stderr,
4637		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
4638		    (longlong_t)vd->vdev_id,
4639		    (longlong_t)vcsec->vcsec_entryid,
4640		    (longlong_t)vcsec->vcsec_num_entries);
4641	}
4642	vcsec->vcsec_entryid++;
4643
4644	/*
4645	 * See comment in checkpoint_sm_exclude_entry_cb()
4646	 */
4647	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
4648	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4649
4650	/*
4651	 * The entries in the vdev_checkpoint_sm should be marked as
4652	 * allocated in the checkpointed state of the pool, therefore
4653	 * their respective ms_allocateable trees should not contain them.
4654	 */
4655	mutex_enter(&ms->ms_lock);
4656	range_tree_verify_not_present(ms->ms_allocatable,
4657	    sme->sme_offset, sme->sme_run);
4658	mutex_exit(&ms->ms_lock);
4659
4660	return (0);
4661}
4662
4663/*
4664 * Verify that all segments in the vdev_checkpoint_sm are allocated
4665 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
4666 * ms_allocatable).
4667 *
4668 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
4669 * each vdev in the current state of the pool to the metaslab space maps
4670 * (ms_sm) of the checkpointed state of the pool.
4671 *
4672 * Note that the function changes the state of the ms_allocatable
4673 * trees of the current spa_t. The entries of these ms_allocatable
4674 * trees are cleared out and then repopulated from with the free
4675 * entries of their respective ms_sm space maps.
4676 */
4677static void
4678verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
4679{
4680	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
4681	vdev_t *current_rvd = current->spa_root_vdev;
4682
4683	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
4684
4685	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
4686		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
4687		vdev_t *current_vd = current_rvd->vdev_child[c];
4688
4689		space_map_t *checkpoint_sm = NULL;
4690		uint64_t checkpoint_sm_obj;
4691
4692		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
4693			/*
4694			 * Since we don't allow device removal in a pool
4695			 * that has a checkpoint, we expect that all removed
4696			 * vdevs were removed from the pool before the
4697			 * checkpoint.
4698			 */
4699			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
4700			continue;
4701		}
4702
4703		/*
4704		 * If the checkpoint space map doesn't exist, then nothing
4705		 * here is checkpointed so there's nothing to verify.
4706		 */
4707		if (current_vd->vdev_top_zap == 0 ||
4708		    zap_contains(spa_meta_objset(current),
4709		    current_vd->vdev_top_zap,
4710		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4711			continue;
4712
4713		VERIFY0(zap_lookup(spa_meta_objset(current),
4714		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4715		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
4716
4717		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
4718		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
4719		    current_vd->vdev_ashift));
4720
4721		verify_checkpoint_sm_entry_cb_arg_t vcsec;
4722		vcsec.vcsec_vd = ckpoint_vd;
4723		vcsec.vcsec_entryid = 0;
4724		vcsec.vcsec_num_entries =
4725		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
4726		VERIFY0(space_map_iterate(checkpoint_sm,
4727		    space_map_length(checkpoint_sm),
4728		    verify_checkpoint_sm_entry_cb, &vcsec));
4729		dump_spacemap(current->spa_meta_objset, checkpoint_sm);
4730		space_map_close(checkpoint_sm);
4731	}
4732
4733	/*
4734	 * If we've added vdevs since we took the checkpoint, ensure
4735	 * that their checkpoint space maps are empty.
4736	 */
4737	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
4738		for (uint64_t c = ckpoint_rvd->vdev_children;
4739		    c < current_rvd->vdev_children; c++) {
4740			vdev_t *current_vd = current_rvd->vdev_child[c];
4741			ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
4742		}
4743	}
4744
4745	/* for cleaner progress output */
4746	(void) fprintf(stderr, "\n");
4747}
4748
4749/*
4750 * Verifies that all space that's allocated in the checkpoint is
4751 * still allocated in the current version, by checking that everything
4752 * in checkpoint's ms_allocatable (which is actually allocated, not
4753 * allocatable/free) is not present in current's ms_allocatable.
4754 *
4755 * Note that the function changes the state of the ms_allocatable
4756 * trees of both spas when called. The entries of all ms_allocatable
4757 * trees are cleared out and then repopulated from their respective
4758 * ms_sm space maps. In the checkpointed state we load the allocated
4759 * entries, and in the current state we load the free entries.
4760 */
4761static void
4762verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
4763{
4764	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
4765	vdev_t *current_rvd = current->spa_root_vdev;
4766
4767	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
4768	load_concrete_ms_allocatable_trees(current, SM_FREE);
4769
4770	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
4771		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
4772		vdev_t *current_vd = current_rvd->vdev_child[i];
4773
4774		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
4775			/*
4776			 * See comment in verify_checkpoint_vdev_spacemaps()
4777			 */
4778			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
4779			continue;
4780		}
4781
4782		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
4783			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
4784			metaslab_t *current_msp = current_vd->vdev_ms[m];
4785
4786			(void) fprintf(stderr,
4787			    "\rverifying vdev %llu of %llu, "
4788			    "metaslab %llu of %llu ...",
4789			    (longlong_t)current_vd->vdev_id,
4790			    (longlong_t)current_rvd->vdev_children,
4791			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
4792			    (longlong_t)current_vd->vdev_ms_count);
4793
4794			/*
4795			 * We walk through the ms_allocatable trees that
4796			 * are loaded with the allocated blocks from the
4797			 * ms_sm spacemaps of the checkpoint. For each
4798			 * one of these ranges we ensure that none of them
4799			 * exists in the ms_allocatable trees of the
4800			 * current state which are loaded with the ranges
4801			 * that are currently free.
4802			 *
4803			 * This way we ensure that none of the blocks that
4804			 * are part of the checkpoint were freed by mistake.
4805			 */
4806			range_tree_walk(ckpoint_msp->ms_allocatable,
4807			    (range_tree_func_t *)range_tree_verify_not_present,
4808			    current_msp->ms_allocatable);
4809		}
4810	}
4811
4812	/* for cleaner progress output */
4813	(void) fprintf(stderr, "\n");
4814}
4815
4816static void
4817verify_checkpoint_blocks(spa_t *spa)
4818{
4819	ASSERT(!dump_opt['L']);
4820
4821	spa_t *checkpoint_spa;
4822	char *checkpoint_pool;
4823	nvlist_t *config = NULL;
4824	int error = 0;
4825
4826	/*
4827	 * We import the checkpointed state of the pool (under a different
4828	 * name) so we can do verification on it against the current state
4829	 * of the pool.
4830	 */
4831	checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
4832	    NULL);
4833	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
4834
4835	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
4836	if (error != 0) {
4837		fatal("Tried to open pool \"%s\" but spa_open() failed with "
4838		    "error %d\n", checkpoint_pool, error);
4839	}
4840
4841	/*
4842	 * Ensure that ranges in the checkpoint space maps of each vdev
4843	 * are allocated according to the checkpointed state's metaslab
4844	 * space maps.
4845	 */
4846	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
4847
4848	/*
4849	 * Ensure that allocated ranges in the checkpoint's metaslab
4850	 * space maps remain allocated in the metaslab space maps of
4851	 * the current state.
4852	 */
4853	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
4854
4855	/*
4856	 * Once we are done, we get rid of the checkpointed state.
4857	 */
4858	spa_close(checkpoint_spa, FTAG);
4859	free(checkpoint_pool);
4860}
4861
4862static void
4863dump_leftover_checkpoint_blocks(spa_t *spa)
4864{
4865	vdev_t *rvd = spa->spa_root_vdev;
4866
4867	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
4868		vdev_t *vd = rvd->vdev_child[i];
4869
4870		space_map_t *checkpoint_sm = NULL;
4871		uint64_t checkpoint_sm_obj;
4872
4873		if (vd->vdev_top_zap == 0)
4874			continue;
4875
4876		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4877		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4878			continue;
4879
4880		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4881		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4882		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
4883
4884		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4885		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4886		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
4887		space_map_close(checkpoint_sm);
4888	}
4889}
4890
4891static int
4892verify_checkpoint(spa_t *spa)
4893{
4894	uberblock_t checkpoint;
4895	int error;
4896
4897	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
4898		return (0);
4899
4900	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4901	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4902	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4903
4904	if (error == ENOENT && !dump_opt['L']) {
4905		/*
4906		 * If the feature is active but the uberblock is missing
4907		 * then we must be in the middle of discarding the
4908		 * checkpoint.
4909		 */
4910		(void) printf("\nPartially discarded checkpoint "
4911		    "state found:\n");
4912		dump_leftover_checkpoint_blocks(spa);
4913		return (0);
4914	} else if (error != 0) {
4915		(void) printf("lookup error %d when looking for "
4916		    "checkpointed uberblock in MOS\n", error);
4917		return (error);
4918	}
4919	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
4920
4921	if (checkpoint.ub_checkpoint_txg == 0) {
4922		(void) printf("\nub_checkpoint_txg not set in checkpointed "
4923		    "uberblock\n");
4924		error = 3;
4925	}
4926
4927	if (error == 0 && !dump_opt['L'])
4928		verify_checkpoint_blocks(spa);
4929
4930	return (error);
4931}
4932
4933/* ARGSUSED */
4934static void
4935mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
4936{
4937	for (uint64_t i = start; i < size; i++) {
4938		(void) printf("MOS object %llu referenced but not allocated\n",
4939		    (u_longlong_t)i);
4940	}
4941}
4942
4943static range_tree_t *mos_refd_objs;
4944
4945static void
4946mos_obj_refd(uint64_t obj)
4947{
4948	if (obj != 0 && mos_refd_objs != NULL)
4949		range_tree_add(mos_refd_objs, obj, 1);
4950}
4951
4952static void
4953mos_leak_vdev_top_zap(vdev_t *vd)
4954{
4955	uint64_t ms_flush_data_obj;
4956
4957	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
4958	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
4959	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
4960	if (error == ENOENT)
4961		return;
4962	ASSERT0(error);
4963
4964	mos_obj_refd(ms_flush_data_obj);
4965}
4966
4967static void
4968mos_leak_vdev(vdev_t *vd)
4969{
4970	mos_obj_refd(vd->vdev_dtl_object);
4971	mos_obj_refd(vd->vdev_ms_array);
4972	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
4973	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
4974	mos_obj_refd(vd->vdev_leaf_zap);
4975	if (vd->vdev_checkpoint_sm != NULL)
4976		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
4977	if (vd->vdev_indirect_mapping != NULL) {
4978		mos_obj_refd(vd->vdev_indirect_mapping->
4979		    vim_phys->vimp_counts_object);
4980	}
4981	if (vd->vdev_obsolete_sm != NULL)
4982		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
4983
4984	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
4985		metaslab_t *ms = vd->vdev_ms[m];
4986		mos_obj_refd(space_map_object(ms->ms_sm));
4987	}
4988
4989	if (vd->vdev_top_zap != 0) {
4990		mos_obj_refd(vd->vdev_top_zap);
4991		mos_leak_vdev_top_zap(vd);
4992	}
4993
4994	for (uint64_t c = 0; c < vd->vdev_children; c++) {
4995		mos_leak_vdev(vd->vdev_child[c]);
4996	}
4997}
4998
4999static void
5000mos_leak_log_spacemaps(spa_t *spa)
5001{
5002	uint64_t spacemap_zap;
5003
5004	int error = zap_lookup(spa_meta_objset(spa),
5005	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
5006	    sizeof (spacemap_zap), 1, &spacemap_zap);
5007	if (error == ENOENT)
5008		return;
5009	ASSERT0(error);
5010
5011	mos_obj_refd(spacemap_zap);
5012	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
5013	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
5014		mos_obj_refd(sls->sls_sm_obj);
5015}
5016
5017static int
5018dump_mos_leaks(spa_t *spa)
5019{
5020	int rv = 0;
5021	objset_t *mos = spa->spa_meta_objset;
5022	dsl_pool_t *dp = spa->spa_dsl_pool;
5023
5024	/* Visit and mark all referenced objects in the MOS */
5025
5026	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
5027	mos_obj_refd(spa->spa_pool_props_object);
5028	mos_obj_refd(spa->spa_config_object);
5029	mos_obj_refd(spa->spa_ddt_stat_object);
5030	mos_obj_refd(spa->spa_feat_desc_obj);
5031	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
5032	mos_obj_refd(spa->spa_feat_for_read_obj);
5033	mos_obj_refd(spa->spa_feat_for_write_obj);
5034	mos_obj_refd(spa->spa_history);
5035	mos_obj_refd(spa->spa_errlog_last);
5036	mos_obj_refd(spa->spa_errlog_scrub);
5037	mos_obj_refd(spa->spa_all_vdev_zaps);
5038	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
5039	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
5040	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
5041	bpobj_count_refd(&spa->spa_deferred_bpobj);
5042	mos_obj_refd(dp->dp_empty_bpobj);
5043	bpobj_count_refd(&dp->dp_obsolete_bpobj);
5044	bpobj_count_refd(&dp->dp_free_bpobj);
5045	mos_obj_refd(spa->spa_l2cache.sav_object);
5046	mos_obj_refd(spa->spa_spares.sav_object);
5047
5048	if (spa->spa_syncing_log_sm != NULL)
5049		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
5050	mos_leak_log_spacemaps(spa);
5051
5052	mos_obj_refd(spa->spa_condensing_indirect_phys.
5053	    scip_next_mapping_object);
5054	mos_obj_refd(spa->spa_condensing_indirect_phys.
5055	    scip_prev_obsolete_sm_object);
5056	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
5057		vdev_indirect_mapping_t *vim =
5058		    vdev_indirect_mapping_open(mos,
5059		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
5060		mos_obj_refd(vim->vim_phys->vimp_counts_object);
5061		vdev_indirect_mapping_close(vim);
5062	}
5063
5064	if (dp->dp_origin_snap != NULL) {
5065		dsl_dataset_t *ds;
5066
5067		dsl_pool_config_enter(dp, FTAG);
5068		VERIFY0(dsl_dataset_hold_obj(dp,
5069		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
5070		    FTAG, &ds));
5071		count_ds_mos_objects(ds);
5072		dump_deadlist(&ds->ds_deadlist);
5073		dsl_dataset_rele(ds, FTAG);
5074		dsl_pool_config_exit(dp, FTAG);
5075
5076		count_ds_mos_objects(dp->dp_origin_snap);
5077		dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
5078	}
5079	count_dir_mos_objects(dp->dp_mos_dir);
5080	if (dp->dp_free_dir != NULL)
5081		count_dir_mos_objects(dp->dp_free_dir);
5082	if (dp->dp_leak_dir != NULL)
5083		count_dir_mos_objects(dp->dp_leak_dir);
5084
5085	mos_leak_vdev(spa->spa_root_vdev);
5086
5087	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
5088		for (uint64_t type = 0; type < DDT_TYPES; type++) {
5089			for (uint64_t cksum = 0;
5090			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
5091				ddt_t *ddt = spa->spa_ddt[cksum];
5092				mos_obj_refd(ddt->ddt_object[type][class]);
5093			}
5094		}
5095	}
5096
5097	/*
5098	 * Visit all allocated objects and make sure they are referenced.
5099	 */
5100	uint64_t object = 0;
5101	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
5102		if (range_tree_contains(mos_refd_objs, object, 1)) {
5103			range_tree_remove(mos_refd_objs, object, 1);
5104		} else {
5105			dmu_object_info_t doi;
5106			const char *name;
5107			dmu_object_info(mos, object, &doi);
5108			if (doi.doi_type & DMU_OT_NEWTYPE) {
5109				dmu_object_byteswap_t bswap =
5110				    DMU_OT_BYTESWAP(doi.doi_type);
5111				name = dmu_ot_byteswap[bswap].ob_name;
5112			} else {
5113				name = dmu_ot[doi.doi_type].ot_name;
5114			}
5115
5116			(void) printf("MOS object %llu (%s) leaked\n",
5117			    (u_longlong_t)object, name);
5118			rv = 2;
5119		}
5120	}
5121	(void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
5122	if (!range_tree_is_empty(mos_refd_objs))
5123		rv = 2;
5124	range_tree_vacate(mos_refd_objs, NULL, NULL);
5125	range_tree_destroy(mos_refd_objs);
5126	return (rv);
5127}
5128
5129typedef struct log_sm_obsolete_stats_arg {
5130	uint64_t lsos_current_txg;
5131
5132	uint64_t lsos_total_entries;
5133	uint64_t lsos_valid_entries;
5134
5135	uint64_t lsos_sm_entries;
5136	uint64_t lsos_valid_sm_entries;
5137} log_sm_obsolete_stats_arg_t;
5138
5139static int
5140log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
5141    uint64_t txg, void *arg)
5142{
5143	log_sm_obsolete_stats_arg_t *lsos = arg;
5144	uint64_t offset = sme->sme_offset;
5145	uint64_t vdev_id = sme->sme_vdev;
5146
5147	if (lsos->lsos_current_txg == 0) {
5148		/* this is the first log */
5149		lsos->lsos_current_txg = txg;
5150	} else if (lsos->lsos_current_txg < txg) {
5151		/* we just changed log - print stats and reset */
5152		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
5153		    (u_longlong_t)lsos->lsos_valid_sm_entries,
5154		    (u_longlong_t)lsos->lsos_sm_entries,
5155		    (u_longlong_t)lsos->lsos_current_txg);
5156		lsos->lsos_valid_sm_entries = 0;
5157		lsos->lsos_sm_entries = 0;
5158		lsos->lsos_current_txg = txg;
5159	}
5160	ASSERT3U(lsos->lsos_current_txg, ==, txg);
5161
5162	lsos->lsos_sm_entries++;
5163	lsos->lsos_total_entries++;
5164
5165	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
5166	if (!vdev_is_concrete(vd))
5167		return (0);
5168
5169	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5170	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
5171
5172	if (txg < metaslab_unflushed_txg(ms))
5173		return (0);
5174	lsos->lsos_valid_sm_entries++;
5175	lsos->lsos_valid_entries++;
5176	return (0);
5177}
5178
5179static void
5180dump_log_spacemap_obsolete_stats(spa_t *spa)
5181{
5182	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
5183		return;
5184
5185	log_sm_obsolete_stats_arg_t lsos;
5186	bzero(&lsos, sizeof (lsos));
5187
5188	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
5189
5190	iterate_through_spacemap_logs(spa,
5191	    log_spacemap_obsolete_stats_cb, &lsos);
5192
5193	/* print stats for latest log */
5194	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
5195	    (u_longlong_t)lsos.lsos_valid_sm_entries,
5196	    (u_longlong_t)lsos.lsos_sm_entries,
5197	    (u_longlong_t)lsos.lsos_current_txg);
5198
5199	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
5200	    (u_longlong_t)lsos.lsos_valid_entries,
5201	    (u_longlong_t)lsos.lsos_total_entries);
5202}
5203
5204static void
5205dump_zpool(spa_t *spa)
5206{
5207	dsl_pool_t *dp = spa_get_dsl(spa);
5208	int rc = 0;
5209
5210	if (dump_opt['S']) {
5211		dump_simulated_ddt(spa);
5212		return;
5213	}
5214
5215	if (!dump_opt['e'] && dump_opt['C'] > 1) {
5216		(void) printf("\nCached configuration:\n");
5217		dump_nvlist(spa->spa_config, 8);
5218	}
5219
5220	if (dump_opt['C'])
5221		dump_config(spa);
5222
5223	if (dump_opt['u'])
5224		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
5225
5226	if (dump_opt['D'])
5227		dump_all_ddts(spa);
5228
5229	if (dump_opt['d'] > 2 || dump_opt['m'])
5230		dump_metaslabs(spa);
5231	if (dump_opt['M'])
5232		dump_metaslab_groups(spa);
5233	if (dump_opt['d'] > 2 || dump_opt['m']) {
5234		dump_log_spacemaps(spa);
5235		dump_log_spacemap_obsolete_stats(spa);
5236	}
5237
5238	if (dump_opt['d'] || dump_opt['i']) {
5239		mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
5240		    0);
5241		dump_dir(dp->dp_meta_objset);
5242
5243		if (dump_opt['d'] >= 3) {
5244			dsl_pool_t *dp = spa->spa_dsl_pool;
5245			dump_full_bpobj(&spa->spa_deferred_bpobj,
5246			    "Deferred frees", 0);
5247			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
5248				dump_full_bpobj(&dp->dp_free_bpobj,
5249				    "Pool snapshot frees", 0);
5250			}
5251			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
5252				ASSERT(spa_feature_is_enabled(spa,
5253				    SPA_FEATURE_DEVICE_REMOVAL));
5254				dump_full_bpobj(&dp->dp_obsolete_bpobj,
5255				    "Pool obsolete blocks", 0);
5256			}
5257
5258			if (spa_feature_is_active(spa,
5259			    SPA_FEATURE_ASYNC_DESTROY)) {
5260				dump_bptree(spa->spa_meta_objset,
5261				    dp->dp_bptree_obj,
5262				    "Pool dataset frees");
5263			}
5264			dump_dtl(spa->spa_root_vdev, 0);
5265		}
5266		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
5267		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
5268
5269		if (rc == 0 && !dump_opt['L'])
5270			rc = dump_mos_leaks(spa);
5271
5272		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
5273			uint64_t refcount;
5274
5275			if (!(spa_feature_table[f].fi_flags &
5276			    ZFEATURE_FLAG_PER_DATASET) ||
5277			    !spa_feature_is_enabled(spa, f)) {
5278				ASSERT0(dataset_feature_count[f]);
5279				continue;
5280			}
5281			(void) feature_get_refcount(spa,
5282			    &spa_feature_table[f], &refcount);
5283			if (dataset_feature_count[f] != refcount) {
5284				(void) printf("%s feature refcount mismatch: "
5285				    "%lld datasets != %lld refcount\n",
5286				    spa_feature_table[f].fi_uname,
5287				    (longlong_t)dataset_feature_count[f],
5288				    (longlong_t)refcount);
5289				rc = 2;
5290			} else {
5291				(void) printf("Verified %s feature refcount "
5292				    "of %llu is correct\n",
5293				    spa_feature_table[f].fi_uname,
5294				    (longlong_t)refcount);
5295			}
5296		}
5297
5298		if (rc == 0)
5299			rc = verify_device_removal_feature_counts(spa);
5300	}
5301
5302	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
5303		rc = dump_block_stats(spa);
5304
5305	if (rc == 0)
5306		rc = verify_spacemap_refcounts(spa);
5307
5308	if (dump_opt['s'])
5309		show_pool_stats(spa);
5310
5311	if (dump_opt['h'])
5312		dump_history(spa);
5313
5314	if (rc == 0)
5315		rc = verify_checkpoint(spa);
5316
5317	if (rc != 0) {
5318		dump_debug_buffer();
5319		exit(rc);
5320	}
5321}
5322
5323#define	ZDB_FLAG_CHECKSUM	0x0001
5324#define	ZDB_FLAG_DECOMPRESS	0x0002
5325#define	ZDB_FLAG_BSWAP		0x0004
5326#define	ZDB_FLAG_GBH		0x0008
5327#define	ZDB_FLAG_INDIRECT	0x0010
5328#define	ZDB_FLAG_PHYS		0x0020
5329#define	ZDB_FLAG_RAW		0x0040
5330#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
5331
5332static int flagbits[256];
5333
5334static void
5335zdb_print_blkptr(blkptr_t *bp, int flags)
5336{
5337	char blkbuf[BP_SPRINTF_LEN];
5338
5339	if (flags & ZDB_FLAG_BSWAP)
5340		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
5341
5342	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5343	(void) printf("%s\n", blkbuf);
5344}
5345
5346static void
5347zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
5348{
5349	int i;
5350
5351	for (i = 0; i < nbps; i++)
5352		zdb_print_blkptr(&bp[i], flags);
5353}
5354
5355static void
5356zdb_dump_gbh(void *buf, int flags)
5357{
5358	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
5359}
5360
5361static void
5362zdb_dump_block_raw(void *buf, uint64_t size, int flags)
5363{
5364	if (flags & ZDB_FLAG_BSWAP)
5365		byteswap_uint64_array(buf, size);
5366	(void) write(1, buf, size);
5367}
5368
5369static void
5370zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
5371{
5372	uint64_t *d = (uint64_t *)buf;
5373	unsigned nwords = size / sizeof (uint64_t);
5374	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
5375	unsigned i, j;
5376	const char *hdr;
5377	char *c;
5378
5379
5380	if (do_bswap)
5381		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
5382	else
5383		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
5384
5385	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
5386
5387	for (i = 0; i < nwords; i += 2) {
5388		(void) printf("%06llx:  %016llx  %016llx  ",
5389		    (u_longlong_t)(i * sizeof (uint64_t)),
5390		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
5391		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
5392
5393		c = (char *)&d[i];
5394		for (j = 0; j < 2 * sizeof (uint64_t); j++)
5395			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
5396		(void) printf("\n");
5397	}
5398}
5399
5400/*
5401 * There are two acceptable formats:
5402 *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
5403 *	child[.child]*    - For example: 0.1.1
5404 *
5405 * The second form can be used to specify arbitrary vdevs anywhere
5406 * in the heirarchy.  For example, in a pool with a mirror of
5407 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
5408 */
5409static vdev_t *
5410zdb_vdev_lookup(vdev_t *vdev, const char *path)
5411{
5412	char *s, *p, *q;
5413	unsigned i;
5414
5415	if (vdev == NULL)
5416		return (NULL);
5417
5418	/* First, assume the x.x.x.x format */
5419	i = strtoul(path, &s, 10);
5420	if (s == path || (s && *s != '.' && *s != '\0'))
5421		goto name;
5422	if (i >= vdev->vdev_children)
5423		return (NULL);
5424
5425	vdev = vdev->vdev_child[i];
5426	if (*s == '\0')
5427		return (vdev);
5428	return (zdb_vdev_lookup(vdev, s+1));
5429
5430name:
5431	for (i = 0; i < vdev->vdev_children; i++) {
5432		vdev_t *vc = vdev->vdev_child[i];
5433
5434		if (vc->vdev_path == NULL) {
5435			vc = zdb_vdev_lookup(vc, path);
5436			if (vc == NULL)
5437				continue;
5438			else
5439				return (vc);
5440		}
5441
5442		p = strrchr(vc->vdev_path, '/');
5443		p = p ? p + 1 : vc->vdev_path;
5444		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
5445
5446		if (strcmp(vc->vdev_path, path) == 0)
5447			return (vc);
5448		if (strcmp(p, path) == 0)
5449			return (vc);
5450		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
5451			return (vc);
5452	}
5453
5454	return (NULL);
5455}
5456
5457/* ARGSUSED */
5458static int
5459random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
5460{
5461	return (random_get_pseudo_bytes(buf, len));
5462}
5463
5464/*
5465 * Read a block from a pool and print it out.  The syntax of the
5466 * block descriptor is:
5467 *
5468 *	pool:vdev_specifier:offset:size[:flags]
5469 *
5470 *	pool           - The name of the pool you wish to read from
5471 *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
5472 *	offset         - offset, in hex, in bytes
5473 *	size           - Amount of data to read, in hex, in bytes
5474 *	flags          - A string of characters specifying options
5475 *		 b: Decode a blkptr at given offset within block
5476 *		*c: Calculate and display checksums
5477 *		 d: Decompress data before dumping
5478 *		 e: Byteswap data before dumping
5479 *		 g: Display data as a gang block header
5480 *		 i: Display as an indirect block
5481 *		 p: Do I/O to physical offset
5482 *		 r: Dump raw data to stdout
5483 *
5484 *              * = not yet implemented
5485 */
5486static void
5487zdb_read_block(char *thing, spa_t *spa)
5488{
5489	blkptr_t blk, *bp = &blk;
5490	dva_t *dva = bp->blk_dva;
5491	int flags = 0;
5492	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
5493	zio_t *zio;
5494	vdev_t *vd;
5495	abd_t *pabd;
5496	void *lbuf, *buf;
5497	const char *s, *vdev;
5498	char *p, *dup, *flagstr;
5499	int i, error;
5500
5501	dup = strdup(thing);
5502	s = strtok(dup, ":");
5503	vdev = s ? s : "";
5504	s = strtok(NULL, ":");
5505	offset = strtoull(s ? s : "", NULL, 16);
5506	s = strtok(NULL, ":");
5507	size = strtoull(s ? s : "", NULL, 16);
5508	s = strtok(NULL, ":");
5509	if (s)
5510		flagstr = strdup(s);
5511	else
5512		flagstr = strdup("");
5513
5514	s = NULL;
5515	if (size == 0)
5516		s = "size must not be zero";
5517	if (!IS_P2ALIGNED(size, DEV_BSIZE))
5518		s = "size must be a multiple of sector size";
5519	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
5520		s = "offset must be a multiple of sector size";
5521	if (s) {
5522		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
5523		free(dup);
5524		return;
5525	}
5526
5527	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
5528		for (i = 0; flagstr[i]; i++) {
5529			int bit = flagbits[(uchar_t)flagstr[i]];
5530
5531			if (bit == 0) {
5532				(void) printf("***Invalid flag: %c\n",
5533				    flagstr[i]);
5534				continue;
5535			}
5536			flags |= bit;
5537
5538			/* If it's not something with an argument, keep going */
5539			if ((bit & (ZDB_FLAG_CHECKSUM |
5540			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
5541				continue;
5542
5543			p = &flagstr[i + 1];
5544			if (bit == ZDB_FLAG_PRINT_BLKPTR)
5545				blkptr_offset = strtoull(p, &p, 16);
5546			if (*p != ':' && *p != '\0') {
5547				(void) printf("***Invalid flag arg: '%s'\n", s);
5548				free(dup);
5549				return;
5550			}
5551		}
5552	}
5553	free(flagstr);
5554
5555	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
5556	if (vd == NULL) {
5557		(void) printf("***Invalid vdev: %s\n", vdev);
5558		free(dup);
5559		return;
5560	} else {
5561		if (vd->vdev_path)
5562			(void) fprintf(stderr, "Found vdev: %s\n",
5563			    vd->vdev_path);
5564		else
5565			(void) fprintf(stderr, "Found vdev type: %s\n",
5566			    vd->vdev_ops->vdev_op_type);
5567	}
5568
5569	psize = size;
5570	lsize = size;
5571
5572	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
5573	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
5574
5575	BP_ZERO(bp);
5576
5577	DVA_SET_VDEV(&dva[0], vd->vdev_id);
5578	DVA_SET_OFFSET(&dva[0], offset);
5579	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
5580	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
5581
5582	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
5583
5584	BP_SET_LSIZE(bp, lsize);
5585	BP_SET_PSIZE(bp, psize);
5586	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
5587	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
5588	BP_SET_TYPE(bp, DMU_OT_NONE);
5589	BP_SET_LEVEL(bp, 0);
5590	BP_SET_DEDUP(bp, 0);
5591	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
5592
5593	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5594	zio = zio_root(spa, NULL, NULL, 0);
5595
5596	if (vd == vd->vdev_top) {
5597		/*
5598		 * Treat this as a normal block read.
5599		 */
5600		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
5601		    ZIO_PRIORITY_SYNC_READ,
5602		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
5603	} else {
5604		/*
5605		 * Treat this as a vdev child I/O.
5606		 */
5607		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
5608		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
5609		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
5610		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
5611		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
5612		    NULL, NULL));
5613	}
5614
5615	error = zio_wait(zio);
5616	spa_config_exit(spa, SCL_STATE, FTAG);
5617
5618	if (error) {
5619		(void) printf("Read of %s failed, error: %d\n", thing, error);
5620		goto out;
5621	}
5622
5623	if (flags & ZDB_FLAG_DECOMPRESS) {
5624		/*
5625		 * We don't know how the data was compressed, so just try
5626		 * every decompress function at every inflated blocksize.
5627		 */
5628		enum zio_compress c;
5629		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
5630		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
5631
5632		abd_copy_to_buf(pbuf2, pabd, psize);
5633
5634		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
5635		    random_get_pseudo_bytes_cb, NULL));
5636
5637		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
5638		    SPA_MAXBLOCKSIZE - psize));
5639
5640		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
5641		    lsize -= SPA_MINBLOCKSIZE) {
5642			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
5643				if (zio_decompress_data(c, pabd,
5644				    lbuf, psize, lsize) == 0 &&
5645				    zio_decompress_data_buf(c, pbuf2,
5646				    lbuf2, psize, lsize) == 0 &&
5647				    bcmp(lbuf, lbuf2, lsize) == 0)
5648					break;
5649			}
5650			if (c != ZIO_COMPRESS_FUNCTIONS)
5651				break;
5652			lsize -= SPA_MINBLOCKSIZE;
5653		}
5654
5655		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
5656		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
5657
5658		if (lsize <= psize) {
5659			(void) printf("Decompress of %s failed\n", thing);
5660			goto out;
5661		}
5662		buf = lbuf;
5663		size = lsize;
5664	} else {
5665		buf = abd_to_buf(pabd);
5666		size = psize;
5667	}
5668
5669	if (flags & ZDB_FLAG_PRINT_BLKPTR)
5670		zdb_print_blkptr((blkptr_t *)(void *)
5671		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
5672	else if (flags & ZDB_FLAG_RAW)
5673		zdb_dump_block_raw(buf, size, flags);
5674	else if (flags & ZDB_FLAG_INDIRECT)
5675		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
5676		    flags);
5677	else if (flags & ZDB_FLAG_GBH)
5678		zdb_dump_gbh(buf, flags);
5679	else
5680		zdb_dump_block(thing, buf, size, flags);
5681
5682out:
5683	abd_free(pabd);
5684	umem_free(lbuf, SPA_MAXBLOCKSIZE);
5685	free(dup);
5686}
5687
5688static void
5689zdb_embedded_block(char *thing)
5690{
5691	blkptr_t bp;
5692	unsigned long long *words = (void *)&bp;
5693	char *buf;
5694	int err;
5695
5696	bzero(&bp, sizeof (bp));
5697	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
5698	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
5699	    words + 0, words + 1, words + 2, words + 3,
5700	    words + 4, words + 5, words + 6, words + 7,
5701	    words + 8, words + 9, words + 10, words + 11,
5702	    words + 12, words + 13, words + 14, words + 15);
5703	if (err != 16) {
5704		(void) fprintf(stderr, "invalid input format\n");
5705		exit(1);
5706	}
5707	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
5708	buf = malloc(SPA_MAXBLOCKSIZE);
5709	if (buf == NULL) {
5710		(void) fprintf(stderr, "out of memory\n");
5711		exit(1);
5712	}
5713	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));