zdb.c revision d77f81966c796267d14a0cc81ff3491176405cc0
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29#include <stdio.h>
30#include <unistd.h>
31#include <stdio_ext.h>
32#include <stdlib.h>
33#include <ctype.h>
34#include <sys/zfs_context.h>
35#include <sys/spa.h>
36#include <sys/spa_impl.h>
37#include <sys/dmu.h>
38#include <sys/zap.h>
39#include <sys/fs/zfs.h>
40#include <sys/zfs_znode.h>
41#include <sys/zfs_sa.h>
42#include <sys/sa.h>
43#include <sys/sa_impl.h>
44#include <sys/vdev.h>
45#include <sys/vdev_impl.h>
46#include <sys/metaslab_impl.h>
47#include <sys/dmu_objset.h>
48#include <sys/dsl_dir.h>
49#include <sys/dsl_dataset.h>
50#include <sys/dsl_pool.h>
51#include <sys/dbuf.h>
52#include <sys/zil.h>
53#include <sys/zil_impl.h>
54#include <sys/stat.h>
55#include <sys/resource.h>
56#include <sys/dmu_traverse.h>
57#include <sys/zio_checksum.h>
58#include <sys/zio_compress.h>
59#include <sys/zfs_fuid.h>
60#include <sys/arc.h>
61#include <sys/ddt.h>
62#include <sys/zfeature.h>
63#include <sys/abd.h>
64#include <sys/blkptr.h>
65#include <zfs_comutil.h>
66#undef verify
67#include <libzfs.h>
68
69#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
70	zio_compress_table[(idx)].ci_name : "UNKNOWN")
71#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
72	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
73#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
74	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
75	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
76#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
77	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
78	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
79
80#ifndef lint
81extern int reference_tracking_enable;
82extern boolean_t zfs_recover;
83extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
84extern int zfs_vdev_async_read_max_active;
85#else
86int reference_tracking_enable;
87boolean_t zfs_recover;
88uint64_t zfs_arc_max, zfs_arc_meta_limit;
89int zfs_vdev_async_read_max_active;
90#endif
91
92const char cmdname[] = "zdb";
93uint8_t dump_opt[256];
94
95typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
96
97extern void dump_intent_log(zilog_t *);
98uint64_t *zopt_object = NULL;
99int zopt_objects = 0;
100libzfs_handle_t *g_zfs;
101uint64_t max_inflight = 1000;
102
103static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
104
105/*
106 * These libumem hooks provide a reasonable set of defaults for the allocator's
107 * debugging facilities.
108 */
109const char *
110_umem_debug_init()
111{
112	return ("default,verbose"); /* $UMEM_DEBUG setting */
113}
114
115const char *
116_umem_logging_init(void)
117{
118	return ("fail,contents"); /* $UMEM_LOGGING setting */
119}
120
121static void
122usage(void)
123{
124	(void) fprintf(stderr,
125	    "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
126	    "[-I <inflight I/Os>]\n"
127	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
128	    "\t\t[<poolname> [<object> ...]]\n"
129	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> "
130	    "[<object> ...]\n"
131	    "\t%s -C [-A] [-U <cache>]\n"
132	    "\t%s -l [-Aqu] <device>\n"
133	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
134	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
135	    "\t%s -O <dataset> <path>\n"
136	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
137	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
138	    "\t%s -E [-A] word0:word1:...:word15\n"
139	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
140	    "<poolname>\n\n",
141	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
142	    cmdname, cmdname);
143
144	(void) fprintf(stderr, "    Dataset name must include at least one "
145	    "separator character '/' or '@'\n");
146	(void) fprintf(stderr, "    If dataset name is specified, only that "
147	    "dataset is dumped\n");
148	(void) fprintf(stderr, "    If object numbers are specified, only "
149	    "those objects are dumped\n\n");
150	(void) fprintf(stderr, "    Options to control amount of output:\n");
151	(void) fprintf(stderr, "        -b block statistics\n");
152	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
153	    "all data) blocks\n");
154	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
155	(void) fprintf(stderr, "        -d dataset(s)\n");
156	(void) fprintf(stderr, "        -D dedup statistics\n");
157	(void) fprintf(stderr, "        -E decode and display block from an "
158	    "embedded block pointer\n");
159	(void) fprintf(stderr, "        -h pool history\n");
160	(void) fprintf(stderr, "        -i intent logs\n");
161	(void) fprintf(stderr, "        -l read label contents\n");
162	(void) fprintf(stderr, "        -L disable leak tracking (do not "
163	    "load spacemaps)\n");
164	(void) fprintf(stderr, "        -m metaslabs\n");
165	(void) fprintf(stderr, "        -M metaslab groups\n");
166	(void) fprintf(stderr, "        -O perform object lookups by path\n");
167	(void) fprintf(stderr, "        -R read and display block from a "
168	    "device\n");
169	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
170	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
171	(void) fprintf(stderr, "        -v verbose (applies to all "
172	    "others)\n\n");
173	(void) fprintf(stderr, "    Below options are intended for use "
174	    "with other options:\n");
175	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
176	    "panic recovery (-AA) or both (-AAA)\n");
177	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
178	    "has altroot/not in a cachefile\n");
179	(void) fprintf(stderr, "        -F attempt automatic rewind within "
180	    "safe range of transaction groups\n");
181	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
182	    "exiting\n");
183	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
184	    "specify the maximum number of "
185	    "checksumming I/Os [default is 200]\n");
186	(void) fprintf(stderr, "        -o <variable>=<value> set global "
187	    "variable to an unsigned 32-bit integer value\n");
188	(void) fprintf(stderr, "        -p <path> -- use one or more with "
189	    "-e to specify path to vdev dir\n");
190	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
191	(void) fprintf(stderr, "        -q don't print label contents\n");
192	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
193	    "searching for uberblocks\n");
194	(void) fprintf(stderr, "        -u uberblock\n");
195	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
196	    "cachefile\n");
197	(void) fprintf(stderr, "        -V do verbatim import\n");
198	(void) fprintf(stderr, "        -x <dumpdir> -- "
199	    "dump all read blocks into specified directory\n");
200	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
201	    "work with dataset)\n\n");
202	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
203	    "to make only that option verbose\n");
204	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
205	exit(1);
206}
207
208static void
209dump_debug_buffer()
210{
211	if (dump_opt['G']) {
212		(void) printf("\n");
213		zfs_dbgmsg_print("zdb");
214	}
215}
216
217/*
218 * Called for usage errors that are discovered after a call to spa_open(),
219 * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
220 */
221
222static void
223fatal(const char *fmt, ...)
224{
225	va_list ap;
226
227	va_start(ap, fmt);
228	(void) fprintf(stderr, "%s: ", cmdname);
229	(void) vfprintf(stderr, fmt, ap);
230	va_end(ap);
231	(void) fprintf(stderr, "\n");
232
233	dump_debug_buffer();
234
235	exit(1);
236}
237
238/* ARGSUSED */
239static void
240dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
241{
242	nvlist_t *nv;
243	size_t nvsize = *(uint64_t *)data;
244	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
245
246	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
247
248	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
249
250	umem_free(packed, nvsize);
251
252	dump_nvlist(nv, 8);
253
254	nvlist_free(nv);
255}
256
257/* ARGSUSED */
258static void
259dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
260{
261	spa_history_phys_t *shp = data;
262
263	if (shp == NULL)
264		return;
265
266	(void) printf("\t\tpool_create_len = %llu\n",
267	    (u_longlong_t)shp->sh_pool_create_len);
268	(void) printf("\t\tphys_max_off = %llu\n",
269	    (u_longlong_t)shp->sh_phys_max_off);
270	(void) printf("\t\tbof = %llu\n",
271	    (u_longlong_t)shp->sh_bof);
272	(void) printf("\t\teof = %llu\n",
273	    (u_longlong_t)shp->sh_eof);
274	(void) printf("\t\trecords_lost = %llu\n",
275	    (u_longlong_t)shp->sh_records_lost);
276}
277
278static void
279zdb_nicenum(uint64_t num, char *buf)
280{
281	if (dump_opt['P'])
282		(void) sprintf(buf, "%llu", (longlong_t)num);
283	else
284		nicenum(num, buf);
285}
286
287const char histo_stars[] = "****************************************";
288const int histo_width = sizeof (histo_stars) - 1;
289
290static void
291dump_histogram(const uint64_t *histo, int size, int offset)
292{
293	int i;
294	int minidx = size - 1;
295	int maxidx = 0;
296	uint64_t max = 0;
297
298	for (i = 0; i < size; i++) {
299		if (histo[i] > max)
300			max = histo[i];
301		if (histo[i] > 0 && i > maxidx)
302			maxidx = i;
303		if (histo[i] > 0 && i < minidx)
304			minidx = i;
305	}
306
307	if (max < histo_width)
308		max = histo_width;
309
310	for (i = minidx; i <= maxidx; i++) {
311		(void) printf("\t\t\t%3u: %6llu %s\n",
312		    i + offset, (u_longlong_t)histo[i],
313		    &histo_stars[(max - histo[i]) * histo_width / max]);
314	}
315}
316
317static void
318dump_zap_stats(objset_t *os, uint64_t object)
319{
320	int error;
321	zap_stats_t zs;
322
323	error = zap_get_stats(os, object, &zs);
324	if (error)
325		return;
326
327	if (zs.zs_ptrtbl_len == 0) {
328		ASSERT(zs.zs_num_blocks == 1);
329		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
330		    (u_longlong_t)zs.zs_blocksize,
331		    (u_longlong_t)zs.zs_num_entries);
332		return;
333	}
334
335	(void) printf("\tFat ZAP stats:\n");
336
337	(void) printf("\t\tPointer table:\n");
338	(void) printf("\t\t\t%llu elements\n",
339	    (u_longlong_t)zs.zs_ptrtbl_len);
340	(void) printf("\t\t\tzt_blk: %llu\n",
341	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
342	(void) printf("\t\t\tzt_numblks: %llu\n",
343	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
344	(void) printf("\t\t\tzt_shift: %llu\n",
345	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
346	(void) printf("\t\t\tzt_blks_copied: %llu\n",
347	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
348	(void) printf("\t\t\tzt_nextblk: %llu\n",
349	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
350
351	(void) printf("\t\tZAP entries: %llu\n",
352	    (u_longlong_t)zs.zs_num_entries);
353	(void) printf("\t\tLeaf blocks: %llu\n",
354	    (u_longlong_t)zs.zs_num_leafs);
355	(void) printf("\t\tTotal blocks: %llu\n",
356	    (u_longlong_t)zs.zs_num_blocks);
357	(void) printf("\t\tzap_block_type: 0x%llx\n",
358	    (u_longlong_t)zs.zs_block_type);
359	(void) printf("\t\tzap_magic: 0x%llx\n",
360	    (u_longlong_t)zs.zs_magic);
361	(void) printf("\t\tzap_salt: 0x%llx\n",
362	    (u_longlong_t)zs.zs_salt);
363
364	(void) printf("\t\tLeafs with 2^n pointers:\n");
365	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
366
367	(void) printf("\t\tBlocks with n*5 entries:\n");
368	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
369
370	(void) printf("\t\tBlocks n/10 full:\n");
371	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
372
373	(void) printf("\t\tEntries with n chunks:\n");
374	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
375
376	(void) printf("\t\tBuckets with n entries:\n");
377	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
378}
379
380/*ARGSUSED*/
381static void
382dump_none(objset_t *os, uint64_t object, void *data, size_t size)
383{
384}
385
386/*ARGSUSED*/
387static void
388dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
389{
390	(void) printf("\tUNKNOWN OBJECT TYPE\n");
391}
392
393/*ARGSUSED*/
394void
395dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
396{
397}
398
399/*ARGSUSED*/
400static void
401dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
402{
403}
404
405/*ARGSUSED*/
406static void
407dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
408{
409	zap_cursor_t zc;
410	zap_attribute_t attr;
411	void *prop;
412	int i;
413
414	dump_zap_stats(os, object);
415	(void) printf("\n");
416
417	for (zap_cursor_init(&zc, os, object);
418	    zap_cursor_retrieve(&zc, &attr) == 0;
419	    zap_cursor_advance(&zc)) {
420		(void) printf("\t\t%s = ", attr.za_name);
421		if (attr.za_num_integers == 0) {
422			(void) printf("\n");
423			continue;
424		}
425		prop = umem_zalloc(attr.za_num_integers *
426		    attr.za_integer_length, UMEM_NOFAIL);
427		(void) zap_lookup(os, object, attr.za_name,
428		    attr.za_integer_length, attr.za_num_integers, prop);
429		if (attr.za_integer_length == 1) {
430			(void) printf("%s", (char *)prop);
431		} else {
432			for (i = 0; i < attr.za_num_integers; i++) {
433				switch (attr.za_integer_length) {
434				case 2:
435					(void) printf("%u ",
436					    ((uint16_t *)prop)[i]);
437					break;
438				case 4:
439					(void) printf("%u ",
440					    ((uint32_t *)prop)[i]);
441					break;
442				case 8:
443					(void) printf("%lld ",
444					    (u_longlong_t)((int64_t *)prop)[i]);
445					break;
446				}
447			}
448		}
449		(void) printf("\n");
450		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
451	}
452	zap_cursor_fini(&zc);
453}
454
455static void
456dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
457{
458	bpobj_phys_t *bpop = data;
459	char bytes[32], comp[32], uncomp[32];
460
461	if (bpop == NULL)
462		return;
463
464	zdb_nicenum(bpop->bpo_bytes, bytes);
465	zdb_nicenum(bpop->bpo_comp, comp);
466	zdb_nicenum(bpop->bpo_uncomp, uncomp);
467
468	(void) printf("\t\tnum_blkptrs = %llu\n",
469	    (u_longlong_t)bpop->bpo_num_blkptrs);
470	(void) printf("\t\tbytes = %s\n", bytes);
471	if (size >= BPOBJ_SIZE_V1) {
472		(void) printf("\t\tcomp = %s\n", comp);
473		(void) printf("\t\tuncomp = %s\n", uncomp);
474	}
475	if (size >= sizeof (*bpop)) {
476		(void) printf("\t\tsubobjs = %llu\n",
477		    (u_longlong_t)bpop->bpo_subobjs);
478		(void) printf("\t\tnum_subobjs = %llu\n",
479		    (u_longlong_t)bpop->bpo_num_subobjs);
480	}
481
482	if (dump_opt['d'] < 5)
483		return;
484
485	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
486		char blkbuf[BP_SPRINTF_LEN];
487		blkptr_t bp;
488
489		int err = dmu_read(os, object,
490		    i * sizeof (bp), sizeof (bp), &bp, 0);
491		if (err != 0) {
492			(void) printf("got error %u from dmu_read\n", err);
493			break;
494		}
495		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
496		(void) printf("\t%s\n", blkbuf);
497	}
498}
499
500/* ARGSUSED */
501static void
502dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
503{
504	dmu_object_info_t doi;
505
506	VERIFY0(dmu_object_info(os, object, &doi));
507	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
508
509	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
510	if (err != 0) {
511		(void) printf("got error %u from dmu_read\n", err);
512		kmem_free(subobjs, doi.doi_max_offset);
513		return;
514	}
515
516	int64_t last_nonzero = -1;
517	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
518		if (subobjs[i] != 0)
519			last_nonzero = i;
520	}
521
522	for (int64_t i = 0; i <= last_nonzero; i++) {
523		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
524	}
525	kmem_free(subobjs, doi.doi_max_offset);
526}
527
528/*ARGSUSED*/
529static void
530dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
531{
532	dump_zap_stats(os, object);
533	/* contents are printed elsewhere, properly decoded */
534}
535
536/*ARGSUSED*/
537static void
538dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
539{
540	zap_cursor_t zc;
541	zap_attribute_t attr;
542
543	dump_zap_stats(os, object);
544	(void) printf("\n");
545
546	for (zap_cursor_init(&zc, os, object);
547	    zap_cursor_retrieve(&zc, &attr) == 0;
548	    zap_cursor_advance(&zc)) {
549		(void) printf("\t\t%s = ", attr.za_name);
550		if (attr.za_num_integers == 0) {
551			(void) printf("\n");
552			continue;
553		}
554		(void) printf(" %llx : [%d:%d:%d]\n",
555		    (u_longlong_t)attr.za_first_integer,
556		    (int)ATTR_LENGTH(attr.za_first_integer),
557		    (int)ATTR_BSWAP(attr.za_first_integer),
558		    (int)ATTR_NUM(attr.za_first_integer));
559	}
560	zap_cursor_fini(&zc);
561}
562
563/*ARGSUSED*/
564static void
565dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
566{
567	zap_cursor_t zc;
568	zap_attribute_t attr;
569	uint16_t *layout_attrs;
570	int i;
571
572	dump_zap_stats(os, object);
573	(void) printf("\n");
574
575	for (zap_cursor_init(&zc, os, object);
576	    zap_cursor_retrieve(&zc, &attr) == 0;
577	    zap_cursor_advance(&zc)) {
578		(void) printf("\t\t%s = [", attr.za_name);
579		if (attr.za_num_integers == 0) {
580			(void) printf("\n");
581			continue;
582		}
583
584		VERIFY(attr.za_integer_length == 2);
585		layout_attrs = umem_zalloc(attr.za_num_integers *
586		    attr.za_integer_length, UMEM_NOFAIL);
587
588		VERIFY(zap_lookup(os, object, attr.za_name,
589		    attr.za_integer_length,
590		    attr.za_num_integers, layout_attrs) == 0);
591
592		for (i = 0; i != attr.za_num_integers; i++)
593			(void) printf(" %d ", (int)layout_attrs[i]);
594		(void) printf("]\n");
595		umem_free(layout_attrs,
596		    attr.za_num_integers * attr.za_integer_length);
597	}
598	zap_cursor_fini(&zc);
599}
600
601/*ARGSUSED*/
602static void
603dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
604{
605	zap_cursor_t zc;
606	zap_attribute_t attr;
607	const char *typenames[] = {
608		/* 0 */ "not specified",
609		/* 1 */ "FIFO",
610		/* 2 */ "Character Device",
611		/* 3 */ "3 (invalid)",
612		/* 4 */ "Directory",
613		/* 5 */ "5 (invalid)",
614		/* 6 */ "Block Device",
615		/* 7 */ "7 (invalid)",
616		/* 8 */ "Regular File",
617		/* 9 */ "9 (invalid)",
618		/* 10 */ "Symbolic Link",
619		/* 11 */ "11 (invalid)",
620		/* 12 */ "Socket",
621		/* 13 */ "Door",
622		/* 14 */ "Event Port",
623		/* 15 */ "15 (invalid)",
624	};
625
626	dump_zap_stats(os, object);
627	(void) printf("\n");
628
629	for (zap_cursor_init(&zc, os, object);
630	    zap_cursor_retrieve(&zc, &attr) == 0;
631	    zap_cursor_advance(&zc)) {
632		(void) printf("\t\t%s = %lld (type: %s)\n",
633		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
634		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
635	}
636	zap_cursor_fini(&zc);
637}
638
639int
640get_dtl_refcount(vdev_t *vd)
641{
642	int refcount = 0;
643
644	if (vd->vdev_ops->vdev_op_leaf) {
645		space_map_t *sm = vd->vdev_dtl_sm;
646
647		if (sm != NULL &&
648		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
649			return (1);
650		return (0);
651	}
652
653	for (int c = 0; c < vd->vdev_children; c++)
654		refcount += get_dtl_refcount(vd->vdev_child[c]);
655	return (refcount);
656}
657
658int
659get_metaslab_refcount(vdev_t *vd)
660{
661	int refcount = 0;
662
663	if (vd->vdev_top == vd && !vd->vdev_removing) {
664		for (int m = 0; m < vd->vdev_ms_count; m++) {
665			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
666
667			if (sm != NULL &&
668			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
669				refcount++;
670		}
671	}
672	for (int c = 0; c < vd->vdev_children; c++)
673		refcount += get_metaslab_refcount(vd->vdev_child[c]);
674
675	return (refcount);
676}
677
678static int
679verify_spacemap_refcounts(spa_t *spa)
680{
681	uint64_t expected_refcount = 0;
682	uint64_t actual_refcount;
683
684	(void) feature_get_refcount(spa,
685	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
686	    &expected_refcount);
687	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
688	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
689
690	if (expected_refcount != actual_refcount) {
691		(void) printf("space map refcount mismatch: expected %lld != "
692		    "actual %lld\n",
693		    (longlong_t)expected_refcount,
694		    (longlong_t)actual_refcount);
695		return (2);
696	}
697	return (0);
698}
699
700static void
701dump_spacemap(objset_t *os, space_map_t *sm)
702{
703	uint64_t alloc, offset, entry;
704	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
705			    "INVALID", "INVALID", "INVALID", "INVALID" };
706
707	if (sm == NULL)
708		return;
709
710	/*
711	 * Print out the freelist entries in both encoded and decoded form.
712	 */
713	alloc = 0;
714	for (offset = 0; offset < space_map_length(sm);
715	    offset += sizeof (entry)) {
716		uint8_t mapshift = sm->sm_shift;
717
718		VERIFY0(dmu_read(os, space_map_object(sm), offset,
719		    sizeof (entry), &entry, DMU_READ_PREFETCH));
720		if (SM_DEBUG_DECODE(entry)) {
721
722			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
723			    (u_longlong_t)(offset / sizeof (entry)),
724			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
725			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
726			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
727		} else {
728			(void) printf("\t    [%6llu]    %c  range:"
729			    " %010llx-%010llx  size: %06llx\n",
730			    (u_longlong_t)(offset / sizeof (entry)),
731			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
732			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
733			    mapshift) + sm->sm_start),
734			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
735			    mapshift) + sm->sm_start +
736			    (SM_RUN_DECODE(entry) << mapshift)),
737			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
738			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
739				alloc += SM_RUN_DECODE(entry) << mapshift;
740			else
741				alloc -= SM_RUN_DECODE(entry) << mapshift;
742		}
743	}
744	if (alloc != space_map_allocated(sm)) {
745		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
746		    "with space map summary (%llu)\n",
747		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
748	}
749}
750
751static void
752dump_metaslab_stats(metaslab_t *msp)
753{
754	char maxbuf[32];
755	range_tree_t *rt = msp->ms_tree;
756	avl_tree_t *t = &msp->ms_size_tree;
757	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
758
759	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
760
761	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
762	    "segments", avl_numnodes(t), "maxsize", maxbuf,
763	    "freepct", free_pct);
764	(void) printf("\tIn-memory histogram:\n");
765	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
766}
767
768static void
769dump_metaslab(metaslab_t *msp)
770{
771	vdev_t *vd = msp->ms_group->mg_vd;
772	spa_t *spa = vd->vdev_spa;
773	space_map_t *sm = msp->ms_sm;
774	char freebuf[32];
775
776	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
777
778	(void) printf(
779	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
780	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
781	    (u_longlong_t)space_map_object(sm), freebuf);
782
783	if (dump_opt['m'] > 2 && !dump_opt['L']) {
784		mutex_enter(&msp->ms_lock);
785		metaslab_load_wait(msp);
786		if (!msp->ms_loaded) {
787			VERIFY0(metaslab_load(msp));
788			range_tree_stat_verify(msp->ms_tree);
789		}
790		dump_metaslab_stats(msp);
791		metaslab_unload(msp);
792		mutex_exit(&msp->ms_lock);
793	}
794
795	if (dump_opt['m'] > 1 && sm != NULL &&
796	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
797		/*
798		 * The space map histogram represents free space in chunks
799		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
800		 */
801		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
802		    (u_longlong_t)msp->ms_fragmentation);
803		dump_histogram(sm->sm_phys->smp_histogram,
804		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
805	}
806
807	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
808		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
809
810		mutex_enter(&msp->ms_lock);
811		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
812		mutex_exit(&msp->ms_lock);
813	}
814}
815
816static void
817print_vdev_metaslab_header(vdev_t *vd)
818{
819	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
820	    (u_longlong_t)vd->vdev_id,
821	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
822	    "offset", "spacemap", "free");
823	(void) printf("\t%15s   %19s   %15s   %10s\n",
824	    "---------------", "-------------------",
825	    "---------------", "-------------");
826}
827
828static void
829dump_metaslab_groups(spa_t *spa)
830{
831	vdev_t *rvd = spa->spa_root_vdev;
832	metaslab_class_t *mc = spa_normal_class(spa);
833	uint64_t fragmentation;
834
835	metaslab_class_histogram_verify(mc);
836
837	for (int c = 0; c < rvd->vdev_children; c++) {
838		vdev_t *tvd = rvd->vdev_child[c];
839		metaslab_group_t *mg = tvd->vdev_mg;
840
841		if (mg->mg_class != mc)
842			continue;
843
844		metaslab_group_histogram_verify(mg);
845		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
846
847		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
848		    "fragmentation",
849		    (u_longlong_t)tvd->vdev_id,
850		    (u_longlong_t)tvd->vdev_ms_count);
851		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
852			(void) printf("%3s\n", "-");
853		} else {
854			(void) printf("%3llu%%\n",
855			    (u_longlong_t)mg->mg_fragmentation);
856		}
857		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
858	}
859
860	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
861	fragmentation = metaslab_class_fragmentation(mc);
862	if (fragmentation == ZFS_FRAG_INVALID)
863		(void) printf("\t%3s\n", "-");
864	else
865		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
866	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
867}
868
869static void
870dump_metaslabs(spa_t *spa)
871{
872	vdev_t *vd, *rvd = spa->spa_root_vdev;
873	uint64_t m, c = 0, children = rvd->vdev_children;
874
875	(void) printf("\nMetaslabs:\n");
876
877	if (!dump_opt['d'] && zopt_objects > 0) {
878		c = zopt_object[0];
879
880		if (c >= children)
881			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
882
883		if (zopt_objects > 1) {
884			vd = rvd->vdev_child[c];
885			print_vdev_metaslab_header(vd);
886
887			for (m = 1; m < zopt_objects; m++) {
888				if (zopt_object[m] < vd->vdev_ms_count)
889					dump_metaslab(
890					    vd->vdev_ms[zopt_object[m]]);
891				else
892					(void) fprintf(stderr, "bad metaslab "
893					    "number %llu\n",
894					    (u_longlong_t)zopt_object[m]);
895			}
896			(void) printf("\n");
897			return;
898		}
899		children = c + 1;
900	}
901	for (; c < children; c++) {
902		vd = rvd->vdev_child[c];
903		print_vdev_metaslab_header(vd);
904
905		for (m = 0; m < vd->vdev_ms_count; m++)
906			dump_metaslab(vd->vdev_ms[m]);
907		(void) printf("\n");
908	}
909}
910
911static void
912dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
913{
914	const ddt_phys_t *ddp = dde->dde_phys;
915	const ddt_key_t *ddk = &dde->dde_key;
916	char *types[4] = { "ditto", "single", "double", "triple" };
917	char blkbuf[BP_SPRINTF_LEN];
918	blkptr_t blk;
919
920	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
921		if (ddp->ddp_phys_birth == 0)
922			continue;
923		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
924		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
925		(void) printf("index %llx refcnt %llu %s %s\n",
926		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
927		    types[p], blkbuf);
928	}
929}
930
931static void
932dump_dedup_ratio(const ddt_stat_t *dds)
933{
934	double rL, rP, rD, D, dedup, compress, copies;
935
936	if (dds->dds_blocks == 0)
937		return;
938
939	rL = (double)dds->dds_ref_lsize;
940	rP = (double)dds->dds_ref_psize;
941	rD = (double)dds->dds_ref_dsize;
942	D = (double)dds->dds_dsize;
943
944	dedup = rD / D;
945	compress = rL / rP;
946	copies = rD / rP;
947
948	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
949	    "dedup * compress / copies = %.2f\n\n",
950	    dedup, compress, copies, dedup * compress / copies);
951}
952
953static void
954dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
955{
956	char name[DDT_NAMELEN];
957	ddt_entry_t dde;
958	uint64_t walk = 0;
959	dmu_object_info_t doi;
960	uint64_t count, dspace, mspace;
961	int error;
962
963	error = ddt_object_info(ddt, type, class, &doi);
964
965	if (error == ENOENT)
966		return;
967	ASSERT(error == 0);
968
969	if ((count = ddt_object_count(ddt, type, class)) == 0)
970		return;
971
972	dspace = doi.doi_physical_blocks_512 << 9;
973	mspace = doi.doi_fill_count * doi.doi_data_block_size;
974
975	ddt_object_name(ddt, type, class, name);
976
977	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
978	    name,
979	    (u_longlong_t)count,
980	    (u_longlong_t)(dspace / count),
981	    (u_longlong_t)(mspace / count));
982
983	if (dump_opt['D'] < 3)
984		return;
985
986	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
987
988	if (dump_opt['D'] < 4)
989		return;
990
991	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
992		return;
993
994	(void) printf("%s contents:\n\n", name);
995
996	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
997		dump_dde(ddt, &dde, walk);
998
999	ASSERT(error == ENOENT);
1000
1001	(void) printf("\n");
1002}
1003
1004static void
1005dump_all_ddts(spa_t *spa)
1006{
1007	ddt_histogram_t ddh_total = { 0 };
1008	ddt_stat_t dds_total = { 0 };
1009
1010	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1011		ddt_t *ddt = spa->spa_ddt[c];
1012		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1013			for (enum ddt_class class = 0; class < DDT_CLASSES;
1014			    class++) {
1015				dump_ddt(ddt, type, class);
1016			}
1017		}
1018	}
1019
1020	ddt_get_dedup_stats(spa, &dds_total);
1021
1022	if (dds_total.dds_blocks == 0) {
1023		(void) printf("All DDTs are empty\n");
1024		return;
1025	}
1026
1027	(void) printf("\n");
1028
1029	if (dump_opt['D'] > 1) {
1030		(void) printf("DDT histogram (aggregated over all DDTs):\n");
1031		ddt_get_dedup_histogram(spa, &ddh_total);
1032		zpool_dump_ddt(&dds_total, &ddh_total);
1033	}
1034
1035	dump_dedup_ratio(&dds_total);
1036}
1037
1038static void
1039dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
1040{
1041	char *prefix = arg;
1042
1043	(void) printf("%s [%llu,%llu) length %llu\n",
1044	    prefix,
1045	    (u_longlong_t)start,
1046	    (u_longlong_t)(start + size),
1047	    (u_longlong_t)(size));
1048}
1049
1050static void
1051dump_dtl(vdev_t *vd, int indent)
1052{
1053	spa_t *spa = vd->vdev_spa;
1054	boolean_t required;
1055	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
1056	char prefix[256];
1057
1058	spa_vdev_state_enter(spa, SCL_NONE);
1059	required = vdev_dtl_required(vd);
1060	(void) spa_vdev_state_exit(spa, NULL, 0);
1061
1062	if (indent == 0)
1063		(void) printf("\nDirty time logs:\n\n");
1064
1065	(void) printf("\t%*s%s [%s]\n", indent, "",
1066	    vd->vdev_path ? vd->vdev_path :
1067	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
1068	    required ? "DTL-required" : "DTL-expendable");
1069
1070	for (int t = 0; t < DTL_TYPES; t++) {
1071		range_tree_t *rt = vd->vdev_dtl[t];
1072		if (range_tree_space(rt) == 0)
1073			continue;
1074		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
1075		    indent + 2, "", name[t]);
1076		mutex_enter(rt->rt_lock);
1077		range_tree_walk(rt, dump_dtl_seg, prefix);
1078		mutex_exit(rt->rt_lock);
1079		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
1080			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
1081	}
1082
1083	for (int c = 0; c < vd->vdev_children; c++)
1084		dump_dtl(vd->vdev_child[c], indent + 4);
1085}
1086
1087static void
1088dump_history(spa_t *spa)
1089{
1090	nvlist_t **events = NULL;
1091	uint64_t resid, len, off = 0;
1092	uint_t num = 0;
1093	int error;
1094	time_t tsec;
1095	struct tm t;
1096	char tbuf[30];
1097	char internalstr[MAXPATHLEN];
1098
1099	char *buf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
1100	do {
1101		len = SPA_MAXBLOCKSIZE;
1102
1103		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
1104			(void) fprintf(stderr, "Unable to read history: "
1105			    "error %d\n", error);
1106			umem_free(buf, SPA_MAXBLOCKSIZE);
1107			return;
1108		}
1109
1110		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
1111			break;
1112
1113		off -= resid;
1114	} while (len != 0);
1115	umem_free(buf, SPA_MAXBLOCKSIZE);
1116
1117	(void) printf("\nHistory:\n");
1118	for (int i = 0; i < num; i++) {
1119		uint64_t time, txg, ievent;
1120		char *cmd, *intstr;
1121		boolean_t printed = B_FALSE;
1122
1123		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
1124		    &time) != 0)
1125			goto next;
1126		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
1127		    &cmd) != 0) {
1128			if (nvlist_lookup_uint64(events[i],
1129			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
1130				goto next;
1131			verify(nvlist_lookup_uint64(events[i],
1132			    ZPOOL_HIST_TXG, &txg) == 0);
1133			verify(nvlist_lookup_string(events[i],
1134			    ZPOOL_HIST_INT_STR, &intstr) == 0);
1135			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
1136				goto next;
1137
1138			(void) snprintf(internalstr,
1139			    sizeof (internalstr),
1140			    "[internal %s txg:%lld] %s",
1141			    zfs_history_event_names[ievent], txg,
1142			    intstr);
1143			cmd = internalstr;
1144		}
1145		tsec = time;
1146		(void) localtime_r(&tsec, &t);
1147		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
1148		(void) printf("%s %s\n", tbuf, cmd);
1149		printed = B_TRUE;
1150
1151next:
1152		if (dump_opt['h'] > 1) {
1153			if (!printed)
1154				(void) printf("unrecognized record:\n");
1155			dump_nvlist(events[i], 2);
1156		}
1157	}
1158}
1159
1160/*ARGSUSED*/
1161static void
1162dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
1163{
1164}
1165
1166static uint64_t
1167blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
1168    const zbookmark_phys_t *zb)
1169{
1170	if (dnp == NULL) {
1171		ASSERT(zb->zb_level < 0);
1172		if (zb->zb_object == 0)
1173			return (zb->zb_blkid);
1174		return (zb->zb_blkid * BP_GET_LSIZE(bp));
1175	}
1176
1177	ASSERT(zb->zb_level >= 0);
1178
1179	return ((zb->zb_blkid <<
1180	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
1181	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
1182}
1183
1184static void
1185snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
1186{
1187	const dva_t *dva = bp->blk_dva;
1188	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
1189
1190	if (dump_opt['b'] >= 6) {
1191		snprintf_blkptr(blkbuf, buflen, bp);
1192		return;
1193	}
1194
1195	if (BP_IS_EMBEDDED(bp)) {
1196		(void) sprintf(blkbuf,
1197		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
1198		    (int)BPE_GET_ETYPE(bp),
1199		    (u_longlong_t)BPE_GET_LSIZE(bp),
1200		    (u_longlong_t)BPE_GET_PSIZE(bp),
1201		    (u_longlong_t)bp->blk_birth);
1202		return;
1203	}
1204
1205	blkbuf[0] = '\0';
1206	for (int i = 0; i < ndvas; i++)
1207		(void) snprintf(blkbuf + strlen(blkbuf),
1208		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
1209		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
1210		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
1211		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
1212
1213	if (BP_IS_HOLE(bp)) {
1214		(void) snprintf(blkbuf + strlen(blkbuf),
1215		    buflen - strlen(blkbuf),
1216		    "%llxL B=%llu",
1217		    (u_longlong_t)BP_GET_LSIZE(bp),
1218		    (u_longlong_t)bp->blk_birth);
1219	} else {
1220		(void) snprintf(blkbuf + strlen(blkbuf),
1221		    buflen - strlen(blkbuf),
1222		    "%llxL/%llxP F=%llu B=%llu/%llu",
1223		    (u_longlong_t)BP_GET_LSIZE(bp),
1224		    (u_longlong_t)BP_GET_PSIZE(bp),
1225		    (u_longlong_t)BP_GET_FILL(bp),
1226		    (u_longlong_t)bp->blk_birth,
1227		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
1228	}
1229}
1230
1231static void
1232print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
1233    const dnode_phys_t *dnp)
1234{
1235	char blkbuf[BP_SPRINTF_LEN];
1236	int l;
1237
1238	if (!BP_IS_EMBEDDED(bp)) {
1239		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
1240		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
1241	}
1242
1243	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
1244
1245	ASSERT(zb->zb_level >= 0);
1246
1247	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
1248		if (l == zb->zb_level) {
1249			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
1250		} else {
1251			(void) printf(" ");
1252		}
1253	}
1254
1255	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1256	(void) printf("%s\n", blkbuf);
1257}
1258
1259static int
1260visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
1261    blkptr_t *bp, const zbookmark_phys_t *zb)
1262{
1263	int err = 0;
1264
1265	if (bp->blk_birth == 0)
1266		return (0);
1267
1268	print_indirect(bp, zb, dnp);
1269
1270	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
1271		arc_flags_t flags = ARC_FLAG_WAIT;
1272		int i;
1273		blkptr_t *cbp;
1274		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1275		arc_buf_t *buf;
1276		uint64_t fill = 0;
1277
1278		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
1279		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
1280		if (err)
1281			return (err);
1282		ASSERT(buf->b_data);
1283
1284		/* recursively visit blocks below this */
1285		cbp = buf->b_data;
1286		for (i = 0; i < epb; i++, cbp++) {
1287			zbookmark_phys_t czb;
1288
1289			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1290			    zb->zb_level - 1,
1291			    zb->zb_blkid * epb + i);
1292			err = visit_indirect(spa, dnp, cbp, &czb);
1293			if (err)
1294				break;
1295			fill += BP_GET_FILL(cbp);
1296		}
1297		if (!err)
1298			ASSERT3U(fill, ==, BP_GET_FILL(bp));
1299		arc_buf_destroy(buf, &buf);
1300	}
1301
1302	return (err);
1303}
1304
1305/*ARGSUSED*/
1306static void
1307dump_indirect(dnode_t *dn)
1308{
1309	dnode_phys_t *dnp = dn->dn_phys;
1310	int j;
1311	zbookmark_phys_t czb;
1312
1313	(void) printf("Indirect blocks:\n");
1314
1315	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
1316	    dn->dn_object, dnp->dn_nlevels - 1, 0);
1317	for (j = 0; j < dnp->dn_nblkptr; j++) {
1318		czb.zb_blkid = j;
1319		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
1320		    &dnp->dn_blkptr[j], &czb);
1321	}
1322
1323	(void) printf("\n");
1324}
1325
1326/*ARGSUSED*/
1327static void
1328dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
1329{
1330	dsl_dir_phys_t *dd = data;
1331	time_t crtime;
1332	char nice[32];
1333
1334	if (dd == NULL)
1335		return;
1336
1337	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
1338
1339	crtime = dd->dd_creation_time;
1340	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1341	(void) printf("\t\thead_dataset_obj = %llu\n",
1342	    (u_longlong_t)dd->dd_head_dataset_obj);
1343	(void) printf("\t\tparent_dir_obj = %llu\n",
1344	    (u_longlong_t)dd->dd_parent_obj);
1345	(void) printf("\t\torigin_obj = %llu\n",
1346	    (u_longlong_t)dd->dd_origin_obj);
1347	(void) printf("\t\tchild_dir_zapobj = %llu\n",
1348	    (u_longlong_t)dd->dd_child_dir_zapobj);
1349	zdb_nicenum(dd->dd_used_bytes, nice);
1350	(void) printf("\t\tused_bytes = %s\n", nice);
1351	zdb_nicenum(dd->dd_compressed_bytes, nice);
1352	(void) printf("\t\tcompressed_bytes = %s\n", nice);
1353	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
1354	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
1355	zdb_nicenum(dd->dd_quota, nice);
1356	(void) printf("\t\tquota = %s\n", nice);
1357	zdb_nicenum(dd->dd_reserved, nice);
1358	(void) printf("\t\treserved = %s\n", nice);
1359	(void) printf("\t\tprops_zapobj = %llu\n",
1360	    (u_longlong_t)dd->dd_props_zapobj);
1361	(void) printf("\t\tdeleg_zapobj = %llu\n",
1362	    (u_longlong_t)dd->dd_deleg_zapobj);
1363	(void) printf("\t\tflags = %llx\n",
1364	    (u_longlong_t)dd->dd_flags);
1365
1366#define	DO(which) \
1367	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
1368	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
1369	DO(HEAD);
1370	DO(SNAP);
1371	DO(CHILD);
1372	DO(CHILD_RSRV);
1373	DO(REFRSRV);
1374#undef DO
1375}
1376
1377/*ARGSUSED*/
1378static void
1379dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
1380{
1381	dsl_dataset_phys_t *ds = data;
1382	time_t crtime;
1383	char used[32], compressed[32], uncompressed[32], unique[32];
1384	char blkbuf[BP_SPRINTF_LEN];
1385
1386	if (ds == NULL)
1387		return;
1388
1389	ASSERT(size == sizeof (*ds));
1390	crtime = ds->ds_creation_time;
1391	zdb_nicenum(ds->ds_referenced_bytes, used);
1392	zdb_nicenum(ds->ds_compressed_bytes, compressed);
1393	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
1394	zdb_nicenum(ds->ds_unique_bytes, unique);
1395	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
1396
1397	(void) printf("\t\tdir_obj = %llu\n",
1398	    (u_longlong_t)ds->ds_dir_obj);
1399	(void) printf("\t\tprev_snap_obj = %llu\n",
1400	    (u_longlong_t)ds->ds_prev_snap_obj);
1401	(void) printf("\t\tprev_snap_txg = %llu\n",
1402	    (u_longlong_t)ds->ds_prev_snap_txg);
1403	(void) printf("\t\tnext_snap_obj = %llu\n",
1404	    (u_longlong_t)ds->ds_next_snap_obj);
1405	(void) printf("\t\tsnapnames_zapobj = %llu\n",
1406	    (u_longlong_t)ds->ds_snapnames_zapobj);
1407	(void) printf("\t\tnum_children = %llu\n",
1408	    (u_longlong_t)ds->ds_num_children);
1409	(void) printf("\t\tuserrefs_obj = %llu\n",
1410	    (u_longlong_t)ds->ds_userrefs_obj);
1411	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1412	(void) printf("\t\tcreation_txg = %llu\n",
1413	    (u_longlong_t)ds->ds_creation_txg);
1414	(void) printf("\t\tdeadlist_obj = %llu\n",
1415	    (u_longlong_t)ds->ds_deadlist_obj);
1416	(void) printf("\t\tused_bytes = %s\n", used);
1417	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
1418	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
1419	(void) printf("\t\tunique = %s\n", unique);
1420	(void) printf("\t\tfsid_guid = %llu\n",
1421	    (u_longlong_t)ds->ds_fsid_guid);
1422	(void) printf("\t\tguid = %llu\n",
1423	    (u_longlong_t)ds->ds_guid);
1424	(void) printf("\t\tflags = %llx\n",
1425	    (u_longlong_t)ds->ds_flags);
1426	(void) printf("\t\tnext_clones_obj = %llu\n",
1427	    (u_longlong_t)ds->ds_next_clones_obj);
1428	(void) printf("\t\tprops_obj = %llu\n",
1429	    (u_longlong_t)ds->ds_props_obj);
1430	(void) printf("\t\tbp = %s\n", blkbuf);
1431}
1432
1433/* ARGSUSED */
1434static int
1435dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1436{
1437	char blkbuf[BP_SPRINTF_LEN];
1438
1439	if (bp->blk_birth != 0) {
1440		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
1441		(void) printf("\t%s\n", blkbuf);
1442	}
1443	return (0);
1444}
1445
1446static void
1447dump_bptree(objset_t *os, uint64_t obj, char *name)
1448{
1449	char bytes[32];
1450	bptree_phys_t *bt;
1451	dmu_buf_t *db;
1452
1453	if (dump_opt['d'] < 3)
1454		return;
1455
1456	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
1457	bt = db->db_data;
1458	zdb_nicenum(bt->bt_bytes, bytes);
1459	(void) printf("\n    %s: %llu datasets, %s\n",
1460	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
1461	dmu_buf_rele(db, FTAG);
1462
1463	if (dump_opt['d'] < 5)
1464		return;
1465
1466	(void) printf("\n");
1467
1468	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
1469}
1470
1471/* ARGSUSED */
1472static int
1473dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1474{
1475	char blkbuf[BP_SPRINTF_LEN];
1476
1477	ASSERT(bp->blk_birth != 0);
1478	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1479	(void) printf("\t%s\n", blkbuf);
1480	return (0);
1481}
1482
1483static void
1484dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
1485{
1486	char bytes[32];
1487	char comp[32];
1488	char uncomp[32];
1489
1490	if (dump_opt['d'] < 3)
1491		return;
1492
1493	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
1494	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1495		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
1496		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
1497		(void) printf("    %*s: object %llu, %llu local blkptrs, "
1498		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
1499		    indent * 8, name,
1500		    (u_longlong_t)bpo->bpo_object,
1501		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1502		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1503		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
1504		    bytes, comp, uncomp);
1505
1506		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1507			uint64_t subobj;
1508			bpobj_t subbpo;
1509			int error;
1510			VERIFY0(dmu_read(bpo->bpo_os,
1511			    bpo->bpo_phys->bpo_subobjs,
1512			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1513			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1514			if (error != 0) {
1515				(void) printf("ERROR %u while trying to open "
1516				    "subobj id %llu\n",
1517				    error, (u_longlong_t)subobj);
1518				continue;
1519			}
1520			dump_full_bpobj(&subbpo, "subobj", indent + 1);
1521			bpobj_close(&subbpo);
1522		}
1523	} else {
1524		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
1525		    indent * 8, name,
1526		    (u_longlong_t)bpo->bpo_object,
1527		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1528		    bytes);
1529	}
1530
1531	if (dump_opt['d'] < 5)
1532		return;
1533
1534
1535	if (indent == 0) {
1536		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
1537		(void) printf("\n");
1538	}
1539}
1540
1541static void
1542dump_deadlist(dsl_deadlist_t *dl)
1543{
1544	dsl_deadlist_entry_t *dle;
1545	uint64_t unused;
1546	char bytes[32];
1547	char comp[32];
1548	char uncomp[32];
1549
1550	if (dump_opt['d'] < 3)
1551		return;
1552
1553	if (dl->dl_oldfmt) {
1554		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
1555		return;
1556	}
1557
1558	zdb_nicenum(dl->dl_phys->dl_used, bytes);
1559	zdb_nicenum(dl->dl_phys->dl_comp, comp);
1560	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
1561	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
1562	    bytes, comp, uncomp);
1563
1564	if (dump_opt['d'] < 4)
1565		return;
1566
1567	(void) printf("\n");
1568
1569	/* force the tree to be loaded */
1570	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
1571
1572	for (dle = avl_first(&dl->dl_tree); dle;
1573	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
1574		if (dump_opt['d'] >= 5) {
1575			char buf[128];
1576			(void) snprintf(buf, sizeof (buf),
1577			    "mintxg %llu -> obj %llu",
1578			    (longlong_t)dle->dle_mintxg,
1579			    (longlong_t)dle->dle_bpobj.bpo_object);
1580
1581			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
1582		} else {
1583			(void) printf("mintxg %llu -> obj %llu\n",
1584			    (longlong_t)dle->dle_mintxg,
1585			    (longlong_t)dle->dle_bpobj.bpo_object);
1586
1587		}
1588	}
1589}
1590
1591static avl_tree_t idx_tree;
1592static avl_tree_t domain_tree;
1593static boolean_t fuid_table_loaded;
1594static objset_t *sa_os = NULL;
1595static sa_attr_type_t *sa_attr_table = NULL;
1596
1597static int
1598open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp)
1599{
1600	int err;
1601	uint64_t sa_attrs = 0;
1602	uint64_t version = 0;
1603
1604	VERIFY3P(sa_os, ==, NULL);
1605	err = dmu_objset_own(path, type, B_TRUE, tag, osp);
1606	if (err != 0) {
1607		(void) fprintf(stderr, "failed to own dataset '%s': %s\n", path,
1608		    strerror(err));
1609		return (err);
1610	}
1611
1612	if (dmu_objset_type(*osp) == DMU_OST_ZFS) {
1613		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1614		    8, 1, &version);
1615		if (version >= ZPL_VERSION_SA) {
1616			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
1617			    8, 1, &sa_attrs);
1618		}
1619		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
1620		    &sa_attr_table);
1621		if (err != 0) {
1622			(void) fprintf(stderr, "sa_setup failed: %s\n",
1623			    strerror(err));
1624			dmu_objset_disown(*osp, tag);
1625			*osp = NULL;
1626		}
1627	}
1628	sa_os = *osp;
1629
1630	return (0);
1631}
1632
1633static void
1634close_objset(objset_t *os, void *tag)
1635{
1636	VERIFY3P(os, ==, sa_os);
1637	if (os->os_sa != NULL)
1638		sa_tear_down(os);
1639	dmu_objset_disown(os, tag);
1640	sa_attr_table = NULL;
1641	sa_os = NULL;
1642}
1643
1644static void
1645fuid_table_destroy()
1646{
1647	if (fuid_table_loaded) {
1648		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
1649		fuid_table_loaded = B_FALSE;
1650	}
1651}
1652
1653/*
1654 * print uid or gid information.
1655 * For normal POSIX id just the id is printed in decimal format.
1656 * For CIFS files with FUID the fuid is printed in hex followed by
1657 * the domain-rid string.
1658 */
1659static void
1660print_idstr(uint64_t id, const char *id_type)
1661{
1662	if (FUID_INDEX(id)) {
1663		char *domain;
1664
1665		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
1666		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
1667		    (u_longlong_t)id, domain, (int)FUID_RID(id));
1668	} else {
1669		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
1670	}
1671
1672}
1673
1674static void
1675dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
1676{
1677	uint32_t uid_idx, gid_idx;
1678
1679	uid_idx = FUID_INDEX(uid);
1680	gid_idx = FUID_INDEX(gid);
1681
1682	/* Load domain table, if not already loaded */
1683	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
1684		uint64_t fuid_obj;
1685
1686		/* first find the fuid object.  It lives in the master node */
1687		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
1688		    8, 1, &fuid_obj) == 0);
1689		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
1690		(void) zfs_fuid_table_load(os, fuid_obj,
1691		    &idx_tree, &domain_tree);
1692		fuid_table_loaded = B_TRUE;
1693	}
1694
1695	print_idstr(uid, "uid");
1696	print_idstr(gid, "gid");
1697}
1698
1699/*ARGSUSED*/
1700static void
1701dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
1702{
1703	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
1704	sa_handle_t *hdl;
1705	uint64_t xattr, rdev, gen;
1706	uint64_t uid, gid, mode, fsize, parent, links;
1707	uint64_t pflags;
1708	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
1709	time_t z_crtime, z_atime, z_mtime, z_ctime;
1710	sa_bulk_attr_t bulk[12];
1711	int idx = 0;
1712	int error;
1713
1714	VERIFY3P(os, ==, sa_os);
1715	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
1716		(void) printf("Failed to get handle for SA znode\n");
1717		return;
1718	}
1719
1720	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
1721	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
1722	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
1723	    &links, 8);
1724	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
1725	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
1726	    &mode, 8);
1727	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
1728	    NULL, &parent, 8);
1729	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
1730	    &fsize, 8);
1731	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
1732	    acctm, 16);
1733	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
1734	    modtm, 16);
1735	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
1736	    crtm, 16);
1737	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
1738	    chgtm, 16);
1739	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
1740	    &pflags, 8);
1741
1742	if (sa_bulk_lookup(hdl, bulk, idx)) {
1743		(void) sa_handle_destroy(hdl);
1744		return;
1745	}
1746
1747	z_crtime = (time_t)crtm[0];
1748	z_atime = (time_t)acctm[0];
1749	z_mtime = (time_t)modtm[0];
1750	z_ctime = (time_t)chgtm[0];
1751
1752	if (dump_opt['d'] > 4) {
1753		error = zfs_obj_to_path(os, object, path, sizeof (path));
1754		if (error != 0) {
1755			(void) snprintf(path, sizeof (path),
1756			    "\?\?\?<object#%llu>", (u_longlong_t)object);
1757		}
1758		(void) printf("\tpath	%s\n", path);
1759	}
1760	dump_uidgid(os, uid, gid);
1761	(void) printf("\tatime	%s", ctime(&z_atime));
1762	(void) printf("\tmtime	%s", ctime(&z_mtime));
1763	(void) printf("\tctime	%s", ctime(&z_ctime));
1764	(void) printf("\tcrtime	%s", ctime(&z_crtime));
1765	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
1766	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
1767	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
1768	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
1769	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
1770	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
1771	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
1772	    sizeof (uint64_t)) == 0)
1773		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
1774	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
1775	    sizeof (uint64_t)) == 0)
1776		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
1777	sa_handle_destroy(hdl);
1778}
1779
1780/*ARGSUSED*/
1781static void
1782dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
1783{
1784}
1785
1786/*ARGSUSED*/
1787static void
1788dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
1789{
1790}
1791
1792static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
1793	dump_none,		/* unallocated			*/
1794	dump_zap,		/* object directory		*/
1795	dump_uint64,		/* object array			*/
1796	dump_none,		/* packed nvlist		*/
1797	dump_packed_nvlist,	/* packed nvlist size		*/
1798	dump_none,		/* bpobj			*/
1799	dump_bpobj,		/* bpobj header			*/
1800	dump_none,		/* SPA space map header		*/
1801	dump_none,		/* SPA space map		*/
1802	dump_none,		/* ZIL intent log		*/
1803	dump_dnode,		/* DMU dnode			*/
1804	dump_dmu_objset,	/* DMU objset			*/
1805	dump_dsl_dir,		/* DSL directory		*/
1806	dump_zap,		/* DSL directory child map	*/
1807	dump_zap,		/* DSL dataset snap map		*/
1808	dump_zap,		/* DSL props			*/
1809	dump_dsl_dataset,	/* DSL dataset			*/
1810	dump_znode,		/* ZFS znode			*/
1811	dump_acl,		/* ZFS V0 ACL			*/
1812	dump_uint8,		/* ZFS plain file		*/
1813	dump_zpldir,		/* ZFS directory		*/
1814	dump_zap,		/* ZFS master node		*/
1815	dump_zap,		/* ZFS delete queue		*/
1816	dump_uint8,		/* zvol object			*/
1817	dump_zap,		/* zvol prop			*/
1818	dump_uint8,		/* other uint8[]		*/
1819	dump_uint64,		/* other uint64[]		*/
1820	dump_zap,		/* other ZAP			*/
1821	dump_zap,		/* persistent error log		*/
1822	dump_uint8,		/* SPA history			*/
1823	dump_history_offsets,	/* SPA history offsets		*/
1824	dump_zap,		/* Pool properties		*/
1825	dump_zap,		/* DSL permissions		*/
1826	dump_acl,		/* ZFS ACL			*/
1827	dump_uint8,		/* ZFS SYSACL			*/
1828	dump_none,		/* FUID nvlist			*/
1829	dump_packed_nvlist,	/* FUID nvlist size		*/
1830	dump_zap,		/* DSL dataset next clones	*/
1831	dump_zap,		/* DSL scrub queue		*/
1832	dump_zap,		/* ZFS user/group used		*/
1833	dump_zap,		/* ZFS user/group quota		*/
1834	dump_zap,		/* snapshot refcount tags	*/
1835	dump_ddt_zap,		/* DDT ZAP object		*/
1836	dump_zap,		/* DDT statistics		*/
1837	dump_znode,		/* SA object			*/
1838	dump_zap,		/* SA Master Node		*/
1839	dump_sa_attrs,		/* SA attribute registration	*/
1840	dump_sa_layouts,	/* SA attribute layouts		*/
1841	dump_zap,		/* DSL scrub translations	*/
1842	dump_none,		/* fake dedup BP		*/
1843	dump_zap,		/* deadlist			*/
1844	dump_none,		/* deadlist hdr			*/
1845	dump_zap,		/* dsl clones			*/
1846	dump_bpobj_subobjs,	/* bpobj subobjs		*/
1847	dump_unknown,		/* Unknown type, must be last	*/
1848};
1849
1850static void
1851dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
1852{
1853	dmu_buf_t *db = NULL;
1854	dmu_object_info_t doi;
1855	dnode_t *dn;
1856	void *bonus = NULL;
1857	size_t bsize = 0;
1858	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
1859	char bonus_size[32];
1860	char aux[50];
1861	int error;
1862
1863	if (*print_header) {
1864		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
1865		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
1866		    "%full", "type");
1867		*print_header = 0;
1868	}
1869
1870	if (object == 0) {
1871		dn = DMU_META_DNODE(os);
1872	} else {
1873		error = dmu_bonus_hold(os, object, FTAG, &db);
1874		if (error)
1875			fatal("dmu_bonus_hold(%llu) failed, errno %u",
1876			    object, error);
1877		bonus = db->db_data;
1878		bsize = db->db_size;
1879		dn = DB_DNODE((dmu_buf_impl_t *)db);
1880	}
1881	dmu_object_info_from_dnode(dn, &doi);
1882
1883	zdb_nicenum(doi.doi_metadata_block_size, iblk);
1884	zdb_nicenum(doi.doi_data_block_size, dblk);
1885	zdb_nicenum(doi.doi_max_offset, lsize);
1886	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
1887	zdb_nicenum(doi.doi_bonus_size, bonus_size);
1888	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
1889	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
1890	    doi.doi_max_offset);
1891
1892	aux[0] = '\0';
1893
1894	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
1895		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
1896		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
1897	}
1898
1899	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
1900		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
1901		    ZDB_COMPRESS_NAME(doi.doi_compress));
1902	}
1903
1904	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
1905	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
1906	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
1907
1908	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
1909		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
1910		    "", "", "", "", "", bonus_size, "bonus",
1911		    ZDB_OT_NAME(doi.doi_bonus_type));
1912	}
1913
1914	if (verbosity >= 4) {
1915		(void) printf("\tdnode flags: %s%s%s\n",
1916		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
1917		    "USED_BYTES " : "",
1918		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
1919		    "USERUSED_ACCOUNTED " : "",
1920		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
1921		    "SPILL_BLKPTR" : "");
1922		(void) printf("\tdnode maxblkid: %llu\n",
1923		    (longlong_t)dn->dn_phys->dn_maxblkid);
1924
1925		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
1926		    bonus, bsize);
1927		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
1928		*print_header = 1;
1929	}
1930
1931	if (verbosity >= 5)
1932		dump_indirect(dn);
1933
1934	if (verbosity >= 5) {
1935		/*
1936		 * Report the list of segments that comprise the object.
1937		 */
1938		uint64_t start = 0;
1939		uint64_t end;
1940		uint64_t blkfill = 1;
1941		int minlvl = 1;
1942
1943		if (dn->dn_type == DMU_OT_DNODE) {
1944			minlvl = 0;
1945			blkfill = DNODES_PER_BLOCK;
1946		}
1947
1948		for (;;) {
1949			char segsize[32];
1950			error = dnode_next_offset(dn,
1951			    0, &start, minlvl, blkfill, 0);
1952			if (error)
1953				break;
1954			end = start;
1955			error = dnode_next_offset(dn,
1956			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
1957			zdb_nicenum(end - start, segsize);
1958			(void) printf("\t\tsegment [%016llx, %016llx)"
1959			    " size %5s\n", (u_longlong_t)start,
1960			    (u_longlong_t)end, segsize);
1961			if (error)
1962				break;
1963			start = end;
1964		}
1965	}
1966
1967	if (db != NULL)
1968		dmu_buf_rele(db, FTAG);
1969}
1970
1971static char *objset_types[DMU_OST_NUMTYPES] = {
1972	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
1973
1974static void
1975dump_dir(objset_t *os)
1976{
1977	dmu_objset_stats_t dds;
1978	uint64_t object, object_count;
1979	uint64_t refdbytes, usedobjs, scratch;
1980	char numbuf[32];
1981	char blkbuf[BP_SPRINTF_LEN + 20];
1982	char osname[ZFS_MAX_DATASET_NAME_LEN];
1983	char *type = "UNKNOWN";
1984	int verbosity = dump_opt['d'];
1985	int print_header = 1;
1986	int i, error;
1987
1988	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
1989	dmu_objset_fast_stat(os, &dds);
1990	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
1991
1992	if (dds.dds_type < DMU_OST_NUMTYPES)
1993		type = objset_types[dds.dds_type];
1994
1995	if (dds.dds_type == DMU_OST_META) {
1996		dds.dds_creation_txg = TXG_INITIAL;
1997		usedobjs = BP_GET_FILL(os->os_rootbp);
1998		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
1999		    dd_used_bytes;
2000	} else {
2001		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
2002	}
2003
2004	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
2005
2006	zdb_nicenum(refdbytes, numbuf);
2007
2008	if (verbosity >= 4) {
2009		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
2010		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
2011		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
2012	} else {
2013		blkbuf[0] = '\0';
2014	}
2015
2016	dmu_objset_name(os, osname);
2017
2018	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
2019	    "%s, %llu objects%s\n",
2020	    osname, type, (u_longlong_t)dmu_objset_id(os),
2021	    (u_longlong_t)dds.dds_creation_txg,
2022	    numbuf, (u_longlong_t)usedobjs, blkbuf);
2023
2024	if (zopt_objects != 0) {
2025		for (i = 0; i < zopt_objects; i++)
2026			dump_object(os, zopt_object[i], verbosity,
2027			    &print_header);
2028		(void) printf("\n");
2029		return;
2030	}
2031
2032	if (dump_opt['i'] != 0 || verbosity >= 2)
2033		dump_intent_log(dmu_objset_zil(os));
2034
2035	if (dmu_objset_ds(os) != NULL)
2036		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
2037
2038	if (verbosity < 2)
2039		return;
2040
2041	if (BP_IS_HOLE(os->os_rootbp))
2042		return;
2043
2044	dump_object(os, 0, verbosity, &print_header);
2045	object_count = 0;
2046	if (DMU_USERUSED_DNODE(os) != NULL &&
2047	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
2048		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
2049		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
2050	}
2051
2052	object = 0;
2053	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
2054		dump_object(os, object, verbosity, &print_header);
2055		object_count++;
2056	}
2057
2058	ASSERT3U(object_count, ==, usedobjs);
2059
2060	(void) printf("\n");
2061
2062	if (error != ESRCH) {
2063		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
2064		abort();
2065	}
2066}
2067
2068static void
2069dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
2070{
2071	time_t timestamp = ub->ub_timestamp;
2072
2073	(void) printf(header ? header : "");
2074	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
2075	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
2076	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
2077	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
2078	(void) printf("\ttimestamp = %llu UTC = %s",
2079	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
2080	if (dump_opt['u'] >= 3) {
2081		char blkbuf[BP_SPRINTF_LEN];
2082		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
2083		(void) printf("\trootbp = %s\n", blkbuf);
2084	}
2085	(void) printf(footer ? footer : "");
2086}
2087
2088static void
2089dump_config(spa_t *spa)
2090{
2091	dmu_buf_t *db;
2092	size_t nvsize = 0;
2093	int error = 0;
2094
2095
2096	error = dmu_bonus_hold(spa->spa_meta_objset,
2097	    spa->spa_config_object, FTAG, &db);
2098
2099	if (error == 0) {
2100		nvsize = *(uint64_t *)db->db_data;
2101		dmu_buf_rele(db, FTAG);
2102
2103		(void) printf("\nMOS Configuration:\n");
2104		dump_packed_nvlist(spa->spa_meta_objset,
2105		    spa->spa_config_object, (void *)&nvsize, 1);
2106	} else {
2107		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
2108		    (u_longlong_t)spa->spa_config_object, error);
2109	}
2110}
2111
2112static void
2113dump_cachefile(const char *cachefile)
2114{
2115	int fd;
2116	struct stat64 statbuf;
2117	char *buf;
2118	nvlist_t *config;
2119
2120	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
2121		(void) printf("cannot open '%s': %s\n", cachefile,
2122		    strerror(errno));
2123		exit(1);
2124	}
2125
2126	if (fstat64(fd, &statbuf) != 0) {
2127		(void) printf("failed to stat '%s': %s\n", cachefile,
2128		    strerror(errno));
2129		exit(1);
2130	}
2131
2132	if ((buf = malloc(statbuf.st_size)) == NULL) {
2133		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
2134		    (u_longlong_t)statbuf.st_size);
2135		exit(1);
2136	}
2137
2138	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
2139		(void) fprintf(stderr, "failed to read %llu bytes\n",
2140		    (u_longlong_t)statbuf.st_size);
2141		exit(1);
2142	}
2143
2144	(void) close(fd);
2145
2146	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
2147		(void) fprintf(stderr, "failed to unpack nvlist\n");
2148		exit(1);
2149	}
2150
2151	free(buf);
2152
2153	dump_nvlist(config, 0);
2154
2155	nvlist_free(config);
2156}
2157
2158#define	ZDB_MAX_UB_HEADER_SIZE 32
2159
2160static void
2161dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
2162{
2163	vdev_t vd;
2164	vdev_t *vdp = &vd;
2165	char header[ZDB_MAX_UB_HEADER_SIZE];
2166
2167	vd.vdev_ashift = ashift;
2168	vdp->vdev_top = vdp;
2169
2170	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
2171		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
2172		uberblock_t *ub = (void *)((char *)lbl + uoff);
2173
2174		if (uberblock_verify(ub))
2175			continue;
2176		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
2177		    "Uberblock[%d]\n", i);
2178		dump_uberblock(ub, header, "");
2179	}
2180}
2181
2182static char curpath[PATH_MAX];
2183
2184/*
2185 * Iterate through the path components, recursively passing
2186 * current one's obj and remaining path until we find the obj
2187 * for the last one.
2188 */
2189static int
2190dump_path_impl(objset_t *os, uint64_t obj, char *name)
2191{
2192	int err;
2193	int header = 1;
2194	uint64_t child_obj;
2195	char *s;
2196	dmu_buf_t *db;
2197	dmu_object_info_t doi;
2198
2199	if ((s = strchr(name, '/')) != NULL)
2200		*s = '\0';
2201	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
2202
2203	(void) strlcat(curpath, name, sizeof (curpath));
2204
2205	if (err != 0) {
2206		(void) fprintf(stderr, "failed to lookup %s: %s\n",
2207		    curpath, strerror(err));
2208		return (err);
2209	}
2210
2211	child_obj = ZFS_DIRENT_OBJ(child_obj);
2212	err = sa_buf_hold(os, child_obj, FTAG, &db);
2213	if (err != 0) {
2214		(void) fprintf(stderr,
2215		    "failed to get SA dbuf for obj %llu: %s\n",
2216		    (u_longlong_t)child_obj, strerror(err));
2217		return (EINVAL);
2218	}
2219	dmu_object_info_from_db(db, &doi);
2220	sa_buf_rele(db, FTAG);
2221
2222	if (doi.doi_bonus_type != DMU_OT_SA &&
2223	    doi.doi_bonus_type != DMU_OT_ZNODE) {
2224		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
2225		    doi.doi_bonus_type, (u_longlong_t)child_obj);
2226		return (EINVAL);
2227	}
2228
2229	if (dump_opt['v'] > 6) {
2230		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
2231		    (u_longlong_t)child_obj, curpath, doi.doi_type,
2232		    doi.doi_bonus_type);
2233	}
2234
2235	(void) strlcat(curpath, "/", sizeof (curpath));
2236
2237	switch (doi.doi_type) {
2238	case DMU_OT_DIRECTORY_CONTENTS:
2239		if (s != NULL && *(s + 1) != '\0')
2240			return (dump_path_impl(os, child_obj, s + 1));
2241		/*FALLTHROUGH*/
2242	case DMU_OT_PLAIN_FILE_CONTENTS:
2243		dump_object(os, child_obj, dump_opt['v'], &header);
2244		return (0);
2245	default:
2246		(void) fprintf(stderr, "object %llu has non-file/directory "
2247		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
2248		break;
2249	}
2250
2251	return (EINVAL);
2252}
2253
2254/*
2255 * Dump the blocks for the object specified by path inside the dataset.
2256 */
2257static int
2258dump_path(char *ds, char *path)
2259{
2260	int err;
2261	objset_t *os;
2262	uint64_t root_obj;
2263
2264	err = open_objset(ds, DMU_OST_ZFS, FTAG, &os);
2265	if (err != 0)
2266		return (err);
2267
2268	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
2269	if (err != 0) {
2270		(void) fprintf(stderr, "can't lookup root znode: %s\n",
2271		    strerror(err));
2272		dmu_objset_disown(os, FTAG);
2273		return (EINVAL);
2274	}
2275
2276	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
2277
2278	err = dump_path_impl(os, root_obj, path);
2279
2280	close_objset(os, FTAG);
2281	return (err);
2282}
2283
2284static int
2285dump_label(const char *dev)
2286{
2287	int fd;
2288	vdev_label_t label;
2289	char path[MAXPATHLEN];
2290	char *buf = label.vl_vdev_phys.vp_nvlist;
2291	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
2292	struct stat64 statbuf;
2293	uint64_t psize, ashift;
2294	boolean_t label_found = B_FALSE;
2295
2296	(void) strlcpy(path, dev, sizeof (path));
2297	if (dev[0] == '/') {
2298		if (strncmp(dev, ZFS_DISK_ROOTD,
2299		    strlen(ZFS_DISK_ROOTD)) == 0) {
2300			(void) snprintf(path, sizeof (path), "%s%s",
2301			    ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD));
2302		}
2303	} else if (stat64(path, &statbuf) != 0) {
2304		char *s;
2305
2306		(void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD,
2307		    dev);
2308		if ((s = strrchr(dev, 's')) == NULL || !isdigit(*(s + 1)))
2309			(void) strlcat(path, "s0", sizeof (path));
2310	}
2311
2312	if (stat64(path, &statbuf) != 0) {
2313		(void) printf("failed to stat '%s': %s\n", path,
2314		    strerror(errno));
2315		exit(1);
2316	}
2317
2318	if (S_ISBLK(statbuf.st_mode)) {
2319		(void) printf("cannot use '%s': character device required\n",
2320		    path);
2321		exit(1);
2322	}
2323
2324	if ((fd = open64(path, O_RDONLY)) < 0) {
2325		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
2326		exit(1);
2327	}
2328
2329	psize = statbuf.st_size;
2330	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
2331
2332	for (int l = 0; l < VDEV_LABELS; l++) {
2333		nvlist_t *config = NULL;
2334
2335		if (!dump_opt['q']) {
2336			(void) printf("------------------------------------\n");
2337			(void) printf("LABEL %d\n", l);
2338			(void) printf("------------------------------------\n");
2339		}
2340
2341		if (pread64(fd, &label, sizeof (label),
2342		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
2343			if (!dump_opt['q'])
2344				(void) printf("failed to read label %d\n", l);
2345			continue;
2346		}
2347
2348		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
2349			if (!dump_opt['q'])
2350				(void) printf("failed to unpack label %d\n", l);
2351			ashift = SPA_MINBLOCKSHIFT;
2352		} else {
2353			nvlist_t *vdev_tree = NULL;
2354
2355			if (!dump_opt['q'])
2356				dump_nvlist(config, 4);
2357			if ((nvlist_lookup_nvlist(config,
2358			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
2359			    (nvlist_lookup_uint64(vdev_tree,
2360			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
2361				ashift = SPA_MINBLOCKSHIFT;
2362			nvlist_free(config);
2363			label_found = B_TRUE;
2364		}
2365		if (dump_opt['u'])
2366			dump_label_uberblocks(&label, ashift);
2367	}
2368
2369	(void) close(fd);
2370
2371	return (label_found ? 0 : 2);
2372}
2373
2374static uint64_t dataset_feature_count[SPA_FEATURES];
2375
2376/*ARGSUSED*/
2377static int
2378dump_one_dir(const char *dsname, void *arg)
2379{
2380	int error;
2381	objset_t *os;
2382
2383	error = open_objset(dsname, DMU_OST_ANY, FTAG, &os);
2384	if (error != 0)
2385		return (0);
2386
2387	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
2388		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
2389			continue;
2390		ASSERT(spa_feature_table[f].fi_flags &
2391		    ZFEATURE_FLAG_PER_DATASET);
2392		dataset_feature_count[f]++;
2393	}
2394
2395	dump_dir(os);
2396	close_objset(os, FTAG);
2397	fuid_table_destroy();
2398	return (0);
2399}
2400
2401/*
2402 * Block statistics.
2403 */
2404#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
2405typedef struct zdb_blkstats {
2406	uint64_t zb_asize;
2407	uint64_t zb_lsize;
2408	uint64_t zb_psize;
2409	uint64_t zb_count;
2410	uint64_t zb_gangs;
2411	uint64_t zb_ditto_samevdev;
2412	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
2413} zdb_blkstats_t;
2414
2415/*
2416 * Extended object types to report deferred frees and dedup auto-ditto blocks.
2417 */
2418#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
2419#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
2420#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
2421#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
2422
2423static char *zdb_ot_extname[] = {
2424	"deferred free",
2425	"dedup ditto",
2426	"other",
2427	"Total",
2428};
2429
2430#define	ZB_TOTAL	DN_MAX_LEVELS
2431
2432typedef struct zdb_cb {
2433	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
2434	uint64_t	zcb_dedup_asize;
2435	uint64_t	zcb_dedup_blocks;
2436	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
2437	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
2438	    [BPE_PAYLOAD_SIZE];
2439	uint64_t	zcb_start;
2440	uint64_t	zcb_lastprint;
2441	uint64_t	zcb_totalasize;
2442	uint64_t	zcb_errors[256];
2443	int		zcb_readfails;
2444	int		zcb_haderrors;
2445	spa_t		*zcb_spa;
2446} zdb_cb_t;
2447
2448static void
2449zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
2450    dmu_object_type_t type)
2451{
2452	uint64_t refcnt = 0;
2453
2454	ASSERT(type < ZDB_OT_TOTAL);
2455
2456	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
2457		return;
2458
2459	for (int i = 0; i < 4; i++) {
2460		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
2461		int t = (i & 1) ? type : ZDB_OT_TOTAL;
2462		int equal;
2463		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
2464
2465		zb->zb_asize += BP_GET_ASIZE(bp);
2466		zb->zb_lsize += BP_GET_LSIZE(bp);
2467		zb->zb_psize += BP_GET_PSIZE(bp);
2468		zb->zb_count++;
2469
2470		/*
2471		 * The histogram is only big enough to record blocks up to
2472		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
2473		 * "other", bucket.
2474		 */
2475		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
2476		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
2477		zb->zb_psize_histogram[idx]++;
2478
2479		zb->zb_gangs += BP_COUNT_GANG(bp);
2480
2481		switch (BP_GET_NDVAS(bp)) {
2482		case 2:
2483			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2484			    DVA_GET_VDEV(&bp->blk_dva[1]))
2485				zb->zb_ditto_samevdev++;
2486			break;
2487		case 3:
2488			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2489			    DVA_GET_VDEV(&bp->blk_dva[1])) +
2490			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2491			    DVA_GET_VDEV(&bp->blk_dva[2])) +
2492			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
2493			    DVA_GET_VDEV(&bp->blk_dva[2]));
2494			if (equal != 0)
2495				zb->zb_ditto_samevdev++;
2496			break;
2497		}
2498
2499	}
2500
2501	if (BP_IS_EMBEDDED(bp)) {
2502		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
2503		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
2504		    [BPE_GET_PSIZE(bp)]++;
2505		return;
2506	}
2507
2508	if (dump_opt['L'])
2509		return;
2510
2511	if (BP_GET_DEDUP(bp)) {
2512		ddt_t *ddt;
2513		ddt_entry_t *dde;
2514
2515		ddt = ddt_select(zcb->zcb_spa, bp);
2516		ddt_enter(ddt);
2517		dde = ddt_lookup(ddt, bp, B_FALSE);
2518
2519		if (dde == NULL) {
2520			refcnt = 0;
2521		} else {
2522			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
2523			ddt_phys_decref(ddp);
2524			refcnt = ddp->ddp_refcnt;
2525			if (ddt_phys_total_refcnt(dde) == 0)
2526				ddt_remove(ddt, dde);
2527		}
2528		ddt_exit(ddt);
2529	}
2530
2531	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
2532	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
2533	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
2534}
2535
2536static void
2537zdb_blkptr_done(zio_t *zio)
2538{
2539	spa_t *spa = zio->io_spa;
2540	blkptr_t *bp = zio->io_bp;
2541	int ioerr = zio->io_error;
2542	zdb_cb_t *zcb = zio->io_private;
2543	zbookmark_phys_t *zb = &zio->io_bookmark;
2544
2545	abd_free(zio->io_abd);
2546
2547	mutex_enter(&spa->spa_scrub_lock);
2548	spa->spa_scrub_inflight--;
2549	cv_broadcast(&spa->spa_scrub_io_cv);
2550
2551	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2552		char blkbuf[BP_SPRINTF_LEN];
2553
2554		zcb->zcb_haderrors = 1;
2555		zcb->zcb_errors[ioerr]++;
2556
2557		if (dump_opt['b'] >= 2)
2558			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2559		else
2560			blkbuf[0] = '\0';
2561
2562		(void) printf("zdb_blkptr_cb: "
2563		    "Got error %d reading "
2564		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
2565		    ioerr,
2566		    (u_longlong_t)zb->zb_objset,
2567		    (u_longlong_t)zb->zb_object,
2568		    (u_longlong_t)zb->zb_level,
2569		    (u_longlong_t)zb->zb_blkid,
2570		    blkbuf);
2571	}
2572	mutex_exit(&spa->spa_scrub_lock);
2573}
2574
2575static int
2576zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2577    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2578{
2579	zdb_cb_t *zcb = arg;
2580	dmu_object_type_t type;
2581	boolean_t is_metadata;
2582
2583	if (bp == NULL)
2584		return (0);
2585
2586	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
2587		char blkbuf[BP_SPRINTF_LEN];
2588		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2589		(void) printf("objset %llu object %llu "
2590		    "level %lld offset 0x%llx %s\n",
2591		    (u_longlong_t)zb->zb_objset,
2592		    (u_longlong_t)zb->zb_object,
2593		    (longlong_t)zb->zb_level,
2594		    (u_longlong_t)blkid2offset(dnp, bp, zb),
2595		    blkbuf);
2596	}
2597
2598	if (BP_IS_HOLE(bp))
2599		return (0);
2600
2601	type = BP_GET_TYPE(bp);
2602
2603	zdb_count_block(zcb, zilog, bp,
2604	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
2605
2606	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
2607
2608	if (!BP_IS_EMBEDDED(bp) &&
2609	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
2610		size_t size = BP_GET_PSIZE(bp);
2611		abd_t *abd = abd_alloc(size, B_FALSE);
2612		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
2613
2614		/* If it's an intent log block, failure is expected. */
2615		if (zb->zb_level == ZB_ZIL_LEVEL)
2616			flags |= ZIO_FLAG_SPECULATIVE;
2617
2618		mutex_enter(&spa->spa_scrub_lock);
2619		while (spa->spa_scrub_inflight > max_inflight)
2620			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2621		spa->spa_scrub_inflight++;
2622		mutex_exit(&spa->spa_scrub_lock);
2623
2624		zio_nowait(zio_read(NULL, spa, bp, abd, size,
2625		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
2626	}
2627
2628	zcb->zcb_readfails = 0;
2629
2630	/* only call gethrtime() every 100 blocks */
2631	static int iters;
2632	if (++iters > 100)
2633		iters = 0;
2634	else
2635		return (0);
2636
2637	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
2638		uint64_t now = gethrtime();
2639		char buf[10];
2640		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
2641		int kb_per_sec =
2642		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
2643		int sec_remaining =
2644		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
2645
2646		zfs_nicenum(bytes, buf, sizeof (buf));
2647		(void) fprintf(stderr,
2648		    "\r%5s completed (%4dMB/s) "
2649		    "estimated time remaining: %uhr %02umin %02usec        ",
2650		    buf, kb_per_sec / 1024,
2651		    sec_remaining / 60 / 60,
2652		    sec_remaining / 60 % 60,
2653		    sec_remaining % 60);
2654
2655		zcb->zcb_lastprint = now;
2656	}
2657
2658	return (0);
2659}
2660
2661static void
2662zdb_leak(void *arg, uint64_t start, uint64_t size)
2663{
2664	vdev_t *vd = arg;
2665
2666	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
2667	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
2668}
2669
2670static metaslab_ops_t zdb_metaslab_ops = {
2671	NULL	/* alloc */
2672};
2673
2674static void
2675zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
2676{
2677	ddt_bookmark_t ddb = { 0 };
2678	ddt_entry_t dde;
2679	int error;
2680
2681	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
2682		blkptr_t blk;
2683		ddt_phys_t *ddp = dde.dde_phys;
2684
2685		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
2686			return;
2687
2688		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
2689
2690		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2691			if (ddp->ddp_phys_birth == 0)
2692				continue;
2693			ddt_bp_create(ddb.ddb_checksum,
2694			    &dde.dde_key, ddp, &blk);
2695			if (p == DDT_PHYS_DITTO) {
2696				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
2697			} else {
2698				zcb->zcb_dedup_asize +=
2699				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
2700				zcb->zcb_dedup_blocks++;
2701			}
2702		}
2703		if (!dump_opt['L']) {
2704			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2705			ddt_enter(ddt);
2706			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
2707			ddt_exit(ddt);
2708		}
2709	}
2710
2711	ASSERT(error == ENOENT);
2712}
2713
2714static void
2715zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
2716{
2717	zcb->zcb_spa = spa;
2718
2719	if (!dump_opt['L']) {
2720		vdev_t *rvd = spa->spa_root_vdev;
2721
2722		/*
2723		 * We are going to be changing the meaning of the metaslab's
2724		 * ms_tree.  Ensure that the allocator doesn't try to
2725		 * use the tree.
2726		 */
2727		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
2728		spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
2729
2730		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2731			vdev_t *vd = rvd->vdev_child[c];
2732			metaslab_group_t *mg = vd->vdev_mg;
2733			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
2734				metaslab_t *msp = vd->vdev_ms[m];
2735				ASSERT3P(msp->ms_group, ==, mg);
2736				mutex_enter(&msp->ms_lock);
2737				metaslab_unload(msp);
2738
2739				/*
2740				 * For leak detection, we overload the metaslab
2741				 * ms_tree to contain allocated segments
2742				 * instead of free segments. As a result,
2743				 * we can't use the normal metaslab_load/unload
2744				 * interfaces.
2745				 */
2746				if (msp->ms_sm != NULL) {
2747					(void) fprintf(stderr,
2748					    "\rloading space map for "
2749					    "vdev %llu of %llu, "
2750					    "metaslab %llu of %llu ...",
2751					    (longlong_t)c,
2752					    (longlong_t)rvd->vdev_children,
2753					    (longlong_t)m,
2754					    (longlong_t)vd->vdev_ms_count);
2755
2756					/*
2757					 * We don't want to spend the CPU
2758					 * manipulating the size-ordered
2759					 * tree, so clear the range_tree
2760					 * ops.
2761					 */
2762					msp->ms_tree->rt_ops = NULL;
2763					VERIFY0(space_map_load(msp->ms_sm,
2764					    msp->ms_tree, SM_ALLOC));
2765
2766					if (!msp->ms_loaded) {
2767						msp->ms_loaded = B_TRUE;
2768					}
2769				}
2770				mutex_exit(&msp->ms_lock);
2771			}
2772		}
2773		(void) fprintf(stderr, "\n");
2774	}
2775
2776	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2777
2778	zdb_ddt_leak_init(spa, zcb);
2779
2780	spa_config_exit(spa, SCL_CONFIG, FTAG);
2781}
2782
2783static void
2784zdb_leak_fini(spa_t *spa)
2785{
2786	if (!dump_opt['L']) {
2787		vdev_t *rvd = spa->spa_root_vdev;
2788		for (int c = 0; c < rvd->vdev_children; c++) {
2789			vdev_t *vd = rvd->vdev_child[c];
2790			metaslab_group_t *mg = vd->vdev_mg;
2791			for (int m = 0; m < vd->vdev_ms_count; m++) {
2792				metaslab_t *msp = vd->vdev_ms[m];
2793				ASSERT3P(mg, ==, msp->ms_group);
2794				mutex_enter(&msp->ms_lock);
2795
2796				/*
2797				 * The ms_tree has been overloaded to
2798				 * contain allocated segments. Now that we
2799				 * finished traversing all blocks, any
2800				 * block that remains in the ms_tree
2801				 * represents an allocated block that we
2802				 * did not claim during the traversal.
2803				 * Claimed blocks would have been removed
2804				 * from the ms_tree.
2805				 */
2806				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
2807
2808				if (msp->ms_loaded) {
2809					msp->ms_loaded = B_FALSE;
2810				}
2811
2812				mutex_exit(&msp->ms_lock);
2813			}
2814		}
2815	}
2816}
2817
2818/* ARGSUSED */
2819static int
2820count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2821{
2822	zdb_cb_t *zcb = arg;
2823
2824	if (dump_opt['b'] >= 5) {
2825		char blkbuf[BP_SPRINTF_LEN];
2826		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2827		(void) printf("[%s] %s\n",
2828		    "deferred free", blkbuf);
2829	}
2830	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
2831	return (0);
2832}
2833
2834static int
2835dump_block_stats(spa_t *spa)
2836{
2837	zdb_cb_t zcb = { 0 };
2838	zdb_blkstats_t *zb, *tzb;
2839	uint64_t norm_alloc, norm_space, total_alloc, total_found;
2840	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
2841	boolean_t leaks = B_FALSE;
2842
2843	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
2844	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
2845	    (dump_opt['c'] == 1) ? "metadata " : "",
2846	    dump_opt['c'] ? "checksums " : "",
2847	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
2848	    !dump_opt['L'] ? "nothing leaked " : "");
2849
2850	/*
2851	 * Load all space maps as SM_ALLOC maps, then traverse the pool
2852	 * claiming each block we discover.  If the pool is perfectly
2853	 * consistent, the space maps will be empty when we're done.
2854	 * Anything left over is a leak; any block we can't claim (because
2855	 * it's not part of any space map) is a double allocation,
2856	 * reference to a freed block, or an unclaimed log block.
2857	 */
2858	zdb_leak_init(spa, &zcb);
2859
2860	/*
2861	 * If there's a deferred-free bplist, process that first.
2862	 */
2863	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
2864	    count_block_cb, &zcb, NULL);
2865	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
2866		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
2867		    count_block_cb, &zcb, NULL);
2868	}
2869	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
2870		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
2871		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
2872		    &zcb, NULL));
2873	}
2874
2875	if (dump_opt['c'] > 1)
2876		flags |= TRAVERSE_PREFETCH_DATA;
2877
2878	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
2879	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
2880	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
2881
2882	/*
2883	 * If we've traversed the data blocks then we need to wait for those
2884	 * I/Os to complete. We leverage "The Godfather" zio to wait on
2885	 * all async I/Os to complete.
2886	 */
2887	if (dump_opt['c']) {
2888		for (int i = 0; i < max_ncpus; i++) {
2889			(void) zio_wait(spa->spa_async_zio_root[i]);
2890			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2891			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2892			    ZIO_FLAG_GODFATHER);
2893		}
2894	}
2895
2896	if (zcb.zcb_haderrors) {
2897		(void) printf("\nError counts:\n\n");
2898		(void) printf("\t%5s  %s\n", "errno", "count");
2899		for (int e = 0; e < 256; e++) {
2900			if (zcb.zcb_errors[e] != 0) {
2901				(void) printf("\t%5d  %llu\n",
2902				    e, (u_longlong_t)zcb.zcb_errors[e]);
2903			}
2904		}
2905	}
2906
2907	/*
2908	 * Report any leaked segments.
2909	 */
2910	zdb_leak_fini(spa);
2911
2912	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
2913
2914	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
2915	norm_space = metaslab_class_get_space(spa_normal_class(spa));
2916
2917	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
2918	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
2919
2920	if (total_found == total_alloc) {
2921		if (!dump_opt['L'])
2922			(void) printf("\n\tNo leaks (block sum matches space"
2923			    " maps exactly)\n");
2924	} else {
2925		(void) printf("block traversal size %llu != alloc %llu "
2926		    "(%s %lld)\n",
2927		    (u_longlong_t)total_found,
2928		    (u_longlong_t)total_alloc,
2929		    (dump_opt['L']) ? "unreachable" : "leaked",
2930		    (longlong_t)(total_alloc - total_found));
2931		leaks = B_TRUE;
2932	}
2933
2934	if (tzb->zb_count == 0)
2935		return (2);
2936
2937	(void) printf("\n");
2938	(void) printf("\tbp count:      %10llu\n",
2939	    (u_longlong_t)tzb->zb_count);
2940	(void) printf("\tganged count:  %10llu\n",
2941	    (longlong_t)tzb->zb_gangs);
2942	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
2943	    (u_longlong_t)tzb->zb_lsize,
2944	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
2945	(void) printf("\tbp physical:   %10llu      avg:"
2946	    " %6llu     compression: %6.2f\n",
2947	    (u_longlong_t)tzb->zb_psize,
2948	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
2949	    (double)tzb->zb_lsize / tzb->zb_psize);
2950	(void) printf("\tbp allocated:  %10llu      avg:"
2951	    " %6llu     compression: %6.2f\n",
2952	    (u_longlong_t)tzb->zb_asize,
2953	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
2954	    (double)tzb->zb_lsize / tzb->zb_asize);
2955	(void) printf("\tbp deduped:    %10llu    ref>1:"
2956	    " %6llu   deduplication: %6.2f\n",
2957	    (u_longlong_t)zcb.zcb_dedup_asize,
2958	    (u_longlong_t)zcb.zcb_dedup_blocks,
2959	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
2960	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
2961	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
2962
2963	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
2964		if (zcb.zcb_embedded_blocks[i] == 0)
2965			continue;
2966		(void) printf("\n");
2967		(void) printf("\tadditional, non-pointer bps of type %u: "
2968		    "%10llu\n",
2969		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
2970
2971		if (dump_opt['b'] >= 3) {
2972			(void) printf("\t number of (compressed) bytes:  "
2973			    "number of bps\n");
2974			dump_histogram(zcb.zcb_embedded_histogram[i],
2975			    sizeof (zcb.zcb_embedded_histogram[i]) /
2976			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
2977		}
2978	}
2979
2980	if (tzb->zb_ditto_samevdev != 0) {
2981		(void) printf("\tDittoed blocks on same vdev: %llu\n",
2982		    (longlong_t)tzb->zb_ditto_samevdev);
2983	}
2984
2985	if (dump_opt['b'] >= 2) {
2986		int l, t, level;
2987		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
2988		    "\t  avg\t comp\t%%Total\tType\n");
2989
2990		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
2991			char csize[32], lsize[32], psize[32], asize[32];
2992			char avg[32], gang[32];
2993			char *typename;
2994
2995			if (t < DMU_OT_NUMTYPES)
2996				typename = dmu_ot[t].ot_name;
2997			else
2998				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
2999
3000			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
3001				(void) printf("%6s\t%5s\t%5s\t%5s"
3002				    "\t%5s\t%5s\t%6s\t%s\n",
3003				    "-",
3004				    "-",
3005				    "-",
3006				    "-",
3007				    "-",
3008				    "-",
3009				    "-",
3010				    typename);
3011				continue;
3012			}
3013
3014			for (l = ZB_TOTAL - 1; l >= -1; l--) {
3015				level = (l == -1 ? ZB_TOTAL : l);
3016				zb = &zcb.zcb_type[level][t];
3017
3018				if (zb->zb_asize == 0)
3019					continue;
3020
3021				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
3022					continue;
3023
3024				if (level == 0 && zb->zb_asize ==
3025				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
3026					continue;
3027
3028				zdb_nicenum(zb->zb_count, csize);
3029				zdb_nicenum(zb->zb_lsize, lsize);
3030				zdb_nicenum(zb->zb_psize, psize);
3031				zdb_nicenum(zb->zb_asize, asize);
3032				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
3033				zdb_nicenum(zb->zb_gangs, gang);
3034
3035				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
3036				    "\t%5.2f\t%6.2f\t",
3037				    csize, lsize, psize, asize, avg,
3038				    (double)zb->zb_lsize / zb->zb_psize,
3039				    100.0 * zb->zb_asize / tzb->zb_asize);
3040
3041				if (level == ZB_TOTAL)
3042					(void) printf("%s\n", typename);
3043				else
3044					(void) printf("    L%d %s\n",
3045					    level, typename);
3046
3047				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
3048					(void) printf("\t number of ganged "
3049					    "blocks: %s\n", gang);
3050				}
3051
3052				if (dump_opt['b'] >= 4) {
3053					(void) printf("psize "
3054					    "(in 512-byte sectors): "
3055					    "number of blocks\n");
3056					dump_histogram(zb->zb_psize_histogram,
3057					    PSIZE_HISTO_SIZE, 0);
3058				}
3059			}
3060		}
3061	}
3062
3063	(void) printf("\n");
3064
3065	if (leaks)
3066		return (2);
3067
3068	if (zcb.zcb_haderrors)
3069		return (3);
3070
3071	return (0);
3072}
3073
3074typedef struct zdb_ddt_entry {
3075	ddt_key_t	zdde_key;
3076	uint64_t	zdde_ref_blocks;
3077	uint64_t	zdde_ref_lsize;
3078	uint64_t	zdde_ref_psize;
3079	uint64_t	zdde_ref_dsize;
3080	avl_node_t	zdde_node;
3081} zdb_ddt_entry_t;
3082
3083/* ARGSUSED */
3084static int
3085zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
3086    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
3087{
3088	avl_tree_t *t = arg;
3089	avl_index_t where;
3090	zdb_ddt_entry_t *zdde, zdde_search;
3091
3092	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
3093		return (0);
3094
3095	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
3096		(void) printf("traversing objset %llu, %llu objects, "
3097		    "%lu blocks so far\n",
3098		    (u_longlong_t)zb->zb_objset,
3099		    (u_longlong_t)BP_GET_FILL(bp),
3100		    avl_numnodes(t));
3101	}
3102
3103	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
3104	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
3105		return (0);
3106
3107	ddt_key_fill(&zdde_search.zdde_key, bp);
3108
3109	zdde = avl_find(t, &zdde_search, &where);
3110
3111	if (zdde == NULL) {
3112		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
3113		zdde->zdde_key = zdde_search.zdde_key;
3114		avl_insert(t, zdde, where);
3115	}
3116
3117	zdde->zdde_ref_blocks += 1;
3118	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
3119	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
3120	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
3121
3122	return (0);
3123}
3124
3125static void
3126dump_simulated_ddt(spa_t *spa)
3127{
3128	avl_tree_t t;
3129	void *cookie = NULL;
3130	zdb_ddt_entry_t *zdde;
3131	ddt_histogram_t ddh_total = { 0 };
3132	ddt_stat_t dds_total = { 0 };
3133
3134	avl_create(&t, ddt_entry_compare,
3135	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
3136
3137	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3138
3139	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
3140	    zdb_ddt_add_cb, &t);
3141
3142	spa_config_exit(spa, SCL_CONFIG, FTAG);
3143
3144	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
3145		ddt_stat_t dds;
3146		uint64_t refcnt = zdde->zdde_ref_blocks;
3147		ASSERT(refcnt != 0);
3148
3149		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
3150		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
3151		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
3152		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
3153
3154		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
3155		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
3156		dds.dds_ref_psize = zdde->zdde_ref_psize;
3157		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
3158
3159		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
3160		    &dds, 0);
3161
3162		umem_free(zdde, sizeof (*zdde));
3163	}
3164
3165	avl_destroy(&t);
3166
3167	ddt_histogram_stat(&dds_total, &ddh_total);
3168
3169	(void) printf("Simulated DDT histogram:\n");
3170
3171	zpool_dump_ddt(&dds_total, &ddh_total);
3172
3173	dump_dedup_ratio(&dds_total);
3174}
3175
3176static void
3177dump_zpool(spa_t *spa)
3178{
3179	dsl_pool_t *dp = spa_get_dsl(spa);
3180	int rc = 0;
3181
3182	if (dump_opt['S']) {
3183		dump_simulated_ddt(spa);
3184		return;
3185	}
3186
3187	if (!dump_opt['e'] && dump_opt['C'] > 1) {
3188		(void) printf("\nCached configuration:\n");
3189		dump_nvlist(spa->spa_config, 8);
3190	}
3191
3192	if (dump_opt['C'])
3193		dump_config(spa);
3194
3195	if (dump_opt['u'])
3196		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
3197
3198	if (dump_opt['D'])
3199		dump_all_ddts(spa);
3200
3201	if (dump_opt['d'] > 2 || dump_opt['m'])
3202		dump_metaslabs(spa);
3203	if (dump_opt['M'])
3204		dump_metaslab_groups(spa);
3205
3206	if (dump_opt['d'] || dump_opt['i']) {
3207		dump_dir(dp->dp_meta_objset);
3208		if (dump_opt['d'] >= 3) {
3209			dump_full_bpobj(&spa->spa_deferred_bpobj,
3210			    "Deferred frees", 0);
3211			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3212				dump_full_bpobj(
3213				    &spa->spa_dsl_pool->dp_free_bpobj,
3214				    "Pool snapshot frees", 0);
3215			}
3216
3217			if (spa_feature_is_active(spa,
3218			    SPA_FEATURE_ASYNC_DESTROY)) {
3219				dump_bptree(spa->spa_meta_objset,
3220				    spa->spa_dsl_pool->dp_bptree_obj,
3221				    "Pool dataset frees");
3222			}
3223			dump_dtl(spa->spa_root_vdev, 0);
3224		}
3225		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
3226		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
3227
3228		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
3229			uint64_t refcount;
3230
3231			if (!(spa_feature_table[f].fi_flags &
3232			    ZFEATURE_FLAG_PER_DATASET) ||
3233			    !spa_feature_is_enabled(spa, f)) {
3234				ASSERT0(dataset_feature_count[f]);
3235				continue;
3236			}
3237			(void) feature_get_refcount(spa,
3238			    &spa_feature_table[f], &refcount);
3239			if (dataset_feature_count[f] != refcount) {
3240				(void) printf("%s feature refcount mismatch: "
3241				    "%lld datasets != %lld refcount\n",
3242				    spa_feature_table[f].fi_uname,
3243				    (longlong_t)dataset_feature_count[f],
3244				    (longlong_t)refcount);
3245				rc = 2;
3246			} else {
3247				(void) printf("Verified %s feature refcount "
3248				    "of %llu is correct\n",
3249				    spa_feature_table[f].fi_uname,
3250				    (longlong_t)refcount);
3251			}
3252		}
3253	}
3254	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
3255		rc = dump_block_stats(spa);
3256
3257	if (rc == 0)
3258		rc = verify_spacemap_refcounts(spa);
3259
3260	if (dump_opt['s'])
3261		show_pool_stats(spa);
3262
3263	if (dump_opt['h'])
3264		dump_history(spa);
3265
3266	if (rc != 0) {
3267		dump_debug_buffer();
3268		exit(rc);
3269	}
3270}
3271
3272#define	ZDB_FLAG_CHECKSUM	0x0001
3273#define	ZDB_FLAG_DECOMPRESS	0x0002
3274#define	ZDB_FLAG_BSWAP		0x0004
3275#define	ZDB_FLAG_GBH		0x0008
3276#define	ZDB_FLAG_INDIRECT	0x0010
3277#define	ZDB_FLAG_PHYS		0x0020
3278#define	ZDB_FLAG_RAW		0x0040
3279#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
3280
3281int flagbits[256];
3282
3283static void
3284zdb_print_blkptr(blkptr_t *bp, int flags)
3285{
3286	char blkbuf[BP_SPRINTF_LEN];
3287
3288	if (flags & ZDB_FLAG_BSWAP)
3289		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
3290
3291	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3292	(void) printf("%s\n", blkbuf);
3293}
3294
3295static void
3296zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
3297{
3298	int i;
3299
3300	for (i = 0; i < nbps; i++)
3301		zdb_print_blkptr(&bp[i], flags);
3302}
3303
3304static void
3305zdb_dump_gbh(void *buf, int flags)
3306{
3307	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
3308}
3309
3310static void
3311zdb_dump_block_raw(void *buf, uint64_t size, int flags)
3312{
3313	if (flags & ZDB_FLAG_BSWAP)
3314		byteswap_uint64_array(buf, size);
3315	(void) write(1, buf, size);
3316}
3317
3318static void
3319zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
3320{
3321	uint64_t *d = (uint64_t *)buf;
3322	int nwords = size / sizeof (uint64_t);
3323	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
3324	int i, j;
3325	char *hdr, *c;
3326
3327
3328	if (do_bswap)
3329		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
3330	else
3331		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
3332
3333	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
3334
3335	for (i = 0; i < nwords; i += 2) {
3336		(void) printf("%06llx:  %016llx  %016llx  ",
3337		    (u_longlong_t)(i * sizeof (uint64_t)),
3338		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
3339		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
3340
3341		c = (char *)&d[i];
3342		for (j = 0; j < 2 * sizeof (uint64_t); j++)
3343			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
3344		(void) printf("\n");
3345	}
3346}
3347
3348/*
3349 * There are two acceptable formats:
3350 *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
3351 *	child[.child]*    - For example: 0.1.1
3352 *
3353 * The second form can be used to specify arbitrary vdevs anywhere
3354 * in the heirarchy.  For example, in a pool with a mirror of
3355 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
3356 */
3357static vdev_t *
3358zdb_vdev_lookup(vdev_t *vdev, char *path)
3359{
3360	char *s, *p, *q;
3361	int i;
3362
3363	if (vdev == NULL)
3364		return (NULL);
3365
3366	/* First, assume the x.x.x.x format */
3367	i = (int)strtoul(path, &s, 10);
3368	if (s == path || (s && *s != '.' && *s != '\0'))
3369		goto name;
3370	if (i < 0 || i >= vdev->vdev_children)
3371		return (NULL);
3372
3373	vdev = vdev->vdev_child[i];
3374	if (*s == '\0')
3375		return (vdev);
3376	return (zdb_vdev_lookup(vdev, s+1));
3377
3378name:
3379	for (i = 0; i < vdev->vdev_children; i++) {
3380		vdev_t *vc = vdev->vdev_child[i];
3381
3382		if (vc->vdev_path == NULL) {
3383			vc = zdb_vdev_lookup(vc, path);
3384			if (vc == NULL)
3385				continue;
3386			else
3387				return (vc);
3388		}
3389
3390		p = strrchr(vc->vdev_path, '/');
3391		p = p ? p + 1 : vc->vdev_path;
3392		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
3393
3394		if (strcmp(vc->vdev_path, path) == 0)
3395			return (vc);
3396		if (strcmp(p, path) == 0)
3397			return (vc);
3398		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
3399			return (vc);
3400	}
3401
3402	return (NULL);
3403}
3404
3405/* ARGSUSED */
3406static int
3407random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
3408{
3409	return (random_get_pseudo_bytes(buf, len));
3410}
3411
3412/*
3413 * Read a block from a pool and print it out.  The syntax of the
3414 * block descriptor is:
3415 *
3416 *	pool:vdev_specifier:offset:size[:flags]
3417 *
3418 *	pool           - The name of the pool you wish to read from
3419 *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
3420 *	offset         - offset, in hex, in bytes
3421 *	size           - Amount of data to read, in hex, in bytes
3422 *	flags          - A string of characters specifying options
3423 *		 b: Decode a blkptr at given offset within block
3424 *		*c: Calculate and display checksums
3425 *		 d: Decompress data before dumping
3426 *		 e: Byteswap data before dumping
3427 *		 g: Display data as a gang block header
3428 *		 i: Display as an indirect block
3429 *		 p: Do I/O to physical offset
3430 *		 r: Dump raw data to stdout
3431 *
3432 *              * = not yet implemented
3433 */
3434static void
3435zdb_read_block(char *thing, spa_t *spa)
3436{
3437	blkptr_t blk, *bp = &blk;
3438	dva_t *dva = bp->blk_dva;
3439	int flags = 0;
3440	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
3441	zio_t *zio;
3442	vdev_t *vd;
3443	abd_t *pabd;
3444	void *lbuf, *buf;
3445	char *s, *p, *dup, *vdev, *flagstr;
3446	int i, error;
3447
3448	dup = strdup(thing);
3449	s = strtok(dup, ":");
3450	vdev = s ? s : "";
3451	s = strtok(NULL, ":");
3452	offset = strtoull(s ? s : "", NULL, 16);
3453	s = strtok(NULL, ":");
3454	size = strtoull(s ? s : "", NULL, 16);
3455	s = strtok(NULL, ":");
3456	flagstr = s ? s : "";
3457
3458	s = NULL;
3459	if (size == 0)
3460		s = "size must not be zero";
3461	if (!IS_P2ALIGNED(size, DEV_BSIZE))
3462		s = "size must be a multiple of sector size";
3463	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
3464		s = "offset must be a multiple of sector size";
3465	if (s) {
3466		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
3467		free(dup);
3468		return;
3469	}
3470
3471	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
3472		for (i = 0; flagstr[i]; i++) {
3473			int bit = flagbits[(uchar_t)flagstr[i]];
3474
3475			if (bit == 0) {
3476				(void) printf("***Invalid flag: %c\n",
3477				    flagstr[i]);
3478				continue;
3479			}
3480			flags |= bit;
3481
3482			/* If it's not something with an argument, keep going */
3483			if ((bit & (ZDB_FLAG_CHECKSUM |
3484			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
3485				continue;
3486
3487			p = &flagstr[i + 1];
3488			if (bit == ZDB_FLAG_PRINT_BLKPTR)
3489				blkptr_offset = strtoull(p, &p, 16);
3490			if (*p != ':' && *p != '\0') {
3491				(void) printf("***Invalid flag arg: '%s'\n", s);
3492				free(dup);
3493				return;
3494			}
3495		}
3496	}
3497
3498	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
3499	if (vd == NULL) {
3500		(void) printf("***Invalid vdev: %s\n", vdev);
3501		free(dup);
3502		return;
3503	} else {
3504		if (vd->vdev_path)
3505			(void) fprintf(stderr, "Found vdev: %s\n",
3506			    vd->vdev_path);
3507		else
3508			(void) fprintf(stderr, "Found vdev type: %s\n",
3509			    vd->vdev_ops->vdev_op_type);
3510	}
3511
3512	psize = size;
3513	lsize = size;
3514
3515	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
3516	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3517
3518	BP_ZERO(bp);
3519
3520	DVA_SET_VDEV(&dva[0], vd->vdev_id);
3521	DVA_SET_OFFSET(&dva[0], offset);
3522	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
3523	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
3524
3525	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
3526
3527	BP_SET_LSIZE(bp, lsize);
3528	BP_SET_PSIZE(bp, psize);
3529	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
3530	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
3531	BP_SET_TYPE(bp, DMU_OT_NONE);
3532	BP_SET_LEVEL(bp, 0);
3533	BP_SET_DEDUP(bp, 0);
3534	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
3535
3536	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3537	zio = zio_root(spa, NULL, NULL, 0);
3538
3539	if (vd == vd->vdev_top) {
3540		/*
3541		 * Treat this as a normal block read.
3542		 */
3543		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
3544		    ZIO_PRIORITY_SYNC_READ,
3545		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
3546	} else {
3547		/*
3548		 * Treat this as a vdev child I/O.
3549		 */
3550		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
3551		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
3552		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
3553		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
3554		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
3555	}
3556
3557	error = zio_wait(zio);
3558	spa_config_exit(spa, SCL_STATE, FTAG);
3559
3560	if (error) {
3561		(void) printf("Read of %s failed, error: %d\n", thing, error);
3562		goto out;
3563	}
3564
3565	if (flags & ZDB_FLAG_DECOMPRESS) {
3566		/*
3567		 * We don't know how the data was compressed, so just try
3568		 * every decompress function at every inflated blocksize.
3569		 */
3570		enum zio_compress c;
3571		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3572		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3573
3574		abd_copy_to_buf(pbuf2, pabd, psize);
3575
3576		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
3577		    random_get_pseudo_bytes_cb, NULL));
3578
3579		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
3580		    SPA_MAXBLOCKSIZE - psize));
3581
3582		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
3583		    lsize -= SPA_MINBLOCKSIZE) {
3584			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
3585				if (zio_decompress_data(c, pabd,
3586				    lbuf, psize, lsize) == 0 &&
3587				    zio_decompress_data_buf(c, pbuf2,
3588				    lbuf2, psize, lsize) == 0 &&
3589				    bcmp(lbuf, lbuf2, lsize) == 0)
3590					break;
3591			}
3592			if (c != ZIO_COMPRESS_FUNCTIONS)
3593				break;
3594			lsize -= SPA_MINBLOCKSIZE;
3595		}
3596
3597		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
3598		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
3599
3600		if (lsize <= psize) {
3601			(void) printf("Decompress of %s failed\n", thing);
3602			goto out;
3603		}
3604		buf = lbuf;
3605		size = lsize;
3606	} else {
3607		buf = abd_to_buf(pabd);
3608		size = psize;
3609	}
3610
3611	if (flags & ZDB_FLAG_PRINT_BLKPTR)
3612		zdb_print_blkptr((blkptr_t *)(void *)
3613		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
3614	else if (flags & ZDB_FLAG_RAW)
3615		zdb_dump_block_raw(buf, size, flags);
3616	else if (flags & ZDB_FLAG_INDIRECT)
3617		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
3618		    flags);
3619	else if (flags & ZDB_FLAG_GBH)
3620		zdb_dump_gbh(buf, flags);
3621	else
3622		zdb_dump_block(thing, buf, size, flags);
3623
3624out:
3625	abd_free(pabd);
3626	umem_free(lbuf, SPA_MAXBLOCKSIZE);
3627	free(dup);
3628}
3629
3630static void
3631zdb_embedded_block(char *thing)
3632{
3633	blkptr_t bp = { 0 };
3634	unsigned long long *words = (void *)&bp;
3635	char buf[SPA_MAXBLOCKSIZE];
3636	int err;
3637
3638	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
3639	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
3640	    words + 0, words + 1, words + 2, words + 3,
3641	    words + 4, words + 5, words + 6, words + 7,
3642	    words + 8, words + 9, words + 10, words + 11,
3643	    words + 12, words + 13, words + 14, words + 15);
3644	if (err != 16) {
3645		(void) printf("invalid input format\n");
3646		exit(1);
3647	}
3648	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
3649	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
3650	if (err != 0) {
3651		(void) printf("decode failed: %u\n", err);
3652		exit(1);
3653	}
3654	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
3655}
3656
3657static boolean_t
3658pool_match(nvlist_t *cfg, char *tgt)
3659{
3660	uint64_t v, guid = strtoull(tgt, NULL, 0);
3661	char *s;
3662
3663	if (guid != 0) {
3664		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
3665			return (v == guid);
3666	} else {
3667		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
3668			return (strcmp(s, tgt) == 0);
3669	}
3670	return (B_FALSE);
3671}
3672
3673static char *
3674find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
3675{
3676	nvlist_t *pools;
3677	nvlist_t *match = NULL;
3678	char *name = NULL;
3679	char *sepp = NULL;
3680	char sep = '\0';
3681	int count = 0;
3682	importargs_t args = { 0 };
3683
3684	args.paths = dirc;
3685	args.path = dirv;
3686	args.can_be_active = B_TRUE;
3687
3688	if ((sepp = strpbrk(*target, "/@")) != NULL) {
3689		sep = *sepp;
3690		*sepp = '\0';
3691	}
3692
3693	pools = zpool_search_import(g_zfs, &args);
3694
3695	if (pools != NULL) {
3696		nvpair_t *elem = NULL;
3697		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
3698			verify(nvpair_value_nvlist(elem, configp) == 0);
3699			if (pool_match(*configp, *target)) {
3700				count++;
3701				if (match != NULL) {
3702					/* print previously found config */
3703					if (name != NULL) {
3704						(void) printf("%s\n", name);
3705						dump_nvlist(match, 8);
3706						name = NULL;
3707					}
3708					(void) printf("%s\n",
3709					    nvpair_name(elem));
3710					dump_nvlist(*configp, 8);
3711				} else {
3712					match = *configp;
3713					name = nvpair_name(elem);
3714				}
3715			}
3716		}
3717	}
3718	if (count > 1)
3719		(void) fatal("\tMatched %d pools - use pool GUID "
3720		    "instead of pool name or \n"
3721		    "\tpool name part of a dataset name to select pool", count);
3722
3723	if (sepp)
3724		*sepp = sep;
3725	/*
3726	 * If pool GUID was specified for pool id, replace it with pool name
3727	 */
3728	if (name && (strstr(*target, name) != *target)) {
3729		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
3730
3731		*target = umem_alloc(sz, UMEM_NOFAIL);
3732		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
3733	}
3734
3735	*configp = name ? match : NULL;
3736
3737	return (name);
3738}
3739
3740int
3741main(int argc, char **argv)
3742{
3743	int i, c;
3744	struct rlimit rl = { 1024, 1024 };
3745	spa_t *spa = NULL;
3746	objset_t *os = NULL;
3747	int dump_all = 1;
3748	int verbose = 0;
3749	int error = 0;
3750	char **searchdirs = NULL;
3751	int nsearch = 0;
3752	char *target;
3753	nvlist_t *policy = NULL;
3754	uint64_t max_txg = UINT64_MAX;
3755	int flags = ZFS_IMPORT_MISSING_LOG;
3756	int rewind = ZPOOL_NEVER_REWIND;
3757	char *spa_config_path_env;
3758	boolean_t target_is_spa = B_TRUE;
3759
3760	(void) setrlimit(RLIMIT_NOFILE, &rl);
3761	(void) enable_extended_FILE_stdio(-1, -1);
3762
3763	dprintf_setup(&argc, argv);
3764
3765	/*
3766	 * If there is an environment variable SPA_CONFIG_PATH it overrides
3767	 * default spa_config_path setting. If -U flag is specified it will
3768	 * override this environment variable settings once again.
3769	 */
3770	spa_config_path_env = getenv("SPA_CONFIG_PATH");
3771	if (spa_config_path_env != NULL)
3772		spa_config_path = spa_config_path_env;
3773
3774	while ((c = getopt(argc, argv,
3775	    "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
3776		switch (c) {
3777		case 'b':
3778		case 'c':
3779		case 'C':
3780		case 'd':
3781		case 'D':
3782		case 'E':
3783		case 'G':
3784		case 'h':
3785		case 'i':
3786		case 'l':
3787		case 'm':
3788		case 'M':
3789		case 'O':
3790		case 'R':
3791		case 's':
3792		case 'S':
3793		case 'u':
3794			dump_opt[c]++;
3795			dump_all = 0;
3796			break;
3797		case 'A':
3798		case 'e':
3799		case 'F':
3800		case 'L':
3801		case 'P':
3802		case 'q':
3803		case 'X':
3804			dump_opt[c]++;
3805			break;
3806		/* NB: Sort single match options below. */
3807		case 'I':
3808			max_inflight = strtoull(optarg, NULL, 0);
3809			if (max_inflight == 0) {
3810				(void) fprintf(stderr, "maximum number "
3811				    "of inflight I/Os must be greater "
3812				    "than 0\n");
3813				usage();
3814			}
3815			break;
3816		case 'o':
3817			error = set_global_var(optarg);
3818			if (error != 0)
3819				usage();
3820			break;
3821		case 'p':
3822			if (searchdirs == NULL) {
3823				searchdirs = umem_alloc(sizeof (char *),
3824				    UMEM_NOFAIL);
3825			} else {
3826				char **tmp = umem_alloc((nsearch + 1) *
3827				    sizeof (char *), UMEM_NOFAIL);
3828				bcopy(searchdirs, tmp, nsearch *
3829				    sizeof (char *));
3830				umem_free(searchdirs,
3831				    nsearch * sizeof (char *));
3832				searchdirs = tmp;
3833			}
3834			searchdirs[nsearch++] = optarg;
3835			break;
3836		case 't':
3837			max_txg = strtoull(optarg, NULL, 0);
3838			if (max_txg < TXG_INITIAL) {
3839				(void) fprintf(stderr, "incorrect txg "
3840				    "specified: %s\n", optarg);
3841				usage();
3842			}
3843			break;
3844		case 'U':
3845			spa_config_path = optarg;
3846			if (spa_config_path[0] != '/') {
3847				(void) fprintf(stderr,
3848				    "cachefile must be an absolute path "
3849				    "(i.e. start with a slash)\n");
3850				usage();
3851			}
3852			break;
3853		case 'v':
3854			verbose++;
3855			break;
3856		case 'V':
3857			flags = ZFS_IMPORT_VERBATIM;
3858			break;
3859		case 'x':
3860			vn_dumpdir = optarg;
3861			break;
3862		default:
3863			usage();
3864			break;
3865		}
3866	}
3867
3868	if (!dump_opt['e'] && searchdirs != NULL) {
3869		(void) fprintf(stderr, "-p option requires use of -e\n");
3870		usage();
3871	}
3872
3873	/*
3874	 * ZDB does not typically re-read blocks; therefore limit the ARC
3875	 * to 256 MB, which can be used entirely for metadata.
3876	 */
3877	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
3878
3879	/*
3880	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
3881	 * "zdb -b" uses traversal prefetch which uses async reads.
3882	 * For good performance, let several of them be active at once.
3883	 */
3884	zfs_vdev_async_read_max_active = 10;
3885
3886	/*
3887	 * Disable reference tracking for better performance.
3888	 */
3889	reference_tracking_enable = B_FALSE;
3890
3891	kernel_init(FREAD);
3892	g_zfs = libzfs_init();
3893	ASSERT(g_zfs != NULL);
3894
3895	if (dump_all)
3896		verbose = MAX(verbose, 1);
3897
3898	for (c = 0; c < 256; c++) {
3899		if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
3900			dump_opt[c] = 1;
3901		if (dump_opt[c])
3902			dump_opt[c] += verbose;
3903	}
3904
3905	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
3906	zfs_recover = (dump_opt['A'] > 1);
3907
3908	argc -= optind;
3909	argv += optind;
3910
3911	if (argc < 2 && dump_opt['R'])
3912		usage();
3913
3914	if (dump_opt['E']) {
3915		if (argc != 1)
3916			usage();
3917		zdb_embedded_block(argv[0]);
3918		return (0);
3919	}
3920
3921	if (argc < 1) {
3922		if (!dump_opt['e'] && dump_opt['C']) {
3923			dump_cachefile(spa_config_path);
3924			return (0);
3925		}
3926		usage();
3927	}
3928
3929	if (dump_opt['l'])
3930		return (dump_label(argv[0]));
3931
3932	if (dump_opt['O']) {
3933		if (argc != 2)
3934			usage();
3935		dump_opt['v'] = verbose + 3;
3936		return (dump_path(argv[0], argv[1]));
3937	}
3938
3939	if (dump_opt['X'] || dump_opt['F'])
3940		rewind = ZPOOL_DO_REWIND |
3941		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
3942
3943	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
3944	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
3945	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
3946		fatal("internal error: %s", strerror(ENOMEM));
3947
3948	error = 0;
3949	target = argv[0];
3950
3951	if (dump_opt['e']) {
3952		nvlist_t *cfg = NULL;
3953		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
3954
3955		error = ENOENT;
3956		if (name) {
3957			if (dump_opt['C'] > 1) {
3958				(void) printf("\nConfiguration for import:\n");
3959				dump_nvlist(cfg, 8);
3960			}
3961			if (nvlist_add_nvlist(cfg,
3962			    ZPOOL_REWIND_POLICY, policy) != 0) {
3963				fatal("can't open '%s': %s",
3964				    target, strerror(ENOMEM));
3965			}
3966			error = spa_import(name, cfg, NULL, flags);
3967		}
3968	}
3969
3970	if (strpbrk(target, "/@") != NULL) {
3971		size_t targetlen;
3972
3973		target_is_spa = B_FALSE;
3974		/*
3975		 * Remove any trailing slash.  Later code would get confused
3976		 * by it, but we want to allow it so that "pool/" can
3977		 * indicate that we want to dump the topmost filesystem,
3978		 * rather than the whole pool.
3979		 */
3980		targetlen = strlen(target);
3981		if (targetlen != 0 && target[targetlen - 1] == '/')
3982			target[targetlen - 1] = '\0';
3983	}
3984
3985	if (error == 0) {
3986		if (target_is_spa || dump_opt['R']) {
3987			error = spa_open_rewind(target, &spa, FTAG, policy,
3988			    NULL);
3989			if (error) {
3990				/*
3991				 * If we're missing the log device then
3992				 * try opening the pool after clearing the
3993				 * log state.
3994				 */
3995				mutex_enter(&spa_namespace_lock);
3996				if ((spa = spa_lookup(target)) != NULL &&
3997				    spa->spa_log_state == SPA_LOG_MISSING) {
3998					spa->spa_log_state = SPA_LOG_CLEAR;
3999					error = 0;
4000				}
4001				mutex_exit(&spa_namespace_lock);
4002
4003				if (!error) {
4004					error = spa_open_rewind(target, &spa,
4005					    FTAG, policy, NULL);
4006				}
4007			}
4008		} else {
4009			error = open_objset(target, DMU_OST_ANY, FTAG, &os);
4010		}
4011	}
4012	nvlist_free(policy);
4013
4014	if (error)
4015		fatal("can't open '%s': %s", target, strerror(error));
4016
4017	argv++;
4018	argc--;
4019	if (!dump_opt['R']) {
4020		if (argc > 0) {
4021			zopt_objects = argc;
4022			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
4023			for (i = 0; i < zopt_objects; i++) {
4024				errno = 0;
4025				zopt_object[i] = strtoull(argv[i], NULL, 0);
4026				if (zopt_object[i] == 0 && errno != 0)
4027					fatal("bad number %s: %s",
4028					    argv[i], strerror(errno));
4029			}
4030		}
4031		if (os != NULL) {
4032			dump_dir(os);
4033		} else if (zopt_objects > 0 && !dump_opt['m']) {
4034			dump_dir(spa->spa_meta_objset);
4035		} else {
4036			dump_zpool(spa);
4037		}
4038	} else {
4039		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
4040		flagbits['c'] = ZDB_FLAG_CHECKSUM;
4041		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
4042		flagbits['e'] = ZDB_FLAG_BSWAP;
4043		flagbits['g'] = ZDB_FLAG_GBH;
4044		flagbits['i'] = ZDB_FLAG_INDIRECT;
4045		flagbits['p'] = ZDB_FLAG_PHYS;
4046		flagbits['r'] = ZDB_FLAG_RAW;
4047
4048		for (i = 0; i < argc; i++)
4049			zdb_read_block(argv[i], spa);
4050	}
4051
4052	if (os != NULL)
4053		close_objset(os, FTAG);
4054	else
4055		spa_close(spa, FTAG);
4056
4057	fuid_table_destroy();
4058
4059	dump_debug_buffer();
4060
4061	libzfs_fini(g_zfs);
4062	kernel_fini();
4063
4064	return (0);
4065}
4066