zdb.c revision fbabab8faf7439009737ccefe9d50152b38c26d1
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <stdio.h>
30#include <stdlib.h>
31#include <sys/zfs_context.h>
32#include <sys/spa.h>
33#include <sys/spa_impl.h>
34#include <sys/dmu.h>
35#include <sys/zap.h>
36#include <sys/fs/zfs.h>
37#include <sys/zfs_znode.h>
38#include <sys/vdev.h>
39#include <sys/vdev_impl.h>
40#include <sys/metaslab_impl.h>
41#include <sys/dmu_objset.h>
42#include <sys/dsl_dir.h>
43#include <sys/dsl_dataset.h>
44#include <sys/dsl_pool.h>
45#include <sys/dbuf.h>
46#include <sys/zil.h>
47#include <sys/zil_impl.h>
48#include <sys/stat.h>
49#include <sys/resource.h>
50#include <sys/dmu_traverse.h>
51#include <sys/zio_checksum.h>
52#include <sys/zio_compress.h>
53
54const char cmdname[] = "zdb";
55uint8_t dump_opt[256];
56
57typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
58
59extern void dump_intent_log(zilog_t *);
60uint64_t *zopt_object = NULL;
61int zopt_objects = 0;
62int zdb_advance = ADVANCE_PRE;
63zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
64
65/*
66 * These libumem hooks provide a reasonable set of defaults for the allocator's
67 * debugging facilities.
68 */
69const char *
70_umem_debug_init()
71{
72	return ("default,verbose"); /* $UMEM_DEBUG setting */
73}
74
75const char *
76_umem_logging_init(void)
77{
78	return ("fail,contents"); /* $UMEM_LOGGING setting */
79}
80
81static void
82usage(void)
83{
84	(void) fprintf(stderr,
85	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
86	    "dataset [object...]\n"
87	    "       %s -C [pool]\n"
88	    "       %s -l dev\n",
89	    cmdname, cmdname, cmdname);
90
91	(void) fprintf(stderr, "	-u uberblock\n");
92	(void) fprintf(stderr, "	-d datasets\n");
93	(void) fprintf(stderr, "        -C cached pool configuration\n");
94	(void) fprintf(stderr, "	-i intent logs\n");
95	(void) fprintf(stderr, "	-b block statistics\n");
96	(void) fprintf(stderr, "	-c checksum all data blocks\n");
97	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
98	(void) fprintf(stderr, "	-v verbose (applies to all others)\n");
99	(void) fprintf(stderr, "        -l dump label contents\n");
100	(void) fprintf(stderr, "	-L live pool (allows some errors)\n");
101	(void) fprintf(stderr, "	-O [!]<pre|post|prune|data|holes> "
102	    "visitation order\n");
103	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
104	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
105	    "simulate bad block\n");
106	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
107	    "to make only that option verbose\n");
108	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
109	exit(1);
110}
111
112static void
113fatal(const char *fmt, ...)
114{
115	va_list ap;
116
117	va_start(ap, fmt);
118	(void) fprintf(stderr, "%s: ", cmdname);
119	(void) vfprintf(stderr, fmt, ap);
120	va_end(ap);
121	(void) fprintf(stderr, "\n");
122
123	exit(1);
124}
125
126static void
127dump_nvlist(nvlist_t *list, int indent)
128{
129	nvpair_t *elem = NULL;
130
131	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
132		switch (nvpair_type(elem)) {
133		case DATA_TYPE_STRING:
134			{
135				char *value;
136
137				VERIFY(nvpair_value_string(elem, &value) == 0);
138				(void) printf("%*s%s='%s'\n", indent, "",
139				    nvpair_name(elem), value);
140			}
141			break;
142
143		case DATA_TYPE_UINT64:
144			{
145				uint64_t value;
146
147				VERIFY(nvpair_value_uint64(elem, &value) == 0);
148				(void) printf("%*s%s=%llu\n", indent, "",
149				    nvpair_name(elem), (u_longlong_t)value);
150			}
151			break;
152
153		case DATA_TYPE_NVLIST:
154			{
155				nvlist_t *value;
156
157				VERIFY(nvpair_value_nvlist(elem, &value) == 0);
158				(void) printf("%*s%s\n", indent, "",
159				    nvpair_name(elem));
160				dump_nvlist(value, indent + 4);
161			}
162			break;
163
164		case DATA_TYPE_NVLIST_ARRAY:
165			{
166				nvlist_t **value;
167				uint_t c, count;
168
169				VERIFY(nvpair_value_nvlist_array(elem, &value,
170				    &count) == 0);
171
172				for (c = 0; c < count; c++) {
173					(void) printf("%*s%s[%u]\n", indent, "",
174					    nvpair_name(elem), c);
175					dump_nvlist(value[c], indent + 8);
176				}
177			}
178			break;
179
180		default:
181
182			(void) printf("bad config type %d for %s\n",
183			    nvpair_type(elem), nvpair_name(elem));
184		}
185	}
186}
187
188/* ARGSUSED */
189static void
190dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
191{
192	nvlist_t *nv;
193	size_t nvsize = *(uint64_t *)data;
194	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
195
196	dmu_read(os, object, 0, nvsize, packed);
197
198	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
199
200	umem_free(packed, nvsize);
201
202	dump_nvlist(nv, 8);
203
204	nvlist_free(nv);
205}
206
207const char dump_zap_stars[] = "****************************************";
208const int dump_zap_width = sizeof (dump_zap_stars) - 1;
209
210static void
211dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
212{
213	int i;
214	int minidx = ZAP_HISTOGRAM_SIZE - 1;
215	int maxidx = 0;
216	uint64_t max = 0;
217
218	for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
219		if (histo[i] > max)
220			max = histo[i];
221		if (histo[i] > 0 && i > maxidx)
222			maxidx = i;
223		if (histo[i] > 0 && i < minidx)
224			minidx = i;
225	}
226
227	if (max < dump_zap_width)
228		max = dump_zap_width;
229
230	for (i = minidx; i <= maxidx; i++)
231		(void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
232		    &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
233}
234
235static void
236dump_zap_stats(objset_t *os, uint64_t object)
237{
238	int error;
239	zap_stats_t zs;
240
241	error = zap_get_stats(os, object, &zs);
242	if (error)
243		return;
244
245	if (zs.zs_ptrtbl_len == 0) {
246		ASSERT(zs.zs_num_blocks == 1);
247		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
248		    (u_longlong_t)zs.zs_blocksize,
249		    (u_longlong_t)zs.zs_num_entries);
250		return;
251	}
252
253	(void) printf("\tFat ZAP stats:\n");
254	(void) printf("\t\tPointer table: %llu elements\n",
255	    (u_longlong_t)zs.zs_ptrtbl_len);
256	(void) printf("\t\tZAP entries: %llu\n",
257	    (u_longlong_t)zs.zs_num_entries);
258	(void) printf("\t\tLeaf blocks: %llu\n",
259	    (u_longlong_t)zs.zs_num_leafs);
260	(void) printf("\t\tTotal blocks: %llu\n",
261	    (u_longlong_t)zs.zs_num_blocks);
262	(void) printf("\t\tOversize blocks: %llu\n",
263	    (u_longlong_t)zs.zs_num_blocks_large);
264
265	(void) printf("\t\tLeafs with 2^n pointers:\n");
266	dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
267
268	(void) printf("\t\tLeafs with n chained:\n");
269	dump_zap_histogram(zs.zs_leafs_with_n_chained);
270
271	(void) printf("\t\tBlocks with n*5 entries:\n");
272	dump_zap_histogram(zs.zs_blocks_with_n5_entries);
273
274	(void) printf("\t\tBlocks n/10 full:\n");
275	dump_zap_histogram(zs.zs_blocks_n_tenths_full);
276
277	(void) printf("\t\tEntries with n chunks:\n");
278	dump_zap_histogram(zs.zs_entries_using_n_chunks);
279
280	(void) printf("\t\tBuckets with n entries:\n");
281	dump_zap_histogram(zs.zs_buckets_with_n_entries);
282}
283
284/*ARGSUSED*/
285static void
286dump_none(objset_t *os, uint64_t object, void *data, size_t size)
287{
288}
289
290/*ARGSUSED*/
291void
292dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
293{
294}
295
296/*ARGSUSED*/
297static void
298dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
299{
300}
301
302/*ARGSUSED*/
303static void
304dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
305{
306	zap_cursor_t zc;
307	zap_attribute_t attr;
308	void *prop;
309	int i;
310
311	dump_zap_stats(os, object);
312	(void) printf("\n");
313
314	for (zap_cursor_init(&zc, os, object);
315	    zap_cursor_retrieve(&zc, &attr) == 0;
316	    zap_cursor_advance(&zc)) {
317		(void) printf("\t\t%s = ", attr.za_name);
318		if (attr.za_num_integers == 0) {
319			(void) printf("\n");
320			continue;
321		}
322		prop = umem_zalloc(attr.za_num_integers *
323		    attr.za_integer_length, UMEM_NOFAIL);
324		(void) zap_lookup(os, object, attr.za_name,
325		    attr.za_integer_length, attr.za_num_integers, prop);
326		if (attr.za_integer_length == 1) {
327			(void) printf("%s", (char *)prop);
328		} else {
329			for (i = 0; i < attr.za_num_integers; i++) {
330				switch (attr.za_integer_length) {
331				case 2:
332					(void) printf("%u ",
333					    ((uint16_t *)prop)[i]);
334					break;
335				case 4:
336					(void) printf("%u ",
337					    ((uint32_t *)prop)[i]);
338					break;
339				case 8:
340					(void) printf("%lld ",
341					    (u_longlong_t)((int64_t *)prop)[i]);
342					break;
343				}
344			}
345		}
346		(void) printf("\n");
347		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
348	}
349	zap_cursor_fini(&zc);
350}
351
352static void
353dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
354{
355	uint64_t alloc, offset, entry;
356	int mapshift = sm->sm_shift;
357	uint64_t mapstart = sm->sm_start;
358	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
359
360	if (smo->smo_object == 0)
361		return;
362
363	/*
364	 * Print out the freelist entries in both encoded and decoded form.
365	 */
366	alloc = 0;
367	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
368		dmu_read(os, smo->smo_object, offset, sizeof (entry), &entry);
369		if (SM_DEBUG_DECODE(entry)) {
370			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
371			    (u_longlong_t)(offset / sizeof (entry)),
372			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
373			    SM_DEBUG_TXG_DECODE(entry),
374			    SM_DEBUG_SYNCPASS_DECODE(entry));
375		} else {
376			(void) printf("\t\t[%4llu]    %c  range:"
377			    " %08llx-%08llx  size: %06llx\n",
378			    (u_longlong_t)(offset / sizeof (entry)),
379			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
380			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart,
381			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart +
382			    (SM_RUN_DECODE(entry) << mapshift),
383			    (SM_RUN_DECODE(entry) << mapshift));
384			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
385				alloc += SM_RUN_DECODE(entry) << mapshift;
386			else
387				alloc -= SM_RUN_DECODE(entry) << mapshift;
388		}
389	}
390	if (alloc != smo->smo_alloc) {
391		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
392		    "with space map summary (%llu)\n",
393		    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
394	}
395}
396
397static void
398dump_metaslab(metaslab_t *msp)
399{
400	char freebuf[5];
401	space_map_obj_t *smo = msp->ms_smo;
402	vdev_t *vd = msp->ms_group->mg_vd;
403	spa_t *spa = vd->vdev_spa;
404
405	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
406
407	if (dump_opt['d'] <= 5) {
408		(void) printf("\t%10llx   %10llu   %5s\n",
409		    (u_longlong_t)msp->ms_map.sm_start,
410		    (u_longlong_t)smo->smo_object,
411		    freebuf);
412		return;
413	}
414
415	(void) printf(
416	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
417	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
418	    (u_longlong_t)smo->smo_object, freebuf);
419
420	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
421
422	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
423}
424
425static void
426dump_metaslabs(spa_t *spa)
427{
428	vdev_t *rvd = spa->spa_root_vdev;
429	vdev_t *vd;
430	int c, m;
431
432	(void) printf("\nMetaslabs:\n");
433
434	for (c = 0; c < rvd->vdev_children; c++) {
435		vd = rvd->vdev_child[c];
436
437		spa_config_enter(spa, RW_READER);
438		(void) printf("\n    vdev %llu = %s\n\n",
439		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
440		spa_config_exit(spa);
441
442		if (dump_opt['d'] <= 5) {
443			(void) printf("\t%10s   %10s   %5s\n",
444			    "offset", "spacemap", "free");
445			(void) printf("\t%10s   %10s   %5s\n",
446			    "------", "--------", "----");
447		}
448		for (m = 0; m < vd->vdev_ms_count; m++)
449			dump_metaslab(vd->vdev_ms[m]);
450		(void) printf("\n");
451	}
452}
453
454static void
455dump_dtl(vdev_t *vd, int indent)
456{
457	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
458	spa_t *spa = vd->vdev_spa;
459	space_seg_t *ss;
460	vdev_t *pvd;
461	int c;
462
463	if (indent == 0)
464		(void) printf("\nDirty time logs:\n\n");
465
466	spa_config_enter(spa, RW_READER);
467	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
468	spa_config_exit(spa);
469
470	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
471		/*
472		 * Everything in this DTL must appear in all parent DTL unions.
473		 */
474		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
475			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
476			    ss->ss_start, ss->ss_end - ss->ss_start));
477		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
478		    indent, "",
479		    (u_longlong_t)ss->ss_start,
480		    (u_longlong_t)ss->ss_end - 1,
481		    (u_longlong_t)ss->ss_end - ss->ss_start);
482	}
483
484	(void) printf("\n");
485
486	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
487		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
488		    &vd->vdev_dtl_map);
489		(void) printf("\n");
490	}
491
492	for (c = 0; c < vd->vdev_children; c++)
493		dump_dtl(vd->vdev_child[c], indent + 4);
494}
495
496/*ARGSUSED*/
497static void
498dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
499{
500}
501
502static uint64_t
503blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
504{
505	if (level < 0)
506		return (blkid);
507
508	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
509	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
510}
511
512/* ARGSUSED */
513static int
514zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
515{
516	zbookmark_t *zb = &bc->bc_bookmark;
517	blkptr_t *bp = &bc->bc_blkptr;
518	dva_t *dva = &bp->blk_dva[0];
519	void *data = bc->bc_data;
520	dnode_phys_t *dnp = bc->bc_dnode;
521	char buffer[300];
522	int l;
523
524	if (bc->bc_errno) {
525		(void) sprintf(buffer,
526		    "Error %d reading <%llu, %llu, %d, %llu>: ",
527		    bc->bc_errno,
528		    (u_longlong_t)zb->zb_objset,
529		    (u_longlong_t)zb->zb_object,
530		    zb->zb_level,
531		    (u_longlong_t)zb->zb_blkid);
532		goto out;
533	}
534
535	if (zb->zb_level == -1) {
536		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
537		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
538	} else {
539		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
540		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
541	}
542
543	if (zb->zb_level > 0) {
544		uint64_t fill = 0;
545		blkptr_t *bpx, *bpend;
546
547		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
548		    bpx < bpend; bpx++) {
549			if (bpx->blk_birth != 0) {
550				ASSERT(bpx->blk_fill > 0);
551				fill += bpx->blk_fill;
552			} else {
553				ASSERT(bpx->blk_fill == 0);
554			}
555		}
556		ASSERT3U(fill, ==, bp->blk_fill);
557	}
558
559	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
560		uint64_t fill = 0;
561		dnode_phys_t *dnx, *dnend;
562
563		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
564		    dnx < dnend; dnx++) {
565			if (dnx->dn_type != DMU_OT_NONE)
566				fill++;
567		}
568		ASSERT3U(fill, ==, bp->blk_fill);
569	}
570
571	(void) sprintf(buffer, "%16llx ",
572	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
573
574	ASSERT(zb->zb_level >= 0);
575
576	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
577		if (l == zb->zb_level) {
578			(void) sprintf(buffer + strlen(buffer), "L%x",
579			    zb->zb_level);
580		} else {
581			(void) sprintf(buffer + strlen(buffer), " ");
582		}
583	}
584
585out:
586	if (bp->blk_birth == 0) {
587		(void) sprintf(buffer + strlen(buffer), "<hole>");
588		(void) printf("%s\n", buffer);
589	} else {
590		// XXBP - Need to print number of active BPs here
591		(void) sprintf(buffer + strlen(buffer),
592		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
593		    (u_longlong_t)DVA_GET_VDEV(dva),
594		    (u_longlong_t)DVA_GET_OFFSET(dva),
595		    (u_longlong_t)BP_GET_LSIZE(bp),
596		    (u_longlong_t)BP_GET_PSIZE(bp),
597		    (u_longlong_t)DVA_GET_ASIZE(dva),
598		    (u_longlong_t)bp->blk_fill,
599		    (u_longlong_t)bp->blk_birth);
600
601		(void) printf("%s\n", buffer);
602	}
603
604	return (bc->bc_errno ? ERESTART : 0);
605}
606
607/*ARGSUSED*/
608static void
609dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
610{
611	traverse_handle_t *th;
612	uint64_t objset = dmu_objset_id(os);
613	int advance = zdb_advance;
614
615	(void) printf("Indirect blocks:\n");
616
617	if (object == 0)
618		advance |= ADVANCE_DATA;
619
620	th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
621	    ZIO_FLAG_CANFAIL);
622	th->th_noread = zdb_noread;
623
624	traverse_add_dnode(th, 0, -1ULL, objset, object);
625
626	while (traverse_more(th) == EAGAIN)
627		continue;
628
629	(void) printf("\n");
630
631	traverse_fini(th);
632}
633
634/*ARGSUSED*/
635static void
636dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
637{
638	dsl_dir_phys_t *dd = data;
639	time_t crtime;
640	char used[6], compressed[6], uncompressed[6], quota[6], resv[6];
641
642	if (dd == NULL)
643		return;
644
645	ASSERT(size == sizeof (*dd));
646
647	crtime = dd->dd_creation_time;
648	nicenum(dd->dd_used_bytes, used);
649	nicenum(dd->dd_compressed_bytes, compressed);
650	nicenum(dd->dd_uncompressed_bytes, uncompressed);
651	nicenum(dd->dd_quota, quota);
652	nicenum(dd->dd_reserved, resv);
653
654	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
655	(void) printf("\t\thead_dataset_obj = %llu\n",
656	    (u_longlong_t)dd->dd_head_dataset_obj);
657	(void) printf("\t\tparent_dir_obj = %llu\n",
658	    (u_longlong_t)dd->dd_parent_obj);
659	(void) printf("\t\tclone_parent_obj = %llu\n",
660	    (u_longlong_t)dd->dd_clone_parent_obj);
661	(void) printf("\t\tchild_dir_zapobj = %llu\n",
662	    (u_longlong_t)dd->dd_child_dir_zapobj);
663	(void) printf("\t\tused_bytes = %s\n", used);
664	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
665	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
666	(void) printf("\t\tquota = %s\n", quota);
667	(void) printf("\t\treserved = %s\n", resv);
668	(void) printf("\t\tprops_zapobj = %llu\n",
669	    (u_longlong_t)dd->dd_props_zapobj);
670}
671
672/*ARGSUSED*/
673static void
674dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
675{
676	dsl_dataset_phys_t *ds = data;
677	time_t crtime;
678	char used[6], compressed[6], uncompressed[6], unique[6];
679	char blkbuf[BP_SPRINTF_LEN];
680
681	if (ds == NULL)
682		return;
683
684	ASSERT(size == sizeof (*ds));
685	crtime = ds->ds_creation_time;
686	nicenum(ds->ds_used_bytes, used);
687	nicenum(ds->ds_compressed_bytes, compressed);
688	nicenum(ds->ds_uncompressed_bytes, uncompressed);
689	nicenum(ds->ds_unique_bytes, unique);
690	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
691
692	(void) printf("\t\tdataset_obj = %llu\n",
693	    (u_longlong_t)ds->ds_dir_obj);
694	(void) printf("\t\tprev_snap_obj = %llu\n",
695	    (u_longlong_t)ds->ds_prev_snap_obj);
696	(void) printf("\t\tprev_snap_txg = %llu\n",
697	    (u_longlong_t)ds->ds_prev_snap_txg);
698	(void) printf("\t\tnext_snap_obj = %llu\n",
699	    (u_longlong_t)ds->ds_next_snap_obj);
700	(void) printf("\t\tsnapnames_zapobj = %llu\n",
701	    (u_longlong_t)ds->ds_snapnames_zapobj);
702	(void) printf("\t\tnum_children = %llu\n",
703	    (u_longlong_t)ds->ds_num_children);
704	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
705	(void) printf("\t\tcreation_txg = %llu\n",
706	    (u_longlong_t)ds->ds_creation_txg);
707	(void) printf("\t\tdeadlist_obj = %llu\n",
708	    (u_longlong_t)ds->ds_deadlist_obj);
709	(void) printf("\t\tused_bytes = %s\n", used);
710	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
711	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
712	(void) printf("\t\tunique = %s\n", unique);
713	(void) printf("\t\tfsid_guid = %llu\n",
714	    (u_longlong_t)ds->ds_fsid_guid);
715	(void) printf("\t\tguid = %llu\n",
716	    (u_longlong_t)ds->ds_guid);
717	(void) printf("\t\trestoring = %llu\n",
718	    (u_longlong_t)ds->ds_restoring);
719	(void) printf("\t\tbp = %s\n", blkbuf);
720}
721
722static void
723dump_bplist(objset_t *mos, uint64_t object, char *name)
724{
725	bplist_t bpl = { 0 };
726	blkptr_t blk, *bp = &blk;
727	uint64_t itor = 0;
728	char numbuf[6];
729
730	if (dump_opt['d'] < 3)
731		return;
732
733	bplist_open(&bpl, mos, object);
734	if (bplist_empty(&bpl)) {
735		bplist_close(&bpl);
736		return;
737	}
738
739	nicenum(bpl.bpl_phys->bpl_bytes, numbuf);
740
741	(void) printf("\n    %s: %llu entries, %s\n",
742	    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, numbuf);
743
744	if (dump_opt['d'] < 5) {
745		bplist_close(&bpl);
746		return;
747	}
748
749	(void) printf("\n");
750
751	while (bplist_iterate(&bpl, &itor, bp) == 0) {
752		ASSERT(bp->blk_birth != 0);
753		// XXBP - Do we want to see all DVAs, or just one?
754		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
755		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
756		    (u_longlong_t)itor - 1,
757		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
758		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
759		    (u_longlong_t)BP_GET_LSIZE(bp),
760		    (u_longlong_t)BP_GET_PSIZE(bp),
761		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
762		    (u_longlong_t)bp->blk_fill,
763		    (u_longlong_t)bp->blk_birth);
764	}
765
766	bplist_close(&bpl);
767}
768
769static char *
770znode_path(objset_t *os, uint64_t object, char *pathbuf, size_t size)
771{
772	dmu_buf_t *db;
773	dmu_object_info_t doi;
774	znode_phys_t *zp;
775	uint64_t parent = 0;
776	size_t complen;
777	char component[MAXNAMELEN + 1];
778	char *path;
779
780	path = pathbuf + size;
781	*--path = '\0';
782
783	for (;;) {
784		db = dmu_bonus_hold(os, object);
785		if (db == NULL)
786			break;
787
788		dmu_buf_read(db);
789		dmu_object_info_from_db(db, &doi);
790		zp = db->db_data;
791		parent = zp->zp_parent;
792		dmu_buf_rele(db);
793
794		if (doi.doi_bonus_type != DMU_OT_ZNODE)
795			break;
796
797		if (parent == object) {
798			if (path[0] != '/')
799				*--path = '/';
800			return (path);
801		}
802
803		if (zap_value_search(os, parent, object, component) != 0)
804			break;
805
806		complen = strlen(component);
807		path -= complen;
808		bcopy(component, path, complen);
809		*--path = '/';
810
811		object = parent;
812	}
813
814	(void) sprintf(component, "???<object#%llu>", (u_longlong_t)object);
815
816	complen = strlen(component);
817	path -= complen;
818	bcopy(component, path, complen);
819
820	return (path);
821}
822
823/*ARGSUSED*/
824static void
825dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
826{
827	znode_phys_t *zp = data;
828	time_t z_crtime, z_atime, z_mtime, z_ctime;
829	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
830
831	ASSERT(size >= sizeof (znode_phys_t));
832
833	if (dump_opt['d'] < 3) {
834		(void) printf("\t%s\n",
835		    znode_path(os, object, path, sizeof (path)));
836		return;
837	}
838
839	z_crtime = (time_t)zp->zp_crtime[0];
840	z_atime = (time_t)zp->zp_atime[0];
841	z_mtime = (time_t)zp->zp_mtime[0];
842	z_ctime = (time_t)zp->zp_ctime[0];
843
844	(void) printf("\tpath	%s\n",
845	    znode_path(os, object, path, sizeof (path)));
846	(void) printf("\tatime	%s", ctime(&z_atime));
847	(void) printf("\tmtime	%s", ctime(&z_mtime));
848	(void) printf("\tctime	%s", ctime(&z_ctime));
849	(void) printf("\tcrtime	%s", ctime(&z_crtime));
850	(void) printf("\tgen	%llu\n", (u_longlong_t)zp->zp_gen);
851	(void) printf("\tmode	%llo\n", (u_longlong_t)zp->zp_mode);
852	(void) printf("\tsize	%llu\n", (u_longlong_t)zp->zp_size);
853	(void) printf("\tparent	%llu\n", (u_longlong_t)zp->zp_parent);
854	(void) printf("\tlinks	%llu\n", (u_longlong_t)zp->zp_links);
855	(void) printf("\txattr	%llu\n", (u_longlong_t)zp->zp_xattr);
856	(void) printf("\trdev	0x%016llx\n", (u_longlong_t)zp->zp_rdev);
857}
858
859/*ARGSUSED*/
860static void
861dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
862{
863}
864
865/*ARGSUSED*/
866static void
867dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
868{
869}
870
871static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
872	dump_none,		/* unallocated			*/
873	dump_zap,		/* object directory		*/
874	dump_uint64,		/* object array			*/
875	dump_none,		/* packed nvlist		*/
876	dump_packed_nvlist,	/* packed nvlist size		*/
877	dump_none,		/* bplist			*/
878	dump_none,		/* bplist header		*/
879	dump_none,		/* SPA space map header		*/
880	dump_none,		/* SPA space map		*/
881	dump_none,		/* ZIL intent log		*/
882	dump_dnode,		/* DMU dnode			*/
883	dump_dmu_objset,	/* DMU objset			*/
884	dump_dsl_dir,	/* DSL directory			*/
885	dump_zap,		/* DSL directory child map	*/
886	dump_zap,		/* DSL dataset snap map		*/
887	dump_zap,		/* DSL props			*/
888	dump_dsl_dataset,	/* DSL dataset			*/
889	dump_znode,		/* ZFS znode			*/
890	dump_acl,		/* ZFS ACL			*/
891	dump_uint8,		/* ZFS plain file		*/
892	dump_zap,		/* ZFS directory		*/
893	dump_zap,		/* ZFS master node		*/
894	dump_zap,		/* ZFS delete queue		*/
895	dump_uint8,		/* zvol object			*/
896	dump_zap,		/* zvol prop			*/
897	dump_uint8,		/* other uint8[]		*/
898	dump_uint64,		/* other uint64[]		*/
899	dump_zap,		/* other ZAP			*/
900};
901
902static void
903dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
904{
905	dmu_buf_t *db = NULL;
906	dmu_object_info_t doi;
907	dnode_t *dn;
908	void *bonus = NULL;
909	size_t bsize = 0;
910	char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6];
911	char aux[50];
912	int error;
913
914	if (*print_header) {
915		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
916		    "  psize  type\n");
917		*print_header = 0;
918	}
919
920	if (object == 0) {
921		dn = os->os->os_meta_dnode;
922	} else {
923		db = dmu_bonus_hold(os, object);
924		if (db == NULL)
925			fatal("dmu_bonus_hold(%llu) failed", object);
926		dmu_buf_read(db);
927		bonus = db->db_data;
928		bsize = db->db_size;
929		dn = ((dmu_buf_impl_t *)db)->db_dnode;
930	}
931	dmu_object_info_from_dnode(dn, &doi);
932
933	nicenum(doi.doi_metadata_block_size, iblk);
934	nicenum(doi.doi_data_block_size, dblk);
935	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
936	    lsize);
937	nicenum(doi.doi_physical_blks << 9, psize);
938	nicenum(doi.doi_bonus_size, bonus_size);
939
940	aux[0] = '\0';
941
942	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6)
943		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
944		zio_checksum_table[doi.doi_checksum].ci_name);
945
946	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6)
947		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
948		zio_compress_table[doi.doi_compress].ci_name);
949
950	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
951	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
952	    psize, dmu_ot[doi.doi_type].ot_name, aux);
953
954	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
955		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
956		    "", "", "", "", bonus_size, "bonus",
957		    dmu_ot[doi.doi_bonus_type].ot_name);
958	}
959
960	if (verbosity >= 4) {
961		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
962		object_viewer[doi.doi_type](os, object, NULL, 0);
963		*print_header = 1;
964	}
965
966	if (verbosity >= 5)
967		dump_indirect(os, object, NULL, 0);
968
969	if (verbosity >= 5) {
970		/*
971		 * Report the list of segments that comprise the object.
972		 */
973		uint64_t start = 0;
974		uint64_t end;
975		uint64_t blkfill = 1;
976		int minlvl = 1;
977
978		if (dn->dn_type == DMU_OT_DNODE) {
979			minlvl = 0;
980			blkfill = DNODES_PER_BLOCK;
981		}
982
983		for (;;) {
984			error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
985			    blkfill);
986			if (error)
987				break;
988			end = start;
989			error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
990			    blkfill);
991			nicenum(end - start, segsize);
992			(void) printf("\t\tsegment [%016llx, %016llx)"
993			    " size %5s\n", (u_longlong_t)start,
994			    (u_longlong_t)end, segsize);
995			if (error)
996				break;
997			start = end;
998		}
999	}
1000
1001	if (db != NULL)
1002		dmu_buf_rele(db);
1003}
1004
1005static char *objset_types[DMU_OST_NUMTYPES] = {
1006	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
1007
1008/*ARGSUSED*/
1009static void
1010dump_dir(objset_t *os)
1011{
1012	dmu_objset_stats_t dds;
1013	uint64_t object, object_count;
1014	char numbuf[8];
1015	char blkbuf[BP_SPRINTF_LEN];
1016	char osname[MAXNAMELEN];
1017	char *type = "UNKNOWN";
1018	int verbosity = dump_opt['d'];
1019	int print_header = 1;
1020	int i, error;
1021
1022	dmu_objset_stats(os, &dds);
1023
1024	if (dds.dds_type < DMU_OST_NUMTYPES)
1025		type = objset_types[dds.dds_type];
1026
1027	if (dds.dds_type == DMU_OST_META) {
1028		dds.dds_creation_txg = TXG_INITIAL;
1029		dds.dds_last_txg = os->os->os_rootbp.blk_birth;
1030		dds.dds_objects_used = os->os->os_rootbp.blk_fill;
1031		dds.dds_space_refd =
1032		    os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
1033	}
1034
1035	ASSERT3U(dds.dds_objects_used, ==, os->os->os_rootbp.blk_fill);
1036
1037	nicenum(dds.dds_space_refd, numbuf);
1038
1039	if (verbosity >= 4) {
1040		(void) strcpy(blkbuf, ", rootbp ");
1041		sprintf_blkptr(blkbuf + strlen(blkbuf),
1042		    BP_SPRINTF_LEN - strlen(blkbuf), &os->os->os_rootbp);
1043	} else {
1044		blkbuf[0] = '\0';
1045	}
1046
1047	dmu_objset_name(os, osname);
1048
1049	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, last_txg %llu, "
1050	    "%s, %llu objects%s\n",
1051	    osname, type, (u_longlong_t)dmu_objset_id(os),
1052	    (u_longlong_t)dds.dds_creation_txg,
1053	    (u_longlong_t)dds.dds_last_txg,
1054	    numbuf,
1055	    (u_longlong_t)dds.dds_objects_used,
1056	    blkbuf);
1057
1058	dump_intent_log(dmu_objset_zil(os));
1059
1060	if (dmu_objset_ds(os) != NULL)
1061		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
1062		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
1063
1064	if (verbosity < 2)
1065		return;
1066
1067	if (zopt_objects != 0) {
1068		for (i = 0; i < zopt_objects; i++)
1069			dump_object(os, zopt_object[i], verbosity,
1070			    &print_header);
1071		(void) printf("\n");
1072		return;
1073	}
1074
1075	dump_object(os, 0, verbosity, &print_header);
1076	object_count = 1;
1077
1078	object = 0;
1079	while ((error = dmu_object_next(os, &object, B_FALSE)) == 0) {
1080		dump_object(os, object, verbosity, &print_header);
1081		object_count++;
1082	}
1083
1084	ASSERT3U(object_count, ==, dds.dds_objects_used);
1085
1086	(void) printf("\n");
1087
1088	if (error != ESRCH)
1089		fatal("dmu_object_next() = %d", error);
1090}
1091
1092static void
1093dump_uberblock(uberblock_t *ub)
1094{
1095	time_t timestamp = ub->ub_timestamp;
1096
1097	(void) printf("Uberblock\n\n");
1098	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
1099	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
1100	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
1101	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
1102	(void) printf("\ttimestamp = %llu UTC = %s",
1103	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
1104	if (dump_opt['u'] >= 3) {
1105		char blkbuf[BP_SPRINTF_LEN];
1106		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
1107		(void) printf("\trootbp = %s\n", blkbuf);
1108	}
1109	(void) printf("\n");
1110}
1111
1112static void
1113dump_config(const char *pool)
1114{
1115	spa_t *spa = NULL;
1116
1117	mutex_enter(&spa_namespace_lock);
1118	while ((spa = spa_next(spa)) != NULL) {
1119		if (pool == NULL)
1120			(void) printf("%s\n", spa_name(spa));
1121		if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
1122			dump_nvlist(spa->spa_config, 4);
1123	}
1124	mutex_exit(&spa_namespace_lock);
1125}
1126
1127static void
1128dump_label(const char *dev)
1129{
1130	int fd;
1131	vdev_label_t label;
1132	char *buf = label.vl_vdev_phys.vp_nvlist;
1133	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
1134	struct stat64 statbuf;
1135	uint64_t psize;
1136	int l;
1137
1138	if ((fd = open(dev, O_RDONLY)) < 0) {
1139		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
1140		exit(1);
1141	}
1142
1143	if (fstat64(fd, &statbuf) != 0) {
1144		(void) printf("failed to stat '%s': %s\n", dev,
1145		    strerror(errno));
1146		exit(1);
1147	}
1148
1149	psize = statbuf.st_size;
1150	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
1151
1152	for (l = 0; l < VDEV_LABELS; l++) {
1153
1154		nvlist_t *config = NULL;
1155
1156		(void) printf("--------------------------------------------\n");
1157		(void) printf("LABEL %d\n", l);
1158		(void) printf("--------------------------------------------\n");
1159
1160		if (pread(fd, &label, sizeof (label),
1161		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
1162			(void) printf("failed to read label %d\n", l);
1163			continue;
1164		}
1165
1166		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
1167			(void) printf("failed to unpack label %d\n", l);
1168			continue;
1169		}
1170		dump_nvlist(config, 4);
1171		nvlist_free(config);
1172	}
1173}
1174
1175/*ARGSUSED*/
1176static void
1177dump_one_dir(char *dsname, void *arg)
1178{
1179	int error;
1180	objset_t *os;
1181
1182	error = dmu_objset_open(dsname, DMU_OST_ANY,
1183	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1184	if (error) {
1185		(void) printf("Could not open %s\n", dsname);
1186		return;
1187	}
1188	dump_dir(os);
1189	dmu_objset_close(os);
1190}
1191
1192static void
1193zdb_space_map_load(spa_t *spa)
1194{
1195	vdev_t *rvd = spa->spa_root_vdev;
1196	vdev_t *vd;
1197	int c, m, error;
1198
1199	for (c = 0; c < rvd->vdev_children; c++) {
1200		vd = rvd->vdev_child[c];
1201		for (m = 0; m < vd->vdev_ms_count; m++) {
1202			metaslab_t *msp = vd->vdev_ms[m];
1203			space_map_t *sm = &msp->ms_allocmap[0];
1204			mutex_enter(&msp->ms_lock);
1205			error = space_map_load(sm, msp->ms_smo, SM_ALLOC,
1206			    spa->spa_meta_objset, msp->ms_usable_end,
1207			    sm->sm_size - msp->ms_usable_space);
1208			mutex_exit(&msp->ms_lock);
1209			if (error)
1210				fatal("%s bad space map #%d, error %d",
1211				    spa->spa_name, c, error);
1212		}
1213	}
1214}
1215
1216static int
1217zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
1218{
1219	dva_t *dva = &bp->blk_dva[0];
1220	uint64_t vdev = DVA_GET_VDEV(dva);
1221	uint64_t offset = DVA_GET_OFFSET(dva);
1222	uint64_t size = DVA_GET_ASIZE(dva);
1223	vdev_t *vd;
1224	metaslab_t *msp;
1225	space_map_t *allocmap, *freemap;
1226	int error;
1227
1228	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
1229		return (ENXIO);
1230
1231	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1232		return (ENXIO);
1233
1234	if (DVA_GET_GANG(dva)) {
1235		zio_gbh_phys_t gbh;
1236		blkptr_t blk = *bp;
1237		int g;
1238
1239		/* LINTED - compile time assert */
1240		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
1241		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1242		DVA_SET_GANG(&blk.blk_dva[0], 0);
1243		DVA_SET_ASIZE(&blk.blk_dva[0], size);
1244		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
1245		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
1246		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
1247		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
1248		error = zio_wait(zio_read(NULL, spa, &blk,
1249		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1250		    ZIO_PRIORITY_SYNC_READ,
1251		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD));
1252		if (error)
1253			return (error);
1254		if (BP_SHOULD_BYTESWAP(&blk))
1255			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
1256		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1257			if (gbh.zg_blkptr[g].blk_birth == 0)
1258				break;
1259			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g]);
1260			if (error)
1261				return (error);
1262		}
1263	}
1264
1265	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1266	allocmap = &msp->ms_allocmap[0];
1267	freemap = &msp->ms_freemap[0];
1268
1269	mutex_enter(&msp->ms_lock);
1270	if (space_map_contains(freemap, offset, size)) {
1271		mutex_exit(&msp->ms_lock);
1272		return (EAGAIN);	/* allocated more than once */
1273	}
1274
1275	if (!space_map_contains(allocmap, offset, size)) {
1276		mutex_exit(&msp->ms_lock);
1277		return (ESTALE);	/* not allocated at all */
1278	}
1279
1280	space_map_remove(allocmap, offset, size);
1281	space_map_add(freemap, offset, size);
1282
1283	mutex_exit(&msp->ms_lock);
1284
1285	return (0);
1286}
1287
1288static void
1289zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
1290{
1291	metaslab_t *msp;
1292
1293	/* LINTED */
1294	msp = (metaslab_t *)((char *)sm - offsetof(metaslab_t, ms_allocmap[0]));
1295
1296	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
1297	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
1298	    (u_longlong_t)start,
1299	    (u_longlong_t)size);
1300}
1301
1302static void
1303zdb_space_map_vacate(spa_t *spa)
1304{
1305	vdev_t *rvd = spa->spa_root_vdev;
1306	vdev_t *vd;
1307	int c, m;
1308
1309	for (c = 0; c < rvd->vdev_children; c++) {
1310		vd = rvd->vdev_child[c];
1311		for (m = 0; m < vd->vdev_ms_count; m++) {
1312			metaslab_t *msp = vd->vdev_ms[m];
1313			mutex_enter(&msp->ms_lock);
1314			space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
1315			    &msp->ms_allocmap[0]);
1316			space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
1317			mutex_exit(&msp->ms_lock);
1318		}
1319	}
1320}
1321
1322static void
1323zdb_refresh_ubsync(spa_t *spa)
1324{
1325	uberblock_t ub = { 0 };
1326	vdev_t *rvd = spa->spa_root_vdev;
1327	zio_t *zio;
1328
1329	/*
1330	 * Reopen all devices to purge zdb's vdev caches.
1331	 */
1332	vdev_reopen(rvd, NULL);
1333
1334	/*
1335	 * Reload the uberblock.
1336	 */
1337	zio = zio_root(spa, NULL, NULL,
1338	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1339	vdev_uberblock_load(zio, rvd, &ub);
1340	(void) zio_wait(zio);
1341
1342	if (ub.ub_txg != 0)
1343		spa->spa_ubsync = ub;
1344}
1345
1346/*
1347 * Verify that the sum of the sizes of all blocks in the pool adds up
1348 * to the SPA's sa_alloc total.
1349 */
1350typedef struct zdb_blkstats {
1351	uint64_t	zb_asize;
1352	uint64_t	zb_lsize;
1353	uint64_t	zb_psize;
1354	uint64_t	zb_count;
1355} zdb_blkstats_t;
1356
1357#define	DMU_OT_DEFERRED	DMU_OT_NONE
1358#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
1359
1360#define	ZB_TOTAL	ZB_MAXLEVEL
1361
1362typedef struct zdb_cb {
1363	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
1364	uint64_t	zcb_errors[256];
1365	traverse_blk_cache_t *zcb_cache;
1366	int		zcb_readfails;
1367	int		zcb_haderrors;
1368} zdb_cb_t;
1369
1370static blkptr_cb_t zdb_blkptr_cb;
1371
1372static void
1373zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
1374{
1375	int i, error;
1376
1377	for (i = 0; i < 4; i++) {
1378		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
1379		int t = (i & 1) ? type : DMU_OT_TOTAL;
1380		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
1381
1382		zb->zb_asize += BP_GET_ASIZE(bp);
1383		zb->zb_lsize += BP_GET_LSIZE(bp);
1384		zb->zb_psize += BP_GET_PSIZE(bp);
1385		zb->zb_count++;
1386	}
1387
1388	if (dump_opt['L'])
1389		return;
1390
1391	error = zdb_space_map_claim(spa, bp);
1392
1393	if (error == 0)
1394		return;
1395
1396	if (error == EAGAIN)
1397		(void) fatal("double-allocation, bp=%p", bp);
1398
1399	if (error == ESTALE)
1400		(void) fatal("reference to freed block, bp=%p", bp);
1401
1402	(void) fatal("fatal error %d in bp %p", error, bp);
1403}
1404
1405static void
1406zdb_log_block_cb(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t first_txg)
1407{
1408	if (bp->blk_birth < first_txg) {
1409		zdb_cb_t *zcb = arg;
1410		traverse_blk_cache_t bc = *zcb->zcb_cache;
1411		zbookmark_t *zb = &bc.bc_bookmark;
1412
1413		zb->zb_objset = bp->blk_cksum.zc_word[2];
1414		zb->zb_blkid = bp->blk_cksum.zc_word[3];
1415		bc.bc_blkptr = *bp;
1416
1417		(void) zdb_blkptr_cb(&bc, zilog->zl_spa, arg);
1418	}
1419}
1420
1421static int
1422zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
1423{
1424	zbookmark_t *zb = &bc->bc_bookmark;
1425	zdb_cb_t *zcb = arg;
1426	blkptr_t *bp = &bc->bc_blkptr;
1427	dmu_object_type_t type = BP_GET_TYPE(bp);
1428	char blkbuf[BP_SPRINTF_LEN];
1429	int error = 0;
1430
1431	if (bc->bc_errno) {
1432		if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
1433			zdb_refresh_ubsync(spa);
1434			error = EAGAIN;
1435		} else {
1436			zcb->zcb_haderrors = 1;
1437			zcb->zcb_errors[bc->bc_errno]++;
1438			error = ERESTART;
1439		}
1440
1441		if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
1442			sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
1443		else
1444			blkbuf[0] = '\0';
1445
1446		(void) printf("zdb_blkptr_cb: Got error %d reading "
1447		    "<%llu, %llu, %d, %llx> %s -- %s\n",
1448		    bc->bc_errno,
1449		    (u_longlong_t)zb->zb_objset,
1450		    (u_longlong_t)zb->zb_object,
1451		    zb->zb_level,
1452		    (u_longlong_t)zb->zb_blkid,
1453		    blkbuf,
1454		    error == EAGAIN ? "retrying" : "skipping");
1455
1456		return (error);
1457	}
1458
1459	zcb->zcb_readfails = 0;
1460
1461	ASSERT(bp->blk_birth != 0);
1462
1463	zdb_count_block(spa, zcb, bp, type);
1464
1465	if (dump_opt['b'] >= 4) {
1466		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
1467		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
1468		    (u_longlong_t)zb->zb_objset,
1469		    (u_longlong_t)zb->zb_object,
1470		    (u_longlong_t)blkid2offset(bc->bc_dnode,
1471			zb->zb_level, zb->zb_blkid),
1472		    blkbuf);
1473	}
1474
1475	if (type == DMU_OT_OBJSET) {
1476		objset_phys_t *osphys = bc->bc_data;
1477		zilog_t zilog = { 0 };
1478		zilog.zl_header = &osphys->os_zil_header;
1479		zilog.zl_spa = spa;
1480
1481		zcb->zcb_cache = bc;
1482
1483		zil_parse(&zilog, zdb_log_block_cb, NULL, zcb,
1484		    spa_first_txg(spa));
1485	}
1486
1487	return (0);
1488}
1489
1490static int
1491dump_block_stats(spa_t *spa)
1492{
1493	traverse_handle_t *th;
1494	zdb_cb_t zcb = { 0 };
1495	zdb_blkstats_t *zb, *tzb;
1496	uint64_t alloc, space;
1497	int leaks = 0;
1498	int advance = zdb_advance;
1499	int flags;
1500	int e;
1501
1502	if (dump_opt['c'])
1503		advance |= ADVANCE_DATA;
1504
1505	advance |= ADVANCE_PRUNE;
1506
1507	(void) printf("\nTraversing all blocks to %sverify"
1508	    " nothing leaked ...\n",
1509	    dump_opt['c'] ? "verify checksums and " : "");
1510
1511	/*
1512	 * Load all space maps.  As we traverse the pool, if we find a block
1513	 * that's not in its space map, that indicates a double-allocation,
1514	 * reference to a freed block, or an unclaimed block.  Otherwise we
1515	 * remove the block from the space map.  If the space maps are not
1516	 * empty when we're done, that indicates leaked blocks.
1517	 */
1518	if (!dump_opt['L'])
1519		zdb_space_map_load(spa);
1520
1521	/*
1522	 * If there's a deferred-free bplist, process that first.
1523	 */
1524	if (spa->spa_sync_bplist_obj != 0) {
1525		bplist_t *bpl = &spa->spa_sync_bplist;
1526		blkptr_t blk;
1527		uint64_t itor = 0;
1528
1529		bplist_open(bpl, spa->spa_meta_objset,
1530		    spa->spa_sync_bplist_obj);
1531
1532		while (bplist_iterate(bpl, &itor, &blk) == 0) {
1533			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
1534			if (dump_opt['b'] >= 4) {
1535				char blkbuf[BP_SPRINTF_LEN];
1536				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
1537				(void) printf("[%s] %s\n",
1538				    "deferred free", blkbuf);
1539			}
1540		}
1541
1542		bplist_close(bpl);
1543	}
1544
1545	/*
1546	 * Now traverse the pool.  If we're read all data to verify checksums,
1547	 * do a scrubbing read so that we validate all copies.
1548	 */
1549	flags = ZIO_FLAG_CANFAIL;
1550	if (advance & ADVANCE_DATA)
1551		flags |= ZIO_FLAG_SCRUB;
1552	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
1553	th->th_noread = zdb_noread;
1554
1555	traverse_add_pool(th, 0, -1ULL);
1556
1557	while (traverse_more(th) == EAGAIN)
1558		continue;
1559
1560	traverse_fini(th);
1561
1562	if (zcb.zcb_haderrors) {
1563		(void) printf("\nError counts:\n\n");
1564		(void) printf("\t%5s  %s\n", "errno", "count");
1565		for (e = 0; e < 256; e++) {
1566			if (zcb.zcb_errors[e] != 0) {
1567				(void) printf("\t%5d  %llu\n",
1568				    e, (u_longlong_t)zcb.zcb_errors[e]);
1569			}
1570		}
1571	}
1572
1573	/*
1574	 * Report any leaked segments.
1575	 */
1576	if (!dump_opt['L'])
1577		zdb_space_map_vacate(spa);
1578
1579	if (dump_opt['L'])
1580		(void) printf("\n\n *** Live pool traversal; "
1581		    "block counts are only approximate ***\n\n");
1582
1583	alloc = spa_get_alloc(spa);
1584	space = spa_get_space(spa);
1585
1586	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
1587
1588	if (tzb->zb_asize == alloc) {
1589		(void) printf("\n\tNo leaks (block sum matches space"
1590		    " maps exactly)\n");
1591	} else {
1592		(void) printf("block traversal size %llu != alloc %llu "
1593		    "(leaked %lld)\n",
1594		    (u_longlong_t)tzb->zb_asize,
1595		    (u_longlong_t)alloc,
1596		    (u_longlong_t)(alloc - tzb->zb_asize));
1597		leaks = 1;
1598	}
1599
1600	if (tzb->zb_count == 0)
1601		return (2);
1602
1603	(void) printf("\n");
1604	(void) printf("\tbp count:      %10llu\n",
1605	    (u_longlong_t)tzb->zb_count);
1606	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
1607	    (u_longlong_t)tzb->zb_lsize,
1608	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
1609	(void) printf("\tbp physical:   %10llu\t avg:"
1610	    " %6llu\tcompression: %6.2f\n",
1611	    (u_longlong_t)tzb->zb_psize,
1612	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
1613	    (double)tzb->zb_lsize / tzb->zb_psize);
1614	(void) printf("\tbp allocated:  %10llu\t avg:"
1615	    " %6llu\tcompression: %6.2f\n",
1616	    (u_longlong_t)tzb->zb_asize,
1617	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
1618	    (double)tzb->zb_lsize / tzb->zb_asize);
1619	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
1620	    (u_longlong_t)alloc, 100.0 * alloc / space);
1621
1622	if (dump_opt['b'] >= 2) {
1623		int l, t, level;
1624		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
1625		    "\t  avg\t comp\t%%Total\tType\n");
1626
1627		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
1628			char csize[6], lsize[6], psize[6], asize[6], avg[6];
1629			char *typename;
1630
1631			typename = t == DMU_OT_DEFERRED ? "deferred free" :
1632			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
1633
1634			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
1635				(void) printf("%6s\t%5s\t%5s\t%5s"
1636				    "\t%5s\t%5s\t%6s\t%s\n",
1637				    "-",
1638				    "-",
1639				    "-",
1640				    "-",
1641				    "-",
1642				    "-",
1643				    "-",
1644				    typename);
1645				continue;
1646			}
1647
1648			for (l = ZB_TOTAL - 1; l >= -1; l--) {
1649				level = (l == -1 ? ZB_TOTAL : l);
1650				zb = &zcb.zcb_type[level][t];
1651
1652				if (zb->zb_asize == 0)
1653					continue;
1654
1655				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
1656					continue;
1657
1658				if (level == 0 && zb->zb_asize ==
1659				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
1660					continue;
1661
1662				nicenum(zb->zb_count, csize);
1663				nicenum(zb->zb_lsize, lsize);
1664				nicenum(zb->zb_psize, psize);
1665				nicenum(zb->zb_asize, asize);
1666				nicenum(zb->zb_asize / zb->zb_count, avg);
1667
1668				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
1669				    "\t%5.2f\t%6.2f\t",
1670				    csize, lsize, psize, asize, avg,
1671				    (double)zb->zb_lsize / zb->zb_psize,
1672				    100.0 * zb->zb_asize / tzb->zb_asize);
1673
1674				if (level == ZB_TOTAL)
1675					(void) printf("%s\n", typename);
1676				else
1677					(void) printf("    L%d %s\n",
1678					    level, typename);
1679			}
1680		}
1681	}
1682
1683	(void) printf("\n");
1684
1685	if (leaks)
1686		return (2);
1687
1688	if (zcb.zcb_haderrors)
1689		return (3);
1690
1691	return (0);
1692}
1693
1694static void
1695dump_zpool(spa_t *spa)
1696{
1697	dsl_pool_t *dp = spa_get_dsl(spa);
1698	int rc = 0;
1699
1700	if (dump_opt['u'])
1701		dump_uberblock(&spa->spa_uberblock);
1702
1703	if (dump_opt['d'] || dump_opt['i']) {
1704		dump_dir(dp->dp_meta_objset);
1705		if (dump_opt['d'] >= 3) {
1706			dump_bplist(dp->dp_meta_objset,
1707			    spa->spa_sync_bplist_obj, "Deferred frees");
1708			dump_dtl(spa->spa_root_vdev, 0);
1709			dump_metaslabs(spa);
1710		}
1711		dmu_objset_find(spa->spa_name, dump_one_dir, NULL,
1712		    DS_FIND_SNAPSHOTS);
1713	}
1714
1715	if (dump_opt['b'] || dump_opt['c'])
1716		rc = dump_block_stats(spa);
1717
1718	if (dump_opt['s'])
1719		show_pool_stats(spa);
1720
1721	if (rc != 0)
1722		exit(rc);
1723}
1724
1725int
1726main(int argc, char **argv)
1727{
1728	int i, c;
1729	struct rlimit rl = { 1024, 1024 };
1730	spa_t *spa;
1731	objset_t *os = NULL;
1732	char *endstr;
1733	int dump_all = 1;
1734	int verbose = 0;
1735	int error;
1736	int flag, set;
1737
1738	(void) setrlimit(RLIMIT_NOFILE, &rl);
1739
1740	dprintf_setup(&argc, argv);
1741
1742	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
1743		switch (c) {
1744		case 'u':
1745		case 'd':
1746		case 'i':
1747		case 'b':
1748		case 'c':
1749		case 's':
1750		case 'C':
1751		case 'l':
1752			dump_opt[c]++;
1753			dump_all = 0;
1754			break;
1755		case 'L':
1756			dump_opt[c]++;
1757			break;
1758		case 'O':
1759			endstr = optarg;
1760			if (endstr[0] == '!') {
1761				endstr++;
1762				set = 0;
1763			} else {
1764				set = 1;
1765			}
1766			if (strcmp(endstr, "post") == 0) {
1767				flag = ADVANCE_PRE;
1768				set = !set;
1769			} else if (strcmp(endstr, "pre") == 0) {
1770				flag = ADVANCE_PRE;
1771			} else if (strcmp(endstr, "prune") == 0) {
1772				flag = ADVANCE_PRUNE;
1773			} else if (strcmp(endstr, "data") == 0) {
1774				flag = ADVANCE_DATA;
1775			} else if (strcmp(endstr, "holes") == 0) {
1776				flag = ADVANCE_HOLES;
1777			} else {
1778				usage();
1779			}
1780			if (set)
1781				zdb_advance |= flag;
1782			else
1783				zdb_advance &= ~flag;
1784			break;
1785		case 'B':
1786			endstr = optarg - 1;
1787			zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
1788			zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
1789			zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
1790			zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
1791			(void) printf("simulating bad block "
1792			    "<%llu, %llu, %d, %llx>\n",
1793			    (u_longlong_t)zdb_noread.zb_objset,
1794			    (u_longlong_t)zdb_noread.zb_object,
1795			    zdb_noread.zb_level,
1796			    (u_longlong_t)zdb_noread.zb_blkid);
1797			break;
1798		case 'v':
1799			verbose++;
1800			break;
1801		case 'U':
1802			spa_config_dir = "/tmp";
1803			break;
1804		default:
1805			usage();
1806			break;
1807		}
1808	}
1809
1810	kernel_init(FREAD);
1811
1812	for (c = 0; c < 256; c++) {
1813		if (dump_all && c != 'L' && c != 'l')
1814			dump_opt[c] = 1;
1815		if (dump_opt[c])
1816			dump_opt[c] += verbose;
1817	}
1818
1819	argc -= optind;
1820	argv += optind;
1821
1822	if (argc < 1) {
1823		if (dump_opt['C']) {
1824			dump_config(NULL);
1825			return (0);
1826		}
1827		usage();
1828	}
1829
1830	if (dump_opt['l']) {
1831		dump_label(argv[0]);
1832		return (0);
1833	}
1834
1835	if (dump_opt['C'])
1836		dump_config(argv[0]);
1837
1838	if (strchr(argv[0], '/') != NULL) {
1839		error = dmu_objset_open(argv[0], DMU_OST_ANY,
1840		    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1841	} else {
1842		error = spa_open(argv[0], &spa, FTAG);
1843	}
1844
1845	if (error)
1846		fatal("can't open %s: error %d", argv[0], error);
1847
1848	argv++;
1849	if (--argc > 0) {
1850		zopt_objects = argc;
1851		zopt_object = calloc(zopt_objects, sizeof (uint64_t));
1852		for (i = 0; i < zopt_objects; i++) {
1853			errno = 0;
1854			zopt_object[i] = strtoull(argv[i], NULL, 0);
1855			if (zopt_object[i] == 0 && errno != 0)
1856				fatal("bad object number %s: %s",
1857				    argv[i], strerror(errno));
1858		}
1859	}
1860
1861	if (os != NULL) {
1862		dump_dir(os);
1863		dmu_objset_close(os);
1864	} else {
1865		dump_zpool(spa);
1866		spa_close(spa, FTAG);
1867	}
1868
1869	kernel_fini();
1870
1871	return (0);
1872}
1873