zdb.c revision ea8dc4b6d2251b437950c0056bc626b311c73c27
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <stdio.h>
29#include <stdlib.h>
30#include <sys/zfs_context.h>
31#include <sys/spa.h>
32#include <sys/spa_impl.h>
33#include <sys/dmu.h>
34#include <sys/zap.h>
35#include <sys/fs/zfs.h>
36#include <sys/zfs_znode.h>
37#include <sys/vdev.h>
38#include <sys/vdev_impl.h>
39#include <sys/metaslab_impl.h>
40#include <sys/dmu_objset.h>
41#include <sys/dsl_dir.h>
42#include <sys/dsl_dataset.h>
43#include <sys/dsl_pool.h>
44#include <sys/dbuf.h>
45#include <sys/zil.h>
46#include <sys/zil_impl.h>
47#include <sys/stat.h>
48#include <sys/resource.h>
49#include <sys/dmu_traverse.h>
50#include <sys/zio_checksum.h>
51#include <sys/zio_compress.h>
52
53const char cmdname[] = "zdb";
54uint8_t dump_opt[256];
55
56typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
57
58extern void dump_intent_log(zilog_t *);
59uint64_t *zopt_object = NULL;
60int zopt_objects = 0;
61int zdb_advance = ADVANCE_PRE;
62zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
63
64/*
65 * These libumem hooks provide a reasonable set of defaults for the allocator's
66 * debugging facilities.
67 */
68const char *
69_umem_debug_init()
70{
71	return ("default,verbose"); /* $UMEM_DEBUG setting */
72}
73
74const char *
75_umem_logging_init(void)
76{
77	return ("fail,contents"); /* $UMEM_LOGGING setting */
78}
79
80static void
81usage(void)
82{
83	(void) fprintf(stderr,
84	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
85	    "dataset [object...]\n"
86	    "       %s -C [pool]\n"
87	    "       %s -l dev\n",
88	    cmdname, cmdname, cmdname);
89
90	(void) fprintf(stderr, "	-u uberblock\n");
91	(void) fprintf(stderr, "	-d datasets\n");
92	(void) fprintf(stderr, "        -C cached pool configuration\n");
93	(void) fprintf(stderr, "	-i intent logs\n");
94	(void) fprintf(stderr, "	-b block statistics\n");
95	(void) fprintf(stderr, "	-c checksum all data blocks\n");
96	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
97	(void) fprintf(stderr, "	-v verbose (applies to all others)\n");
98	(void) fprintf(stderr, "        -l dump label contents\n");
99	(void) fprintf(stderr, "	-L live pool (allows some errors)\n");
100	(void) fprintf(stderr, "	-O [!]<pre|post|prune|data|holes> "
101	    "visitation order\n");
102	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
103	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
104	    "simulate bad block\n");
105	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
106	    "to make only that option verbose\n");
107	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
108	exit(1);
109}
110
111static void
112fatal(const char *fmt, ...)
113{
114	va_list ap;
115
116	va_start(ap, fmt);
117	(void) fprintf(stderr, "%s: ", cmdname);
118	(void) vfprintf(stderr, fmt, ap);
119	va_end(ap);
120	(void) fprintf(stderr, "\n");
121
122	exit(1);
123}
124
125static void
126dump_nvlist(nvlist_t *list, int indent)
127{
128	nvpair_t *elem = NULL;
129
130	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
131		switch (nvpair_type(elem)) {
132		case DATA_TYPE_STRING:
133			{
134				char *value;
135
136				VERIFY(nvpair_value_string(elem, &value) == 0);
137				(void) printf("%*s%s='%s'\n", indent, "",
138				    nvpair_name(elem), value);
139			}
140			break;
141
142		case DATA_TYPE_UINT64:
143			{
144				uint64_t value;
145
146				VERIFY(nvpair_value_uint64(elem, &value) == 0);
147				(void) printf("%*s%s=%llu\n", indent, "",
148				    nvpair_name(elem), (u_longlong_t)value);
149			}
150			break;
151
152		case DATA_TYPE_NVLIST:
153			{
154				nvlist_t *value;
155
156				VERIFY(nvpair_value_nvlist(elem, &value) == 0);
157				(void) printf("%*s%s\n", indent, "",
158				    nvpair_name(elem));
159				dump_nvlist(value, indent + 4);
160			}
161			break;
162
163		case DATA_TYPE_NVLIST_ARRAY:
164			{
165				nvlist_t **value;
166				uint_t c, count;
167
168				VERIFY(nvpair_value_nvlist_array(elem, &value,
169				    &count) == 0);
170
171				for (c = 0; c < count; c++) {
172					(void) printf("%*s%s[%u]\n", indent, "",
173					    nvpair_name(elem), c);
174					dump_nvlist(value[c], indent + 8);
175				}
176			}
177			break;
178
179		default:
180
181			(void) printf("bad config type %d for %s\n",
182			    nvpair_type(elem), nvpair_name(elem));
183		}
184	}
185}
186
187/* ARGSUSED */
188static void
189dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
190{
191	nvlist_t *nv;
192	size_t nvsize = *(uint64_t *)data;
193	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
194
195	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed));
196
197	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
198
199	umem_free(packed, nvsize);
200
201	dump_nvlist(nv, 8);
202
203	nvlist_free(nv);
204}
205
206const char dump_zap_stars[] = "****************************************";
207const int dump_zap_width = sizeof (dump_zap_stars) - 1;
208
209static void
210dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
211{
212	int i;
213	int minidx = ZAP_HISTOGRAM_SIZE - 1;
214	int maxidx = 0;
215	uint64_t max = 0;
216
217	for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
218		if (histo[i] > max)
219			max = histo[i];
220		if (histo[i] > 0 && i > maxidx)
221			maxidx = i;
222		if (histo[i] > 0 && i < minidx)
223			minidx = i;
224	}
225
226	if (max < dump_zap_width)
227		max = dump_zap_width;
228
229	for (i = minidx; i <= maxidx; i++)
230		(void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
231		    &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
232}
233
234static void
235dump_zap_stats(objset_t *os, uint64_t object)
236{
237	int error;
238	zap_stats_t zs;
239
240	error = zap_get_stats(os, object, &zs);
241	if (error)
242		return;
243
244	if (zs.zs_ptrtbl_len == 0) {
245		ASSERT(zs.zs_num_blocks == 1);
246		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
247		    (u_longlong_t)zs.zs_blocksize,
248		    (u_longlong_t)zs.zs_num_entries);
249		return;
250	}
251
252	(void) printf("\tFat ZAP stats:\n");
253	(void) printf("\t\tPointer table: %llu elements\n",
254	    (u_longlong_t)zs.zs_ptrtbl_len);
255	(void) printf("\t\tZAP entries: %llu\n",
256	    (u_longlong_t)zs.zs_num_entries);
257	(void) printf("\t\tLeaf blocks: %llu\n",
258	    (u_longlong_t)zs.zs_num_leafs);
259	(void) printf("\t\tTotal blocks: %llu\n",
260	    (u_longlong_t)zs.zs_num_blocks);
261	(void) printf("\t\tOversize blocks: %llu\n",
262	    (u_longlong_t)zs.zs_num_blocks_large);
263
264	(void) printf("\t\tLeafs with 2^n pointers:\n");
265	dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
266
267	(void) printf("\t\tLeafs with n chained:\n");
268	dump_zap_histogram(zs.zs_leafs_with_n_chained);
269
270	(void) printf("\t\tBlocks with n*5 entries:\n");
271	dump_zap_histogram(zs.zs_blocks_with_n5_entries);
272
273	(void) printf("\t\tBlocks n/10 full:\n");
274	dump_zap_histogram(zs.zs_blocks_n_tenths_full);
275
276	(void) printf("\t\tEntries with n chunks:\n");
277	dump_zap_histogram(zs.zs_entries_using_n_chunks);
278
279	(void) printf("\t\tBuckets with n entries:\n");
280	dump_zap_histogram(zs.zs_buckets_with_n_entries);
281}
282
283/*ARGSUSED*/
284static void
285dump_none(objset_t *os, uint64_t object, void *data, size_t size)
286{
287}
288
289/*ARGSUSED*/
290void
291dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
292{
293}
294
295/*ARGSUSED*/
296static void
297dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
298{
299}
300
301/*ARGSUSED*/
302static void
303dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
304{
305	zap_cursor_t zc;
306	zap_attribute_t attr;
307	void *prop;
308	int i;
309
310	dump_zap_stats(os, object);
311	(void) printf("\n");
312
313	for (zap_cursor_init(&zc, os, object);
314	    zap_cursor_retrieve(&zc, &attr) == 0;
315	    zap_cursor_advance(&zc)) {
316		(void) printf("\t\t%s = ", attr.za_name);
317		if (attr.za_num_integers == 0) {
318			(void) printf("\n");
319			continue;
320		}
321		prop = umem_zalloc(attr.za_num_integers *
322		    attr.za_integer_length, UMEM_NOFAIL);
323		(void) zap_lookup(os, object, attr.za_name,
324		    attr.za_integer_length, attr.za_num_integers, prop);
325		if (attr.za_integer_length == 1) {
326			(void) printf("%s", (char *)prop);
327		} else {
328			for (i = 0; i < attr.za_num_integers; i++) {
329				switch (attr.za_integer_length) {
330				case 2:
331					(void) printf("%u ",
332					    ((uint16_t *)prop)[i]);
333					break;
334				case 4:
335					(void) printf("%u ",
336					    ((uint32_t *)prop)[i]);
337					break;
338				case 8:
339					(void) printf("%lld ",
340					    (u_longlong_t)((int64_t *)prop)[i]);
341					break;
342				}
343			}
344		}
345		(void) printf("\n");
346		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
347	}
348	zap_cursor_fini(&zc);
349}
350
351static void
352dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
353{
354	uint64_t alloc, offset, entry;
355	int mapshift = sm->sm_shift;
356	uint64_t mapstart = sm->sm_start;
357	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
358
359	if (smo->smo_object == 0)
360		return;
361
362	/*
363	 * Print out the freelist entries in both encoded and decoded form.
364	 */
365	alloc = 0;
366	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
367		VERIFY(0 == dmu_read(os, smo->smo_object, offset,
368		    sizeof (entry), &entry));
369		if (SM_DEBUG_DECODE(entry)) {
370			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
371			    (u_longlong_t)(offset / sizeof (entry)),
372			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
373			    SM_DEBUG_TXG_DECODE(entry),
374			    SM_DEBUG_SYNCPASS_DECODE(entry));
375		} else {
376			(void) printf("\t\t[%4llu]    %c  range:"
377			    " %08llx-%08llx  size: %06llx\n",
378			    (u_longlong_t)(offset / sizeof (entry)),
379			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
380			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart,
381			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart +
382			    (SM_RUN_DECODE(entry) << mapshift),
383			    (SM_RUN_DECODE(entry) << mapshift));
384			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
385				alloc += SM_RUN_DECODE(entry) << mapshift;
386			else
387				alloc -= SM_RUN_DECODE(entry) << mapshift;
388		}
389	}
390	if (alloc != smo->smo_alloc) {
391		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
392		    "with space map summary (%llu)\n",
393		    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
394	}
395}
396
397static void
398dump_metaslab(metaslab_t *msp)
399{
400	char freebuf[5];
401	space_map_obj_t *smo = msp->ms_smo;
402	vdev_t *vd = msp->ms_group->mg_vd;
403	spa_t *spa = vd->vdev_spa;
404
405	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
406
407	if (dump_opt['d'] <= 5) {
408		(void) printf("\t%10llx   %10llu   %5s\n",
409		    (u_longlong_t)msp->ms_map.sm_start,
410		    (u_longlong_t)smo->smo_object,
411		    freebuf);
412		return;
413	}
414
415	(void) printf(
416	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
417	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
418	    (u_longlong_t)smo->smo_object, freebuf);
419
420	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
421
422	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
423}
424
425static void
426dump_metaslabs(spa_t *spa)
427{
428	vdev_t *rvd = spa->spa_root_vdev;
429	vdev_t *vd;
430	int c, m;
431
432	(void) printf("\nMetaslabs:\n");
433
434	for (c = 0; c < rvd->vdev_children; c++) {
435		vd = rvd->vdev_child[c];
436
437		spa_config_enter(spa, RW_READER, FTAG);
438		(void) printf("\n    vdev %llu = %s\n\n",
439		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
440		spa_config_exit(spa, FTAG);
441
442		if (dump_opt['d'] <= 5) {
443			(void) printf("\t%10s   %10s   %5s\n",
444			    "offset", "spacemap", "free");
445			(void) printf("\t%10s   %10s   %5s\n",
446			    "------", "--------", "----");
447		}
448		for (m = 0; m < vd->vdev_ms_count; m++)
449			dump_metaslab(vd->vdev_ms[m]);
450		(void) printf("\n");
451	}
452}
453
454static void
455dump_dtl(vdev_t *vd, int indent)
456{
457	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
458	spa_t *spa = vd->vdev_spa;
459	space_seg_t *ss;
460	vdev_t *pvd;
461	int c;
462
463	if (indent == 0)
464		(void) printf("\nDirty time logs:\n\n");
465
466	spa_config_enter(spa, RW_READER, FTAG);
467	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
468	spa_config_exit(spa, FTAG);
469
470	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
471		/*
472		 * Everything in this DTL must appear in all parent DTL unions.
473		 */
474		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
475			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
476			    ss->ss_start, ss->ss_end - ss->ss_start));
477		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
478		    indent, "",
479		    (u_longlong_t)ss->ss_start,
480		    (u_longlong_t)ss->ss_end - 1,
481		    (u_longlong_t)ss->ss_end - ss->ss_start);
482	}
483
484	(void) printf("\n");
485
486	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
487		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
488		    &vd->vdev_dtl_map);
489		(void) printf("\n");
490	}
491
492	for (c = 0; c < vd->vdev_children; c++)
493		dump_dtl(vd->vdev_child[c], indent + 4);
494}
495
496/*ARGSUSED*/
497static void
498dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
499{
500}
501
502static uint64_t
503blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
504{
505	if (level < 0)
506		return (blkid);
507
508	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
509	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
510}
511
512/* ARGSUSED */
513static int
514zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
515{
516	zbookmark_t *zb = &bc->bc_bookmark;
517	blkptr_t *bp = &bc->bc_blkptr;
518	dva_t *dva = &bp->blk_dva[0];
519	void *data = bc->bc_data;
520	dnode_phys_t *dnp = bc->bc_dnode;
521	char buffer[300];
522	int l;
523
524	if (bc->bc_errno) {
525		(void) sprintf(buffer,
526		    "Error %d reading <%llu, %llu, %lld, %llu>: ",
527		    bc->bc_errno,
528		    (u_longlong_t)zb->zb_objset,
529		    (u_longlong_t)zb->zb_object,
530		    (u_longlong_t)zb->zb_level,
531		    (u_longlong_t)zb->zb_blkid);
532		goto out;
533	}
534
535	if (zb->zb_level == -1) {
536		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
537		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
538	} else {
539		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
540		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
541	}
542
543	if (zb->zb_level > 0) {
544		uint64_t fill = 0;
545		blkptr_t *bpx, *bpend;
546
547		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
548		    bpx < bpend; bpx++) {
549			if (bpx->blk_birth != 0) {
550				fill += bpx->blk_fill;
551			} else {
552				ASSERT(bpx->blk_fill == 0);
553			}
554		}
555		ASSERT3U(fill, ==, bp->blk_fill);
556	}
557
558	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
559		uint64_t fill = 0;
560		dnode_phys_t *dnx, *dnend;
561
562		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
563		    dnx < dnend; dnx++) {
564			if (dnx->dn_type != DMU_OT_NONE)
565				fill++;
566		}
567		ASSERT3U(fill, ==, bp->blk_fill);
568	}
569
570	(void) sprintf(buffer, "%16llx ",
571	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
572
573	ASSERT(zb->zb_level >= 0);
574
575	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
576		if (l == zb->zb_level) {
577			(void) sprintf(buffer + strlen(buffer), "L%llx",
578			    (u_longlong_t)zb->zb_level);
579		} else {
580			(void) sprintf(buffer + strlen(buffer), " ");
581		}
582	}
583
584out:
585	if (bp->blk_birth == 0) {
586		(void) sprintf(buffer + strlen(buffer), "<hole>");
587		(void) printf("%s\n", buffer);
588	} else {
589		// XXBP - Need to print number of active BPs here
590		(void) sprintf(buffer + strlen(buffer),
591		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
592		    (u_longlong_t)DVA_GET_VDEV(dva),
593		    (u_longlong_t)DVA_GET_OFFSET(dva),
594		    (u_longlong_t)BP_GET_LSIZE(bp),
595		    (u_longlong_t)BP_GET_PSIZE(bp),
596		    (u_longlong_t)DVA_GET_ASIZE(dva),
597		    (u_longlong_t)bp->blk_fill,
598		    (u_longlong_t)bp->blk_birth);
599
600		(void) printf("%s\n", buffer);
601	}
602
603	return (bc->bc_errno ? ERESTART : 0);
604}
605
606/*ARGSUSED*/
607static void
608dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
609{
610	traverse_handle_t *th;
611	uint64_t objset = dmu_objset_id(os);
612	int advance = zdb_advance;
613
614	(void) printf("Indirect blocks:\n");
615
616	if (object == 0)
617		advance |= ADVANCE_DATA;
618
619	th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
620	    ZIO_FLAG_CANFAIL);
621	th->th_noread = zdb_noread;
622
623	traverse_add_dnode(th, 0, -1ULL, objset, object);
624
625	while (traverse_more(th) == EAGAIN)
626		continue;
627
628	(void) printf("\n");
629
630	traverse_fini(th);
631}
632
633/*ARGSUSED*/
634static void
635dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
636{
637	dsl_dir_phys_t *dd = data;
638	time_t crtime;
639	char used[6], compressed[6], uncompressed[6], quota[6], resv[6];
640
641	if (dd == NULL)
642		return;
643
644	ASSERT(size == sizeof (*dd));
645
646	crtime = dd->dd_creation_time;
647	nicenum(dd->dd_used_bytes, used);
648	nicenum(dd->dd_compressed_bytes, compressed);
649	nicenum(dd->dd_uncompressed_bytes, uncompressed);
650	nicenum(dd->dd_quota, quota);
651	nicenum(dd->dd_reserved, resv);
652
653	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
654	(void) printf("\t\thead_dataset_obj = %llu\n",
655	    (u_longlong_t)dd->dd_head_dataset_obj);
656	(void) printf("\t\tparent_dir_obj = %llu\n",
657	    (u_longlong_t)dd->dd_parent_obj);
658	(void) printf("\t\tclone_parent_obj = %llu\n",
659	    (u_longlong_t)dd->dd_clone_parent_obj);
660	(void) printf("\t\tchild_dir_zapobj = %llu\n",
661	    (u_longlong_t)dd->dd_child_dir_zapobj);
662	(void) printf("\t\tused_bytes = %s\n", used);
663	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
664	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
665	(void) printf("\t\tquota = %s\n", quota);
666	(void) printf("\t\treserved = %s\n", resv);
667	(void) printf("\t\tprops_zapobj = %llu\n",
668	    (u_longlong_t)dd->dd_props_zapobj);
669}
670
671/*ARGSUSED*/
672static void
673dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
674{
675	dsl_dataset_phys_t *ds = data;
676	time_t crtime;
677	char used[6], compressed[6], uncompressed[6], unique[6];
678	char blkbuf[BP_SPRINTF_LEN];
679
680	if (ds == NULL)
681		return;
682
683	ASSERT(size == sizeof (*ds));
684	crtime = ds->ds_creation_time;
685	nicenum(ds->ds_used_bytes, used);
686	nicenum(ds->ds_compressed_bytes, compressed);
687	nicenum(ds->ds_uncompressed_bytes, uncompressed);
688	nicenum(ds->ds_unique_bytes, unique);
689	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
690
691	(void) printf("\t\tdataset_obj = %llu\n",
692	    (u_longlong_t)ds->ds_dir_obj);
693	(void) printf("\t\tprev_snap_obj = %llu\n",
694	    (u_longlong_t)ds->ds_prev_snap_obj);
695	(void) printf("\t\tprev_snap_txg = %llu\n",
696	    (u_longlong_t)ds->ds_prev_snap_txg);
697	(void) printf("\t\tnext_snap_obj = %llu\n",
698	    (u_longlong_t)ds->ds_next_snap_obj);
699	(void) printf("\t\tsnapnames_zapobj = %llu\n",
700	    (u_longlong_t)ds->ds_snapnames_zapobj);
701	(void) printf("\t\tnum_children = %llu\n",
702	    (u_longlong_t)ds->ds_num_children);
703	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
704	(void) printf("\t\tcreation_txg = %llu\n",
705	    (u_longlong_t)ds->ds_creation_txg);
706	(void) printf("\t\tdeadlist_obj = %llu\n",
707	    (u_longlong_t)ds->ds_deadlist_obj);
708	(void) printf("\t\tused_bytes = %s\n", used);
709	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
710	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
711	(void) printf("\t\tunique = %s\n", unique);
712	(void) printf("\t\tfsid_guid = %llu\n",
713	    (u_longlong_t)ds->ds_fsid_guid);
714	(void) printf("\t\tguid = %llu\n",
715	    (u_longlong_t)ds->ds_guid);
716	(void) printf("\t\trestoring = %llu\n",
717	    (u_longlong_t)ds->ds_restoring);
718	(void) printf("\t\tbp = %s\n", blkbuf);
719}
720
721static void
722dump_bplist(objset_t *mos, uint64_t object, char *name)
723{
724	bplist_t bpl = { 0 };
725	blkptr_t blk, *bp = &blk;
726	uint64_t itor = 0;
727	char numbuf[6];
728
729	if (dump_opt['d'] < 3)
730		return;
731
732	VERIFY(0 == bplist_open(&bpl, mos, object));
733	if (bplist_empty(&bpl)) {
734		bplist_close(&bpl);
735		return;
736	}
737
738	nicenum(bpl.bpl_phys->bpl_bytes, numbuf);
739
740	(void) printf("\n    %s: %llu entries, %s\n",
741	    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, numbuf);
742
743	if (dump_opt['d'] < 5) {
744		bplist_close(&bpl);
745		return;
746	}
747
748	(void) printf("\n");
749
750	while (bplist_iterate(&bpl, &itor, bp) == 0) {
751		ASSERT(bp->blk_birth != 0);
752		// XXBP - Do we want to see all DVAs, or just one?
753		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
754		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
755		    (u_longlong_t)itor - 1,
756		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
757		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
758		    (u_longlong_t)BP_GET_LSIZE(bp),
759		    (u_longlong_t)BP_GET_PSIZE(bp),
760		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
761		    (u_longlong_t)bp->blk_fill,
762		    (u_longlong_t)bp->blk_birth);
763	}
764
765	bplist_close(&bpl);
766}
767
768static char *
769znode_path(objset_t *os, uint64_t object, char *pathbuf, size_t size)
770{
771	dmu_buf_t *db;
772	dmu_object_info_t doi;
773	znode_phys_t *zp;
774	uint64_t parent = 0;
775	size_t complen;
776	char component[MAXNAMELEN + 1];
777	char *path;
778	int error;
779
780	path = pathbuf + size;
781	*--path = '\0';
782
783	for (;;) {
784		error = dmu_bonus_hold(os, object, FTAG, &db);
785		if (error)
786			break;
787
788		dmu_object_info_from_db(db, &doi);
789		zp = db->db_data;
790		parent = zp->zp_parent;
791		dmu_buf_rele(db, FTAG);
792
793		if (doi.doi_bonus_type != DMU_OT_ZNODE)
794			break;
795
796		if (parent == object) {
797			if (path[0] != '/')
798				*--path = '/';
799			return (path);
800		}
801
802		if (zap_value_search(os, parent, object, component) != 0)
803			break;
804
805		complen = strlen(component);
806		path -= complen;
807		bcopy(component, path, complen);
808		*--path = '/';
809
810		object = parent;
811	}
812
813	(void) sprintf(component, "???<object#%llu>", (u_longlong_t)object);
814
815	complen = strlen(component);
816	path -= complen;
817	bcopy(component, path, complen);
818
819	return (path);
820}
821
822/*ARGSUSED*/
823static void
824dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
825{
826	znode_phys_t *zp = data;
827	time_t z_crtime, z_atime, z_mtime, z_ctime;
828	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
829
830	ASSERT(size >= sizeof (znode_phys_t));
831
832	if (dump_opt['d'] < 3) {
833		(void) printf("\t%s\n",
834		    znode_path(os, object, path, sizeof (path)));
835		return;
836	}
837
838	z_crtime = (time_t)zp->zp_crtime[0];
839	z_atime = (time_t)zp->zp_atime[0];
840	z_mtime = (time_t)zp->zp_mtime[0];
841	z_ctime = (time_t)zp->zp_ctime[0];
842
843	(void) printf("\tpath	%s\n",
844	    znode_path(os, object, path, sizeof (path)));
845	(void) printf("\tatime	%s", ctime(&z_atime));
846	(void) printf("\tmtime	%s", ctime(&z_mtime));
847	(void) printf("\tctime	%s", ctime(&z_ctime));
848	(void) printf("\tcrtime	%s", ctime(&z_crtime));
849	(void) printf("\tgen	%llu\n", (u_longlong_t)zp->zp_gen);
850	(void) printf("\tmode	%llo\n", (u_longlong_t)zp->zp_mode);
851	(void) printf("\tsize	%llu\n", (u_longlong_t)zp->zp_size);
852	(void) printf("\tparent	%llu\n", (u_longlong_t)zp->zp_parent);
853	(void) printf("\tlinks	%llu\n", (u_longlong_t)zp->zp_links);
854	(void) printf("\txattr	%llu\n", (u_longlong_t)zp->zp_xattr);
855	(void) printf("\trdev	0x%016llx\n", (u_longlong_t)zp->zp_rdev);
856}
857
858/*ARGSUSED*/
859static void
860dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
861{
862}
863
864/*ARGSUSED*/
865static void
866dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
867{
868}
869
870static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
871	dump_none,		/* unallocated			*/
872	dump_zap,		/* object directory		*/
873	dump_uint64,		/* object array			*/
874	dump_none,		/* packed nvlist		*/
875	dump_packed_nvlist,	/* packed nvlist size		*/
876	dump_none,		/* bplist			*/
877	dump_none,		/* bplist header		*/
878	dump_none,		/* SPA space map header		*/
879	dump_none,		/* SPA space map		*/
880	dump_none,		/* ZIL intent log		*/
881	dump_dnode,		/* DMU dnode			*/
882	dump_dmu_objset,	/* DMU objset			*/
883	dump_dsl_dir,		/* DSL directory		*/
884	dump_zap,		/* DSL directory child map	*/
885	dump_zap,		/* DSL dataset snap map		*/
886	dump_zap,		/* DSL props			*/
887	dump_dsl_dataset,	/* DSL dataset			*/
888	dump_znode,		/* ZFS znode			*/
889	dump_acl,		/* ZFS ACL			*/
890	dump_uint8,		/* ZFS plain file		*/
891	dump_zap,		/* ZFS directory		*/
892	dump_zap,		/* ZFS master node		*/
893	dump_zap,		/* ZFS delete queue		*/
894	dump_uint8,		/* zvol object			*/
895	dump_zap,		/* zvol prop			*/
896	dump_uint8,		/* other uint8[]		*/
897	dump_uint64,		/* other uint64[]		*/
898	dump_zap,		/* other ZAP			*/
899	dump_zap,		/* persistent error log		*/
900};
901
902static void
903dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
904{
905	dmu_buf_t *db = NULL;
906	dmu_object_info_t doi;
907	dnode_t *dn;
908	void *bonus = NULL;
909	size_t bsize = 0;
910	char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6];
911	char aux[50];
912	int error;
913
914	if (*print_header) {
915		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
916		    "  psize  type\n");
917		*print_header = 0;
918	}
919
920	if (object == 0) {
921		dn = os->os->os_meta_dnode;
922	} else {
923		error = dmu_bonus_hold(os, object, FTAG, &db);
924		if (error)
925			fatal("dmu_bonus_hold(%llu) failed, errno %u",
926			    object, error);
927		bonus = db->db_data;
928		bsize = db->db_size;
929		dn = ((dmu_buf_impl_t *)db)->db_dnode;
930	}
931	dmu_object_info_from_dnode(dn, &doi);
932
933	nicenum(doi.doi_metadata_block_size, iblk);
934	nicenum(doi.doi_data_block_size, dblk);
935	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
936	    lsize);
937	nicenum(doi.doi_physical_blks << 9, psize);
938	nicenum(doi.doi_bonus_size, bonus_size);
939
940	aux[0] = '\0';
941
942	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6)
943		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
944		zio_checksum_table[doi.doi_checksum].ci_name);
945
946	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6)
947		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
948		zio_compress_table[doi.doi_compress].ci_name);
949
950	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
951	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
952	    psize, dmu_ot[doi.doi_type].ot_name, aux);
953
954	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
955		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
956		    "", "", "", "", bonus_size, "bonus",
957		    dmu_ot[doi.doi_bonus_type].ot_name);
958	}
959
960	if (verbosity >= 4) {
961		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
962		object_viewer[doi.doi_type](os, object, NULL, 0);
963		*print_header = 1;
964	}
965
966	if (verbosity >= 5)
967		dump_indirect(os, object, NULL, 0);
968
969	if (verbosity >= 5) {
970		/*
971		 * Report the list of segments that comprise the object.
972		 */
973		uint64_t start = 0;
974		uint64_t end;
975		uint64_t blkfill = 1;
976		int minlvl = 1;
977
978		if (dn->dn_type == DMU_OT_DNODE) {
979			minlvl = 0;
980			blkfill = DNODES_PER_BLOCK;
981		}
982
983		for (;;) {
984			error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
985			    blkfill);
986			if (error)
987				break;
988			end = start;
989			error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
990			    blkfill);
991			nicenum(end - start, segsize);
992			(void) printf("\t\tsegment [%016llx, %016llx)"
993			    " size %5s\n", (u_longlong_t)start,
994			    (u_longlong_t)end, segsize);
995			if (error)
996				break;
997			start = end;
998		}
999	}
1000
1001	if (db != NULL)
1002		dmu_buf_rele(db, FTAG);
1003}
1004
1005static char *objset_types[DMU_OST_NUMTYPES] = {
1006	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
1007
1008/*ARGSUSED*/
1009static void
1010dump_dir(objset_t *os)
1011{
1012	dmu_objset_stats_t dds;
1013	uint64_t object, object_count;
1014	char numbuf[8];
1015	char blkbuf[BP_SPRINTF_LEN];
1016	char osname[MAXNAMELEN];
1017	char *type = "UNKNOWN";
1018	int verbosity = dump_opt['d'];
1019	int print_header = 1;
1020	int i, error;
1021
1022	dmu_objset_stats(os, &dds);
1023
1024	if (dds.dds_type < DMU_OST_NUMTYPES)
1025		type = objset_types[dds.dds_type];
1026
1027	if (dds.dds_type == DMU_OST_META) {
1028		dds.dds_creation_txg = TXG_INITIAL;
1029		dds.dds_last_txg = os->os->os_rootbp.blk_birth;
1030		dds.dds_objects_used = os->os->os_rootbp.blk_fill;
1031		dds.dds_space_refd =
1032		    os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
1033	}
1034
1035	ASSERT3U(dds.dds_objects_used, ==, os->os->os_rootbp.blk_fill);
1036
1037	nicenum(dds.dds_space_refd, numbuf);
1038
1039	if (verbosity >= 4) {
1040		(void) strcpy(blkbuf, ", rootbp ");
1041		sprintf_blkptr(blkbuf + strlen(blkbuf),
1042		    BP_SPRINTF_LEN - strlen(blkbuf), &os->os->os_rootbp);
1043	} else {
1044		blkbuf[0] = '\0';
1045	}
1046
1047	dmu_objset_name(os, osname);
1048
1049	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, last_txg %llu, "
1050	    "%s, %llu objects%s\n",
1051	    osname, type, (u_longlong_t)dmu_objset_id(os),
1052	    (u_longlong_t)dds.dds_creation_txg,
1053	    (u_longlong_t)dds.dds_last_txg,
1054	    numbuf,
1055	    (u_longlong_t)dds.dds_objects_used,
1056	    blkbuf);
1057
1058	dump_intent_log(dmu_objset_zil(os));
1059
1060	if (dmu_objset_ds(os) != NULL)
1061		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
1062		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
1063
1064	if (verbosity < 2)
1065		return;
1066
1067	if (zopt_objects != 0) {
1068		for (i = 0; i < zopt_objects; i++)
1069			dump_object(os, zopt_object[i], verbosity,
1070			    &print_header);
1071		(void) printf("\n");
1072		return;
1073	}
1074
1075	dump_object(os, 0, verbosity, &print_header);
1076	object_count = 1;
1077
1078	object = 0;
1079	while ((error = dmu_object_next(os, &object, B_FALSE)) == 0) {
1080		dump_object(os, object, verbosity, &print_header);
1081		object_count++;
1082	}
1083
1084	ASSERT3U(object_count, ==, dds.dds_objects_used);
1085
1086	(void) printf("\n");
1087
1088	if (error != ESRCH)
1089		fatal("dmu_object_next() = %d", error);
1090}
1091
1092static void
1093dump_uberblock(uberblock_t *ub)
1094{
1095	time_t timestamp = ub->ub_timestamp;
1096
1097	(void) printf("Uberblock\n\n");
1098	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
1099	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
1100	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
1101	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
1102	(void) printf("\ttimestamp = %llu UTC = %s",
1103	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
1104	if (dump_opt['u'] >= 3) {
1105		char blkbuf[BP_SPRINTF_LEN];
1106		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
1107		(void) printf("\trootbp = %s\n", blkbuf);
1108	}
1109	(void) printf("\n");
1110}
1111
1112static void
1113dump_config(const char *pool)
1114{
1115	spa_t *spa = NULL;
1116
1117	mutex_enter(&spa_namespace_lock);
1118	while ((spa = spa_next(spa)) != NULL) {
1119		if (pool == NULL)
1120			(void) printf("%s\n", spa_name(spa));
1121		if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
1122			dump_nvlist(spa->spa_config, 4);
1123	}
1124	mutex_exit(&spa_namespace_lock);
1125}
1126
1127static void
1128dump_label(const char *dev)
1129{
1130	int fd;
1131	vdev_label_t label;
1132	char *buf = label.vl_vdev_phys.vp_nvlist;
1133	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
1134	struct stat64 statbuf;
1135	uint64_t psize;
1136	int l;
1137
1138	if ((fd = open64(dev, O_RDONLY)) < 0) {
1139		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
1140		exit(1);
1141	}
1142
1143	if (fstat64(fd, &statbuf) != 0) {
1144		(void) printf("failed to stat '%s': %s\n", dev,
1145		    strerror(errno));
1146		exit(1);
1147	}
1148
1149	psize = statbuf.st_size;
1150	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
1151
1152	for (l = 0; l < VDEV_LABELS; l++) {
1153
1154		nvlist_t *config = NULL;
1155
1156		(void) printf("--------------------------------------------\n");
1157		(void) printf("LABEL %d\n", l);
1158		(void) printf("--------------------------------------------\n");
1159
1160		if (pread64(fd, &label, sizeof (label),
1161		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
1162			(void) printf("failed to read label %d\n", l);
1163			continue;
1164		}
1165
1166		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
1167			(void) printf("failed to unpack label %d\n", l);
1168			continue;
1169		}
1170		dump_nvlist(config, 4);
1171		nvlist_free(config);
1172	}
1173}
1174
1175/*ARGSUSED*/
1176static void
1177dump_one_dir(char *dsname, void *arg)
1178{
1179	int error;
1180	objset_t *os;
1181
1182	error = dmu_objset_open(dsname, DMU_OST_ANY,
1183	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1184	if (error) {
1185		(void) printf("Could not open %s\n", dsname);
1186		return;
1187	}
1188	dump_dir(os);
1189	dmu_objset_close(os);
1190}
1191
1192static void
1193zdb_space_map_load(spa_t *spa)
1194{
1195	vdev_t *rvd = spa->spa_root_vdev;
1196	vdev_t *vd;
1197	int c, m, error;
1198
1199	for (c = 0; c < rvd->vdev_children; c++) {
1200		vd = rvd->vdev_child[c];
1201		for (m = 0; m < vd->vdev_ms_count; m++) {
1202			metaslab_t *msp = vd->vdev_ms[m];
1203			space_map_t *sm = &msp->ms_allocmap[0];
1204			mutex_enter(&msp->ms_lock);
1205			error = space_map_load(sm, msp->ms_smo, SM_ALLOC,
1206			    spa->spa_meta_objset, msp->ms_usable_end,
1207			    sm->sm_size - msp->ms_usable_space);
1208			mutex_exit(&msp->ms_lock);
1209			if (error)
1210				fatal("%s bad space map #%d, error %d",
1211				    spa->spa_name, c, error);
1212		}
1213	}
1214}
1215
1216static int
1217zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
1218{
1219	dva_t *dva = &bp->blk_dva[0];
1220	uint64_t vdev = DVA_GET_VDEV(dva);
1221	uint64_t offset = DVA_GET_OFFSET(dva);
1222	uint64_t size = DVA_GET_ASIZE(dva);
1223	vdev_t *vd;
1224	metaslab_t *msp;
1225	space_map_t *allocmap, *freemap;
1226	int error;
1227
1228	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
1229		return (ENXIO);
1230
1231	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1232		return (ENXIO);
1233
1234	if (DVA_GET_GANG(dva)) {
1235		zio_gbh_phys_t gbh;
1236		blkptr_t blk = *bp;
1237		int g;
1238
1239		/* LINTED - compile time assert */
1240		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
1241		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1242		DVA_SET_GANG(&blk.blk_dva[0], 0);
1243		DVA_SET_ASIZE(&blk.blk_dva[0], size);
1244		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
1245		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
1246		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
1247		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
1248		error = zio_wait(zio_read(NULL, spa, &blk,
1249		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1250		    ZIO_PRIORITY_SYNC_READ,
1251		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
1252		if (error)
1253			return (error);
1254		if (BP_SHOULD_BYTESWAP(&blk))
1255			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
1256		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1257			if (gbh.zg_blkptr[g].blk_birth == 0)
1258				break;
1259			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
1260			if (error)
1261				return (error);
1262		}
1263	}
1264
1265	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1266	allocmap = &msp->ms_allocmap[0];
1267	freemap = &msp->ms_freemap[0];
1268
1269	mutex_enter(&msp->ms_lock);
1270	if (space_map_contains(freemap, offset, size)) {
1271		mutex_exit(&msp->ms_lock);
1272		return (EAGAIN);	/* allocated more than once */
1273	}
1274
1275	if (!space_map_contains(allocmap, offset, size)) {
1276		mutex_exit(&msp->ms_lock);
1277		return (ESTALE);	/* not allocated at all */
1278	}
1279
1280	space_map_remove(allocmap, offset, size);
1281	space_map_add(freemap, offset, size);
1282
1283	mutex_exit(&msp->ms_lock);
1284
1285	return (0);
1286}
1287
1288static void
1289zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
1290{
1291	metaslab_t *msp;
1292
1293	/* LINTED */
1294	msp = (metaslab_t *)((char *)sm - offsetof(metaslab_t, ms_allocmap[0]));
1295
1296	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
1297	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
1298	    (u_longlong_t)start,
1299	    (u_longlong_t)size);
1300}
1301
1302static void
1303zdb_space_map_vacate(spa_t *spa)
1304{
1305	vdev_t *rvd = spa->spa_root_vdev;
1306	vdev_t *vd;
1307	int c, m;
1308
1309	for (c = 0; c < rvd->vdev_children; c++) {
1310		vd = rvd->vdev_child[c];
1311		for (m = 0; m < vd->vdev_ms_count; m++) {
1312			metaslab_t *msp = vd->vdev_ms[m];
1313			mutex_enter(&msp->ms_lock);
1314			space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
1315			    &msp->ms_allocmap[0]);
1316			space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
1317			mutex_exit(&msp->ms_lock);
1318		}
1319	}
1320}
1321
1322static void
1323zdb_refresh_ubsync(spa_t *spa)
1324{
1325	uberblock_t ub = { 0 };
1326	vdev_t *rvd = spa->spa_root_vdev;
1327	zio_t *zio;
1328
1329	/*
1330	 * Reload the uberblock.
1331	 */
1332	zio = zio_root(spa, NULL, NULL,
1333	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1334	vdev_uberblock_load(zio, rvd, &ub);
1335	(void) zio_wait(zio);
1336
1337	if (ub.ub_txg != 0)
1338		spa->spa_ubsync = ub;
1339}
1340
1341/*
1342 * Verify that the sum of the sizes of all blocks in the pool adds up
1343 * to the SPA's sa_alloc total.
1344 */
1345typedef struct zdb_blkstats {
1346	uint64_t	zb_asize;
1347	uint64_t	zb_lsize;
1348	uint64_t	zb_psize;
1349	uint64_t	zb_count;
1350} zdb_blkstats_t;
1351
1352#define	DMU_OT_DEFERRED	DMU_OT_NONE
1353#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
1354
1355#define	ZB_TOTAL	ZB_MAXLEVEL
1356
1357typedef struct zdb_cb {
1358	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
1359	uint64_t	zcb_errors[256];
1360	traverse_blk_cache_t *zcb_cache;
1361	int		zcb_readfails;
1362	int		zcb_haderrors;
1363} zdb_cb_t;
1364
1365static void
1366zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
1367{
1368	int i, error;
1369
1370	for (i = 0; i < 4; i++) {
1371		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
1372		int t = (i & 1) ? type : DMU_OT_TOTAL;
1373		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
1374
1375		zb->zb_asize += BP_GET_ASIZE(bp);
1376		zb->zb_lsize += BP_GET_LSIZE(bp);
1377		zb->zb_psize += BP_GET_PSIZE(bp);
1378		zb->zb_count++;
1379	}
1380
1381	if (dump_opt['L'])
1382		return;
1383
1384	error = zdb_space_map_claim(spa, bp, &zcb->zcb_cache->bc_bookmark);
1385
1386	if (error == 0)
1387		return;
1388
1389	if (error == EAGAIN)
1390		(void) fatal("double-allocation, bp=%p", bp);
1391
1392	if (error == ESTALE)
1393		(void) fatal("reference to freed block, bp=%p", bp);
1394
1395	(void) fatal("fatal error %d in bp %p", error, bp);
1396}
1397
1398static int
1399zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
1400{
1401	zbookmark_t *zb = &bc->bc_bookmark;
1402	zdb_cb_t *zcb = arg;
1403	blkptr_t *bp = &bc->bc_blkptr;
1404	dmu_object_type_t type = BP_GET_TYPE(bp);
1405	char blkbuf[BP_SPRINTF_LEN];
1406	int error = 0;
1407
1408	if (bc->bc_errno) {
1409		if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
1410			zdb_refresh_ubsync(spa);
1411			error = EAGAIN;
1412		} else {
1413			zcb->zcb_haderrors = 1;
1414			zcb->zcb_errors[bc->bc_errno]++;
1415			error = ERESTART;
1416		}
1417
1418		if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
1419			sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
1420		else
1421			blkbuf[0] = '\0';
1422
1423		(void) printf("zdb_blkptr_cb: Got error %d reading "
1424		    "<%llu, %llu, %lld, %llx> %s -- %s\n",
1425		    bc->bc_errno,
1426		    (u_longlong_t)zb->zb_objset,
1427		    (u_longlong_t)zb->zb_object,
1428		    (u_longlong_t)zb->zb_level,
1429		    (u_longlong_t)zb->zb_blkid,
1430		    blkbuf,
1431		    error == EAGAIN ? "retrying" : "skipping");
1432
1433		return (error);
1434	}
1435
1436	zcb->zcb_readfails = 0;
1437
1438	ASSERT(bp->blk_birth != 0);
1439
1440	zdb_count_block(spa, zcb, bp, type);
1441
1442	if (dump_opt['b'] >= 4) {
1443		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
1444		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
1445		    (u_longlong_t)zb->zb_objset,
1446		    (u_longlong_t)zb->zb_object,
1447		    (u_longlong_t)blkid2offset(bc->bc_dnode,
1448			zb->zb_level, zb->zb_blkid),
1449		    blkbuf);
1450	}
1451
1452	return (0);
1453}
1454
1455static int
1456dump_block_stats(spa_t *spa)
1457{
1458	traverse_handle_t *th;
1459	zdb_cb_t zcb = { 0 };
1460	traverse_blk_cache_t dummy_cache = { 0 };
1461	zdb_blkstats_t *zb, *tzb;
1462	uint64_t alloc, space;
1463	int leaks = 0;
1464	int advance = zdb_advance;
1465	int flags;
1466	int e;
1467
1468	zcb.zcb_cache = &dummy_cache;
1469
1470	if (dump_opt['c'])
1471		advance |= ADVANCE_DATA;
1472
1473	advance |= ADVANCE_PRUNE | ADVANCE_ZIL;
1474
1475	(void) printf("\nTraversing all blocks to %sverify"
1476	    " nothing leaked ...\n",
1477	    dump_opt['c'] ? "verify checksums and " : "");
1478
1479	/*
1480	 * Load all space maps.  As we traverse the pool, if we find a block
1481	 * that's not in its space map, that indicates a double-allocation,
1482	 * reference to a freed block, or an unclaimed block.  Otherwise we
1483	 * remove the block from the space map.  If the space maps are not
1484	 * empty when we're done, that indicates leaked blocks.
1485	 */
1486	if (!dump_opt['L'])
1487		zdb_space_map_load(spa);
1488
1489	/*
1490	 * If there's a deferred-free bplist, process that first.
1491	 */
1492	if (spa->spa_sync_bplist_obj != 0) {
1493		bplist_t *bpl = &spa->spa_sync_bplist;
1494		blkptr_t blk;
1495		uint64_t itor = 0;
1496
1497		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
1498		    spa->spa_sync_bplist_obj));
1499
1500		while (bplist_iterate(bpl, &itor, &blk) == 0) {
1501			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
1502			if (dump_opt['b'] >= 4) {
1503				char blkbuf[BP_SPRINTF_LEN];
1504				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
1505				(void) printf("[%s] %s\n",
1506				    "deferred free", blkbuf);
1507			}
1508		}
1509
1510		bplist_close(bpl);
1511	}
1512
1513	/*
1514	 * Now traverse the pool.  If we're reading all data to verify
1515	 * checksums, do a scrubbing read so that we validate all copies.
1516	 */
1517	flags = ZIO_FLAG_CANFAIL;
1518	if (advance & ADVANCE_DATA)
1519		flags |= ZIO_FLAG_SCRUB;
1520	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
1521	th->th_noread = zdb_noread;
1522
1523	traverse_add_pool(th, 0, spa_first_txg(spa));
1524
1525	while (traverse_more(th) == EAGAIN)
1526		continue;
1527
1528	traverse_fini(th);
1529
1530	if (zcb.zcb_haderrors) {
1531		(void) printf("\nError counts:\n\n");
1532		(void) printf("\t%5s  %s\n", "errno", "count");
1533		for (e = 0; e < 256; e++) {
1534			if (zcb.zcb_errors[e] != 0) {
1535				(void) printf("\t%5d  %llu\n",
1536				    e, (u_longlong_t)zcb.zcb_errors[e]);
1537			}
1538		}
1539	}
1540
1541	/*
1542	 * Report any leaked segments.
1543	 */
1544	if (!dump_opt['L'])
1545		zdb_space_map_vacate(spa);
1546
1547	if (dump_opt['L'])
1548		(void) printf("\n\n *** Live pool traversal; "
1549		    "block counts are only approximate ***\n\n");
1550
1551	alloc = spa_get_alloc(spa);
1552	space = spa_get_space(spa);
1553
1554	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
1555
1556	if (tzb->zb_asize == alloc) {
1557		(void) printf("\n\tNo leaks (block sum matches space"
1558		    " maps exactly)\n");
1559	} else {
1560		(void) printf("block traversal size %llu != alloc %llu "
1561		    "(leaked %lld)\n",
1562		    (u_longlong_t)tzb->zb_asize,
1563		    (u_longlong_t)alloc,
1564		    (u_longlong_t)(alloc - tzb->zb_asize));
1565		leaks = 1;
1566	}
1567
1568	if (tzb->zb_count == 0)
1569		return (2);
1570
1571	(void) printf("\n");
1572	(void) printf("\tbp count:      %10llu\n",
1573	    (u_longlong_t)tzb->zb_count);
1574	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
1575	    (u_longlong_t)tzb->zb_lsize,
1576	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
1577	(void) printf("\tbp physical:   %10llu\t avg:"
1578	    " %6llu\tcompression: %6.2f\n",
1579	    (u_longlong_t)tzb->zb_psize,
1580	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
1581	    (double)tzb->zb_lsize / tzb->zb_psize);
1582	(void) printf("\tbp allocated:  %10llu\t avg:"
1583	    " %6llu\tcompression: %6.2f\n",
1584	    (u_longlong_t)tzb->zb_asize,
1585	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
1586	    (double)tzb->zb_lsize / tzb->zb_asize);
1587	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
1588	    (u_longlong_t)alloc, 100.0 * alloc / space);
1589
1590	if (dump_opt['b'] >= 2) {
1591		int l, t, level;
1592		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
1593		    "\t  avg\t comp\t%%Total\tType\n");
1594
1595		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
1596			char csize[6], lsize[6], psize[6], asize[6], avg[6];
1597			char *typename;
1598
1599			typename = t == DMU_OT_DEFERRED ? "deferred free" :
1600			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
1601
1602			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
1603				(void) printf("%6s\t%5s\t%5s\t%5s"
1604				    "\t%5s\t%5s\t%6s\t%s\n",
1605				    "-",
1606				    "-",
1607				    "-",
1608				    "-",
1609				    "-",
1610				    "-",
1611				    "-",
1612				    typename);
1613				continue;
1614			}
1615
1616			for (l = ZB_TOTAL - 1; l >= -1; l--) {
1617				level = (l == -1 ? ZB_TOTAL : l);
1618				zb = &zcb.zcb_type[level][t];
1619
1620				if (zb->zb_asize == 0)
1621					continue;
1622
1623				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
1624					continue;
1625
1626				if (level == 0 && zb->zb_asize ==
1627				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
1628					continue;
1629
1630				nicenum(zb->zb_count, csize);
1631				nicenum(zb->zb_lsize, lsize);
1632				nicenum(zb->zb_psize, psize);
1633				nicenum(zb->zb_asize, asize);
1634				nicenum(zb->zb_asize / zb->zb_count, avg);
1635
1636				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
1637				    "\t%5.2f\t%6.2f\t",
1638				    csize, lsize, psize, asize, avg,
1639				    (double)zb->zb_lsize / zb->zb_psize,
1640				    100.0 * zb->zb_asize / tzb->zb_asize);
1641
1642				if (level == ZB_TOTAL)
1643					(void) printf("%s\n", typename);
1644				else
1645					(void) printf("    L%d %s\n",
1646					    level, typename);
1647			}
1648		}
1649	}
1650
1651	(void) printf("\n");
1652
1653	if (leaks)
1654		return (2);
1655
1656	if (zcb.zcb_haderrors)
1657		return (3);
1658
1659	return (0);
1660}
1661
1662static void
1663dump_zpool(spa_t *spa)
1664{
1665	dsl_pool_t *dp = spa_get_dsl(spa);
1666	int rc = 0;
1667
1668	if (dump_opt['u'])
1669		dump_uberblock(&spa->spa_uberblock);
1670
1671	if (dump_opt['d'] || dump_opt['i']) {
1672		dump_dir(dp->dp_meta_objset);
1673		if (dump_opt['d'] >= 3) {
1674			dump_bplist(dp->dp_meta_objset,
1675			    spa->spa_sync_bplist_obj, "Deferred frees");
1676			dump_dtl(spa->spa_root_vdev, 0);
1677			dump_metaslabs(spa);
1678		}
1679		dmu_objset_find(spa->spa_name, dump_one_dir, NULL,
1680		    DS_FIND_SNAPSHOTS);
1681	}
1682
1683	if (dump_opt['b'] || dump_opt['c'])
1684		rc = dump_block_stats(spa);
1685
1686	if (dump_opt['s'])
1687		show_pool_stats(spa);
1688
1689	if (rc != 0)
1690		exit(rc);
1691}
1692
1693int
1694main(int argc, char **argv)
1695{
1696	int i, c;
1697	struct rlimit rl = { 1024, 1024 };
1698	spa_t *spa;
1699	objset_t *os = NULL;
1700	char *endstr;
1701	int dump_all = 1;
1702	int verbose = 0;
1703	int error;
1704	int flag, set;
1705	vdev_knob_t *vk;
1706
1707	(void) setrlimit(RLIMIT_NOFILE, &rl);
1708
1709	dprintf_setup(&argc, argv);
1710
1711	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
1712		switch (c) {
1713		case 'u':
1714		case 'd':
1715		case 'i':
1716		case 'b':
1717		case 'c':
1718		case 's':
1719		case 'C':
1720		case 'l':
1721			dump_opt[c]++;
1722			dump_all = 0;
1723			break;
1724		case 'L':
1725			dump_opt[c]++;
1726			break;
1727		case 'O':
1728			endstr = optarg;
1729			if (endstr[0] == '!') {
1730				endstr++;
1731				set = 0;
1732			} else {
1733				set = 1;
1734			}
1735			if (strcmp(endstr, "post") == 0) {
1736				flag = ADVANCE_PRE;
1737				set = !set;
1738			} else if (strcmp(endstr, "pre") == 0) {
1739				flag = ADVANCE_PRE;
1740			} else if (strcmp(endstr, "prune") == 0) {
1741				flag = ADVANCE_PRUNE;
1742			} else if (strcmp(endstr, "data") == 0) {
1743				flag = ADVANCE_DATA;
1744			} else if (strcmp(endstr, "holes") == 0) {
1745				flag = ADVANCE_HOLES;
1746			} else {
1747				usage();
1748			}
1749			if (set)
1750				zdb_advance |= flag;
1751			else
1752				zdb_advance &= ~flag;
1753			break;
1754		case 'B':
1755			endstr = optarg - 1;
1756			zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
1757			zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
1758			zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
1759			zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
1760			(void) printf("simulating bad block "
1761			    "<%llu, %llu, %lld, %llx>\n",
1762			    (u_longlong_t)zdb_noread.zb_objset,
1763			    (u_longlong_t)zdb_noread.zb_object,
1764			    (u_longlong_t)zdb_noread.zb_level,
1765			    (u_longlong_t)zdb_noread.zb_blkid);
1766			break;
1767		case 'v':
1768			verbose++;
1769			break;
1770		case 'U':
1771			spa_config_dir = "/tmp";
1772			break;
1773		default:
1774			usage();
1775			break;
1776		}
1777	}
1778
1779	kernel_init(FREAD);
1780
1781	/*
1782	 * Disable vdev caching.  If we don't do this, live pool traversal
1783	 * won't make progress because it will never see disk updates.
1784	 */
1785	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
1786		if (strcmp(vk->vk_name, "cache_size") == 0)
1787			vk->vk_default = 0;
1788	}
1789
1790	for (c = 0; c < 256; c++) {
1791		if (dump_all && c != 'L' && c != 'l')
1792			dump_opt[c] = 1;
1793		if (dump_opt[c])
1794			dump_opt[c] += verbose;
1795	}
1796
1797	argc -= optind;
1798	argv += optind;
1799
1800	if (argc < 1) {
1801		if (dump_opt['C']) {
1802			dump_config(NULL);
1803			return (0);
1804		}
1805		usage();
1806	}
1807
1808	if (dump_opt['l']) {
1809		dump_label(argv[0]);
1810		return (0);
1811	}
1812
1813	if (dump_opt['C'])
1814		dump_config(argv[0]);
1815
1816	if (strchr(argv[0], '/') != NULL) {
1817		error = dmu_objset_open(argv[0], DMU_OST_ANY,
1818		    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1819	} else {
1820		error = spa_open(argv[0], &spa, FTAG);
1821	}
1822
1823	if (error)
1824		fatal("can't open %s: error %d", argv[0], error);
1825
1826	argv++;
1827	if (--argc > 0) {
1828		zopt_objects = argc;
1829		zopt_object = calloc(zopt_objects, sizeof (uint64_t));
1830		for (i = 0; i < zopt_objects; i++) {
1831			errno = 0;
1832			zopt_object[i] = strtoull(argv[i], NULL, 0);
1833			if (zopt_object[i] == 0 && errno != 0)
1834				fatal("bad object number %s: %s",
1835				    argv[i], strerror(errno));
1836		}
1837	}
1838
1839	if (os != NULL) {
1840		dump_dir(os);
1841		dmu_objset_close(os);
1842	} else {
1843		dump_zpool(spa);
1844		spa_close(spa, FTAG);
1845	}
1846
1847	kernel_fini();
1848
1849	return (0);
1850}
1851