zdb.c revision fa9e4066f08beec538e775443c5be79dd423fcab
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <stdio.h>
30#include <stdlib.h>
31#include <sys/zfs_context.h>
32#include <sys/spa.h>
33#include <sys/spa_impl.h>
34#include <sys/dmu.h>
35#include <sys/zap.h>
36#include <sys/fs/zfs.h>
37#include <sys/zfs_znode.h>
38#include <sys/vdev.h>
39#include <sys/vdev_impl.h>
40#include <sys/metaslab_impl.h>
41#include <sys/dmu_objset.h>
42#include <sys/dsl_dir.h>
43#include <sys/dsl_dataset.h>
44#include <sys/dsl_pool.h>
45#include <sys/dbuf.h>
46#include <sys/zil.h>
47#include <sys/zil_impl.h>
48#include <sys/stat.h>
49#include <sys/resource.h>
50#include <sys/dmu_traverse.h>
51#include <sys/zio_checksum.h>
52#include <sys/zio_compress.h>
53
54const char cmdname[] = "zdb";
55uint8_t dump_opt[256];
56
57typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
58
59extern void dump_intent_log(zilog_t *);
60uint64_t *zopt_object = NULL;
61int zopt_objects = 0;
62int zdb_advance = ADVANCE_PRE;
63zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
64
65/*
66 * These libumem hooks provide a reasonable set of defaults for the allocator's
67 * debugging facilities.
68 */
69const char *
70_umem_debug_init()
71{
72	return ("default,verbose"); /* $UMEM_DEBUG setting */
73}
74
75const char *
76_umem_logging_init(void)
77{
78	return ("fail,contents"); /* $UMEM_LOGGING setting */
79}
80
81static void
82usage(void)
83{
84	(void) fprintf(stderr,
85	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
86	    "dataset [object...]\n"
87	    "       %s -C [pool]\n"
88	    "       %s -l dev\n",
89	    cmdname, cmdname, cmdname);
90
91	(void) fprintf(stderr, "	-u uberblock\n");
92	(void) fprintf(stderr, "	-d datasets\n");
93	(void) fprintf(stderr, "        -C cached pool configuration\n");
94	(void) fprintf(stderr, "	-i intent logs\n");
95	(void) fprintf(stderr, "	-b block statistics\n");
96	(void) fprintf(stderr, "	-c checksum all data blocks\n");
97	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
98	(void) fprintf(stderr, "	-v verbose (applies to all others)\n");
99	(void) fprintf(stderr, "        -l dump label contents\n");
100	(void) fprintf(stderr, "	-L live pool (allows some errors)\n");
101	(void) fprintf(stderr, "	-O [!]<pre|post|prune|data|holes> "
102	    "visitation order\n");
103	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
104	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
105	    "simulate bad block\n");
106	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
107	    "to make only that option verbose\n");
108	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
109	exit(1);
110}
111
112static void
113fatal(const char *fmt, ...)
114{
115	va_list ap;
116
117	va_start(ap, fmt);
118	(void) fprintf(stderr, "%s: ", cmdname);
119	(void) vfprintf(stderr, fmt, ap);
120	va_end(ap);
121	(void) fprintf(stderr, "\n");
122
123	exit(1);
124}
125
126static void
127dump_nvlist(nvlist_t *list, int indent)
128{
129	nvpair_t *elem = NULL;
130
131	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
132		switch (nvpair_type(elem)) {
133		case DATA_TYPE_STRING:
134			{
135				char *value;
136
137				VERIFY(nvpair_value_string(elem, &value) == 0);
138				(void) printf("%*s%s='%s'\n", indent, "",
139				    nvpair_name(elem), value);
140			}
141			break;
142
143		case DATA_TYPE_UINT64:
144			{
145				uint64_t value;
146
147				VERIFY(nvpair_value_uint64(elem, &value) == 0);
148				(void) printf("%*s%s=%llu\n", indent, "",
149				    nvpair_name(elem), (u_longlong_t)value);
150			}
151			break;
152
153		case DATA_TYPE_NVLIST:
154			{
155				nvlist_t *value;
156
157				VERIFY(nvpair_value_nvlist(elem, &value) == 0);
158				(void) printf("%*s%s\n", indent, "",
159				    nvpair_name(elem));
160				dump_nvlist(value, indent + 4);
161			}
162			break;
163
164		case DATA_TYPE_NVLIST_ARRAY:
165			{
166				nvlist_t **value;
167				uint_t c, count;
168
169				VERIFY(nvpair_value_nvlist_array(elem, &value,
170				    &count) == 0);
171
172				for (c = 0; c < count; c++) {
173					(void) printf("%*s%s[%u]\n", indent, "",
174					    nvpair_name(elem), c);
175					dump_nvlist(value[c], indent + 8);
176				}
177			}
178			break;
179
180		default:
181
182			(void) printf("bad config type %d for %s\n",
183			    nvpair_type(elem), nvpair_name(elem));
184		}
185	}
186}
187
188/* ARGSUSED */
189static void
190dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
191{
192	nvlist_t *nv;
193	size_t nvsize = *(uint64_t *)data;
194	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
195
196	dmu_read(os, object, 0, nvsize, packed);
197
198	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
199
200	umem_free(packed, nvsize);
201
202	dump_nvlist(nv, 8);
203
204	nvlist_free(nv);
205}
206
207const char dump_zap_stars[] = "****************************************";
208const int dump_zap_width = sizeof (dump_zap_stars) - 1;
209
210static void
211dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
212{
213	int i;
214	int minidx = ZAP_HISTOGRAM_SIZE - 1;
215	int maxidx = 0;
216	uint64_t max = 0;
217
218	for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
219		if (histo[i] > max)
220			max = histo[i];
221		if (histo[i] > 0 && i > maxidx)
222			maxidx = i;
223		if (histo[i] > 0 && i < minidx)
224			minidx = i;
225	}
226
227	if (max < dump_zap_width)
228		max = dump_zap_width;
229
230	for (i = minidx; i <= maxidx; i++)
231		(void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
232		    &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
233}
234
235static void
236dump_zap_stats(objset_t *os, uint64_t object)
237{
238	int error;
239	zap_stats_t zs;
240
241	error = zap_get_stats(os, object, &zs);
242	if (error)
243		return;
244
245	if (zs.zs_ptrtbl_len == 0) {
246		ASSERT(zs.zs_num_blocks == 1);
247		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
248		    (u_longlong_t)zs.zs_blocksize,
249		    (u_longlong_t)zs.zs_num_entries);
250		return;
251	}
252
253	(void) printf("\tFat ZAP stats:\n");
254	(void) printf("\t\tPointer table: %llu elements\n",
255	    (u_longlong_t)zs.zs_ptrtbl_len);
256	(void) printf("\t\tZAP entries: %llu\n",
257	    (u_longlong_t)zs.zs_num_entries);
258	(void) printf("\t\tLeaf blocks: %llu\n",
259	    (u_longlong_t)zs.zs_num_leafs);
260	(void) printf("\t\tTotal blocks: %llu\n",
261	    (u_longlong_t)zs.zs_num_blocks);
262	(void) printf("\t\tOversize blocks: %llu\n",
263	    (u_longlong_t)zs.zs_num_blocks_large);
264
265	(void) printf("\t\tLeafs with 2^n pointers:\n");
266	dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
267
268	(void) printf("\t\tLeafs with n chained:\n");
269	dump_zap_histogram(zs.zs_leafs_with_n_chained);
270
271	(void) printf("\t\tBlocks with n*5 entries:\n");
272	dump_zap_histogram(zs.zs_blocks_with_n5_entries);
273
274	(void) printf("\t\tBlocks n/10 full:\n");
275	dump_zap_histogram(zs.zs_blocks_n_tenths_full);
276
277	(void) printf("\t\tEntries with n chunks:\n");
278	dump_zap_histogram(zs.zs_entries_using_n_chunks);
279
280	(void) printf("\t\tBuckets with n entries:\n");
281	dump_zap_histogram(zs.zs_buckets_with_n_entries);
282}
283
284/*ARGSUSED*/
285static void
286dump_none(objset_t *os, uint64_t object, void *data, size_t size)
287{
288}
289
290/*ARGSUSED*/
291void
292dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
293{
294}
295
296/*ARGSUSED*/
297static void
298dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
299{
300}
301
302/*ARGSUSED*/
303static void
304dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
305{
306	zap_cursor_t zc;
307	zap_attribute_t attr;
308	void *prop;
309	int i;
310
311	dump_zap_stats(os, object);
312	(void) printf("\n");
313
314	for (zap_cursor_init(&zc, os, object);
315	    zap_cursor_retrieve(&zc, &attr) == 0;
316	    zap_cursor_advance(&zc)) {
317		(void) printf("\t\t%s = ", attr.za_name);
318		if (attr.za_num_integers == 0) {
319			(void) printf("\n");
320			continue;
321		}
322		prop = umem_zalloc(attr.za_num_integers *
323		    attr.za_integer_length, UMEM_NOFAIL);
324		(void) zap_lookup(os, object, attr.za_name,
325		    attr.za_integer_length, attr.za_num_integers, prop);
326		if (attr.za_integer_length == 1) {
327			(void) printf("%s", (char *)prop);
328		} else {
329			for (i = 0; i < attr.za_num_integers; i++) {
330				switch (attr.za_integer_length) {
331				case 2:
332					(void) printf("%u ",
333					    ((uint16_t *)prop)[i]);
334					break;
335				case 4:
336					(void) printf("%u ",
337					    ((uint32_t *)prop)[i]);
338					break;
339				case 8:
340					(void) printf("%lld ",
341					    (u_longlong_t)((int64_t *)prop)[i]);
342					break;
343				}
344			}
345		}
346		(void) printf("\n");
347		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
348	}
349}
350
351static void
352dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
353{
354	uint64_t alloc, offset, entry;
355	int mapshift = sm->sm_shift;
356	uint64_t mapstart = sm->sm_start;
357	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
358
359	if (smo->smo_object == 0)
360		return;
361
362	/*
363	 * Print out the freelist entries in both encoded and decoded form.
364	 */
365	alloc = 0;
366	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
367		dmu_read(os, smo->smo_object, offset, sizeof (entry), &entry);
368		if (SM_DEBUG_DECODE(entry)) {
369			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
370			    (u_longlong_t)(offset / sizeof (entry)),
371			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
372			    SM_DEBUG_TXG_DECODE(entry),
373			    SM_DEBUG_SYNCPASS_DECODE(entry));
374		} else {
375			(void) printf("\t\t[%4llu]    %c  range:"
376			    " %08llx-%08llx  size: %06llx\n",
377			    (u_longlong_t)(offset / sizeof (entry)),
378			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
379			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart,
380			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart +
381			    (SM_RUN_DECODE(entry) << mapshift),
382			    (SM_RUN_DECODE(entry) << mapshift));
383			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
384				alloc += SM_RUN_DECODE(entry) << mapshift;
385			else
386				alloc -= SM_RUN_DECODE(entry) << mapshift;
387		}
388	}
389	if (alloc != smo->smo_alloc) {
390		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
391		    "with space map summary (%llu)\n",
392		    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
393	}
394}
395
396static void
397dump_metaslab(metaslab_t *msp)
398{
399	char freebuf[5];
400	space_map_obj_t *smo = msp->ms_smo;
401	vdev_t *vd = msp->ms_group->mg_vd;
402	spa_t *spa = vd->vdev_spa;
403
404	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
405
406	if (dump_opt['d'] <= 5) {
407		(void) printf("\t%10llx   %10llu   %5s\n",
408		    (u_longlong_t)msp->ms_map.sm_start,
409		    (u_longlong_t)smo->smo_object,
410		    freebuf);
411		return;
412	}
413
414	(void) printf(
415	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
416	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
417	    (u_longlong_t)smo->smo_object, freebuf);
418
419	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
420
421	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
422}
423
424static void
425dump_metaslabs(spa_t *spa)
426{
427	vdev_t *rvd = spa->spa_root_vdev;
428	vdev_t *vd;
429	int c, m;
430
431	(void) printf("\nMetaslabs:\n");
432
433	for (c = 0; c < rvd->vdev_children; c++) {
434		vd = rvd->vdev_child[c];
435
436		spa_config_enter(spa, RW_READER);
437		(void) printf("\n    vdev %llu = %s\n\n",
438		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
439		spa_config_exit(spa);
440
441		if (dump_opt['d'] <= 5) {
442			(void) printf("\t%10s   %10s   %5s\n",
443			    "offset", "spacemap", "free");
444			(void) printf("\t%10s   %10s   %5s\n",
445			    "------", "--------", "----");
446		}
447		for (m = 0; m < vd->vdev_ms_count; m++)
448			dump_metaslab(vd->vdev_ms[m]);
449		(void) printf("\n");
450	}
451}
452
453static void
454dump_dtl(vdev_t *vd, int indent)
455{
456	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
457	spa_t *spa = vd->vdev_spa;
458	space_seg_t *ss;
459	vdev_t *pvd;
460	int c;
461
462	if (indent == 0)
463		(void) printf("\nDirty time logs:\n\n");
464
465	spa_config_enter(spa, RW_READER);
466	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
467	spa_config_exit(spa);
468
469	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
470		/*
471		 * Everything in this DTL must appear in all parent DTL unions.
472		 */
473		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
474			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
475			    ss->ss_start, ss->ss_end - ss->ss_start));
476		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
477		    indent, "",
478		    (u_longlong_t)ss->ss_start,
479		    (u_longlong_t)ss->ss_end - 1,
480		    (u_longlong_t)ss->ss_end - ss->ss_start);
481	}
482
483	(void) printf("\n");
484
485	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
486		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
487		    &vd->vdev_dtl_map);
488		(void) printf("\n");
489	}
490
491	for (c = 0; c < vd->vdev_children; c++)
492		dump_dtl(vd->vdev_child[c], indent + 4);
493}
494
495/*ARGSUSED*/
496static void
497dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
498{
499}
500
501static uint64_t
502blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
503{
504	if (level < 0)
505		return (blkid);
506
507	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
508	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
509}
510
511/* ARGSUSED */
512static int
513zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
514{
515	zbookmark_t *zb = &bc->bc_bookmark;
516	blkptr_t *bp = &bc->bc_blkptr;
517	dva_t *dva = &bp->blk_dva[0];
518	void *data = bc->bc_data;
519	dnode_phys_t *dnp = bc->bc_dnode;
520	char buffer[300];
521	int l;
522
523	if (bc->bc_errno) {
524		(void) sprintf(buffer,
525		    "Error %d reading <%llu, %llu, %d, %llu>: ",
526		    bc->bc_errno,
527		    (u_longlong_t)zb->zb_objset,
528		    (u_longlong_t)zb->zb_object,
529		    zb->zb_level,
530		    (u_longlong_t)zb->zb_blkid);
531		goto out;
532	}
533
534	if (zb->zb_level == -1) {
535		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
536		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
537	} else {
538		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
539		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
540	}
541
542	if (zb->zb_level > 0) {
543		uint64_t fill = 0;
544		blkptr_t *bpx, *bpend;
545
546		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
547		    bpx < bpend; bpx++) {
548			if (bpx->blk_birth != 0) {
549				ASSERT(bpx->blk_fill > 0);
550				fill += bpx->blk_fill;
551			} else {
552				ASSERT(bpx->blk_fill == 0);
553			}
554		}
555		ASSERT3U(fill, ==, bp->blk_fill);
556	}
557
558	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
559		uint64_t fill = 0;
560		dnode_phys_t *dnx, *dnend;
561
562		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
563		    dnx < dnend; dnx++) {
564			if (dnx->dn_type != DMU_OT_NONE)
565				fill++;
566		}
567		ASSERT3U(fill, ==, bp->blk_fill);
568	}
569
570	(void) sprintf(buffer, "%16llx ",
571	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
572
573	ASSERT(zb->zb_level >= 0);
574
575	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
576		if (l == zb->zb_level) {
577			(void) sprintf(buffer + strlen(buffer), "L%x",
578			    zb->zb_level);
579		} else {
580			(void) sprintf(buffer + strlen(buffer), " ");
581		}
582	}
583
584out:
585	if (bp->blk_birth == 0) {
586		(void) sprintf(buffer + strlen(buffer), "<hole>");
587		(void) printf("%s\n", buffer);
588	} else {
589		// XXBP - Need to print number of active BPs here
590		(void) sprintf(buffer + strlen(buffer),
591		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
592		    (u_longlong_t)DVA_GET_VDEV(dva),
593		    (u_longlong_t)DVA_GET_OFFSET(dva),
594		    (u_longlong_t)BP_GET_LSIZE(bp),
595		    (u_longlong_t)BP_GET_PSIZE(bp),
596		    (u_longlong_t)DVA_GET_ASIZE(dva),
597		    (u_longlong_t)bp->blk_fill,
598		    (u_longlong_t)bp->blk_birth);
599
600		(void) printf("%s\n", buffer);
601	}
602
603	return (bc->bc_errno ? ERESTART : 0);
604}
605
606/*ARGSUSED*/
607static void
608dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
609{
610	traverse_handle_t *th;
611	uint64_t objset = dmu_objset_id(os);
612	int advance = zdb_advance;
613
614	(void) printf("Indirect blocks:\n");
615
616	if (object == 0)
617		advance |= ADVANCE_DATA;
618
619	th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
620	    ZIO_FLAG_CANFAIL);
621	th->th_noread = zdb_noread;
622
623	traverse_add_dnode(th, 0, -1ULL, objset, object);
624
625	while (traverse_more(th) == EAGAIN)
626		continue;
627
628	(void) printf("\n");
629
630	traverse_fini(th);
631}
632
633/*ARGSUSED*/
634static void
635dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
636{
637	dsl_dir_phys_t *dd = data;
638	time_t crtime;
639	char used[6], compressed[6], uncompressed[6], quota[6], resv[6];
640
641	if (dd == NULL)
642		return;
643
644	ASSERT(size == sizeof (*dd));
645
646	crtime = dd->dd_creation_time;
647	nicenum(dd->dd_used_bytes, used);
648	nicenum(dd->dd_compressed_bytes, compressed);
649	nicenum(dd->dd_uncompressed_bytes, uncompressed);
650	nicenum(dd->dd_quota, quota);
651	nicenum(dd->dd_reserved, resv);
652
653	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
654	(void) printf("\t\thead_dataset_obj = %llu\n",
655	    (u_longlong_t)dd->dd_head_dataset_obj);
656	(void) printf("\t\tparent_dir_obj = %llu\n",
657	    (u_longlong_t)dd->dd_parent_obj);
658	(void) printf("\t\tclone_parent_obj = %llu\n",
659	    (u_longlong_t)dd->dd_clone_parent_obj);
660	(void) printf("\t\tchild_dir_zapobj = %llu\n",
661	    (u_longlong_t)dd->dd_child_dir_zapobj);
662	(void) printf("\t\tused_bytes = %s\n", used);
663	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
664	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
665	(void) printf("\t\tquota = %s\n", quota);
666	(void) printf("\t\treserved = %s\n", resv);
667	(void) printf("\t\tprops_zapobj = %llu\n",
668	    (u_longlong_t)dd->dd_props_zapobj);
669}
670
671/*ARGSUSED*/
672static void
673dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
674{
675	dsl_dataset_phys_t *ds = data;
676	time_t crtime;
677	char used[6], compressed[6], uncompressed[6], unique[6], blkbuf[300];
678
679	if (ds == NULL)
680		return;
681
682	ASSERT(size == sizeof (*ds));
683	crtime = ds->ds_creation_time;
684	nicenum(ds->ds_used_bytes, used);
685	nicenum(ds->ds_compressed_bytes, compressed);
686	nicenum(ds->ds_uncompressed_bytes, uncompressed);
687	nicenum(ds->ds_unique_bytes, unique);
688	sprintf_blkptr(blkbuf, &ds->ds_bp);
689
690	(void) printf("\t\tdataset_obj = %llu\n",
691	    (u_longlong_t)ds->ds_dir_obj);
692	(void) printf("\t\tprev_snap_obj = %llu\n",
693	    (u_longlong_t)ds->ds_prev_snap_obj);
694	(void) printf("\t\tprev_snap_txg = %llu\n",
695	    (u_longlong_t)ds->ds_prev_snap_txg);
696	(void) printf("\t\tnext_snap_obj = %llu\n",
697	    (u_longlong_t)ds->ds_next_snap_obj);
698	(void) printf("\t\tsnapnames_zapobj = %llu\n",
699	    (u_longlong_t)ds->ds_snapnames_zapobj);
700	(void) printf("\t\tnum_children = %llu\n",
701	    (u_longlong_t)ds->ds_num_children);
702	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
703	(void) printf("\t\tcreation_txg = %llu\n",
704	    (u_longlong_t)ds->ds_creation_txg);
705	(void) printf("\t\tdeadlist_obj = %llu\n",
706	    (u_longlong_t)ds->ds_deadlist_obj);
707	(void) printf("\t\tused_bytes = %s\n", used);
708	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
709	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
710	(void) printf("\t\tunique = %s\n", unique);
711	(void) printf("\t\tfsid_guid = %llu\n",
712	    (u_longlong_t)ds->ds_fsid_guid);
713	(void) printf("\t\tguid = %llu\n",
714	    (u_longlong_t)ds->ds_guid);
715	(void) printf("\t\trestoring = %llu\n",
716	    (u_longlong_t)ds->ds_restoring);
717	(void) printf("\t\tbp = %s\n", blkbuf);
718}
719
720static void
721dump_bplist(objset_t *mos, uint64_t object, char *name)
722{
723	bplist_t bpl = { 0 };
724	blkptr_t blk, *bp = &blk;
725	uint64_t itor = 0;
726	char numbuf[6];
727
728	if (dump_opt['d'] < 3)
729		return;
730
731	bplist_open(&bpl, mos, object);
732	if (bplist_empty(&bpl)) {
733		bplist_close(&bpl);
734		return;
735	}
736
737	nicenum(bpl.bpl_phys->bpl_bytes, numbuf);
738
739	(void) printf("\n    %s: %llu entries, %s\n",
740	    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, numbuf);
741
742	if (dump_opt['d'] < 5) {
743		bplist_close(&bpl);
744		return;
745	}
746
747	(void) printf("\n");
748
749	while (bplist_iterate(&bpl, &itor, bp) == 0) {
750		ASSERT(bp->blk_birth != 0);
751		// XXBP - Do we want to see all DVAs, or just one?
752		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
753		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
754		    (u_longlong_t)itor - 1,
755		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
756		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
757		    (u_longlong_t)BP_GET_LSIZE(bp),
758		    (u_longlong_t)BP_GET_PSIZE(bp),
759		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
760		    (u_longlong_t)bp->blk_fill,
761		    (u_longlong_t)bp->blk_birth);
762	}
763
764	bplist_close(&bpl);
765}
766
767static char *
768znode_path(objset_t *os, uint64_t object, char *pathbuf, size_t size)
769{
770	dmu_buf_t *db;
771	dmu_object_info_t doi;
772	znode_phys_t *zp;
773	uint64_t parent = 0;
774	size_t complen;
775	char component[MAXNAMELEN + 1];
776	char *path;
777
778	path = pathbuf + size;
779	*--path = '\0';
780
781	for (;;) {
782		db = dmu_bonus_hold(os, object);
783		if (db == NULL)
784			break;
785
786		dmu_buf_read(db);
787		dmu_object_info_from_db(db, &doi);
788		zp = db->db_data;
789		parent = zp->zp_parent;
790		dmu_buf_rele(db);
791
792		if (doi.doi_bonus_type != DMU_OT_ZNODE)
793			break;
794
795		if (parent == object) {
796			if (path[0] != '/')
797				*--path = '/';
798			return (path);
799		}
800
801		if (zap_value_search(os, parent, object, component) != 0)
802			break;
803
804		complen = strlen(component);
805		path -= complen;
806		bcopy(component, path, complen);
807		*--path = '/';
808
809		object = parent;
810	}
811
812	(void) sprintf(component, "???<object#%llu>", (u_longlong_t)object);
813
814	complen = strlen(component);
815	path -= complen;
816	bcopy(component, path, complen);
817
818	return (path);
819}
820
821/*ARGSUSED*/
822static void
823dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
824{
825	znode_phys_t *zp = data;
826	time_t z_crtime, z_atime, z_mtime, z_ctime;
827	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
828
829	ASSERT(size >= sizeof (znode_phys_t));
830
831	if (dump_opt['d'] < 3) {
832		(void) printf("\t%s\n",
833		    znode_path(os, object, path, sizeof (path)));
834		return;
835	}
836
837	z_crtime = (time_t)zp->zp_crtime[0];
838	z_atime = (time_t)zp->zp_atime[0];
839	z_mtime = (time_t)zp->zp_mtime[0];
840	z_ctime = (time_t)zp->zp_ctime[0];
841
842	(void) printf("\tpath	%s\n",
843	    znode_path(os, object, path, sizeof (path)));
844	(void) printf("\tatime	%s", ctime(&z_atime));
845	(void) printf("\tmtime	%s", ctime(&z_mtime));
846	(void) printf("\tctime	%s", ctime(&z_ctime));
847	(void) printf("\tcrtime	%s", ctime(&z_crtime));
848	(void) printf("\tgen	%llu\n", (u_longlong_t)zp->zp_gen);
849	(void) printf("\tmode	%llo\n", (u_longlong_t)zp->zp_mode);
850	(void) printf("\tsize	%llu\n", (u_longlong_t)zp->zp_size);
851	(void) printf("\tparent	%llu\n", (u_longlong_t)zp->zp_parent);
852	(void) printf("\tlinks	%llu\n", (u_longlong_t)zp->zp_links);
853	(void) printf("\txattr	%llu\n", (u_longlong_t)zp->zp_xattr);
854	(void) printf("\trdev	0x%016llx\n", (u_longlong_t)zp->zp_rdev);
855}
856
857/*ARGSUSED*/
858static void
859dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
860{
861}
862
863/*ARGSUSED*/
864static void
865dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
866{
867}
868
869static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
870	dump_none,		/* unallocated			*/
871	dump_zap,		/* object directory		*/
872	dump_uint64,		/* object array			*/
873	dump_none,		/* packed nvlist		*/
874	dump_packed_nvlist,	/* packed nvlist size		*/
875	dump_none,		/* bplist			*/
876	dump_none,		/* bplist header		*/
877	dump_none,		/* SPA space map header		*/
878	dump_none,		/* SPA space map		*/
879	dump_none,		/* ZIL intent log		*/
880	dump_dnode,		/* DMU dnode			*/
881	dump_dmu_objset,	/* DMU objset			*/
882	dump_dsl_dir,	/* DSL directory			*/
883	dump_zap,		/* DSL directory child map	*/
884	dump_zap,		/* DSL dataset snap map		*/
885	dump_zap,		/* DSL props			*/
886	dump_dsl_dataset,	/* DSL dataset			*/
887	dump_znode,		/* ZFS znode			*/
888	dump_acl,		/* ZFS ACL			*/
889	dump_uint8,		/* ZFS plain file		*/
890	dump_zap,		/* ZFS directory		*/
891	dump_zap,		/* ZFS master node		*/
892	dump_zap,		/* ZFS delete queue		*/
893	dump_uint8,		/* zvol object			*/
894	dump_zap,		/* zvol prop			*/
895	dump_uint8,		/* other uint8[]		*/
896	dump_uint64,		/* other uint64[]		*/
897	dump_zap,		/* other ZAP			*/
898};
899
900static void
901dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
902{
903	dmu_buf_t *db = NULL;
904	dmu_object_info_t doi;
905	dnode_t *dn;
906	void *bonus = NULL;
907	size_t bsize = 0;
908	char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6];
909	char aux[50];
910	int error;
911
912	if (*print_header) {
913		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
914		    "  psize  type\n");
915		*print_header = 0;
916	}
917
918	if (object == 0) {
919		dn = os->os->os_meta_dnode;
920	} else {
921		db = dmu_bonus_hold(os, object);
922		if (db == NULL)
923			fatal("dmu_bonus_hold(%llu) failed", object);
924		dmu_buf_read(db);
925		bonus = db->db_data;
926		bsize = db->db_size;
927		dn = ((dmu_buf_impl_t *)db)->db_dnode;
928	}
929	dmu_object_info_from_dnode(dn, &doi);
930
931	nicenum(doi.doi_metadata_block_size, iblk);
932	nicenum(doi.doi_data_block_size, dblk);
933	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
934	    lsize);
935	nicenum(doi.doi_physical_blks << 9, psize);
936	nicenum(doi.doi_bonus_size, bonus_size);
937
938	aux[0] = '\0';
939
940	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6)
941		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
942		zio_checksum_table[doi.doi_checksum].ci_name);
943
944	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6)
945		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
946		zio_compress_table[doi.doi_compress].ci_name);
947
948	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
949	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
950	    psize, dmu_ot[doi.doi_type].ot_name, aux);
951
952	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
953		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
954		    "", "", "", "", bonus_size, "bonus",
955		    dmu_ot[doi.doi_bonus_type].ot_name);
956	}
957
958	if (verbosity >= 4) {
959		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
960		object_viewer[doi.doi_type](os, object, NULL, 0);
961		*print_header = 1;
962	}
963
964	if (verbosity >= 5)
965		dump_indirect(os, object, NULL, 0);
966
967	if (verbosity >= 5) {
968		/*
969		 * Report the list of segments that comprise the object.
970		 */
971		uint64_t start = 0;
972		uint64_t end;
973		uint64_t blkfill = 1;
974		int minlvl = 1;
975
976		if (dn->dn_type == DMU_OT_DNODE) {
977			minlvl = 0;
978			blkfill = DNODES_PER_BLOCK;
979		}
980
981		for (;;) {
982			error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
983			    blkfill);
984			if (error)
985				break;
986			end = start;
987			error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
988			    blkfill);
989			nicenum(end - start, segsize);
990			(void) printf("\t\tsegment [%016llx, %016llx)"
991			    " size %5s\n", (u_longlong_t)start,
992			    (u_longlong_t)end, segsize);
993			if (error)
994				break;
995			start = end;
996		}
997	}
998
999	if (db != NULL)
1000		dmu_buf_rele(db);
1001}
1002
1003static char *objset_types[DMU_OST_NUMTYPES] = {
1004	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
1005
1006/*ARGSUSED*/
1007static void
1008dump_dir(objset_t *os)
1009{
1010	dmu_objset_stats_t dds;
1011	uint64_t object, object_count;
1012	char numbuf[8];
1013	char blkbuf[300];
1014	char osname[MAXNAMELEN];
1015	char *type = "UNKNOWN";
1016	int verbosity = dump_opt['d'];
1017	int print_header = 1;
1018	int i, error;
1019
1020	dmu_objset_stats(os, &dds);
1021
1022	if (dds.dds_type < DMU_OST_NUMTYPES)
1023		type = objset_types[dds.dds_type];
1024
1025	if (dds.dds_type == DMU_OST_META) {
1026		dds.dds_creation_txg = TXG_INITIAL;
1027		dds.dds_last_txg = os->os->os_rootbp.blk_birth;
1028		dds.dds_objects_used = os->os->os_rootbp.blk_fill;
1029		dds.dds_space_refd =
1030		    os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
1031	}
1032
1033	ASSERT3U(dds.dds_objects_used, ==, os->os->os_rootbp.blk_fill);
1034
1035	nicenum(dds.dds_space_refd, numbuf);
1036
1037	if (verbosity >= 4) {
1038		(void) strcpy(blkbuf, ", rootbp ");
1039		sprintf_blkptr(blkbuf + strlen(blkbuf), &os->os->os_rootbp);
1040	} else {
1041		blkbuf[0] = '\0';
1042	}
1043
1044	dmu_objset_name(os, osname);
1045
1046	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, last_txg %llu, "
1047	    "%s, %llu objects%s\n",
1048	    osname, type, (u_longlong_t)dmu_objset_id(os),
1049	    (u_longlong_t)dds.dds_creation_txg,
1050	    (u_longlong_t)dds.dds_last_txg,
1051	    numbuf,
1052	    (u_longlong_t)dds.dds_objects_used,
1053	    blkbuf);
1054
1055	dump_intent_log(dmu_objset_zil(os));
1056
1057	if (dmu_objset_ds(os) != NULL)
1058		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
1059		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
1060
1061	if (verbosity < 2)
1062		return;
1063
1064	if (zopt_objects != 0) {
1065		for (i = 0; i < zopt_objects; i++)
1066			dump_object(os, zopt_object[i], verbosity,
1067			    &print_header);
1068		(void) printf("\n");
1069		return;
1070	}
1071
1072	dump_object(os, 0, verbosity, &print_header);
1073	object_count = 1;
1074
1075	object = 0;
1076	while ((error = dmu_object_next(os, &object, B_FALSE)) == 0) {
1077		dump_object(os, object, verbosity, &print_header);
1078		object_count++;
1079	}
1080
1081	ASSERT3U(object_count, ==, dds.dds_objects_used);
1082
1083	(void) printf("\n");
1084
1085	if (error != ESRCH)
1086		fatal("dmu_object_next() = %d", error);
1087}
1088
1089static void
1090dump_uberblock(uberblock_t *ub)
1091{
1092	time_t timestamp = ub->ub_timestamp;
1093
1094	(void) printf("Uberblock\n\n");
1095	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
1096	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
1097	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
1098	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
1099	(void) printf("\ttimestamp = %llu UTC = %s",
1100	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
1101	if (dump_opt['u'] >= 3) {
1102		char blkbuf[300];
1103		sprintf_blkptr(blkbuf, &ub->ub_rootbp);
1104		(void) printf("\trootbp = %s\n", blkbuf);
1105	}
1106	(void) printf("\n");
1107}
1108
1109static void
1110dump_config(const char *pool)
1111{
1112	spa_t *spa = NULL;
1113
1114	mutex_enter(&spa_namespace_lock);
1115	while ((spa = spa_next(spa)) != NULL) {
1116		if (pool == NULL)
1117			(void) printf("%s\n", spa_name(spa));
1118		if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
1119			dump_nvlist(spa->spa_config, 4);
1120	}
1121	mutex_exit(&spa_namespace_lock);
1122}
1123
1124static void
1125dump_label(const char *dev)
1126{
1127	int fd;
1128	vdev_label_t label;
1129	char *buf = label.vl_vdev_phys.vp_nvlist;
1130	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
1131	struct stat64 statbuf;
1132	uint64_t psize;
1133	int l;
1134
1135	if ((fd = open(dev, O_RDONLY)) < 0) {
1136		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
1137		exit(1);
1138	}
1139
1140	if (fstat64(fd, &statbuf) != 0) {
1141		(void) printf("failed to stat '%s': %s\n", dev,
1142		    strerror(errno));
1143		exit(1);
1144	}
1145
1146	psize = statbuf.st_size;
1147	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
1148
1149	for (l = 0; l < VDEV_LABELS; l++) {
1150
1151		nvlist_t *config = NULL;
1152
1153		(void) printf("--------------------------------------------\n");
1154		(void) printf("LABEL %d\n", l);
1155		(void) printf("--------------------------------------------\n");
1156
1157		if (pread(fd, &label, sizeof (label),
1158		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
1159			(void) printf("failed to read label %d\n", l);
1160			continue;
1161		}
1162
1163		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
1164			(void) printf("failed to unpack label %d\n", l);
1165			continue;
1166		}
1167		dump_nvlist(config, 4);
1168		nvlist_free(config);
1169	}
1170}
1171
1172/*ARGSUSED*/
1173static void
1174dump_one_dir(char *dsname, void *arg)
1175{
1176	int error;
1177	objset_t *os;
1178
1179	error = dmu_objset_open(dsname, DMU_OST_ANY,
1180	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1181	if (error) {
1182		(void) printf("Could not open %s\n", dsname);
1183		return;
1184	}
1185	dump_dir(os);
1186	dmu_objset_close(os);
1187}
1188
1189static void
1190zdb_space_map_load(spa_t *spa)
1191{
1192	vdev_t *rvd = spa->spa_root_vdev;
1193	vdev_t *vd;
1194	int c, m, error;
1195
1196	for (c = 0; c < rvd->vdev_children; c++) {
1197		vd = rvd->vdev_child[c];
1198		for (m = 0; m < vd->vdev_ms_count; m++) {
1199			metaslab_t *msp = vd->vdev_ms[m];
1200			space_map_t *sm = &msp->ms_allocmap[0];
1201			mutex_enter(&msp->ms_lock);
1202			error = space_map_load(sm, msp->ms_smo, SM_ALLOC,
1203			    spa->spa_meta_objset, msp->ms_usable_end,
1204			    sm->sm_size - msp->ms_usable_space);
1205			mutex_exit(&msp->ms_lock);
1206			if (error)
1207				fatal("%s bad space map #%d, error %d",
1208				    spa->spa_name, c, error);
1209		}
1210	}
1211}
1212
1213static int
1214zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
1215{
1216	dva_t *dva = &bp->blk_dva[0];
1217	uint64_t vdev = DVA_GET_VDEV(dva);
1218	uint64_t offset = DVA_GET_OFFSET(dva);
1219	uint64_t size = DVA_GET_ASIZE(dva);
1220	vdev_t *vd;
1221	metaslab_t *msp;
1222	space_map_t *allocmap, *freemap;
1223	int error;
1224
1225	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
1226		return (ENXIO);
1227
1228	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1229		return (ENXIO);
1230
1231	if (DVA_GET_GANG(dva)) {
1232		zio_gbh_phys_t gbh;
1233		blkptr_t blk = *bp;
1234		int g;
1235
1236		/* LINTED - compile time assert */
1237		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
1238		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1239		DVA_SET_GANG(&blk.blk_dva[0], 0);
1240		DVA_SET_ASIZE(&blk.blk_dva[0], size);
1241		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
1242		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
1243		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
1244		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
1245		error = zio_wait(zio_read(NULL, spa, &blk,
1246		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1247		    ZIO_PRIORITY_SYNC_READ,
1248		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD));
1249		if (error)
1250			return (error);
1251		if (BP_SHOULD_BYTESWAP(&blk))
1252			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
1253		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1254			if (gbh.zg_blkptr[g].blk_birth == 0)
1255				break;
1256			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g]);
1257			if (error)
1258				return (error);
1259		}
1260	}
1261
1262	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1263	allocmap = &msp->ms_allocmap[0];
1264	freemap = &msp->ms_freemap[0];
1265
1266	mutex_enter(&msp->ms_lock);
1267	if (space_map_contains(freemap, offset, size)) {
1268		mutex_exit(&msp->ms_lock);
1269		return (EAGAIN);	/* allocated more than once */
1270	}
1271
1272	if (!space_map_contains(allocmap, offset, size)) {
1273		mutex_exit(&msp->ms_lock);
1274		return (ESTALE);	/* not allocated at all */
1275	}
1276
1277	space_map_remove(allocmap, offset, size);
1278	space_map_add(freemap, offset, size);
1279
1280	mutex_exit(&msp->ms_lock);
1281
1282	return (0);
1283}
1284
1285static void
1286zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
1287{
1288	metaslab_t *msp;
1289
1290	/* LINTED */
1291	msp = (metaslab_t *)((char *)sm - offsetof(metaslab_t, ms_allocmap[0]));
1292
1293	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
1294	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
1295	    (u_longlong_t)start,
1296	    (u_longlong_t)size);
1297}
1298
1299static void
1300zdb_space_map_vacate(spa_t *spa)
1301{
1302	vdev_t *rvd = spa->spa_root_vdev;
1303	vdev_t *vd;
1304	int c, m;
1305
1306	for (c = 0; c < rvd->vdev_children; c++) {
1307		vd = rvd->vdev_child[c];
1308		for (m = 0; m < vd->vdev_ms_count; m++) {
1309			metaslab_t *msp = vd->vdev_ms[m];
1310			mutex_enter(&msp->ms_lock);
1311			space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
1312			    &msp->ms_allocmap[0]);
1313			space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
1314			mutex_exit(&msp->ms_lock);
1315		}
1316	}
1317}
1318
1319static void
1320zdb_refresh_ubsync(spa_t *spa)
1321{
1322	uberblock_t ub = { 0 };
1323	vdev_t *rvd = spa->spa_root_vdev;
1324	zio_t *zio;
1325
1326	/*
1327	 * Reopen all devices to purge zdb's vdev caches.
1328	 */
1329	vdev_reopen(rvd, NULL);
1330
1331	/*
1332	 * Reload the uberblock.
1333	 */
1334	zio = zio_root(spa, NULL, NULL,
1335	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1336	vdev_uberblock_load(zio, rvd, &ub);
1337	(void) zio_wait(zio);
1338
1339	if (ub.ub_txg != 0)
1340		spa->spa_ubsync = ub;
1341}
1342
1343/*
1344 * Verify that the sum of the sizes of all blocks in the pool adds up
1345 * to the SPA's sa_alloc total.
1346 */
1347typedef struct zdb_blkstats {
1348	uint64_t	zb_asize;
1349	uint64_t	zb_lsize;
1350	uint64_t	zb_psize;
1351	uint64_t	zb_count;
1352} zdb_blkstats_t;
1353
1354#define	DMU_OT_DEFERRED	DMU_OT_NONE
1355#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
1356
1357#define	ZB_TOTAL	ZB_MAXLEVEL
1358
1359typedef struct zdb_cb {
1360	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
1361	uint64_t	zcb_errors[256];
1362	traverse_blk_cache_t *zcb_cache;
1363	int		zcb_readfails;
1364	int		zcb_haderrors;
1365} zdb_cb_t;
1366
1367static blkptr_cb_t zdb_blkptr_cb;
1368
1369static void
1370zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
1371{
1372	int i, error;
1373
1374	for (i = 0; i < 4; i++) {
1375		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
1376		int t = (i & 1) ? type : DMU_OT_TOTAL;
1377		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
1378
1379		zb->zb_asize += BP_GET_ASIZE(bp);
1380		zb->zb_lsize += BP_GET_LSIZE(bp);
1381		zb->zb_psize += BP_GET_PSIZE(bp);
1382		zb->zb_count++;
1383	}
1384
1385	if (dump_opt['L'])
1386		return;
1387
1388	error = zdb_space_map_claim(spa, bp);
1389
1390	if (error == 0)
1391		return;
1392
1393	if (error == EAGAIN)
1394		(void) fatal("double-allocation, bp=%p", bp);
1395
1396	if (error == ESTALE)
1397		(void) fatal("reference to freed block, bp=%p", bp);
1398
1399	(void) fatal("fatal error %d in bp %p", error, bp);
1400}
1401
1402static void
1403zdb_log_block_cb(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t first_txg)
1404{
1405	if (bp->blk_birth < first_txg) {
1406		zdb_cb_t *zcb = arg;
1407		traverse_blk_cache_t bc = *zcb->zcb_cache;
1408		zbookmark_t *zb = &bc.bc_bookmark;
1409
1410		zb->zb_objset = bp->blk_cksum.zc_word[2];
1411		zb->zb_blkid = bp->blk_cksum.zc_word[3];
1412		bc.bc_blkptr = *bp;
1413
1414		(void) zdb_blkptr_cb(&bc, zilog->zl_spa, arg);
1415	}
1416}
1417
1418static int
1419zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
1420{
1421	zbookmark_t *zb = &bc->bc_bookmark;
1422	zdb_cb_t *zcb = arg;
1423	blkptr_t *bp = &bc->bc_blkptr;
1424	dmu_object_type_t type = BP_GET_TYPE(bp);
1425	char blkbuf[300];
1426	int error = 0;
1427
1428	if (bc->bc_errno) {
1429		if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
1430			zdb_refresh_ubsync(spa);
1431			error = EAGAIN;
1432		} else {
1433			zcb->zcb_haderrors = 1;
1434			zcb->zcb_errors[bc->bc_errno]++;
1435			error = ERESTART;
1436		}
1437
1438		if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
1439			sprintf_blkptr(blkbuf, bp);
1440		else
1441			blkbuf[0] = '\0';
1442
1443		(void) printf("zdb_blkptr_cb: Got error %d reading "
1444		    "<%llu, %llu, %d, %llx> %s -- %s\n",
1445		    bc->bc_errno,
1446		    (u_longlong_t)zb->zb_objset,
1447		    (u_longlong_t)zb->zb_object,
1448		    zb->zb_level,
1449		    (u_longlong_t)zb->zb_blkid,
1450		    blkbuf,
1451		    error == EAGAIN ? "retrying" : "skipping");
1452
1453		return (error);
1454	}
1455
1456	zcb->zcb_readfails = 0;
1457
1458	ASSERT(bp->blk_birth != 0);
1459
1460	zdb_count_block(spa, zcb, bp, type);
1461
1462	if (dump_opt['b'] >= 4) {
1463		sprintf_blkptr(blkbuf, bp);
1464		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
1465		    (u_longlong_t)zb->zb_objset,
1466		    (u_longlong_t)zb->zb_object,
1467		    (u_longlong_t)blkid2offset(bc->bc_dnode,
1468			zb->zb_level, zb->zb_blkid),
1469		    blkbuf);
1470	}
1471
1472	if (type == DMU_OT_OBJSET) {
1473		objset_phys_t *osphys = bc->bc_data;
1474		zilog_t zilog = { 0 };
1475		zilog.zl_header = &osphys->os_zil_header;
1476		zilog.zl_spa = spa;
1477
1478		zcb->zcb_cache = bc;
1479
1480		zil_parse(&zilog, zdb_log_block_cb, NULL, zcb,
1481		    spa_first_txg(spa));
1482	}
1483
1484	return (0);
1485}
1486
1487static int
1488dump_block_stats(spa_t *spa)
1489{
1490	traverse_handle_t *th;
1491	zdb_cb_t zcb = { 0 };
1492	zdb_blkstats_t *zb, *tzb;
1493	uint64_t alloc, space;
1494	int leaks = 0;
1495	int advance = zdb_advance;
1496	int flags;
1497	int e;
1498
1499	if (dump_opt['c'])
1500		advance |= ADVANCE_DATA;
1501
1502	advance |= ADVANCE_PRUNE;
1503
1504	(void) printf("\nTraversing all blocks to %sverify"
1505	    " nothing leaked ...\n",
1506	    dump_opt['c'] ? "verify checksums and " : "");
1507
1508	/*
1509	 * Load all space maps.  As we traverse the pool, if we find a block
1510	 * that's not in its space map, that indicates a double-allocation,
1511	 * reference to a freed block, or an unclaimed block.  Otherwise we
1512	 * remove the block from the space map.  If the space maps are not
1513	 * empty when we're done, that indicates leaked blocks.
1514	 */
1515	if (!dump_opt['L'])
1516		zdb_space_map_load(spa);
1517
1518	/*
1519	 * If there's a deferred-free bplist, process that first.
1520	 */
1521	if (spa->spa_sync_bplist_obj != 0) {
1522		bplist_t *bpl = &spa->spa_sync_bplist;
1523		blkptr_t blk;
1524		uint64_t itor = 0;
1525
1526		bplist_open(bpl, spa->spa_meta_objset,
1527		    spa->spa_sync_bplist_obj);
1528
1529		while (bplist_iterate(bpl, &itor, &blk) == 0) {
1530			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
1531			if (dump_opt['b'] >= 4) {
1532				char blkbuf[300];
1533				sprintf_blkptr(blkbuf, &blk);
1534				(void) printf("[%s] %s\n",
1535				    "deferred free", blkbuf);
1536			}
1537		}
1538
1539		bplist_close(bpl);
1540	}
1541
1542	/*
1543	 * Now traverse the pool.  If we're read all data to verify checksums,
1544	 * do a scrubbing read so that we validate all copies.
1545	 */
1546	flags = ZIO_FLAG_CANFAIL;
1547	if (advance & ADVANCE_DATA)
1548		flags |= ZIO_FLAG_SCRUB;
1549	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
1550	th->th_noread = zdb_noread;
1551
1552	traverse_add_pool(th, 0, -1ULL);
1553
1554	while (traverse_more(th) == EAGAIN)
1555		continue;
1556
1557	traverse_fini(th);
1558
1559	if (zcb.zcb_haderrors) {
1560		(void) printf("\nError counts:\n\n");
1561		(void) printf("\t%5s  %s\n", "errno", "count");
1562		for (e = 0; e < 256; e++) {
1563			if (zcb.zcb_errors[e] != 0) {
1564				(void) printf("\t%5d  %llu\n",
1565				    e, (u_longlong_t)zcb.zcb_errors[e]);
1566			}
1567		}
1568	}
1569
1570	/*
1571	 * Report any leaked segments.
1572	 */
1573	if (!dump_opt['L'])
1574		zdb_space_map_vacate(spa);
1575
1576	if (dump_opt['L'])
1577		(void) printf("\n\n *** Live pool traversal; "
1578		    "block counts are only approximate ***\n\n");
1579
1580	alloc = spa_get_alloc(spa);
1581	space = spa_get_space(spa);
1582
1583	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
1584
1585	if (tzb->zb_asize == alloc) {
1586		(void) printf("\n\tNo leaks (block sum matches space"
1587		    " maps exactly)\n");
1588	} else {
1589		(void) printf("block traversal size %llu != alloc %llu "
1590		    "(leaked %lld)\n",
1591		    (u_longlong_t)tzb->zb_asize,
1592		    (u_longlong_t)alloc,
1593		    (u_longlong_t)(alloc - tzb->zb_asize));
1594		leaks = 1;
1595	}
1596
1597	if (tzb->zb_count == 0)
1598		return (2);
1599
1600	(void) printf("\n");
1601	(void) printf("\tbp count:      %10llu\n",
1602	    (u_longlong_t)tzb->zb_count);
1603	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
1604	    (u_longlong_t)tzb->zb_lsize,
1605	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
1606	(void) printf("\tbp physical:   %10llu\t avg:"
1607	    " %6llu\tcompression: %6.2f\n",
1608	    (u_longlong_t)tzb->zb_psize,
1609	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
1610	    (double)tzb->zb_lsize / tzb->zb_psize);
1611	(void) printf("\tbp allocated:  %10llu\t avg:"
1612	    " %6llu\tcompression: %6.2f\n",
1613	    (u_longlong_t)tzb->zb_asize,
1614	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
1615	    (double)tzb->zb_lsize / tzb->zb_asize);
1616	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
1617	    (u_longlong_t)alloc, 100.0 * alloc / space);
1618
1619	if (dump_opt['b'] >= 2) {
1620		int l, t, level;
1621		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
1622		    "\t  avg\t comp\t%%Total\tType\n");
1623
1624		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
1625			char csize[6], lsize[6], psize[6], asize[6], avg[6];
1626			char *typename;
1627
1628			typename = t == DMU_OT_DEFERRED ? "deferred free" :
1629			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
1630
1631			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
1632				(void) printf("%6s\t%5s\t%5s\t%5s"
1633				    "\t%5s\t%5s\t%6s\t%s\n",
1634				    "-",
1635				    "-",
1636				    "-",
1637				    "-",
1638				    "-",
1639				    "-",
1640				    "-",
1641				    typename);
1642				continue;
1643			}
1644
1645			for (l = ZB_TOTAL - 1; l >= -1; l--) {
1646				level = (l == -1 ? ZB_TOTAL : l);
1647				zb = &zcb.zcb_type[level][t];
1648
1649				if (zb->zb_asize == 0)
1650					continue;
1651
1652				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
1653					continue;
1654
1655				if (level == 0 && zb->zb_asize ==
1656				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
1657					continue;
1658
1659				nicenum(zb->zb_count, csize);
1660				nicenum(zb->zb_lsize, lsize);
1661				nicenum(zb->zb_psize, psize);
1662				nicenum(zb->zb_asize, asize);
1663				nicenum(zb->zb_asize / zb->zb_count, avg);
1664
1665				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
1666				    "\t%5.2f\t%6.2f\t",
1667				    csize, lsize, psize, asize, avg,
1668				    (double)zb->zb_lsize / zb->zb_psize,
1669				    100.0 * zb->zb_asize / tzb->zb_asize);
1670
1671				if (level == ZB_TOTAL)
1672					(void) printf("%s\n", typename);
1673				else
1674					(void) printf("    L%d %s\n",
1675					    level, typename);
1676			}
1677		}
1678	}
1679
1680	(void) printf("\n");
1681
1682	if (leaks)
1683		return (2);
1684
1685	if (zcb.zcb_haderrors)
1686		return (3);
1687
1688	return (0);
1689}
1690
1691static void
1692dump_zpool(spa_t *spa)
1693{
1694	dsl_pool_t *dp = spa_get_dsl(spa);
1695	int rc = 0;
1696
1697	if (dump_opt['u'])
1698		dump_uberblock(&spa->spa_uberblock);
1699
1700	if (dump_opt['d'] || dump_opt['i']) {
1701		dump_dir(dp->dp_meta_objset);
1702		if (dump_opt['d'] >= 3) {
1703			dump_bplist(dp->dp_meta_objset,
1704			    spa->spa_sync_bplist_obj, "Deferred frees");
1705			dump_dtl(spa->spa_root_vdev, 0);
1706			dump_metaslabs(spa);
1707		}
1708		dmu_objset_find(spa->spa_name, dump_one_dir, NULL,
1709		    DS_FIND_SNAPSHOTS);
1710	}
1711
1712	if (dump_opt['b'] || dump_opt['c'])
1713		rc = dump_block_stats(spa);
1714
1715	if (dump_opt['s'])
1716		show_pool_stats(spa);
1717
1718	if (rc != 0)
1719		exit(rc);
1720}
1721
1722int
1723main(int argc, char **argv)
1724{
1725	int i, c;
1726	struct rlimit rl = { 1024, 1024 };
1727	spa_t *spa;
1728	objset_t *os = NULL;
1729	char *endstr;
1730	int dump_all = 1;
1731	int verbose = 0;
1732	int error;
1733	int flag, set;
1734
1735	(void) setrlimit(RLIMIT_NOFILE, &rl);
1736
1737	dprintf_setup(&argc, argv);
1738
1739	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
1740		switch (c) {
1741		case 'u':
1742		case 'd':
1743		case 'i':
1744		case 'b':
1745		case 'c':
1746		case 's':
1747		case 'C':
1748		case 'l':
1749			dump_opt[c]++;
1750			dump_all = 0;
1751			break;
1752		case 'L':
1753			dump_opt[c]++;
1754			break;
1755		case 'O':
1756			endstr = optarg;
1757			if (endstr[0] == '!') {
1758				endstr++;
1759				set = 0;
1760			} else {
1761				set = 1;
1762			}
1763			if (strcmp(endstr, "post") == 0) {
1764				flag = ADVANCE_PRE;
1765				set = !set;
1766			} else if (strcmp(endstr, "pre") == 0) {
1767				flag = ADVANCE_PRE;
1768			} else if (strcmp(endstr, "prune") == 0) {
1769				flag = ADVANCE_PRUNE;
1770			} else if (strcmp(endstr, "data") == 0) {
1771				flag = ADVANCE_DATA;
1772			} else if (strcmp(endstr, "holes") == 0) {
1773				flag = ADVANCE_HOLES;
1774			} else {
1775				usage();
1776			}
1777			if (set)
1778				zdb_advance |= flag;
1779			else
1780				zdb_advance &= ~flag;
1781			break;
1782		case 'B':
1783			endstr = optarg - 1;
1784			zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
1785			zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
1786			zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
1787			zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
1788			(void) printf("simulating bad block "
1789			    "<%llu, %llu, %d, %llx>\n",
1790			    (u_longlong_t)zdb_noread.zb_objset,
1791			    (u_longlong_t)zdb_noread.zb_object,
1792			    zdb_noread.zb_level,
1793			    (u_longlong_t)zdb_noread.zb_blkid);
1794			break;
1795		case 'v':
1796			verbose++;
1797			break;
1798		case 'U':
1799			spa_config_dir = "/tmp";
1800			break;
1801		default:
1802			usage();
1803			break;
1804		}
1805	}
1806
1807	kernel_init(FREAD);
1808
1809	for (c = 0; c < 256; c++) {
1810		if (dump_all && c != 'L' && c != 'l')
1811			dump_opt[c] = 1;
1812		if (dump_opt[c])
1813			dump_opt[c] += verbose;
1814	}
1815
1816	argc -= optind;
1817	argv += optind;
1818
1819	if (argc < 1) {
1820		if (dump_opt['C']) {
1821			dump_config(NULL);
1822			return (0);
1823		}
1824		usage();
1825	}
1826
1827	if (dump_opt['l']) {
1828		dump_label(argv[0]);
1829		return (0);
1830	}
1831
1832	if (dump_opt['C'])
1833		dump_config(argv[0]);
1834
1835	if (strchr(argv[0], '/') != NULL) {
1836		error = dmu_objset_open(argv[0], DMU_OST_ANY,
1837		    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
1838	} else {
1839		error = spa_open(argv[0], &spa, FTAG);
1840	}
1841
1842	if (error)
1843		fatal("can't open %s: error %d", argv[0], error);
1844
1845	argv++;
1846	if (--argc > 0) {
1847		zopt_objects = argc;
1848		zopt_object = calloc(zopt_objects, sizeof (uint64_t));
1849		for (i = 0; i < zopt_objects; i++) {
1850			errno = 0;
1851			zopt_object[i] = strtoull(argv[i], NULL, 0);
1852			if (zopt_object[i] == 0 && errno != 0)
1853				fatal("bad object number %s: %s",
1854				    argv[i], strerror(errno));
1855		}
1856	}
1857
1858	if (os != NULL) {
1859		dump_dir(os);
1860		dmu_objset_close(os);
1861	} else {
1862		dump_zpool(spa);
1863		spa_close(spa, FTAG);
1864	}
1865
1866	kernel_fini();
1867
1868	return (0);
1869}
1870