1/*
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28
29/*
30 *	Stand-alone ZFS file reader.
31 */
32
33#include <sys/endian.h>
34#include <sys/stat.h>
35#include <sys/stdint.h>
36#include <sys/list.h>
37#include <inttypes.h>
38
39#include "zfsimpl.h"
40#include "zfssubr.c"
41
42
43struct zfsmount {
44	const spa_t	*spa;
45	objset_phys_t	objset;
46	uint64_t	rootobj;
47};
48
49/*
50 * The indirect_child_t represents the vdev that we will read from, when we
51 * need to read all copies of the data (e.g. for scrub or reconstruction).
52 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
53 * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
54 * ic_vdev is a child of the mirror.
55 */
56typedef struct indirect_child {
57	void *ic_data;
58	vdev_t *ic_vdev;
59} indirect_child_t;
60
61/*
62 * The indirect_split_t represents one mapped segment of an i/o to the
63 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
64 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
65 * For split blocks, there will be several of these.
66 */
67typedef struct indirect_split {
68	list_node_t is_node; /* link on iv_splits */
69
70	/*
71	 * is_split_offset is the offset into the i/o.
72	 * This is the sum of the previous splits' is_size's.
73	 */
74	uint64_t is_split_offset;
75
76	vdev_t *is_vdev; /* top-level vdev */
77	uint64_t is_target_offset; /* offset on is_vdev */
78	uint64_t is_size;
79	int is_children; /* number of entries in is_child[] */
80
81	/*
82	 * is_good_child is the child that we are currently using to
83	 * attempt reconstruction.
84	 */
85	int is_good_child;
86
87	indirect_child_t is_child[1]; /* variable-length */
88} indirect_split_t;
89
90/*
91 * The indirect_vsd_t is associated with each i/o to the indirect vdev.
92 * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
93 */
94typedef struct indirect_vsd {
95	boolean_t iv_split_block;
96	boolean_t iv_reconstruct;
97
98	list_t iv_splits; /* list of indirect_split_t's */
99} indirect_vsd_t;
100
101/*
102 * List of all vdevs, chained through v_alllink.
103 */
104static vdev_list_t zfs_vdevs;
105
106/*
107 * List of ZFS features supported for read
108 */
109static const char *features_for_read[] = {
110	"org.illumos:lz4_compress",
111	"com.delphix:hole_birth",
112	"com.delphix:extensible_dataset",
113	"com.delphix:embedded_data",
114	"org.open-zfs:large_blocks",
115	"org.illumos:sha512",
116	"org.illumos:skein",
117	"org.illumos:edonr",
118	"org.zfsonlinux:large_dnode",
119	"com.joyent:multi_vdev_crash_dump",
120	"com.delphix:spacemap_histogram",
121	"com.delphix:zpool_checkpoint",
122	"com.delphix:spacemap_v2",
123	"com.datto:encryption",
124	"com.datto:bookmark_v2",
125	"org.zfsonlinux:allocation_classes",
126	"com.datto:resilver_defer",
127	"com.delphix:device_removal",
128	"com.delphix:obsolete_counts",
129	NULL
130};
131
132/*
133 * List of all pools, chained through spa_link.
134 */
135static spa_list_t zfs_pools;
136
137static const dnode_phys_t *dnode_cache_obj;
138static uint64_t dnode_cache_bn;
139static char *dnode_cache_buf;
140
141static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
142static int zfs_get_root(const spa_t *spa, uint64_t *objid);
143static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
144static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
145    const char *name, uint64_t integer_size, uint64_t num_integers,
146    void *value);
147static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
148    dnode_phys_t *);
149static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
150    size_t);
151static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
152    size_t);
153static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t,
154    size_t);
155
156static void
157zfs_init(void)
158{
159	STAILQ_INIT(&zfs_vdevs);
160	STAILQ_INIT(&zfs_pools);
161
162	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
163
164	zfs_init_crc();
165}
166
167static int
168xdr_int(const unsigned char **xdr, int *ip)
169{
170	*ip = be32dec(*xdr);
171	(*xdr) += 4;
172	return (0);
173}
174
175static int
176xdr_u_int(const unsigned char **xdr, uint_t *ip)
177{
178	*ip = be32dec(*xdr);
179	(*xdr) += 4;
180	return (0);
181}
182
183static int
184xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
185{
186	uint_t hi, lo;
187
188	xdr_u_int(xdr, &hi);
189	xdr_u_int(xdr, &lo);
190	*lp = (((uint64_t)hi) << 32) | lo;
191	return (0);
192}
193
194static int
195nvlist_find(const unsigned char *nvlist, const char *name, int type,
196    int *elementsp, void *valuep)
197{
198	const unsigned char *p, *pair;
199	int junk;
200	int encoded_size, decoded_size;
201
202	p = nvlist;
203	xdr_int(&p, &junk);
204	xdr_int(&p, &junk);
205
206	pair = p;
207	xdr_int(&p, &encoded_size);
208	xdr_int(&p, &decoded_size);
209	while (encoded_size && decoded_size) {
210		int namelen, pairtype, elements;
211		const char *pairname;
212
213		xdr_int(&p, &namelen);
214		pairname = (const char *)p;
215		p += roundup(namelen, 4);
216		xdr_int(&p, &pairtype);
217
218		if (memcmp(name, pairname, namelen) == 0 && type == pairtype) {
219			xdr_int(&p, &elements);
220			if (elementsp)
221				*elementsp = elements;
222			if (type == DATA_TYPE_UINT64) {
223				xdr_uint64_t(&p, (uint64_t *)valuep);
224				return (0);
225			} else if (type == DATA_TYPE_STRING) {
226				int len;
227				xdr_int(&p, &len);
228				(*(const char **)valuep) = (const char *)p;
229				return (0);
230			} else if (type == DATA_TYPE_NVLIST ||
231			    type == DATA_TYPE_NVLIST_ARRAY) {
232				(*(const unsigned char **)valuep) =
233				    (const unsigned char *)p;
234				return (0);
235			} else {
236				return (EIO);
237			}
238		} else {
239			/*
240			 * Not the pair we are looking for, skip to the
241			 * next one.
242			 */
243			p = pair + encoded_size;
244		}
245
246		pair = p;
247		xdr_int(&p, &encoded_size);
248		xdr_int(&p, &decoded_size);
249	}
250
251	return (EIO);
252}
253
254static int
255nvlist_check_features_for_read(const unsigned char *nvlist)
256{
257	const unsigned char *p, *pair;
258	int junk;
259	int encoded_size, decoded_size;
260	int rc;
261
262	rc = 0;
263
264	p = nvlist;
265	xdr_int(&p, &junk);
266	xdr_int(&p, &junk);
267
268	pair = p;
269	xdr_int(&p, &encoded_size);
270	xdr_int(&p, &decoded_size);
271	while (encoded_size && decoded_size) {
272		int namelen, pairtype;
273		const char *pairname;
274		int i, found;
275
276		found = 0;
277
278		xdr_int(&p, &namelen);
279		pairname = (const char *)p;
280		p += roundup(namelen, 4);
281		xdr_int(&p, &pairtype);
282
283		for (i = 0; features_for_read[i] != NULL; i++) {
284			if (memcmp(pairname, features_for_read[i],
285			    namelen) == 0) {
286				found = 1;
287				break;
288			}
289		}
290
291		if (!found) {
292			printf("ZFS: unsupported feature: %s\n", pairname);
293			rc = EIO;
294		}
295
296		p = pair + encoded_size;
297
298		pair = p;
299		xdr_int(&p, &encoded_size);
300		xdr_int(&p, &decoded_size);
301	}
302
303	return (rc);
304}
305
306/*
307 * Return the next nvlist in an nvlist array.
308 */
309static const unsigned char *
310nvlist_next(const unsigned char *nvlist)
311{
312	const unsigned char *p, *pair;
313	int junk;
314	int encoded_size, decoded_size;
315
316	p = nvlist;
317	xdr_int(&p, &junk);
318	xdr_int(&p, &junk);
319
320	pair = p;
321	xdr_int(&p, &encoded_size);
322	xdr_int(&p, &decoded_size);
323	while (encoded_size && decoded_size) {
324		p = pair + encoded_size;
325
326		pair = p;
327		xdr_int(&p, &encoded_size);
328		xdr_int(&p, &decoded_size);
329	}
330
331	return (p);
332}
333
334#ifdef TEST
335
336static const unsigned char *
337nvlist_print(const unsigned char *nvlist, unsigned int indent)
338{
339	static const char *typenames[] = {
340		"DATA_TYPE_UNKNOWN",
341		"DATA_TYPE_BOOLEAN",
342		"DATA_TYPE_BYTE",
343		"DATA_TYPE_INT16",
344		"DATA_TYPE_UINT16",
345		"DATA_TYPE_INT32",
346		"DATA_TYPE_UINT32",
347		"DATA_TYPE_INT64",
348		"DATA_TYPE_UINT64",
349		"DATA_TYPE_STRING",
350		"DATA_TYPE_BYTE_ARRAY",
351		"DATA_TYPE_INT16_ARRAY",
352		"DATA_TYPE_UINT16_ARRAY",
353		"DATA_TYPE_INT32_ARRAY",
354		"DATA_TYPE_UINT32_ARRAY",
355		"DATA_TYPE_INT64_ARRAY",
356		"DATA_TYPE_UINT64_ARRAY",
357		"DATA_TYPE_STRING_ARRAY",
358		"DATA_TYPE_HRTIME",
359		"DATA_TYPE_NVLIST",
360		"DATA_TYPE_NVLIST_ARRAY",
361		"DATA_TYPE_BOOLEAN_VALUE",
362		"DATA_TYPE_INT8",
363		"DATA_TYPE_UINT8",
364		"DATA_TYPE_BOOLEAN_ARRAY",
365		"DATA_TYPE_INT8_ARRAY",
366		"DATA_TYPE_UINT8_ARRAY"
367	};
368
369	unsigned int i, j;
370	const unsigned char *p, *pair;
371	int junk;
372	int encoded_size, decoded_size;
373
374	p = nvlist;
375	xdr_int(&p, &junk);
376	xdr_int(&p, &junk);
377
378	pair = p;
379	xdr_int(&p, &encoded_size);
380	xdr_int(&p, &decoded_size);
381	while (encoded_size && decoded_size) {
382		int namelen, pairtype, elements;
383		const char *pairname;
384
385		xdr_int(&p, &namelen);
386		pairname = (const char *)p;
387		p += roundup(namelen, 4);
388		xdr_int(&p, &pairtype);
389
390		for (i = 0; i < indent; i++)
391			printf(" ");
392		printf("%s %s", typenames[pairtype], pairname);
393
394		xdr_int(&p, &elements);
395		switch (pairtype) {
396		case DATA_TYPE_UINT64: {
397			uint64_t val;
398			xdr_uint64_t(&p, &val);
399			printf(" = 0x%jx\n", (uintmax_t)val);
400			break;
401		}
402
403		case DATA_TYPE_STRING: {
404			int len;
405			xdr_int(&p, &len);
406			printf(" = \"%s\"\n", p);
407			break;
408		}
409
410		case DATA_TYPE_NVLIST:
411			printf("\n");
412			nvlist_print(p, indent + 1);
413			break;
414
415		case DATA_TYPE_NVLIST_ARRAY:
416			for (j = 0; j < elements; j++) {
417				printf("[%d]\n", j);
418				p = nvlist_print(p, indent + 1);
419				if (j != elements - 1) {
420					for (i = 0; i < indent; i++)
421						printf(" ");
422					printf("%s %s", typenames[pairtype],
423					    pairname);
424				}
425			}
426			break;
427
428		default:
429			printf("\n");
430		}
431
432		p = pair + encoded_size;
433
434		pair = p;
435		xdr_int(&p, &encoded_size);
436		xdr_int(&p, &decoded_size);
437	}
438
439	return (p);
440}
441
442#endif
443
444static int
445vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
446    off_t offset, size_t size)
447{
448	size_t psize;
449	int rc;
450
451	if (!vdev->v_phys_read)
452		return (EIO);
453
454	if (bp) {
455		psize = BP_GET_PSIZE(bp);
456	} else {
457		psize = size;
458	}
459
460	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
461	if (rc == 0) {
462		if (bp != NULL)
463			rc = zio_checksum_verify(vdev->v_spa, bp, buf);
464	}
465
466	return (rc);
467}
468
469typedef struct remap_segment {
470	vdev_t *rs_vd;
471	uint64_t rs_offset;
472	uint64_t rs_asize;
473	uint64_t rs_split_offset;
474	list_node_t rs_node;
475} remap_segment_t;
476
477static remap_segment_t *
478rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
479{
480	remap_segment_t *rs = malloc(sizeof (remap_segment_t));
481
482	if (rs != NULL) {
483		rs->rs_vd = vd;
484		rs->rs_offset = offset;
485		rs->rs_asize = asize;
486		rs->rs_split_offset = split_offset;
487	}
488
489	return (rs);
490}
491
492vdev_indirect_mapping_t *
493vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
494    uint64_t mapping_object)
495{
496	vdev_indirect_mapping_t *vim;
497	vdev_indirect_mapping_phys_t *vim_phys;
498	int rc;
499
500	vim = calloc(1, sizeof (*vim));
501	if (vim == NULL)
502		return (NULL);
503
504	vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
505	if (vim->vim_dn == NULL) {
506		free(vim);
507		return (NULL);
508	}
509
510	rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
511	if (rc != 0) {
512		free(vim->vim_dn);
513		free(vim);
514		return (NULL);
515	}
516
517	vim->vim_spa = spa;
518	vim->vim_phys = malloc(sizeof (*vim->vim_phys));
519	if (vim->vim_phys == NULL) {
520		free(vim->vim_dn);
521		free(vim);
522		return (NULL);
523	}
524
525	vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
526	*vim->vim_phys = *vim_phys;
527
528	vim->vim_objset = os;
529	vim->vim_object = mapping_object;
530	vim->vim_entries = NULL;
531
532	vim->vim_havecounts =
533	    (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
534
535	return (vim);
536}
537
538/*
539 * Compare an offset with an indirect mapping entry; there are three
540 * possible scenarios:
541 *
542 *     1. The offset is "less than" the mapping entry; meaning the
543 *        offset is less than the source offset of the mapping entry. In
544 *        this case, there is no overlap between the offset and the
545 *        mapping entry and -1 will be returned.
546 *
547 *     2. The offset is "greater than" the mapping entry; meaning the
548 *        offset is greater than the mapping entry's source offset plus
549 *        the entry's size. In this case, there is no overlap between
550 *        the offset and the mapping entry and 1 will be returned.
551 *
552 *        NOTE: If the offset is actually equal to the entry's offset
553 *        plus size, this is considered to be "greater" than the entry,
554 *        and this case applies (i.e. 1 will be returned). Thus, the
555 *        entry's "range" can be considered to be inclusive at its
556 *        start, but exclusive at its end: e.g. [src, src + size).
557 *
558 *     3. The last case to consider is if the offset actually falls
559 *        within the mapping entry's range. If this is the case, the
560 *        offset is considered to be "equal to" the mapping entry and
561 *        0 will be returned.
562 *
563 *        NOTE: If the offset is equal to the entry's source offset,
564 *        this case applies and 0 will be returned. If the offset is
565 *        equal to the entry's source plus its size, this case does
566 *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
567 *        returned.
568 */
569static int
570dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
571{
572	const uint64_t *key = v_key;
573	const vdev_indirect_mapping_entry_phys_t *array_elem =
574	    v_array_elem;
575	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
576
577	if (*key < src_offset) {
578		return (-1);
579	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
580		return (0);
581	} else {
582		return (1);
583	}
584}
585
586/*
587 * Return array entry.
588 */
589static vdev_indirect_mapping_entry_phys_t *
590vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
591{
592	uint64_t size;
593	off_t offset = 0;
594	int rc;
595
596	if (vim->vim_phys->vimp_num_entries == 0)
597		return (NULL);
598
599	if (vim->vim_entries == NULL) {
600		uint64_t bsize;
601
602		bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
603		size = vim->vim_phys->vimp_num_entries *
604		    sizeof (*vim->vim_entries);
605		if (size > bsize) {
606			size = bsize / sizeof (*vim->vim_entries);
607			size *= sizeof (*vim->vim_entries);
608		}
609		vim->vim_entries = malloc(size);
610		if (vim->vim_entries == NULL)
611			return (NULL);
612		vim->vim_num_entries = size / sizeof (*vim->vim_entries);
613		offset = index * sizeof (*vim->vim_entries);
614	}
615
616	/* We have data in vim_entries */
617	if (offset == 0) {
618		if (index >= vim->vim_entry_offset &&
619		    index <= vim->vim_entry_offset + vim->vim_num_entries) {
620			index -= vim->vim_entry_offset;
621			return (&vim->vim_entries[index]);
622		}
623		offset = index * sizeof (*vim->vim_entries);
624	}
625
626	vim->vim_entry_offset = index;
627	size = vim->vim_num_entries * sizeof (*vim->vim_entries);
628	rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
629	    size);
630	if (rc != 0) {
631		/* Read error, invalidate vim_entries. */
632		free(vim->vim_entries);
633		vim->vim_entries = NULL;
634		return (NULL);
635	}
636	index -= vim->vim_entry_offset;
637	return (&vim->vim_entries[index]);
638}
639
640/*
641 * Returns the mapping entry for the given offset.
642 *
643 * It's possible that the given offset will not be in the mapping table
644 * (i.e. no mapping entries contain this offset), in which case, the
645 * return value value depends on the "next_if_missing" parameter.
646 *
647 * If the offset is not found in the table and "next_if_missing" is
648 * B_FALSE, then NULL will always be returned. The behavior is intended
649 * to allow consumers to get the entry corresponding to the offset
650 * parameter, iff the offset overlaps with an entry in the table.
651 *
652 * If the offset is not found in the table and "next_if_missing" is
653 * B_TRUE, then the entry nearest to the given offset will be returned,
654 * such that the entry's source offset is greater than the offset
655 * passed in (i.e. the "next" mapping entry in the table is returned, if
656 * the offset is missing from the table). If there are no entries whose
657 * source offset is greater than the passed in offset, NULL is returned.
658 */
659static vdev_indirect_mapping_entry_phys_t *
660vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
661    uint64_t offset)
662{
663	ASSERT(vim->vim_phys->vimp_num_entries > 0);
664
665	vdev_indirect_mapping_entry_phys_t *entry;
666
667	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
668	uint64_t base = 0;
669
670	/*
671	 * We don't define these inside of the while loop because we use
672	 * their value in the case that offset isn't in the mapping.
673	 */
674	uint64_t mid;
675	int result;
676
677	while (last >= base) {
678		mid = base + ((last - base) >> 1);
679
680		entry = vdev_indirect_mapping_entry(vim, mid);
681		if (entry == NULL)
682			break;
683		result = dva_mapping_overlap_compare(&offset, entry);
684
685		if (result == 0) {
686			break;
687		} else if (result < 0) {
688			last = mid - 1;
689		} else {
690			base = mid + 1;
691		}
692	}
693	return (entry);
694}
695
696/*
697 * Given an indirect vdev and an extent on that vdev, it duplicates the
698 * physical entries of the indirect mapping that correspond to the extent
699 * to a new array and returns a pointer to it. In addition, copied_entries
700 * is populated with the number of mapping entries that were duplicated.
701 *
702 * Finally, since we are doing an allocation, it is up to the caller to
703 * free the array allocated in this function.
704 */
705vdev_indirect_mapping_entry_phys_t *
706vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
707    uint64_t asize, uint64_t *copied_entries)
708{
709	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
710	vdev_indirect_mapping_t *vim = vd->v_mapping;
711	uint64_t entries = 0;
712
713	vdev_indirect_mapping_entry_phys_t *first_mapping =
714	    vdev_indirect_mapping_entry_for_offset(vim, offset);
715	ASSERT3P(first_mapping, !=, NULL);
716
717	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
718	while (asize > 0) {
719		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
720		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
721		uint64_t inner_size = MIN(asize, size - inner_offset);
722
723		offset += inner_size;
724		asize -= inner_size;
725		entries++;
726		m++;
727	}
728
729	size_t copy_length = entries * sizeof (*first_mapping);
730	duplicate_mappings = malloc(copy_length);
731	if (duplicate_mappings != NULL)
732		bcopy(first_mapping, duplicate_mappings, copy_length);
733	else
734		entries = 0;
735
736	*copied_entries = entries;
737
738	return (duplicate_mappings);
739}
740
741static vdev_t *
742vdev_lookup_top(spa_t *spa, uint64_t vdev)
743{
744	vdev_t *rvd;
745	vdev_list_t *vlist;
746
747	vlist = &spa->spa_root_vdev->v_children;
748	STAILQ_FOREACH(rvd, vlist, v_childlink)
749		if (rvd->v_id == vdev)
750			break;
751
752	return (rvd);
753}
754
755/*
756 * This is a callback for vdev_indirect_remap() which allocates an
757 * indirect_split_t for each split segment and adds it to iv_splits.
758 */
759static void
760vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
761    uint64_t size, void *arg)
762{
763	int n = 1;
764	zio_t *zio = arg;
765	indirect_vsd_t *iv = zio->io_vsd;
766
767	if (vd->v_read == vdev_indirect_read)
768		return;
769
770	if (vd->v_read == vdev_mirror_read)
771		n = vd->v_nchildren;
772
773	indirect_split_t *is =
774	    malloc(offsetof(indirect_split_t, is_child[n]));
775	if (is == NULL) {
776		zio->io_error = ENOMEM;
777		return;
778	}
779	bzero(is, offsetof(indirect_split_t, is_child[n]));
780
781	is->is_children = n;
782	is->is_size = size;
783	is->is_split_offset = split_offset;
784	is->is_target_offset = offset;
785	is->is_vdev = vd;
786
787	/*
788	 * Note that we only consider multiple copies of the data for
789	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
790	 * though they use the same ops as mirror, because there's only one
791	 * "good" copy under the replacing/spare.
792	 */
793	if (vd->v_read == vdev_mirror_read) {
794		int i = 0;
795		vdev_t *kid;
796
797		STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
798			is->is_child[i++].ic_vdev = kid;
799		}
800	} else {
801		is->is_child[0].ic_vdev = vd;
802	}
803
804	list_insert_tail(&iv->iv_splits, is);
805}
806
807static void
808vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
809{
810	list_t stack;
811	spa_t *spa = vd->v_spa;
812	zio_t *zio = arg;
813	remap_segment_t *rs;
814
815	list_create(&stack, sizeof (remap_segment_t),
816	    offsetof(remap_segment_t, rs_node));
817
818	rs = rs_alloc(vd, offset, asize, 0);
819	if (rs == NULL) {
820		printf("vdev_indirect_remap: out of memory.\n");
821		zio->io_error = ENOMEM;
822	}
823	for (; rs != NULL; rs = list_remove_head(&stack)) {
824		vdev_t *v = rs->rs_vd;
825		uint64_t num_entries = 0;
826		/* vdev_indirect_mapping_t *vim = v->v_mapping; */
827		vdev_indirect_mapping_entry_phys_t *mapping =
828		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
829		    rs->rs_offset, rs->rs_asize, &num_entries);
830
831		if (num_entries == 0)
832			zio->io_error = ENOMEM;
833
834		for (uint64_t i = 0; i < num_entries; i++) {
835			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
836			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
837			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
838			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
839			uint64_t inner_offset = rs->rs_offset -
840			    DVA_MAPPING_GET_SRC_OFFSET(m);
841			uint64_t inner_size =
842			    MIN(rs->rs_asize, size - inner_offset);
843			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
844
845			if (dst_v->v_read == vdev_indirect_read) {
846				remap_segment_t *o;
847
848				o = rs_alloc(dst_v, dst_offset + inner_offset,
849				    inner_size, rs->rs_split_offset);
850				if (o == NULL) {
851					printf("vdev_indirect_remap: "
852					    "out of memory.\n");
853					zio->io_error = ENOMEM;
854					break;
855				}
856
857				list_insert_head(&stack, o);
858			}
859			vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
860			    dst_offset + inner_offset,
861			    inner_size, arg);
862
863			/*
864			 * vdev_indirect_gather_splits can have memory
865			 * allocation error, we can not recover from it.
866			 */
867			if (zio->io_error != 0)
868				break;
869			rs->rs_offset += inner_size;
870			rs->rs_asize -= inner_size;
871			rs->rs_split_offset += inner_size;
872		}
873
874		free(mapping);
875		free(rs);
876		if (zio->io_error != 0)
877			break;
878	}
879
880	list_destroy(&stack);
881}
882
883static void
884vdev_indirect_map_free(zio_t *zio)
885{
886	indirect_vsd_t *iv = zio->io_vsd;
887	indirect_split_t *is;
888
889	while ((is = list_head(&iv->iv_splits)) != NULL) {
890		for (int c = 0; c < is->is_children; c++) {
891			indirect_child_t *ic = &is->is_child[c];
892			free(ic->ic_data);
893		}
894		list_remove(&iv->iv_splits, is);
895		free(is);
896	}
897	free(iv);
898}
899
900static int
901vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
902    off_t offset, size_t bytes)
903{
904	zio_t zio;
905	spa_t *spa = vdev->v_spa;
906	indirect_vsd_t *iv;
907	indirect_split_t *first;
908	int rc = EIO;
909
910	iv = calloc(1, sizeof (*iv));
911	if (iv == NULL)
912		return (ENOMEM);
913
914	list_create(&iv->iv_splits,
915	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
916
917	bzero(&zio, sizeof (zio));
918	zio.io_spa = spa;
919	zio.io_bp = (blkptr_t *)bp;
920	zio.io_data = buf;
921	zio.io_size = bytes;
922	zio.io_offset = offset;
923	zio.io_vd = vdev;
924	zio.io_vsd = iv;
925
926	if (vdev->v_mapping == NULL) {
927		vdev_indirect_config_t *vic;
928
929		vic = &vdev->vdev_indirect_config;
930		vdev->v_mapping = vdev_indirect_mapping_open(spa,
931		    &spa->spa_mos, vic->vic_mapping_object);
932	}
933
934	vdev_indirect_remap(vdev, offset, bytes, &zio);
935	if (zio.io_error != 0)
936		return (zio.io_error);
937
938	first = list_head(&iv->iv_splits);
939	if (first->is_size == zio.io_size) {
940		/*
941		 * This is not a split block; we are pointing to the entire
942		 * data, which will checksum the same as the original data.
943		 * Pass the BP down so that the child i/o can verify the
944		 * checksum, and try a different location if available
945		 * (e.g. on a mirror).
946		 *
947		 * While this special case could be handled the same as the
948		 * general (split block) case, doing it this way ensures
949		 * that the vast majority of blocks on indirect vdevs
950		 * (which are not split) are handled identically to blocks
951		 * on non-indirect vdevs.  This allows us to be less strict
952		 * about performance in the general (but rare) case.
953		 */
954		rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
955		    zio.io_data, first->is_target_offset, bytes);
956	} else {
957		iv->iv_split_block = B_TRUE;
958		/*
959		 * Read one copy of each split segment, from the
960		 * top-level vdev.  Since we don't know the
961		 * checksum of each split individually, the child
962		 * zio can't ensure that we get the right data.
963		 * E.g. if it's a mirror, it will just read from a
964		 * random (healthy) leaf vdev.  We have to verify
965		 * the checksum in vdev_indirect_io_done().
966		 */
967		for (indirect_split_t *is = list_head(&iv->iv_splits);
968		    is != NULL; is = list_next(&iv->iv_splits, is)) {
969			char *ptr = zio.io_data;
970
971			rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
972			    ptr + is->is_split_offset, is->is_target_offset,
973			    is->is_size);
974		}
975		if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
976			rc = ECKSUM;
977		else
978			rc = 0;
979	}
980
981	vdev_indirect_map_free(&zio);
982	if (rc == 0)
983		rc = zio.io_error;
984
985	return (rc);
986}
987
988static int
989vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
990    off_t offset, size_t bytes)
991{
992
993	return (vdev_read_phys(vdev, bp, buf,
994	    offset + VDEV_LABEL_START_SIZE, bytes));
995}
996
997
998static int
999vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1000    off_t offset, size_t bytes)
1001{
1002	vdev_t *kid;
1003	int rc;
1004
1005	rc = EIO;
1006	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1007		if (kid->v_state != VDEV_STATE_HEALTHY)
1008			continue;
1009		rc = kid->v_read(kid, bp, buf, offset, bytes);
1010		if (!rc)
1011			return (0);
1012	}
1013
1014	return (rc);
1015}
1016
1017static int
1018vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1019    off_t offset, size_t bytes)
1020{
1021	vdev_t *kid;
1022
1023	/*
1024	 * Here we should have two kids:
1025	 * First one which is the one we are replacing and we can trust
1026	 * only this one to have valid data, but it might not be present.
1027	 * Second one is that one we are replacing with. It is most likely
1028	 * healthy, but we can't trust it has needed data, so we won't use it.
1029	 */
1030	kid = STAILQ_FIRST(&vdev->v_children);
1031	if (kid == NULL)
1032		return (EIO);
1033	if (kid->v_state != VDEV_STATE_HEALTHY)
1034		return (EIO);
1035	return (kid->v_read(kid, bp, buf, offset, bytes));
1036}
1037
1038static vdev_t *
1039vdev_find(uint64_t guid)
1040{
1041	vdev_t *vdev;
1042
1043	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
1044		if (vdev->v_guid == guid)
1045			return (vdev);
1046
1047	return (0);
1048}
1049
1050static vdev_t *
1051vdev_create(uint64_t guid, vdev_read_t *vdev_read)
1052{
1053	vdev_t *vdev;
1054	vdev_indirect_config_t *vic;
1055
1056	vdev = calloc(1, sizeof (vdev_t));
1057	if (vdev != NULL) {
1058		STAILQ_INIT(&vdev->v_children);
1059		vdev->v_guid = guid;
1060		vdev->v_read = vdev_read;
1061
1062		/*
1063		 * root vdev has no read function, we use this fact to
1064		 * skip setting up data we do not need for root vdev.
1065		 * We only point root vdev from spa.
1066		 */
1067		if (vdev_read != NULL) {
1068			vic = &vdev->vdev_indirect_config;
1069			vic->vic_prev_indirect_vdev = UINT64_MAX;
1070			STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
1071		}
1072	}
1073
1074	return (vdev);
1075}
1076
1077static void
1078vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
1079{
1080	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
1081	uint64_t is_log;
1082
1083	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
1084	is_log = 0;
1085	(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
1086	    &is_offline);
1087	(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
1088	    &is_removed);
1089	(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
1090	    &is_faulted);
1091	(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
1092	    NULL, &is_degraded);
1093	(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
1094	    NULL, &isnt_present);
1095	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
1096	    &is_log);
1097
1098	if (is_offline != 0)
1099		vdev->v_state = VDEV_STATE_OFFLINE;
1100	else if (is_removed != 0)
1101		vdev->v_state = VDEV_STATE_REMOVED;
1102	else if (is_faulted != 0)
1103		vdev->v_state = VDEV_STATE_FAULTED;
1104	else if (is_degraded != 0)
1105		vdev->v_state = VDEV_STATE_DEGRADED;
1106	else if (isnt_present != 0)
1107		vdev->v_state = VDEV_STATE_CANT_OPEN;
1108
1109	vdev->v_islog = is_log != 0;
1110}
1111
1112static int
1113vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
1114{
1115	uint64_t id, ashift, asize, nparity;
1116	const char *path;
1117	const char *type;
1118	vdev_t *vdev;
1119
1120	if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
1121	    nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1122	    NULL, &type)) {
1123		return (ENOENT);
1124	}
1125
1126	if (strcmp(type, VDEV_TYPE_MIRROR) != 0 &&
1127	    strcmp(type, VDEV_TYPE_DISK) != 0 &&
1128#ifdef ZFS_TEST
1129	    strcmp(type, VDEV_TYPE_FILE) != 0 &&
1130#endif
1131	    strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
1132	    strcmp(type, VDEV_TYPE_INDIRECT) != 0 &&
1133	    strcmp(type, VDEV_TYPE_REPLACING) != 0) {
1134		printf("ZFS: can only boot from disk, mirror, raidz1, "
1135		    "raidz2 and raidz3 vdevs\n");
1136		return (EIO);
1137	}
1138
1139	if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
1140		vdev = vdev_create(guid, vdev_mirror_read);
1141	else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
1142		vdev = vdev_create(guid, vdev_raidz_read);
1143	else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
1144		vdev = vdev_create(guid, vdev_replacing_read);
1145	else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
1146		vdev_indirect_config_t *vic;
1147
1148		vdev = vdev_create(guid, vdev_indirect_read);
1149		if (vdev != NULL) {
1150			vdev->v_state = VDEV_STATE_HEALTHY;
1151			vic = &vdev->vdev_indirect_config;
1152
1153			nvlist_find(nvlist,
1154			    ZPOOL_CONFIG_INDIRECT_OBJECT,
1155			    DATA_TYPE_UINT64,
1156			    NULL, &vic->vic_mapping_object);
1157			nvlist_find(nvlist,
1158			    ZPOOL_CONFIG_INDIRECT_BIRTHS,
1159			    DATA_TYPE_UINT64,
1160			    NULL, &vic->vic_births_object);
1161			nvlist_find(nvlist,
1162			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
1163			    DATA_TYPE_UINT64,
1164			    NULL, &vic->vic_prev_indirect_vdev);
1165		}
1166	} else {
1167		vdev = vdev_create(guid, vdev_disk_read);
1168	}
1169
1170	if (vdev == NULL)
1171		return (ENOMEM);
1172
1173	vdev_set_initial_state(vdev, nvlist);
1174	vdev->v_id = id;
1175	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
1176	    DATA_TYPE_UINT64, NULL, &ashift) == 0)
1177		vdev->v_ashift = ashift;
1178
1179	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1180	    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1181		vdev->v_psize = asize +
1182		    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1183	}
1184
1185	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1186	    DATA_TYPE_UINT64, NULL, &nparity) == 0)
1187		vdev->v_nparity = nparity;
1188
1189	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1190	    DATA_TYPE_STRING, NULL, &path) == 0) {
1191		if (strncmp(path, "/dev/dsk/", 9) == 0)
1192			path += 9;
1193		vdev->v_name = strdup(path);
1194		if (nvlist_find(nvlist, ZPOOL_CONFIG_PHYS_PATH,
1195		    DATA_TYPE_STRING, NULL, &path) == 0) {
1196			vdev->v_phys_path = strdup(path);
1197		} else {
1198			vdev->v_phys_path = NULL;
1199		}
1200		if (nvlist_find(nvlist, ZPOOL_CONFIG_DEVID,
1201		    DATA_TYPE_STRING, NULL, &path) == 0) {
1202			vdev->v_devid = strdup(path);
1203		} else {
1204			vdev->v_devid = NULL;
1205		}
1206	} else {
1207		char *name;
1208
1209		name = NULL;
1210		if (strcmp(type, "raidz") == 0) {
1211			if (vdev->v_nparity < 1 ||
1212			    vdev->v_nparity > 3) {
1213				printf("ZFS: invalid raidz parity: %d\n",
1214				    vdev->v_nparity);
1215				return (EIO);
1216			}
1217			(void) asprintf(&name, "%s%d-%" PRIu64, type,
1218			    vdev->v_nparity, id);
1219		} else {
1220			(void) asprintf(&name, "%s-%" PRIu64, type, id);
1221		}
1222		vdev->v_name = name;
1223	}
1224	*vdevp = vdev;
1225	return (0);
1226}
1227
1228/*
1229 * Find slot for vdev. We return either NULL to signal to use
1230 * STAILQ_INSERT_HEAD, or we return link element to be used with
1231 * STAILQ_INSERT_AFTER.
1232 */
1233static vdev_t *
1234vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1235{
1236	vdev_t *v, *previous;
1237
1238	if (STAILQ_EMPTY(&top_vdev->v_children))
1239		return (NULL);
1240
1241	previous = NULL;
1242	STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1243		if (v->v_id > vdev->v_id)
1244			return (previous);
1245
1246		if (v->v_id == vdev->v_id)
1247			return (v);
1248
1249		if (v->v_id < vdev->v_id)
1250			previous = v;
1251	}
1252	return (previous);
1253}
1254
1255static size_t
1256vdev_child_count(vdev_t *vdev)
1257{
1258	vdev_t *v;
1259	size_t count;
1260
1261	count = 0;
1262	STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1263		count++;
1264	}
1265	return (count);
1266}
1267
1268/*
1269 * Insert vdev into top_vdev children list. List is ordered by v_id.
1270 */
1271static void
1272vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1273{
1274	vdev_t *previous;
1275	size_t count;
1276
1277	/*
1278	 * The top level vdev can appear in random order, depending how
1279	 * the firmware is presenting the disk devices.
1280	 * However, we will insert vdev to create list ordered by v_id,
1281	 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1282	 * as STAILQ does not have insert before.
1283	 */
1284	previous = vdev_find_previous(top_vdev, vdev);
1285
1286	if (previous == NULL) {
1287		STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1288	} else if (previous->v_id == vdev->v_id) {
1289		/*
1290		 * This vdev was configured from label config,
1291		 * do not insert duplicate.
1292		 */
1293		return;
1294	} else {
1295		STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1296		    v_childlink);
1297	}
1298
1299	count = vdev_child_count(top_vdev);
1300	if (top_vdev->v_nchildren < count)
1301		top_vdev->v_nchildren = count;
1302}
1303
1304static int
1305vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
1306{
1307	vdev_t *top_vdev, *vdev;
1308	const unsigned char *kids;
1309	int rc, nkids;
1310
1311	/* Get top vdev. */
1312	top_vdev = vdev_find(top_guid);
1313	if (top_vdev == NULL) {
1314		rc = vdev_init(top_guid, nvlist, &top_vdev);
1315		if (rc != 0)
1316			return (rc);
1317		top_vdev->v_spa = spa;
1318		top_vdev->v_top = top_vdev;
1319		vdev_insert(spa->spa_root_vdev, top_vdev);
1320	}
1321
1322	/* Add children if there are any. */
1323	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1324	    &nkids, &kids);
1325	if (rc == 0) {
1326		for (int i = 0; i < nkids; i++) {
1327			uint64_t guid;
1328
1329			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1330			    DATA_TYPE_UINT64, NULL, &guid);
1331			if (rc != 0)
1332				return (rc);
1333			rc = vdev_init(guid, kids, &vdev);
1334			if (rc != 0)
1335				return (rc);
1336
1337			vdev->v_spa = spa;
1338			vdev->v_top = top_vdev;
1339			vdev_insert(top_vdev, vdev);
1340
1341			kids = nvlist_next(kids);
1342		}
1343	} else {
1344		/*
1345		 * When there are no children, nvlist_find() does return
1346		 * error, reset it because leaf devices have no children.
1347		 */
1348		rc = 0;
1349	}
1350
1351	return (rc);
1352}
1353
1354static int
1355vdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
1356{
1357	uint64_t pool_guid, top_guid;
1358	const unsigned char *vdevs;
1359
1360	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1361	    NULL, &pool_guid) ||
1362	    nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1363	    NULL, &top_guid) ||
1364	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1365	    NULL, &vdevs)) {
1366		printf("ZFS: can't find vdev details\n");
1367		return (ENOENT);
1368	}
1369
1370	return (vdev_from_nvlist(spa, top_guid, vdevs));
1371}
1372
1373static void
1374vdev_set_state(vdev_t *vdev)
1375{
1376	vdev_t *kid;
1377	int good_kids;
1378	int bad_kids;
1379
1380	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1381		vdev_set_state(kid);
1382	}
1383
1384	/*
1385	 * A mirror or raidz is healthy if all its kids are healthy. A
1386	 * mirror is degraded if any of its kids is healthy; a raidz
1387	 * is degraded if at most nparity kids are offline.
1388	 */
1389	if (STAILQ_FIRST(&vdev->v_children)) {
1390		good_kids = 0;
1391		bad_kids = 0;
1392		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1393			if (kid->v_state == VDEV_STATE_HEALTHY)
1394				good_kids++;
1395			else
1396				bad_kids++;
1397		}
1398		if (bad_kids == 0) {
1399			vdev->v_state = VDEV_STATE_HEALTHY;
1400		} else {
1401			if (vdev->v_read == vdev_mirror_read) {
1402				if (good_kids) {
1403					vdev->v_state = VDEV_STATE_DEGRADED;
1404				} else {
1405					vdev->v_state = VDEV_STATE_OFFLINE;
1406				}
1407			} else if (vdev->v_read == vdev_raidz_read) {
1408				if (bad_kids > vdev->v_nparity) {
1409					vdev->v_state = VDEV_STATE_OFFLINE;
1410				} else {
1411					vdev->v_state = VDEV_STATE_DEGRADED;
1412				}
1413			}
1414		}
1415	}
1416}
1417
1418static int
1419vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
1420{
1421	vdev_t *vdev;
1422	const unsigned char *kids;
1423	int rc, nkids;
1424
1425	/* Update top vdev. */
1426	vdev = vdev_find(top_guid);
1427	if (vdev != NULL)
1428		vdev_set_initial_state(vdev, nvlist);
1429
1430	/* Update children if there are any. */
1431	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1432	    &nkids, &kids);
1433	if (rc == 0) {
1434		for (int i = 0; i < nkids; i++) {
1435			uint64_t guid;
1436
1437			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1438			    DATA_TYPE_UINT64, NULL, &guid);
1439			if (rc != 0)
1440				break;
1441
1442			vdev = vdev_find(guid);
1443			if (vdev != NULL)
1444				vdev_set_initial_state(vdev, kids);
1445
1446			kids = nvlist_next(kids);
1447		}
1448	} else {
1449		rc = 0;
1450	}
1451
1452	return (rc);
1453}
1454
1455static int
1456vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist)
1457{
1458	uint64_t pool_guid, vdev_children;
1459	const unsigned char *vdevs, *kids;
1460	int rc, nkids;
1461
1462	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1463	    NULL, &pool_guid) ||
1464	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
1465	    NULL, &vdev_children) ||
1466	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1467	    NULL, &vdevs)) {
1468		printf("ZFS: can't find vdev details\n");
1469		return (ENOENT);
1470	}
1471
1472	/* Wrong guid?! */
1473	if (spa->spa_guid != pool_guid)
1474		return (EINVAL);
1475
1476	spa->spa_root_vdev->v_nchildren = vdev_children;
1477
1478	rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1479	    &nkids, &kids);
1480
1481	/*
1482	 * MOS config has at least one child for root vdev.
1483	 */
1484	if (rc != 0)
1485		return (rc);
1486
1487	for (int i = 0; i < nkids; i++) {
1488		uint64_t guid;
1489		vdev_t *vdev;
1490
1491		rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1492		    NULL, &guid);
1493		if (rc != 0)
1494			break;
1495		vdev = vdev_find(guid);
1496		/*
1497		 * Top level vdev is missing, create it.
1498		 */
1499		if (vdev == NULL)
1500			rc = vdev_from_nvlist(spa, guid, kids);
1501		else
1502			rc = vdev_update_from_nvlist(guid, kids);
1503		if (rc != 0)
1504			break;
1505		kids = nvlist_next(kids);
1506	}
1507
1508	/*
1509	 * Re-evaluate top-level vdev state.
1510	 */
1511	vdev_set_state(spa->spa_root_vdev);
1512
1513	return (rc);
1514}
1515
1516static spa_t *
1517spa_find_by_guid(uint64_t guid)
1518{
1519	spa_t *spa;
1520
1521	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1522		if (spa->spa_guid == guid)
1523			return (spa);
1524
1525	return (NULL);
1526}
1527
1528static spa_t *
1529spa_find_by_name(const char *name)
1530{
1531	spa_t *spa;
1532
1533	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
1534		if (strcmp(spa->spa_name, name) == 0)
1535			return (spa);
1536
1537	return (NULL);
1538}
1539
1540spa_t *
1541spa_get_primary(void)
1542{
1543	return (STAILQ_FIRST(&zfs_pools));
1544}
1545
1546vdev_t *
1547spa_get_primary_vdev(const spa_t *spa)
1548{
1549	vdev_t *vdev;
1550	vdev_t *kid;
1551
1552	if (spa == NULL)
1553		spa = spa_get_primary();
1554	if (spa == NULL)
1555		return (NULL);
1556	vdev = spa->spa_root_vdev;
1557	if (vdev == NULL)
1558		return (NULL);
1559	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
1560	    kid = STAILQ_FIRST(&vdev->v_children))
1561		vdev = kid;
1562	return (vdev);
1563}
1564
1565static spa_t *
1566spa_create(uint64_t guid, const char *name)
1567{
1568	spa_t *spa;
1569
1570	if ((spa = calloc(1, sizeof (spa_t))) == NULL)
1571		return (NULL);
1572	if ((spa->spa_name = strdup(name)) == NULL) {
1573		free(spa);
1574		return (NULL);
1575	}
1576	spa->spa_guid = guid;
1577	spa->spa_root_vdev = vdev_create(guid, NULL);
1578	if (spa->spa_root_vdev == NULL) {
1579		free(spa->spa_name);
1580		free(spa);
1581		return (NULL);
1582	}
1583	spa->spa_root_vdev->v_name = strdup("root");
1584	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
1585
1586	return (spa);
1587}
1588
1589static const char *
1590state_name(vdev_state_t state)
1591{
1592	static const char *names[] = {
1593		"UNKNOWN",
1594		"CLOSED",
1595		"OFFLINE",
1596		"REMOVED",
1597		"CANT_OPEN",
1598		"FAULTED",
1599		"DEGRADED",
1600		"ONLINE"
1601	};
1602	return (names[state]);
1603}
1604
1605static int
1606pager_printf(const char *fmt, ...)
1607{
1608	char line[80];
1609	va_list args;
1610
1611	va_start(args, fmt);
1612	vsnprintf(line, sizeof (line), fmt, args);
1613	va_end(args);
1614	return (pager_output(line));
1615}
1616
1617#define	STATUS_FORMAT	"        %s %s\n"
1618
1619static int
1620print_state(int indent, const char *name, vdev_state_t state)
1621{
1622	int i;
1623	char buf[512];
1624
1625	buf[0] = 0;
1626	for (i = 0; i < indent; i++)
1627		strcat(buf, "  ");
1628	strcat(buf, name);
1629	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
1630}
1631
1632static int
1633vdev_status(vdev_t *vdev, int indent)
1634{
1635	vdev_t *kid;
1636	int ret;
1637
1638	if (vdev->v_islog) {
1639		(void) pager_output("        logs\n");
1640		indent++;
1641	}
1642
1643	ret = print_state(indent, vdev->v_name, vdev->v_state);
1644	if (ret != 0)
1645		return (ret);
1646
1647	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1648		ret = vdev_status(kid, indent + 1);
1649		if (ret != 0)
1650			return (ret);
1651	}
1652	return (ret);
1653}
1654
1655static int
1656spa_status(spa_t *spa)
1657{
1658	static char bootfs[ZFS_MAXNAMELEN];
1659	uint64_t rootid;
1660	vdev_list_t *vlist;
1661	vdev_t *vdev;
1662	int good_kids, bad_kids, degraded_kids, ret;
1663	vdev_state_t state;
1664
1665	ret = pager_printf("  pool: %s\n", spa->spa_name);
1666	if (ret != 0)
1667		return (ret);
1668
1669	if (zfs_get_root(spa, &rootid) == 0 &&
1670	    zfs_rlookup(spa, rootid, bootfs) == 0) {
1671		if (bootfs[0] == '\0')
1672			ret = pager_printf("bootfs: %s\n", spa->spa_name);
1673		else
1674			ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
1675			    bootfs);
1676		if (ret != 0)
1677			return (ret);
1678	}
1679	ret = pager_printf("config:\n\n");
1680	if (ret != 0)
1681		return (ret);
1682	ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
1683	if (ret != 0)
1684		return (ret);
1685
1686	good_kids = 0;
1687	degraded_kids = 0;
1688	bad_kids = 0;
1689	vlist = &spa->spa_root_vdev->v_children;
1690	STAILQ_FOREACH(vdev, vlist, v_childlink) {
1691		if (vdev->v_state == VDEV_STATE_HEALTHY)
1692			good_kids++;
1693		else if (vdev->v_state == VDEV_STATE_DEGRADED)
1694			degraded_kids++;
1695		else
1696			bad_kids++;
1697	}
1698
1699	state = VDEV_STATE_CLOSED;
1700	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
1701		state = VDEV_STATE_HEALTHY;
1702	else if ((good_kids + degraded_kids) > 0)
1703		state = VDEV_STATE_DEGRADED;
1704
1705	ret = print_state(0, spa->spa_name, state);
1706	if (ret != 0)
1707		return (ret);
1708
1709	STAILQ_FOREACH(vdev, vlist, v_childlink) {
1710		ret = vdev_status(vdev, 1);
1711		if (ret != 0)
1712			return (ret);
1713	}
1714	return (ret);
1715}
1716
1717int
1718spa_all_status(void)
1719{
1720	spa_t *spa;
1721	int first = 1, ret = 0;
1722
1723	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1724		if (!first) {
1725			ret = pager_printf("\n");
1726			if (ret != 0)
1727				return (ret);
1728		}
1729		first = 0;
1730		ret = spa_status(spa);
1731		if (ret != 0)
1732			return (ret);
1733	}
1734	return (ret);
1735}
1736
1737uint64_t
1738vdev_label_offset(uint64_t psize, int l, uint64_t offset)
1739{
1740	uint64_t label_offset;
1741
1742	if (l < VDEV_LABELS / 2)
1743		label_offset = 0;
1744	else
1745		label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
1746
1747	return (offset + l * sizeof (vdev_label_t) + label_offset);
1748}
1749
1750static int
1751vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
1752{
1753	unsigned int seq1 = 0;
1754	unsigned int seq2 = 0;
1755	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
1756
1757	if (cmp != 0)
1758		return (cmp);
1759
1760	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
1761	if (cmp != 0)
1762		return (cmp);
1763
1764	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
1765		seq1 = MMP_SEQ(ub1);
1766
1767	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
1768		seq2 = MMP_SEQ(ub2);
1769
1770	return (AVL_CMP(seq1, seq2));
1771}
1772
1773static int
1774uberblock_verify(uberblock_t *ub)
1775{
1776	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
1777		byteswap_uint64_array(ub, sizeof (uberblock_t));
1778	}
1779
1780	if (ub->ub_magic != UBERBLOCK_MAGIC ||
1781	    !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
1782		return (EINVAL);
1783
1784	return (0);
1785}
1786
1787static int
1788vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
1789    size_t size)
1790{
1791	blkptr_t bp;
1792	off_t off;
1793
1794	off = vdev_label_offset(vd->v_psize, l, offset);
1795
1796	BP_ZERO(&bp);
1797	BP_SET_LSIZE(&bp, size);
1798	BP_SET_PSIZE(&bp, size);
1799	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1800	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1801	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
1802	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1803
1804	return (vdev_read_phys(vd, &bp, buf, off, size));
1805}
1806
1807static unsigned char *
1808vdev_label_read_config(vdev_t *vd, uint64_t txg)
1809{
1810	vdev_phys_t *label;
1811	uint64_t best_txg = 0;
1812	uint64_t label_txg = 0;
1813	uint64_t asize;
1814	unsigned char *nvl;
1815	size_t nvl_size;
1816	int error;
1817
1818	label = malloc(sizeof (vdev_phys_t));
1819	if (label == NULL)
1820		return (NULL);
1821
1822	nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
1823	nvl = malloc(nvl_size);
1824	if (nvl == NULL)
1825		goto done;
1826
1827	for (int l = 0; l < VDEV_LABELS; l++) {
1828		const unsigned char *nvlist;
1829
1830		if (vdev_label_read(vd, l, label,
1831		    offsetof(vdev_label_t, vl_vdev_phys),
1832		    sizeof (vdev_phys_t)))
1833			continue;
1834
1835		if (label->vp_nvlist[0] != NV_ENCODE_XDR)
1836			continue;
1837
1838		nvlist = (const unsigned char *) label->vp_nvlist + 4;
1839		error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
1840		    DATA_TYPE_UINT64, NULL, &label_txg);
1841		if (error != 0 || label_txg == 0) {
1842			memcpy(nvl, nvlist, nvl_size);
1843			goto done;
1844		}
1845
1846		if (label_txg <= txg && label_txg > best_txg) {
1847			best_txg = label_txg;
1848			memcpy(nvl, nvlist, nvl_size);
1849
1850			/*
1851			 * Use asize from pool config. We need this
1852			 * because we can get bad value from BIOS.
1853			 */
1854			if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1855			    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1856				vd->v_psize = asize +
1857				    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1858			}
1859		}
1860	}
1861
1862	if (best_txg == 0) {
1863		free(nvl);
1864		nvl = NULL;
1865	}
1866done:
1867	free(label);
1868	return (nvl);
1869}
1870
1871static void
1872vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
1873{
1874	uberblock_t *buf;
1875
1876	buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
1877	if (buf == NULL)
1878		return;
1879
1880	for (int l = 0; l < VDEV_LABELS; l++) {
1881		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
1882			if (vdev_label_read(vd, l, buf,
1883			    VDEV_UBERBLOCK_OFFSET(vd, n),
1884			    VDEV_UBERBLOCK_SIZE(vd)))
1885				continue;
1886			if (uberblock_verify(buf) != 0)
1887				continue;
1888
1889			if (vdev_uberblock_compare(buf, ub) > 0)
1890				*ub = *buf;
1891		}
1892	}
1893	free(buf);
1894}
1895
1896static int
1897vdev_probe(vdev_phys_read_t *phys_read, void *read_priv, spa_t **spap)
1898{
1899	vdev_t vtmp;
1900	spa_t *spa;
1901	vdev_t *vdev;
1902	unsigned char *nvlist;
1903	uint64_t val;
1904	uint64_t guid, vdev_children;
1905	uint64_t pool_txg, pool_guid;
1906	const char *pool_name;
1907	const unsigned char *features;
1908	int rc;
1909
1910	/*
1911	 * Load the vdev label and figure out which
1912	 * uberblock is most current.
1913	 */
1914	memset(&vtmp, 0, sizeof (vtmp));
1915	vtmp.v_phys_read = phys_read;
1916	vtmp.v_read_priv = read_priv;
1917	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
1918	    (uint64_t)sizeof (vdev_label_t));
1919
1920	/* Test for minimum device size. */
1921	if (vtmp.v_psize < SPA_MINDEVSIZE)
1922		return (EIO);
1923
1924	nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
1925	if (nvlist == NULL)
1926		return (EIO);
1927
1928	if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1929	    NULL, &val) != 0) {
1930		free(nvlist);
1931		return (EIO);
1932	}
1933
1934	if (!SPA_VERSION_IS_SUPPORTED(val)) {
1935		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1936		    (unsigned)val, (unsigned)SPA_VERSION);
1937		free(nvlist);
1938		return (EIO);
1939	}
1940
1941	/* Check ZFS features for read */
1942	if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1943	    DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1944	    nvlist_check_features_for_read(features) != 0) {
1945		free(nvlist);
1946		return (EIO);
1947	}
1948
1949	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1950	    NULL, &val) != 0) {
1951		free(nvlist);
1952		return (EIO);
1953	}
1954
1955	if (val == POOL_STATE_DESTROYED) {
1956		/* We don't boot only from destroyed pools. */
1957		free(nvlist);
1958		return (EIO);
1959	}
1960
1961	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1962	    NULL, &pool_txg) != 0 ||
1963	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1964	    NULL, &pool_guid) != 0 ||
1965	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1966	    NULL, &pool_name) != 0) {
1967		/*
1968		 * Cache and spare devices end up here - just ignore
1969		 * them.
1970		 */
1971		free(nvlist);
1972		return (EIO);
1973	}
1974
1975	/*
1976	 * Create the pool if this is the first time we've seen it.
1977	 */
1978	spa = spa_find_by_guid(pool_guid);
1979	if (spa == NULL) {
1980		nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN,
1981		    DATA_TYPE_UINT64, NULL, &vdev_children);
1982		spa = spa_create(pool_guid, pool_name);
1983		if (spa == NULL) {
1984			free(nvlist);
1985			return (ENOMEM);
1986		}
1987		spa->spa_root_vdev->v_nchildren = vdev_children;
1988	}
1989	if (pool_txg > spa->spa_txg)
1990		spa->spa_txg = pool_txg;
1991
1992	/*
1993	 * Get the vdev tree and create our in-core copy of it.
1994	 * If we already have a vdev with this guid, this must
1995	 * be some kind of alias (overlapping slices, dangerously dedicated
1996	 * disks etc).
1997	 */
1998	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1999	    NULL, &guid) != 0) {
2000		free(nvlist);
2001		return (EIO);
2002	}
2003	vdev = vdev_find(guid);
2004	/* Has this vdev already been inited? */
2005	if (vdev && vdev->v_phys_read) {
2006		free(nvlist);
2007		return (EIO);
2008	}
2009
2010	rc = vdev_init_from_label(spa, nvlist);
2011	free(nvlist);
2012	if (rc != 0)
2013		return (rc);
2014
2015	/*
2016	 * We should already have created an incomplete vdev for this
2017	 * vdev. Find it and initialise it with our read proc.
2018	 */
2019	vdev = vdev_find(guid);
2020	if (vdev != NULL) {
2021		vdev->v_phys_read = phys_read;
2022		vdev->v_read_priv = read_priv;
2023		vdev->v_psize = vtmp.v_psize;
2024		/*
2025		 * If no other state is set, mark vdev healthy.
2026		 */
2027		if (vdev->v_state == VDEV_STATE_UNKNOWN)
2028			vdev->v_state = VDEV_STATE_HEALTHY;
2029	} else {
2030		printf("ZFS: inconsistent nvlist contents\n");
2031		return (EIO);
2032	}
2033
2034	if (vdev->v_islog)
2035		spa->spa_with_log = vdev->v_islog;
2036
2037	/* Record boot vdev for spa. */
2038	if (spa->spa_boot_vdev == NULL)
2039		spa->spa_boot_vdev = vdev;
2040
2041	/*
2042	 * Re-evaluate top-level vdev state.
2043	 */
2044	vdev_set_state(vdev->v_top);
2045
2046	/*
2047	 * Ok, we are happy with the pool so far. Lets find
2048	 * the best uberblock and then we can actually access
2049	 * the contents of the pool.
2050	 */
2051	vdev_uberblock_load(vdev, &spa->spa_uberblock);
2052
2053	if (spap != NULL)
2054		*spap = spa;
2055	return (0);
2056}
2057
2058static int
2059ilog2(int n)
2060{
2061	int v;
2062
2063	for (v = 0; v < 32; v++)
2064		if (n == (1 << v))
2065			return (v);
2066	return (-1);
2067}
2068
2069static int
2070zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
2071{
2072	blkptr_t gbh_bp;
2073	zio_gbh_phys_t zio_gb;
2074	char *pbuf;
2075	int i;
2076
2077	/* Artificial BP for gang block header. */
2078	gbh_bp = *bp;
2079	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2080	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
2081	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
2082	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
2083	for (i = 0; i < SPA_DVAS_PER_BP; i++)
2084		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
2085
2086	/* Read gang header block using the artificial BP. */
2087	if (zio_read(spa, &gbh_bp, &zio_gb))
2088		return (EIO);
2089
2090	pbuf = buf;
2091	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
2092		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
2093
2094		if (BP_IS_HOLE(gbp))
2095			continue;
2096		if (zio_read(spa, gbp, pbuf))
2097			return (EIO);
2098		pbuf += BP_GET_PSIZE(gbp);
2099	}
2100
2101	if (zio_checksum_verify(spa, bp, buf))
2102		return (EIO);
2103	return (0);
2104}
2105
2106static int
2107zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
2108{
2109	int cpfunc = BP_GET_COMPRESS(bp);
2110	uint64_t align, size;
2111	void *pbuf;
2112	int i, error;
2113
2114	/*
2115	 * Process data embedded in block pointer
2116	 */
2117	if (BP_IS_EMBEDDED(bp)) {
2118		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
2119
2120		size = BPE_GET_PSIZE(bp);
2121		ASSERT(size <= BPE_PAYLOAD_SIZE);
2122
2123		if (cpfunc != ZIO_COMPRESS_OFF)
2124			pbuf = malloc(size);
2125		else
2126			pbuf = buf;
2127
2128		if (pbuf == NULL)
2129			return (ENOMEM);
2130
2131		decode_embedded_bp_compressed(bp, pbuf);
2132		error = 0;
2133
2134		if (cpfunc != ZIO_COMPRESS_OFF) {
2135			error = zio_decompress_data(cpfunc, pbuf,
2136			    size, buf, BP_GET_LSIZE(bp));
2137			free(pbuf);
2138		}
2139		if (error != 0)
2140			printf("ZFS: i/o error - unable to decompress "
2141			    "block pointer data, error %d\n", error);
2142		return (error);
2143	}
2144
2145	error = EIO;
2146
2147	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
2148		const dva_t *dva = &bp->blk_dva[i];
2149		vdev_t *vdev;
2150		vdev_list_t *vlist;
2151		uint64_t vdevid;
2152		off_t offset;
2153
2154		if (!dva->dva_word[0] && !dva->dva_word[1])
2155			continue;
2156
2157		vdevid = DVA_GET_VDEV(dva);
2158		offset = DVA_GET_OFFSET(dva);
2159		vlist = &spa->spa_root_vdev->v_children;
2160		STAILQ_FOREACH(vdev, vlist, v_childlink) {
2161			if (vdev->v_id == vdevid)
2162				break;
2163		}
2164		if (!vdev || !vdev->v_read)
2165			continue;
2166
2167		size = BP_GET_PSIZE(bp);
2168		if (vdev->v_read == vdev_raidz_read) {
2169			align = 1ULL << vdev->v_ashift;
2170			if (P2PHASE(size, align) != 0)
2171				size = P2ROUNDUP(size, align);
2172		}
2173		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
2174			pbuf = malloc(size);
2175		else
2176			pbuf = buf;
2177
2178		if (pbuf == NULL) {
2179			error = ENOMEM;
2180			break;
2181		}
2182
2183		if (DVA_GET_GANG(dva))
2184			error = zio_read_gang(spa, bp, pbuf);
2185		else
2186			error = vdev->v_read(vdev, bp, pbuf, offset, size);
2187		if (error == 0) {
2188			if (cpfunc != ZIO_COMPRESS_OFF)
2189				error = zio_decompress_data(cpfunc, pbuf,
2190				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
2191			else if (size != BP_GET_PSIZE(bp))
2192				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
2193		}
2194		if (buf != pbuf)
2195			free(pbuf);
2196		if (error == 0)
2197			break;
2198	}
2199	if (error != 0)
2200		printf("ZFS: i/o error - all block copies unavailable\n");
2201
2202	return (error);
2203}
2204
2205static int
2206dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset,
2207    void *buf, size_t buflen)
2208{
2209	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
2210	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2211	int nlevels = dnode->dn_nlevels;
2212	int i, rc;
2213
2214	if (bsize > SPA_MAXBLOCKSIZE) {
2215		printf("ZFS: I/O error - blocks larger than %llu are not "
2216		    "supported\n", SPA_MAXBLOCKSIZE);
2217		return (EIO);
2218	}
2219
2220	/*
2221	 * Note: bsize may not be a power of two here so we need to do an
2222	 * actual divide rather than a bitshift.
2223	 */
2224	while (buflen > 0) {
2225		uint64_t bn = offset / bsize;
2226		int boff = offset % bsize;
2227		int ibn;
2228		const blkptr_t *indbp;
2229		blkptr_t bp;
2230
2231		if (bn > dnode->dn_maxblkid) {
2232			printf("warning: zfs bug: bn %llx > dn_maxblkid %llx\n",
2233			    (unsigned long long)bn,
2234			    (unsigned long long)dnode->dn_maxblkid);
2235			/*
2236			 * zfs bug, will not return error
2237			 * return (EIO);
2238			 */
2239		}
2240
2241		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
2242			goto cached;
2243
2244		indbp = dnode->dn_blkptr;
2245		for (i = 0; i < nlevels; i++) {
2246			/*
2247			 * Copy the bp from the indirect array so that
2248			 * we can re-use the scratch buffer for multi-level
2249			 * objects.
2250			 */
2251			ibn = bn >> ((nlevels - i - 1) * ibshift);
2252			ibn &= ((1 << ibshift) - 1);
2253			bp = indbp[ibn];
2254			if (BP_IS_HOLE(&bp)) {
2255				memset(dnode_cache_buf, 0, bsize);
2256				break;
2257			}
2258			rc = zio_read(spa, &bp, dnode_cache_buf);
2259			if (rc)
2260				return (rc);
2261			indbp = (const blkptr_t *) dnode_cache_buf;
2262		}
2263		dnode_cache_obj = dnode;
2264		dnode_cache_bn = bn;
2265	cached:
2266
2267		/*
2268		 * The buffer contains our data block. Copy what we
2269		 * need from it and loop.
2270		 */
2271		i = bsize - boff;
2272		if (i > buflen) i = buflen;
2273		memcpy(buf, &dnode_cache_buf[boff], i);
2274		buf = ((char *)buf) + i;
2275		offset += i;
2276		buflen -= i;
2277	}
2278
2279	return (0);
2280}
2281
2282/*
2283 * Lookup a value in a microzap directory.
2284 */
2285static int
2286mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name,
2287    uint64_t *value)
2288{
2289	const mzap_ent_phys_t *mze;
2290	int chunks, i;
2291
2292	/*
2293	 * Microzap objects use exactly one block. Read the whole
2294	 * thing.
2295	 */
2296	chunks = size / MZAP_ENT_LEN - 1;
2297	for (i = 0; i < chunks; i++) {
2298		mze = &mz->mz_chunk[i];
2299		if (strcmp(mze->mze_name, name) == 0) {
2300			*value = mze->mze_value;
2301			return (0);
2302		}
2303	}
2304
2305	return (ENOENT);
2306}
2307
2308/*
2309 * Compare a name with a zap leaf entry. Return non-zero if the name
2310 * matches.
2311 */
2312static int
2313fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2314    const char *name)
2315{
2316	size_t namelen;
2317	const zap_leaf_chunk_t *nc;
2318	const char *p;
2319
2320	namelen = zc->l_entry.le_name_numints;
2321
2322	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2323	p = name;
2324	while (namelen > 0) {
2325		size_t len;
2326
2327		len = namelen;
2328		if (len > ZAP_LEAF_ARRAY_BYTES)
2329			len = ZAP_LEAF_ARRAY_BYTES;
2330		if (memcmp(p, nc->l_array.la_array, len))
2331			return (0);
2332		p += len;
2333		namelen -= len;
2334		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2335	}
2336
2337	return (1);
2338}
2339
2340/*
2341 * Extract a uint64_t value from a zap leaf entry.
2342 */
2343static uint64_t
2344fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
2345{
2346	const zap_leaf_chunk_t *vc;
2347	int i;
2348	uint64_t value;
2349	const uint8_t *p;
2350
2351	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
2352	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
2353		value = (value << 8) | p[i];
2354	}
2355
2356	return (value);
2357}
2358
2359static void
2360stv(int len, void *addr, uint64_t value)
2361{
2362	switch (len) {
2363	case 1:
2364		*(uint8_t *)addr = value;
2365		return;
2366	case 2:
2367		*(uint16_t *)addr = value;
2368		return;
2369	case 4:
2370		*(uint32_t *)addr = value;
2371		return;
2372	case 8:
2373		*(uint64_t *)addr = value;
2374		return;
2375	}
2376}
2377
2378/*
2379 * Extract a array from a zap leaf entry.
2380 */
2381static void
2382fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
2383    uint64_t integer_size, uint64_t num_integers, void *buf)
2384{
2385	uint64_t array_int_len = zc->l_entry.le_value_intlen;
2386	uint64_t value = 0;
2387	uint64_t *u64 = buf;
2388	char *p = buf;
2389	int len = MIN(zc->l_entry.le_value_numints, num_integers);
2390	int chunk = zc->l_entry.le_value_chunk;
2391	int byten = 0;
2392
2393	if (integer_size == 8 && len == 1) {
2394		*u64 = fzap_leaf_value(zl, zc);
2395		return;
2396	}
2397
2398	while (len > 0) {
2399		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
2400		int i;
2401
2402		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
2403		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
2404			value = (value << 8) | la->la_array[i];
2405			byten++;
2406			if (byten == array_int_len) {
2407				stv(integer_size, p, value);
2408				byten = 0;
2409				len--;
2410				if (len == 0)
2411					return;
2412				p += integer_size;
2413			}
2414		}
2415		chunk = la->la_next;
2416	}
2417}
2418
2419static int
2420fzap_check_size(uint64_t integer_size, uint64_t num_integers)
2421{
2422
2423	switch (integer_size) {
2424	case 1:
2425	case 2:
2426	case 4:
2427	case 8:
2428		break;
2429	default:
2430		return (EINVAL);
2431	}
2432
2433	if (integer_size * num_integers > ZAP_MAXVALUELEN)
2434		return (E2BIG);
2435
2436	return (0);
2437}
2438
2439static void
2440zap_leaf_free(zap_leaf_t *leaf)
2441{
2442	free(leaf->l_phys);
2443	free(leaf);
2444}
2445
2446static int
2447zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp)
2448{
2449	int bs = FZAP_BLOCK_SHIFT(zap);
2450	int err;
2451
2452	*lp = malloc(sizeof (**lp));
2453	if (*lp == NULL)
2454		return (ENOMEM);
2455
2456	(*lp)->l_bs = bs;
2457	(*lp)->l_phys = malloc(1 << bs);
2458
2459	if ((*lp)->l_phys == NULL) {
2460		free(*lp);
2461		return (ENOMEM);
2462	}
2463	err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys,
2464	    1 << bs);
2465	if (err != 0) {
2466		zap_leaf_free(*lp);
2467	}
2468	return (err);
2469}
2470
2471static int
2472zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx,
2473    uint64_t *valp)
2474{
2475	int bs = FZAP_BLOCK_SHIFT(zap);
2476	uint64_t blk = idx >> (bs - 3);
2477	uint64_t off = idx & ((1 << (bs - 3)) - 1);
2478	uint64_t *buf;
2479	int rc;
2480
2481	buf = malloc(1 << zap->zap_block_shift);
2482	if (buf == NULL)
2483		return (ENOMEM);
2484	rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs,
2485	    buf, 1 << zap->zap_block_shift);
2486	if (rc == 0)
2487		*valp = buf[off];
2488	free(buf);
2489	return (rc);
2490}
2491
2492static int
2493zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp)
2494{
2495	if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) {
2496		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
2497		return (0);
2498	} else {
2499		return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl,
2500		    idx, valp));
2501	}
2502}
2503
2504#define	ZAP_HASH_IDX(hash, n)	(((n) == 0) ? 0 : ((hash) >> (64 - (n))))
2505static int
2506zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp)
2507{
2508	uint64_t idx, blk;
2509	int err;
2510
2511	idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift);
2512	err = zap_idx_to_blk(zap, idx, &blk);
2513	if (err != 0)
2514		return (err);
2515	return (zap_get_leaf_byblk(zap, blk, lp));
2516}
2517
2518#define	CHAIN_END	0xffff	/* end of the chunk chain */
2519#define	LEAF_HASH(l, h) \
2520	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
2521	((h) >> \
2522	(64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len)))
2523#define	LEAF_HASH_ENTPTR(l, h)	(&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
2524
2525static int
2526zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name,
2527    uint64_t integer_size, uint64_t num_integers, void *value)
2528{
2529	int rc;
2530	uint16_t *chunkp;
2531	struct zap_leaf_entry *le;
2532
2533	/*
2534	 * Make sure this chunk matches our hash.
2535	 */
2536	if (zl->l_phys->l_hdr.lh_prefix_len > 0 &&
2537	    zl->l_phys->l_hdr.lh_prefix !=
2538	    hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len))
2539		return (EIO);
2540
2541	rc = ENOENT;
2542	for (chunkp = LEAF_HASH_ENTPTR(zl, hash);
2543	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
2544		zap_leaf_chunk_t *zc;
2545		uint16_t chunk = *chunkp;
2546
2547		le = ZAP_LEAF_ENTRY(zl, chunk);
2548		if (le->le_hash != hash)
2549			continue;
2550		zc = &ZAP_LEAF_CHUNK(zl, chunk);
2551		if (fzap_name_equal(zl, zc, name)) {
2552			if (zc->l_entry.le_value_intlen > integer_size) {
2553				rc = EINVAL;
2554			} else {
2555				fzap_leaf_array(zl, zc, integer_size,
2556				    num_integers, value);
2557				rc = 0;
2558			}
2559			break;
2560		}
2561	}
2562	return (rc);
2563}
2564
2565/*
2566 * Lookup a value in a fatzap directory.
2567 */
2568static int
2569fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2570    const char *name, uint64_t integer_size, uint64_t num_integers,
2571    void *value)
2572{
2573	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2574	fat_zap_t z;
2575	zap_leaf_t *zl;
2576	uint64_t hash;
2577	int rc;
2578
2579	if (zh->zap_magic != ZAP_MAGIC)
2580		return (EIO);
2581
2582	if ((rc = fzap_check_size(integer_size, num_integers)) != 0)
2583		return (rc);
2584
2585	z.zap_block_shift = ilog2(bsize);
2586	z.zap_phys = zh;
2587	z.zap_spa = spa;
2588	z.zap_dnode = dnode;
2589
2590	hash = zap_hash(zh->zap_salt, name);
2591	rc = zap_deref_leaf(&z, hash, &zl);
2592	if (rc != 0)
2593		return (rc);
2594
2595	rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value);
2596
2597	zap_leaf_free(zl);
2598	return (rc);
2599}
2600
2601/*
2602 * Lookup a name in a zap object and return its value as a uint64_t.
2603 */
2604static int
2605zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
2606    uint64_t integer_size, uint64_t num_integers, void *value)
2607{
2608	int rc;
2609	zap_phys_t *zap;
2610	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2611
2612	zap = malloc(size);
2613	if (zap == NULL)
2614		return (ENOMEM);
2615
2616	rc = dnode_read(spa, dnode, 0, zap, size);
2617	if (rc)
2618		goto done;
2619
2620	switch (zap->zap_block_type) {
2621	case ZBT_MICRO:
2622		rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value);
2623		break;
2624	case ZBT_HEADER:
2625		rc = fzap_lookup(spa, dnode, zap, name, integer_size,
2626		    num_integers, value);
2627		break;
2628	default:
2629		printf("ZFS: invalid zap_type=%" PRIx64 "\n",
2630		    zap->zap_block_type);
2631		rc = EIO;
2632	}
2633done:
2634	free(zap);
2635	return (rc);
2636}
2637
2638/*
2639 * List a microzap directory.
2640 */
2641static int
2642mzap_list(const mzap_phys_t *mz, size_t size,
2643    int (*callback)(const char *, uint64_t))
2644{
2645	const mzap_ent_phys_t *mze;
2646	int chunks, i, rc;
2647
2648	/*
2649	 * Microzap objects use exactly one block. Read the whole
2650	 * thing.
2651	 */
2652	rc = 0;
2653	chunks = size / MZAP_ENT_LEN - 1;
2654	for (i = 0; i < chunks; i++) {
2655		mze = &mz->mz_chunk[i];
2656		if (mze->mze_name[0]) {
2657			rc = callback(mze->mze_name, mze->mze_value);
2658			if (rc != 0)
2659				break;
2660		}
2661	}
2662
2663	return (rc);
2664}
2665
2666/*
2667 * List a fatzap directory.
2668 */
2669static int
2670fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2671    int (*callback)(const char *, uint64_t))
2672{
2673	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2674	fat_zap_t z;
2675	int i, j, rc;
2676
2677	if (zh->zap_magic != ZAP_MAGIC)
2678		return (EIO);
2679
2680	z.zap_block_shift = ilog2(bsize);
2681	z.zap_phys = zh;
2682
2683	/*
2684	 * This assumes that the leaf blocks start at block 1. The
2685	 * documentation isn't exactly clear on this.
2686	 */
2687	zap_leaf_t zl;
2688	zl.l_bs = z.zap_block_shift;
2689	zl.l_phys = malloc(bsize);
2690	if (zl.l_phys == NULL)
2691		return (ENOMEM);
2692
2693	for (i = 0; i < zh->zap_num_leafs; i++) {
2694		off_t off = ((off_t)(i + 1)) << zl.l_bs;
2695		char name[256], *p;
2696		uint64_t value;
2697
2698		if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) {
2699			free(zl.l_phys);
2700			return (EIO);
2701		}
2702
2703		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2704			zap_leaf_chunk_t *zc, *nc;
2705			int namelen;
2706
2707			zc = &ZAP_LEAF_CHUNK(&zl, j);
2708			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2709				continue;
2710			namelen = zc->l_entry.le_name_numints;
2711			if (namelen > sizeof (name))
2712				namelen = sizeof (name);
2713
2714			/*
2715			 * Paste the name back together.
2716			 */
2717			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
2718			p = name;
2719			while (namelen > 0) {
2720				int len;
2721				len = namelen;
2722				if (len > ZAP_LEAF_ARRAY_BYTES)
2723					len = ZAP_LEAF_ARRAY_BYTES;
2724				memcpy(p, nc->l_array.la_array, len);
2725				p += len;
2726				namelen -= len;
2727				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
2728			}
2729
2730			/*
2731			 * Assume the first eight bytes of the value are
2732			 * a uint64_t.
2733			 */
2734			value = fzap_leaf_value(&zl, zc);
2735
2736			/* printf("%s 0x%jx\n", name, (uintmax_t)value); */
2737			rc = callback((const char *)name, value);
2738			if (rc != 0) {
2739				free(zl.l_phys);
2740				return (rc);
2741			}
2742		}
2743	}
2744
2745	free(zl.l_phys);
2746	return (0);
2747}
2748
2749static int zfs_printf(const char *name, uint64_t value __unused)
2750{
2751
2752	printf("%s\n", name);
2753
2754	return (0);
2755}
2756
2757/*
2758 * List a zap directory.
2759 */
2760static int
2761zap_list(const spa_t *spa, const dnode_phys_t *dnode)
2762{
2763	zap_phys_t *zap;
2764	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2765	int rc;
2766
2767	zap = malloc(size);
2768	if (zap == NULL)
2769		return (ENOMEM);
2770
2771	rc = dnode_read(spa, dnode, 0, zap, size);
2772	if (rc == 0) {
2773		if (zap->zap_block_type == ZBT_MICRO)
2774			rc = mzap_list((const mzap_phys_t *)zap, size,
2775			    zfs_printf);
2776		else
2777			rc = fzap_list(spa, dnode, zap, zfs_printf);
2778	}
2779	free(zap);
2780	return (rc);
2781}
2782
2783static int
2784objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum,
2785    dnode_phys_t *dnode)
2786{
2787	off_t offset;
2788
2789	offset = objnum * sizeof (dnode_phys_t);
2790	return (dnode_read(spa, &os->os_meta_dnode, offset,
2791	    dnode, sizeof (dnode_phys_t)));
2792}
2793
2794/*
2795 * Lookup a name in a microzap directory.
2796 */
2797static int
2798mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value)
2799{
2800	const mzap_ent_phys_t *mze;
2801	int chunks, i;
2802
2803	/*
2804	 * Microzap objects use exactly one block. Read the whole
2805	 * thing.
2806	 */
2807	chunks = size / MZAP_ENT_LEN - 1;
2808	for (i = 0; i < chunks; i++) {
2809		mze = &mz->mz_chunk[i];
2810		if (value == mze->mze_value) {
2811			strcpy(name, mze->mze_name);
2812			return (0);
2813		}
2814	}
2815
2816	return (ENOENT);
2817}
2818
2819static void
2820fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
2821{
2822	size_t namelen;
2823	const zap_leaf_chunk_t *nc;
2824	char *p;
2825
2826	namelen = zc->l_entry.le_name_numints;
2827
2828	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
2829	p = name;
2830	while (namelen > 0) {
2831		size_t len;
2832		len = namelen;
2833		if (len > ZAP_LEAF_ARRAY_BYTES)
2834			len = ZAP_LEAF_ARRAY_BYTES;
2835		memcpy(p, nc->l_array.la_array, len);
2836		p += len;
2837		namelen -= len;
2838		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
2839	}
2840
2841	*p = '\0';
2842}
2843
2844static int
2845fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh,
2846    char *name, uint64_t value)
2847{
2848	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2849	fat_zap_t z;
2850	uint64_t i;
2851	int j, rc;
2852
2853	if (zh->zap_magic != ZAP_MAGIC)
2854		return (EIO);
2855
2856	z.zap_block_shift = ilog2(bsize);
2857	z.zap_phys = zh;
2858
2859	/*
2860	 * This assumes that the leaf blocks start at block 1. The
2861	 * documentation isn't exactly clear on this.
2862	 */
2863	zap_leaf_t zl;
2864	zl.l_bs = z.zap_block_shift;
2865	zl.l_phys = malloc(bsize);
2866	if (zl.l_phys == NULL)
2867		return (ENOMEM);
2868
2869	for (i = 0; i < zh->zap_num_leafs; i++) {
2870		off_t off = ((off_t)(i + 1)) << zl.l_bs;
2871
2872		rc = dnode_read(spa, dnode, off, zl.l_phys, bsize);
2873		if (rc != 0)
2874			goto done;
2875
2876		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
2877			zap_leaf_chunk_t *zc;
2878
2879			zc = &ZAP_LEAF_CHUNK(&zl, j);
2880			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
2881				continue;
2882			if (zc->l_entry.le_value_intlen != 8 ||
2883			    zc->l_entry.le_value_numints != 1)
2884				continue;
2885
2886			if (fzap_leaf_value(&zl, zc) == value) {
2887				fzap_name_copy(&zl, zc, name);
2888				goto done;
2889			}
2890		}
2891	}
2892
2893	rc = ENOENT;
2894done:
2895	free(zl.l_phys);
2896	return (rc);
2897}
2898
2899static int
2900zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name,
2901    uint64_t value)
2902{
2903	zap_phys_t *zap;
2904	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
2905	int rc;
2906
2907	zap = malloc(size);
2908	if (zap == NULL)
2909		return (ENOMEM);
2910
2911	rc = dnode_read(spa, dnode, 0, zap, size);
2912	if (rc == 0) {
2913		if (zap->zap_block_type == ZBT_MICRO)
2914			rc = mzap_rlookup((const mzap_phys_t *)zap, size,
2915			    name, value);
2916		else
2917			rc = fzap_rlookup(spa, dnode, zap, name, value);
2918	}
2919	free(zap);
2920	return (rc);
2921}
2922
2923static int
2924zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
2925{
2926	char name[256];
2927	char component[256];
2928	uint64_t dir_obj, parent_obj, child_dir_zapobj;
2929	dnode_phys_t child_dir_zap, dataset, dir, parent;
2930	dsl_dir_phys_t *dd;
2931	dsl_dataset_phys_t *ds;
2932	char *p;
2933	int len;
2934
2935	p = &name[sizeof (name) - 1];
2936	*p = '\0';
2937
2938	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2939		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2940		return (EIO);
2941	}
2942	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
2943	dir_obj = ds->ds_dir_obj;
2944
2945	for (;;) {
2946		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
2947			return (EIO);
2948		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2949
2950		/* Actual loop condition. */
2951		parent_obj = dd->dd_parent_obj;
2952		if (parent_obj == 0)
2953			break;
2954
2955		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj,
2956		    &parent) != 0)
2957			return (EIO);
2958		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
2959		child_dir_zapobj = dd->dd_child_dir_zapobj;
2960		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
2961		    &child_dir_zap) != 0)
2962			return (EIO);
2963		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
2964			return (EIO);
2965
2966		len = strlen(component);
2967		p -= len;
2968		memcpy(p, component, len);
2969		--p;
2970		*p = '/';
2971
2972		/* Actual loop iteration. */
2973		dir_obj = parent_obj;
2974	}
2975
2976	if (*p != '\0')
2977		++p;
2978	strcpy(result, p);
2979
2980	return (0);
2981}
2982
2983static int
2984zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
2985{
2986	char element[256];
2987	uint64_t dir_obj, child_dir_zapobj;
2988	dnode_phys_t child_dir_zap, dir;
2989	dsl_dir_phys_t *dd;
2990	const char *p, *q;
2991
2992	if (objset_get_dnode(spa, &spa->spa_mos,
2993	    DMU_POOL_DIRECTORY_OBJECT, &dir))
2994		return (EIO);
2995	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
2996	    1, &dir_obj))
2997		return (EIO);
2998
2999	p = name;
3000	for (;;) {
3001		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
3002			return (EIO);
3003		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3004
3005		while (*p == '/')
3006			p++;
3007		/* Actual loop condition #1. */
3008		if (*p == '\0')
3009			break;
3010
3011		q = strchr(p, '/');
3012		if (q) {
3013			memcpy(element, p, q - p);
3014			element[q - p] = '\0';
3015			p = q + 1;
3016		} else {
3017			strcpy(element, p);
3018			p += strlen(p);
3019		}
3020
3021		child_dir_zapobj = dd->dd_child_dir_zapobj;
3022		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
3023		    &child_dir_zap) != 0)
3024			return (EIO);
3025
3026		/* Actual loop condition #2. */
3027		if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
3028		    1, &dir_obj) != 0)
3029			return (ENOENT);
3030	}
3031
3032	*objnum = dd->dd_head_dataset_obj;
3033	return (0);
3034}
3035
3036#pragma GCC diagnostic ignored "-Wstrict-aliasing"
3037static int
3038zfs_list_dataset(const spa_t *spa, uint64_t objnum)
3039{
3040	uint64_t dir_obj, child_dir_zapobj;
3041	dnode_phys_t child_dir_zap, dir, dataset;
3042	dsl_dataset_phys_t *ds;
3043	dsl_dir_phys_t *dd;
3044
3045	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
3046		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3047		return (EIO);
3048	}
3049	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3050	dir_obj = ds->ds_dir_obj;
3051
3052	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
3053		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3054		return (EIO);
3055	}
3056	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3057
3058	child_dir_zapobj = dd->dd_child_dir_zapobj;
3059	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
3060	    &child_dir_zap) != 0) {
3061		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3062		return (EIO);
3063	}
3064
3065	return (zap_list(spa, &child_dir_zap) != 0);
3066}
3067
3068int
3069zfs_callback_dataset(const spa_t *spa, uint64_t objnum,
3070    int (*callback)(const char *, uint64_t))
3071{
3072	uint64_t dir_obj, child_dir_zapobj;
3073	dnode_phys_t child_dir_zap, dir, dataset;
3074	dsl_dataset_phys_t *ds;
3075	dsl_dir_phys_t *dd;
3076	zap_phys_t *zap;
3077	size_t size;
3078	int err;
3079
3080	err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
3081	if (err != 0) {
3082		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3083		return (err);
3084	}
3085	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3086	dir_obj = ds->ds_dir_obj;
3087
3088	err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
3089	if (err != 0) {
3090		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
3091		return (err);
3092	}
3093	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3094
3095	child_dir_zapobj = dd->dd_child_dir_zapobj;
3096	err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj,
3097	    &child_dir_zap);
3098	if (err != 0) {
3099		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
3100		return (err);
3101	}
3102
3103	size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3104	zap = malloc(size);
3105	if (zap != NULL) {
3106		err = dnode_read(spa, &child_dir_zap, 0, zap, size);
3107		if (err != 0)
3108			goto done;
3109
3110		if (zap->zap_block_type == ZBT_MICRO)
3111			err = mzap_list((const mzap_phys_t *)zap, size,
3112			    callback);
3113		else
3114			err = fzap_list(spa, &child_dir_zap, zap, callback);
3115	} else {
3116		err = ENOMEM;
3117	}
3118done:
3119	free(zap);
3120	return (err);
3121}
3122
3123/*
3124 * Find the object set given the object number of its dataset object
3125 * and return its details in *objset
3126 */
3127static int
3128zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
3129{
3130	dnode_phys_t dataset;
3131	dsl_dataset_phys_t *ds;
3132
3133	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
3134		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
3135		return (EIO);
3136	}
3137
3138	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
3139	if (zio_read(spa, &ds->ds_bp, objset)) {
3140		printf("ZFS: can't read object set for dataset %ju\n",
3141		    (uintmax_t)objnum);
3142		return (EIO);
3143	}
3144
3145	return (0);
3146}
3147
3148/*
3149 * Find the object set pointed to by the BOOTFS property or the root
3150 * dataset if there is none and return its details in *objset
3151 */
3152static int
3153zfs_get_root(const spa_t *spa, uint64_t *objid)
3154{
3155	dnode_phys_t dir, propdir;
3156	uint64_t props, bootfs, root;
3157
3158	*objid = 0;
3159
3160	/*
3161	 * Start with the MOS directory object.
3162	 */
3163	if (objset_get_dnode(spa, &spa->spa_mos,
3164	    DMU_POOL_DIRECTORY_OBJECT, &dir)) {
3165		printf("ZFS: can't read MOS object directory\n");
3166		return (EIO);
3167	}
3168
3169	/*
3170	 * Lookup the pool_props and see if we can find a bootfs.
3171	 */
3172	if (zap_lookup(spa, &dir, DMU_POOL_PROPS,
3173	    sizeof (props), 1, &props) == 0 &&
3174	    objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 &&
3175	    zap_lookup(spa, &propdir, "bootfs",
3176	    sizeof (bootfs), 1, &bootfs) == 0 && bootfs != 0) {
3177		*objid = bootfs;
3178		return (0);
3179	}
3180	/*
3181	 * Lookup the root dataset directory
3182	 */
3183	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET,
3184	    sizeof (root), 1, &root) ||
3185	    objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
3186		printf("ZFS: can't find root dsl_dir\n");
3187		return (EIO);
3188	}
3189
3190	/*
3191	 * Use the information from the dataset directory's bonus buffer
3192	 * to find the dataset object and from that the object set itself.
3193	 */
3194	dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus;
3195	*objid = dd->dd_head_dataset_obj;
3196	return (0);
3197}
3198
3199static int
3200zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mnt)
3201{
3202
3203	mnt->spa = spa;
3204
3205	/*
3206	 * Find the root object set if not explicitly provided
3207	 */
3208	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
3209		printf("ZFS: can't find root filesystem\n");
3210		return (EIO);
3211	}
3212
3213	if (zfs_mount_dataset(spa, rootobj, &mnt->objset)) {
3214		printf("ZFS: can't open root filesystem\n");
3215		return (EIO);
3216	}
3217
3218	mnt->rootobj = rootobj;
3219
3220	return (0);
3221}
3222
3223/*
3224 * callback function for feature name checks.
3225 */
3226static int
3227check_feature(const char *name, uint64_t value)
3228{
3229	int i;
3230
3231	if (value == 0)
3232		return (0);
3233	if (name[0] == '\0')
3234		return (0);
3235
3236	for (i = 0; features_for_read[i] != NULL; i++) {
3237		if (strcmp(name, features_for_read[i]) == 0)
3238			return (0);
3239	}
3240	printf("ZFS: unsupported feature: %s\n", name);
3241	return (EIO);
3242}
3243
3244/*
3245 * Checks whether the MOS features that are active are supported.
3246 */
3247static int
3248check_mos_features(const spa_t *spa)
3249{
3250	dnode_phys_t dir;
3251	zap_phys_t *zap;
3252	uint64_t objnum;
3253	size_t size;
3254	int rc;
3255
3256	if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
3257	    &dir)) != 0)
3258		return (rc);
3259	if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
3260	    sizeof (objnum), 1, &objnum)) != 0) {
3261		/*
3262		 * It is older pool without features. As we have already
3263		 * tested the label, just return without raising the error.
3264		 */
3265		if (rc == ENOENT)
3266			rc = 0;
3267		return (rc);
3268	}
3269
3270	if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
3271		return (rc);
3272
3273	if (dir.dn_type != DMU_OTN_ZAP_METADATA)
3274		return (EIO);
3275
3276	size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT;
3277	zap = malloc(size);
3278	if (zap == NULL)
3279		return (ENOMEM);
3280
3281	if (dnode_read(spa, &dir, 0, zap, size)) {
3282		free(zap);
3283		return (EIO);
3284	}
3285
3286	if (zap->zap_block_type == ZBT_MICRO)
3287		rc = mzap_list((const mzap_phys_t *)zap, size, check_feature);
3288	else
3289		rc = fzap_list(spa, &dir, zap, check_feature);
3290
3291	free(zap);
3292	return (rc);
3293}
3294
3295static int
3296load_nvlist(spa_t *spa, uint64_t obj, unsigned char **value)
3297{
3298	dnode_phys_t dir;
3299	size_t size;
3300	int rc;
3301	unsigned char *nv;
3302
3303	*value = NULL;
3304	if ((rc = objset_get_dnode(spa, &spa->spa_mos, obj, &dir)) != 0)
3305		return (rc);
3306	if (dir.dn_type != DMU_OT_PACKED_NVLIST &&
3307	    dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) {
3308		return (EIO);
3309	}
3310
3311	if (dir.dn_bonuslen != sizeof (uint64_t))
3312		return (EIO);
3313
3314	size = *(uint64_t *)DN_BONUS(&dir);
3315	nv = malloc(size);
3316	if (nv == NULL)
3317		return (ENOMEM);
3318
3319	rc = dnode_read(spa, &dir, 0, nv, size);
3320	if (rc != 0) {
3321		free(nv);
3322		nv = NULL;
3323		return (rc);
3324	}
3325	*value = nv;
3326	return (rc);
3327}
3328
3329static int
3330zfs_spa_init(spa_t *spa)
3331{
3332	dnode_phys_t dir;
3333	uint64_t config_object;
3334	unsigned char *nvlist;
3335	int rc;
3336
3337	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
3338		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
3339		return (EIO);
3340	}
3341	if (spa->spa_mos.os_type != DMU_OST_META) {
3342		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
3343		return (EIO);
3344	}
3345
3346	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
3347	    &dir)) {
3348		printf("ZFS: failed to read pool %s directory object\n",
3349		    spa->spa_name);
3350		return (EIO);
3351	}
3352	/* this is allowed to fail, older pools do not have salt */
3353	rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
3354	    sizeof (spa->spa_cksum_salt.zcs_bytes),
3355	    spa->spa_cksum_salt.zcs_bytes);
3356
3357	rc = check_mos_features(spa);
3358	if (rc != 0) {
3359		printf("ZFS: pool %s is not supported\n", spa->spa_name);
3360		return (rc);
3361	}
3362
3363	rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG,
3364	    sizeof (config_object), 1, &config_object);
3365	if (rc != 0) {
3366		printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG);
3367		return (EIO);
3368	}
3369	rc = load_nvlist(spa, config_object, &nvlist);
3370	if (rc != 0)
3371		return (rc);
3372
3373	/*
3374	 * Update vdevs from MOS config. Note, we do skip encoding bytes
3375	 * here. See also vdev_label_read_config().
3376	 */
3377	rc = vdev_init_from_nvlist(spa, nvlist + 4);
3378	free(nvlist);
3379	return (rc);
3380}
3381
3382static int
3383zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
3384{
3385
3386	if (dn->dn_bonustype != DMU_OT_SA) {
3387		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
3388
3389		sb->st_mode = zp->zp_mode;
3390		sb->st_uid = zp->zp_uid;
3391		sb->st_gid = zp->zp_gid;
3392		sb->st_size = zp->zp_size;
3393	} else {
3394		sa_hdr_phys_t *sahdrp;
3395		int hdrsize;
3396		size_t size = 0;
3397		void *buf = NULL;
3398
3399		if (dn->dn_bonuslen != 0)
3400			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3401		else {
3402			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
3403				blkptr_t *bp = DN_SPILL_BLKPTR(dn);
3404				int error;
3405
3406				size = BP_GET_LSIZE(bp);
3407				buf = malloc(size);
3408				if (buf == NULL)
3409					error = ENOMEM;
3410				else
3411					error = zio_read(spa, bp, buf);
3412
3413				if (error != 0) {
3414					free(buf);
3415					return (error);
3416				}
3417				sahdrp = buf;
3418			} else {
3419				return (EIO);
3420			}
3421		}
3422		hdrsize = SA_HDR_SIZE(sahdrp);
3423		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
3424		    SA_MODE_OFFSET);
3425		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
3426		    SA_UID_OFFSET);
3427		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
3428		    SA_GID_OFFSET);
3429		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
3430		    SA_SIZE_OFFSET);
3431		free(buf);
3432	}
3433
3434	return (0);
3435}
3436
3437static int
3438zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
3439{
3440	int rc = 0;
3441
3442	if (dn->dn_bonustype == DMU_OT_SA) {
3443		sa_hdr_phys_t *sahdrp = NULL;
3444		size_t size = 0;
3445		void *buf = NULL;
3446		int hdrsize;
3447		char *p;
3448
3449		if (dn->dn_bonuslen != 0) {
3450			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
3451		} else {
3452			blkptr_t *bp;
3453
3454			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
3455				return (EIO);
3456			bp = DN_SPILL_BLKPTR(dn);
3457
3458			size = BP_GET_LSIZE(bp);
3459			buf = malloc(size);
3460			if (buf == NULL)
3461				rc = ENOMEM;
3462			else
3463				rc = zio_read(spa, bp, buf);
3464			if (rc != 0) {
3465				free(buf);
3466				return (rc);
3467			}
3468			sahdrp = buf;
3469		}
3470		hdrsize = SA_HDR_SIZE(sahdrp);
3471		p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
3472		memcpy(path, p, psize);
3473		free(buf);
3474		return (0);
3475	}
3476	/*
3477	 * Second test is purely to silence bogus compiler
3478	 * warning about accessing past the end of dn_bonus.
3479	 */
3480	if (psize + sizeof (znode_phys_t) <= dn->dn_bonuslen &&
3481	    sizeof (znode_phys_t) <= sizeof (dn->dn_bonus)) {
3482		memcpy(path, &dn->dn_bonus[sizeof (znode_phys_t)], psize);
3483	} else {
3484		rc = dnode_read(spa, dn, 0, path, psize);
3485	}
3486	return (rc);
3487}
3488
3489struct obj_list {
3490	uint64_t		objnum;
3491	STAILQ_ENTRY(obj_list)	entry;
3492};
3493
3494/*
3495 * Lookup a file and return its dnode.
3496 */
3497static int
3498zfs_lookup(const struct zfsmount *mnt, const char *upath, dnode_phys_t *dnode)
3499{
3500	int rc;
3501	uint64_t objnum;
3502	const spa_t *spa;
3503	dnode_phys_t dn;
3504	const char *p, *q;
3505	char element[256];
3506	char path[1024];
3507	int symlinks_followed = 0;
3508	struct stat sb;
3509	struct obj_list *entry, *tentry;
3510	STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
3511
3512	spa = mnt->spa;
3513	if (mnt->objset.os_type != DMU_OST_ZFS) {
3514		printf("ZFS: unexpected object set type %ju\n",
3515		    (uintmax_t)mnt->objset.os_type);
3516		return (EIO);
3517	}
3518
3519	if ((entry = malloc(sizeof (struct obj_list))) == NULL)
3520		return (ENOMEM);
3521
3522	/*
3523	 * Get the root directory dnode.
3524	 */
3525	rc = objset_get_dnode(spa, &mnt->objset, MASTER_NODE_OBJ, &dn);
3526	if (rc) {
3527		free(entry);
3528		return (rc);
3529	}
3530
3531	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
3532	if (rc) {
3533		free(entry);
3534		return (rc);
3535	}
3536	entry->objnum = objnum;
3537	STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3538
3539	rc = objset_get_dnode(spa, &mnt->objset, objnum, &dn);
3540	if (rc != 0)
3541		goto done;
3542
3543	p = upath;
3544	while (p && *p) {
3545		rc = objset_get_dnode(spa, &mnt->objset, objnum, &dn);
3546		if (rc != 0)
3547			goto done;
3548
3549		while (*p == '/')
3550			p++;
3551		if (*p == '\0')
3552			break;
3553		q = p;
3554		while (*q != '\0' && *q != '/')
3555			q++;
3556
3557		/* skip dot */
3558		if (p + 1 == q && p[0] == '.') {
3559			p++;
3560			continue;
3561		}
3562		/* double dot */
3563		if (p + 2 == q && p[0] == '.' && p[1] == '.') {
3564			p += 2;
3565			if (STAILQ_FIRST(&on_cache) ==
3566			    STAILQ_LAST(&on_cache, obj_list, entry)) {
3567				rc = ENOENT;
3568				goto done;
3569			}
3570			entry = STAILQ_FIRST(&on_cache);
3571			STAILQ_REMOVE_HEAD(&on_cache, entry);
3572			free(entry);
3573			objnum = (STAILQ_FIRST(&on_cache))->objnum;
3574			continue;
3575		}
3576		if (q - p + 1 > sizeof (element)) {
3577			rc = ENAMETOOLONG;
3578			goto done;
3579		}
3580		memcpy(element, p, q - p);
3581		element[q - p] = 0;
3582		p = q;
3583
3584		if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
3585			goto done;
3586		if (!S_ISDIR(sb.st_mode)) {
3587			rc = ENOTDIR;
3588			goto done;
3589		}
3590
3591		rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
3592		if (rc)
3593			goto done;
3594		objnum = ZFS_DIRENT_OBJ(objnum);
3595
3596		if ((entry = malloc(sizeof (struct obj_list))) == NULL) {
3597			rc = ENOMEM;
3598			goto done;
3599		}
3600		entry->objnum = objnum;
3601		STAILQ_INSERT_HEAD(&on_cache, entry, entry);
3602		rc = objset_get_dnode(spa, &mnt->objset, objnum, &dn);
3603		if (rc)
3604			goto done;
3605
3606		/*
3607		 * Check for symlink.
3608		 */
3609		rc = zfs_dnode_stat(spa, &dn, &sb);
3610		if (rc)
3611			goto done;
3612		if (S_ISLNK(sb.st_mode)) {
3613			if (symlinks_followed > 10) {
3614				rc = EMLINK;
3615				goto done;
3616			}
3617			symlinks_followed++;
3618
3619			/*
3620			 * Read the link value and copy the tail of our
3621			 * current path onto the end.
3622			 */
3623			if (sb.st_size + strlen(p) + 1 > sizeof (path)) {
3624				rc = ENAMETOOLONG;
3625				goto done;
3626			}
3627			strcpy(&path[sb.st_size], p);
3628
3629			rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
3630			if (rc != 0)
3631				goto done;
3632
3633			/*
3634			 * Restart with the new path, starting either at
3635			 * the root or at the parent depending whether or
3636			 * not the link is relative.
3637			 */
3638			p = path;
3639			if (*p == '/') {
3640				while (STAILQ_FIRST(&on_cache) !=
3641				    STAILQ_LAST(&on_cache, obj_list, entry)) {
3642					entry = STAILQ_FIRST(&on_cache);
3643					STAILQ_REMOVE_HEAD(&on_cache, entry);
3644					free(entry);
3645				}
3646			} else {
3647				entry = STAILQ_FIRST(&on_cache);
3648				STAILQ_REMOVE_HEAD(&on_cache, entry);
3649				free(entry);
3650			}
3651			objnum = (STAILQ_FIRST(&on_cache))->objnum;
3652		}
3653	}
3654
3655	*dnode = dn;
3656done:
3657	STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
3658		free(entry);
3659	return (rc);
3660}
3661