1c023f65Toomas Soome/*
2199767fToomas Soome * Copyright (c) 2007 Doug Rabson
3199767fToomas Soome * All rights reserved.
4199767fToomas Soome *
5199767fToomas Soome * Redistribution and use in source and binary forms, with or without
6199767fToomas Soome * modification, are permitted provided that the following conditions
7199767fToomas Soome * are met:
8199767fToomas Soome * 1. Redistributions of source code must retain the above copyright
9199767fToomas Soome *    notice, this list of conditions and the following disclaimer.
10199767fToomas Soome * 2. Redistributions in binary form must reproduce the above copyright
11199767fToomas Soome *    notice, this list of conditions and the following disclaimer in the
12199767fToomas Soome *    documentation and/or other materials provided with the distribution.
13199767fToomas Soome *
24199767fToomas Soome * SUCH DAMAGE.
25199767fToomas Soome */
26199767fToomas Soome
27199767fToomas Soome#include <sys/cdefs.h>
28199767fToomas Soome
29199767fToomas Soome/*
30199767fToomas Soome *	Stand-alone ZFS file reader.
31199767fToomas Soome */
32199767fToomas Soome
3313a6e30Toomas Soome#include <sys/endian.h>
34199767fToomas Soome#include <sys/stat.h>
35199767fToomas Soome#include <sys/stdint.h>
36c023f65Toomas Soome#include <sys/list.h>
37c023f65Toomas Soome#include <inttypes.h>
38199767fToomas Soome
39199767fToomas Soome#include "zfsimpl.h"
40199767fToomas Soome#include "zfssubr.c"
41199767fToomas Soome
42199767fToomas Soome
43199767fToomas Soomestruct zfsmount {
44199767fToomas Soome	const spa_t	*spa;
45199767fToomas Soome	objset_phys_t	objset;
46199767fToomas Soome	uint64_t	rootobj;
47199767fToomas Soome};
48199767fToomas Soome
49199767fToomas Soome/*
50c023f65Toomas Soome * The indirect_child_t represents the vdev that we will read from, when we
51c023f65Toomas Soome * need to read all copies of the data (e.g. for scrub or reconstruction).
52c023f65Toomas Soome * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
53c023f65Toomas Soome * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
54c023f65Toomas Soome * ic_vdev is a child of the mirror.
55c023f65Toomas Soome */
56c023f65Toomas Soometypedef struct indirect_child {
57c023f65Toomas Soome	void *ic_data;
58c023f65Toomas Soome	vdev_t *ic_vdev;
59c023f65Toomas Soome} indirect_child_t;
60c023f65Toomas Soome
61c023f65Toomas Soome/*
62c023f65Toomas Soome * The indirect_split_t represents one mapped segment of an i/o to the
63c023f65Toomas Soome * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
64c023f65Toomas Soome * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
65c023f65Toomas Soome * For split blocks, there will be several of these.
66c023f65Toomas Soome */
67c023f65Toomas Soometypedef struct indirect_split {
68c023f65Toomas Soome	list_node_t is_node; /* link on iv_splits */
69c023f65Toomas Soome
70c023f65Toomas Soome	/*
71c023f65Toomas Soome	 * is_split_offset is the offset into the i/o.
72c023f65Toomas Soome	 * This is the sum of the previous splits' is_size's.
73c023f65Toomas Soome	 */
74c023f65Toomas Soome	uint64_t is_split_offset;
75c023f65Toomas Soome
76c023f65Toomas Soome	vdev_t *is_vdev; /* top-level vdev */
77c023f65Toomas Soome	uint64_t is_target_offset; /* offset on is_vdev */
78c023f65Toomas Soome	uint64_t is_size;
79c023f65Toomas Soome	int is_children; /* number of entries in is_child[] */
80c023f65Toomas Soome
81c023f65Toomas Soome	/*
82c023f65Toomas Soome	 * is_good_child is the child that we are currently using to
83c023f65Toomas Soome	 * attempt reconstruction.
84c023f65Toomas Soome	 */
85c023f65Toomas Soome	int is_good_child;
86c023f65Toomas Soome
87c023f65Toomas Soome	indirect_child_t is_child[1]; /* variable-length */
88c023f65Toomas Soome} indirect_split_t;
89c023f65Toomas Soome
90c023f65Toomas Soome/*
91c023f65Toomas Soome * The indirect_vsd_t is associated with each i/o to the indirect vdev.
92c023f65Toomas Soome * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
93c023f65Toomas Soome */
94c023f65Toomas Soometypedef struct indirect_vsd {
95c023f65Toomas Soome	boolean_t iv_split_block;
96c023f65Toomas Soome	boolean_t iv_reconstruct;
97c023f65Toomas Soome
98c023f65Toomas Soome	list_t iv_splits; /* list of indirect_split_t's */
99c023f65Toomas Soome} indirect_vsd_t;
100c023f65Toomas Soome
101c023f65Toomas Soome/*
102199767fToomas Soome * List of all vdevs, chained through v_alllink.
103199767fToomas Soome */
104199767fToomas Soomestatic vdev_list_t zfs_vdevs;
105199767fToomas Soome
1066fd7fa3Toomas Soome/*
107199767fToomas Soome * List of ZFS features supported for read
108199767fToomas Soome */
109199767fToomas Soomestatic const char *features_for_read[] = {
110199767fToomas Soome	"org.illumos:lz4_compress",
111199767fToomas Soome	"com.delphix:hole_birth",
112199767fToomas Soome	"com.delphix:extensible_dataset",
113199767fToomas Soome	"com.delphix:embedded_data",
114199767fToomas Soome	"org.open-zfs:large_blocks",
115199767fToomas Soome	"org.illumos:sha512",
1164a04e8dToomas Soome	"org.illumos:skein",
1174a04e8dToomas Soome	"org.illumos:edonr",
118f905073Toomas Soome	"org.zfsonlinux:large_dnode",
1196f8e6e5Alex Wilson	"com.joyent:multi_vdev_crash_dump",
12042b4b09Andy Fiddaman	"com.delphix:spacemap_histogram",
12142b4b09Andy Fiddaman	"com.delphix:zpool_checkpoint",
12242b4b09Andy Fiddaman	"com.delphix:spacemap_v2",
12342b4b09Andy Fiddaman	"com.datto:encryption",
12442b4b09Andy Fiddaman	"com.datto:bookmark_v2",
12542b4b09Andy Fiddaman	"org.zfsonlinux:allocation_classes",
12642b4b09Andy Fiddaman	"com.datto:resilver_defer",
127c023f65Toomas Soome	"com.delphix:device_removal",
128c023f65Toomas Soome	"com.delphix:obsolete_counts",
129199767fToomas Soome	NULL
130199767fToomas Soome};
131199767fToomas Soome
132199767fToomas Soome/*
133199767fToomas Soome * List of all pools, chained through spa_link.
134199767fToomas Soome */
135199767fToomas Soomestatic spa_list_t zfs_pools;
136199767fToomas Soome
137edb3504Toomas Soomestatic const dnode_phys_t *dnode_cache_obj;
138199767fToomas Soomestatic uint64_t dnode_cache_bn;
139199767fToomas Soomestatic char *dnode_cache_buf;
140199767fToomas Soome
141199767fToomas Soomestatic int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
142199767fToomas Soomestatic int zfs_get_root(const spa_t *spa, uint64_t *objid);
143199767fToomas Soomestatic int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
1444a04e8dToomas Soomestatic int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
1454a04e8dToomas Soome    const char *name, uint64_t integer_size, uint64_t num_integers,
1464a04e8dToomas Soome    void *value);
147c023f65Toomas Soomestatic int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
148c023f65Toomas Soome    dnode_phys_t *);
149c023f65Toomas Soomestatic int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
150c023f65Toomas Soome    size_t);
151c023f65Toomas Soomestatic int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
152c023f65Toomas Soome    size_t);
153c023f65Toomas Soomestatic int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t,
154c023f65Toomas Soome    size_t);
155199767fToomas Soome
156199767fToomas Soomestatic void
157199767fToomas Soomezfs_init(void)
158199767fToomas Soome{
159199767fToomas Soome	STAILQ_INIT(&zfs_vdevs);
160199767fToomas Soome	STAILQ_INIT(&zfs_pools);
161199767fToomas Soome
162199767fToomas Soome	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
163199767fToomas Soome
164199767fToomas Soome	zfs_init_crc();
165199767fToomas Soome}
166199767fToomas Soome
167199767fToomas Soomestatic int
168199767fToomas Soomexdr_int(const unsigned char **xdr, int *ip)
169199767fToomas Soome{
17013a6e30Toomas Soome	*ip = be32dec(*xdr);
171199767fToomas Soome	(*xdr) += 4;
172199767fToomas Soome	return (0);
173199767fToomas Soome}
174199767fToomas Soome
175199767fToomas Soomestatic int
1766fd7fa3Toomas Soomexdr_u_int(const unsigned char **xdr, uint_t *ip)
177199767fToomas Soome{
17813a6e30Toomas Soome	*ip = be32dec(*xdr);
179199767fToomas Soome	(*xdr) += 4;
180199767fToomas Soome	return (0);
181199767fToomas Soome}
182199767fToomas Soome
183199767fToomas Soomestatic int
184199767fToomas Soomexdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
185199767fToomas Soome{
1866fd7fa3Toomas Soome	uint_t hi, lo;
187199767fToomas Soome
188199767fToomas Soome	xdr_u_int(xdr, &hi);
189199767fToomas Soome	xdr_u_int(xdr, &lo);
1906fd7fa3Toomas Soome	*lp = (((uint64_t)hi) << 32) | lo;
191199767fToomas Soome	return (0);
192199767fToomas Soome}
193199767fToomas Soome
194199767fToomas Soomestatic int
195199767fToomas Soomenvlist_find(const unsigned char *nvlist, const char *name, int type,
1966fd7fa3Toomas Soome    int *elementsp, void *valuep)
197199767fToomas Soome{
198199767fToomas Soome	const unsigned char *p, *pair;
199199767fToomas Soome	int junk;
200199767fToomas Soome	int encoded_size, decoded_size;
201199767fToomas Soome
202199767fToomas Soome	p = nvlist;
203199767fToomas Soome	xdr_int(&p, &junk);
204199767fToomas Soome	xdr_int(&p, &junk);
205199767fToomas Soome
206199767fToomas Soome	pair = p;
207199767fToomas Soome	xdr_int(&p, &encoded_size);
208199767fToomas Soome	xdr_int(&p, &decoded_size);
209199767fToomas Soome	while (encoded_size && decoded_size) {
210199767fToomas Soome		int namelen, pairtype, elements;
211199767fToomas Soome		const char *pairname;
212199767fToomas Soome
213199767fToomas Soome		xdr_int(&p, &namelen);
2146fd7fa3Toomas Soome		pairname = (const char *)p;
215199767fToomas Soome		p += roundup(namelen, 4);
216199767fToomas Soome		xdr_int(&p, &pairtype);
217199767fToomas Soome
2186fd7fa3Toomas Soome		if (memcmp(name, pairname, namelen) == 0 && type == pairtype) {
219199767fToomas Soome			xdr_int(&p, &elements);
220199767fToomas Soome			if (elementsp)
221199767fToomas Soome				*elementsp = elements;
222199767fToomas Soome			if (type == DATA_TYPE_UINT64) {
2236fd7fa3Toomas Soome				xdr_uint64_t(&p, (uint64_t *)valuep);
224199767fToomas Soome				return (0);
225199767fToomas Soome			} else if (type == DATA_TYPE_STRING) {
226199767fToomas Soome				int len;
227199767fToomas Soome				xdr_int(&p, &len);
2286fd7fa3Toomas Soome				(*(const char **)valuep) = (const char *)p;
229199767fToomas Soome				return (0);
2306fd7fa3Toomas Soome			} else if (type == DATA_TYPE_NVLIST ||
2316fd7fa3Toomas Soome			    type == DATA_TYPE_NVLIST_ARRAY) {
2326fd7fa3Toomas Soome				(*(const unsigned char **)valuep) =
2336fd7fa3Toomas Soome				    (const unsigned char *)p;
234199767fToomas Soome				return (0);
235199767fToomas Soome			} else {
236199767fToomas Soome				return (EIO);
237199767fToomas Soome			}
238199767fToomas Soome		} else {
239199767fToomas Soome			/*
2406fd7fa3Toomas Soome			 * Not the pair we are looking for, skip to the
2416fd7fa3Toomas Soome			 * next one.
242199767fToomas Soome			 */
243199767fToomas Soome			p = pair + encoded_size;
244199767fToomas Soome		}
245199767fToomas Soome
246199767fToomas Soome		pair = p;
247199767fToomas Soome		xdr_int(&p, &encoded_size);
248199767fToomas Soome		xdr_int(&p, &decoded_size);
249199767fToomas Soome	}
250199767fToomas Soome
251199767fToomas Soome	return (EIO);
252199767fToomas Soome}
253199767fToomas Soome
254199767fToomas Soomestatic int
255199767fToomas Soomenvlist_check_features_for_read(const unsigned char *nvlist)
256199767fToomas Soome{
257199767fToomas Soome	const unsigned char *p, *pair;
258199767fToomas Soome	int junk;
259199767fToomas Soome	int encoded_size, decoded_size;
260199767fToomas Soome	int rc;
261199767fToomas Soome
262199767fToomas Soome	rc = 0;
263199767fToomas Soome
264199767fToomas Soome	p = nvlist;
265199767fToomas Soome	xdr_int(&p, &junk);
266199767fToomas Soome	xdr_int(&p, &junk);
267199767fToomas Soome
268199767fToomas Soome	pair = p;
269199767fToomas Soome	xdr_int(&p, &encoded_size);
270199767fToomas Soome	xdr_int(&p, &decoded_size);
271199767fToomas Soome	while (encoded_size && decoded_size) {
272199767fToomas Soome		int namelen, pairtype;
273199767fToomas Soome		const char *pairname;
274199767fToomas Soome		int i, found;
275199767fToomas Soome
276199767fToomas Soome		found = 0;
277199767fToomas Soome
278199767fToomas Soome		xdr_int(&p, &namelen);
2796fd7fa3Toomas Soome		pairname = (const char *)p;
280199767fToomas Soome		p += roundup(namelen, 4);
281199767fToomas Soome		xdr_int(&p, &pairtype);
282199767fToomas Soome
283199767fToomas Soome		for (i = 0; features_for_read[i] != NULL; i++) {
2846fd7fa3Toomas Soome			if (memcmp(pairname, features_for_read[i],
2856fd7fa3Toomas Soome			    namelen) == 0) {
286199767fToomas Soome				found = 1;
287199767fToomas Soome				break;
288199767fToomas Soome			}
289199767fToomas Soome		}
290199767fToomas Soome
291199767fToomas Soome		if (!found) {
292199767fToomas Soome			printf("ZFS: unsupported feature: %s\n", pairname);
293199767fToomas Soome			rc = EIO;
294199767fToomas Soome		}
295199767fToomas Soome
296199767fToomas Soome		p = pair + encoded_size;
297199767fToomas Soome
298199767fToomas Soome		pair = p;
299199767fToomas Soome		xdr_int(&p, &encoded_size);
300199767fToomas Soome		xdr_int(&p, &decoded_size);
301199767fToomas Soome	}
302199767fToomas Soome
303199767fToomas Soome	return (rc);
304199767fToomas Soome}
305199767fToomas Soome
306199767fToomas Soome/*
307199767fToomas Soome * Return the next nvlist in an nvlist array.
308199767fToomas Soome */
309199767fToomas Soomestatic const unsigned char *
310199767fToomas Soomenvlist_next(const unsigned char *nvlist)
311199767fToomas Soome{
312199767fToomas Soome	const unsigned char *p, *pair;
313199767fToomas Soome	int junk;
314199767fToomas Soome	int encoded_size, decoded_size;
315199767fToomas Soome
316199767fToomas Soome	p = nvlist;
317199767fToomas Soome	xdr_int(&p, &junk);
318199767fToomas Soome	xdr_int(&p, &junk);
319199767fToomas Soome
320199767fToomas Soome	pair = p;
321199767fToomas Soome	xdr_int(&p, &encoded_size);
322199767fToomas Soome	xdr_int(&p, &decoded_size);
323199767fToomas Soome	while (encoded_size && decoded_size) {
324199767fToomas Soome		p = pair + encoded_size;
325199767fToomas Soome
326199767fToomas Soome		pair = p;
327199767fToomas Soome		xdr_int(&p, &encoded_size);
328199767fToomas Soome		xdr_int(&p, &decoded_size);
329199767fToomas Soome	}
330199767fToomas Soome
3316fd7fa3Toomas Soome	return (p);
332199767fToomas Soome}
333199767fToomas Soome
334199767fToomas Soome#ifdef TEST
335199767fToomas Soome
336199767fToomas Soomestatic const unsigned char *
337199767fToomas Soomenvlist_print(const unsigned char *nvlist, unsigned int indent)
338199767fToomas Soome{
3396fd7fa3Toomas Soome	static const char *typenames[] = {
340199767fToomas Soome		"DATA_TYPE_UNKNOWN",
341199767fToomas Soome		"DATA_TYPE_BOOLEAN",
342199767fToomas Soome		"DATA_TYPE_BYTE",
343199767fToomas Soome		"DATA_TYPE_INT16",
344199767fToomas Soome		"DATA_TYPE_UINT16",
345199767fToomas Soome		"DATA_TYPE_INT32",
346199767fToomas Soome		"DATA_TYPE_UINT32",
347199767fToomas Soome		"DATA_TYPE_INT64",
348199767fToomas Soome		"DATA_TYPE_UINT64",
349199767fToomas Soome		"DATA_TYPE_STRING",
350199767fToomas Soome		"DATA_TYPE_BYTE_ARRAY",
351199767fToomas Soome		"DATA_TYPE_INT16_ARRAY",
352199767fToomas Soome		"DATA_TYPE_UINT16_ARRAY",
353199767fToomas Soome		"DATA_TYPE_INT32_ARRAY",
354199767fToomas Soome		"DATA_TYPE_UINT32_ARRAY",
355199767fToomas Soome		"DATA_TYPE_INT64_ARRAY",
356199767fToomas Soome		"DATA_TYPE_UINT64_ARRAY",
357199767fToomas Soome		"DATA_TYPE_STRING_ARRAY",
358199767fToomas Soome		"DATA_TYPE_HRTIME",
359199767fToomas Soome		"DATA_TYPE_NVLIST",
360199767fToomas Soome		"DATA_TYPE_NVLIST_ARRAY",
361199767fToomas Soome		"DATA_TYPE_BOOLEAN_VALUE",
362199767fToomas Soome		"DATA_TYPE_INT8",
363199767fToomas Soome		"DATA_TYPE_UINT8",
364199767fToomas Soome		"DATA_TYPE_BOOLEAN_ARRAY",
365199767fToomas Soome		"DATA_TYPE_INT8_ARRAY",
366199767fToomas Soome		"DATA_TYPE_UINT8_ARRAY"
367199767fToomas Soome	};
368199767fToomas Soome
369199767fToomas Soome	unsigned int i, j;
370199767fToomas Soome	const unsigned char *p, *pair;
371199767fToomas Soome	int junk;
372199767fToomas Soome	int encoded_size, decoded_size;
373199767fToomas Soome
374199767fToomas Soome	p = nvlist;
375199767fToomas Soome	xdr_int(&p, &junk);
376199767fToomas Soome	xdr_int(&p, &junk);
377199767fToomas Soome
378199767fToomas Soome	pair = p;
379199767fToomas Soome	xdr_int(&p, &encoded_size);
380199767fToomas Soome	xdr_int(&p, &decoded_size);
381199767fToomas Soome	while (encoded_size && decoded_size) {
382199767fToomas Soome		int namelen, pairtype, elements;
383199767fToomas Soome		const char *pairname;
384199767fToomas Soome
385199767fToomas Soome		xdr_int(&p, &namelen);
3866fd7fa3Toomas Soome		pairname = (const char *)p;
387199767fToomas Soome		p += roundup(namelen, 4);
388199767fToomas Soome		xdr_int(&p, &pairtype);
389199767fToomas Soome
390199767fToomas Soome		for (i = 0; i < indent; i++)
391199767fToomas Soome			printf(" ");
392199767fToomas Soome		printf("%s %s", typenames[pairtype], pairname);
393199767fToomas Soome
394199767fToomas Soome		xdr_int(&p, &elements);
395199767fToomas Soome		switch (pairtype) {
396199767fToomas Soome		case DATA_TYPE_UINT64: {
397199767fToomas Soome			uint64_t val;
398199767fToomas Soome			xdr_uint64_t(&p, &val);
399199767fToomas Soome			printf(" = 0x%jx\n", (uintmax_t)val);
400199767fToomas Soome			break;
401199767fToomas Soome		}
402199767fToomas Soome
403199767fToomas Soome		case DATA_TYPE_STRING: {
404199767fToomas Soome			int len;
405199767fToomas Soome			xdr_int(&p, &len);
406199767fToomas Soome			printf(" = \"%s\"\n", p);
407199767fToomas Soome			break;
408199767fToomas Soome		}
409199767fToomas Soome
410199767fToomas Soome		case DATA_TYPE_NVLIST:
411199767fToomas Soome			printf("\n");
412199767fToomas Soome			nvlist_print(p, indent + 1);
413199767fToomas Soome			break;
414199767fToomas Soome
415199767fToomas Soome		case DATA_TYPE_NVLIST_ARRAY:
416199767fToomas Soome			for (j = 0; j < elements; j++) {
417199767fToomas Soome				printf("[%d]\n", j);
418199767fToomas Soome				p = nvlist_print(p, indent + 1);
419199767fToomas Soome				if (j != elements - 1) {
420199767fToomas Soome					for (i = 0; i < indent; i++)
421199767fToomas Soome						printf(" ");
4226fd7fa3Toomas Soome					printf("%s %s", typenames[pairtype],
4236fd7fa3Toomas Soome					    pairname);
424199767fToomas Soome				}
425199767fToomas Soome			}
426199767fToomas Soome			break;
427199767fToomas Soome
428199767fToomas Soome		default:
429199767fToomas Soome			printf("\n");
430199767fToomas Soome		}
431199767fToomas Soome
432199767fToomas Soome		p = pair + encoded_size;
433199767fToomas Soome
434199767fToomas Soome		pair = p;
435199767fToomas Soome		xdr_int(&p, &encoded_size);
436199767fToomas Soome		xdr_int(&p, &decoded_size);
437199767fToomas Soome	}
438199767fToomas Soome
4396fd7fa3Toomas Soome	return (p);
440199767fToomas Soome}
441199767fToomas Soome
442199767fToomas Soome#endif
443199767fToomas Soome
444199767fToomas Soomestatic int
445199767fToomas Soomevdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
446199767fToomas Soome    off_t offset, size_t size)
447199767fToomas Soome{
448199767fToomas Soome	size_t psize;
449199767fToomas Soome	int rc;
450199767fToomas Soome
451199767fToomas Soome	if (!vdev->v_phys_read)
452199767fToomas Soome		return (EIO);
453199767fToomas Soome
454199767fToomas Soome	if (bp) {
455199767fToomas Soome		psize = BP_GET_PSIZE(bp);
456199767fToomas Soome	} else {
457199767fToomas Soome		psize = size;
458199767fToomas Soome	}
459199767fToomas Soome
460199767fToomas Soome	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
461da9bf00Toomas Soome	if (rc == 0) {
462da9bf00Toomas Soome		if (bp != NULL)
463da9bf00Toomas Soome			rc = zio_checksum_verify(vdev->v_spa, bp, buf);
464da9bf00Toomas Soome	}
465199767fToomas Soome
466da9bf00Toomas Soome	return (rc);
467199767fToomas Soome}
468199767fToomas Soome
469c023f65Toomas Soometypedef struct remap_segment {
470c023f65Toomas Soome	vdev_t *rs_vd;
471c023f65Toomas Soome	uint64_t rs_offset;
472c023f65Toomas Soome	uint64_t rs_asize;
473c023f65Toomas Soome	uint64_t rs_split_offset;
474c023f65Toomas Soome	list_node_t rs_node;
475c023f65Toomas Soome} remap_segment_t;
476c023f65Toomas Soome
477c023f65Toomas Soomestatic remap_segment_t *
478c023f65Toomas Soomers_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
479c023f65Toomas Soome{
480c023f65Toomas Soome	remap_segment_t *rs = malloc(sizeof (remap_segment_t));
481c023f65Toomas Soome
482c023f65Toomas Soome	if (rs != NULL) {
483c023f65Toomas Soome		rs->rs_vd = vd;
484c023f65Toomas Soome		rs->rs_offset = offset;
485c023f65Toomas Soome		rs->rs_asize = asize;
486c023f65Toomas Soome		rs->rs_split_offset = split_offset;
487c023f65Toomas Soome	}
488c023f65Toomas Soome
489c023f65Toomas Soome	return (rs);
490c023f65Toomas Soome}
491c023f65Toomas Soome
492c023f65Toomas Soomevdev_indirect_mapping_t *
493c023f65Toomas Soomevdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
494c023f65Toomas Soome    uint64_t mapping_object)
495c023f65Toomas Soome{
496c023f65Toomas Soome	vdev_indirect_mapping_t *vim;
497c023f65Toomas Soome	vdev_indirect_mapping_phys_t *vim_phys;
498c023f65Toomas Soome	int rc;
499c023f65Toomas Soome
500c023f65Toomas Soome	vim = calloc(1, sizeof (*vim));
501c023f65Toomas Soome	if (vim == NULL)
502c023f65Toomas Soome		return (NULL);
503c023f65Toomas Soome
504c023f65Toomas Soome	vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
505c023f65Toomas Soome	if (vim->vim_dn == NULL) {
506c023f65Toomas Soome		free(vim);
507c023f65Toomas Soome		return (NULL);
508c023f65Toomas Soome	}
509c023f65Toomas Soome
510c023f65Toomas Soome	rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
511c023f65Toomas Soome	if (rc != 0) {
512c023f65Toomas Soome		free(vim->vim_dn);
513c023f65Toomas Soome		free(vim);
514c023f65Toomas Soome		return (NULL);
515c023f65Toomas Soome	}
516c023f65Toomas Soome
517c023f65Toomas Soome	vim->vim_spa = spa;
518c023f65Toomas Soome	vim->vim_phys = malloc(sizeof (*vim->vim_phys));
519c023f65Toomas Soome	if (vim->vim_phys == NULL) {
520c023f65Toomas Soome		free(vim->vim_dn);
521c023f65Toomas Soome		free(vim);
522c023f65Toomas Soome		return (NULL);
523c023f65Toomas Soome	}
524c023f65Toomas Soome
525c023f65Toomas Soome	vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
526c023f65Toomas Soome	*vim->vim_phys = *vim_phys;
527c023f65Toomas Soome
528c023f65Toomas Soome	vim->vim_objset = os;
529c023f65Toomas Soome	vim->vim_object = mapping_object;
530c023f65Toomas Soome	vim->vim_entries = NULL;
531c023f65Toomas Soome
532c023f65Toomas Soome	vim->vim_havecounts =
533c023f65Toomas Soome	    (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
534c023f65Toomas Soome
535c023f65Toomas Soome	return (vim);
536c023f65Toomas Soome}
537c023f65Toomas Soome
538c023f65Toomas Soome/*
539c023f65Toomas Soome * Compare an offset with an indirect mapping entry; there are three
540c023f65Toomas Soome * possible scenarios:
541c023f65Toomas Soome *
542c023f65Toomas Soome *     1. The offset is "less than" the mapping entry; meaning the
543c023f65Toomas Soome *        offset is less than the source offset of the mapping entry. In
544c023f65Toomas Soome *        this case, there is no overlap between the offset and the
545c023f65Toomas Soome *        mapping entry and -1 will be returned.
546c023f65Toomas Soome *
547c023f65Toomas Soome *     2. The offset is "greater than" the mapping entry; meaning the
548c023f65Toomas Soome *        offset is greater than the mapping entry's source offset plus
549c023f65Toomas Soome *        the entry's size. In this case, there is no overlap between
550c023f65Toomas Soome *        the offset and the mapping entry and 1 will be returned.
551c023f65Toomas Soome *
552c023f65Toomas Soome *        NOTE: If the offset is actually equal to the entry's offset
553c023f65Toomas Soome *        plus size, this is considered to be "greater" than the entry,
554c023f65Toomas Soome *        and this case applies (i.e. 1 will be returned). Thus, the
555c023f65Toomas Soome *        entry's "range" can be considered to be inclusive at its
556c023f65Toomas Soome *        start, but exclusive at its end: e.g. [src, src + size).
557c023f65Toomas Soome *
558c023f65Toomas Soome *     3. The last case to consider is if the offset actually falls
559c023f65Toomas Soome *        within the mapping entry's range. If this is the case, the
560c023f65Toomas Soome *        offset is considered to be "equal to" the mapping entry and
561c023f65Toomas Soome *        0 will be returned.
562c023f65Toomas Soome *
563c023f65Toomas Soome *        NOTE: If the offset is equal to the entry's source offset,
564c023f65Toomas Soome *        this case applies and 0 will be returned. If the offset is
565c023f65Toomas Soome *        equal to the entry's source plus its size, this case does
566c023f65Toomas Soome *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
567c023f65Toomas Soome *        returned.
568c023f65Toomas Soome */
569c023f65Toomas Soomestatic int
570c023f65Toomas Soomedva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
571c023f65Toomas Soome{
572c023f65Toomas Soome	const uint64_t *key = v_key;
573c023f65Toomas Soome	const vdev_indirect_mapping_entry_phys_t *array_elem =
574c023f65Toomas Soome	    v_array_elem;
575c023f65Toomas Soome	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
576c023f65Toomas Soome
577c023f65Toomas Soome	if (*key < src_offset) {
578c023f65Toomas Soome		return (-1);
579c023f65Toomas Soome	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
580c023f65Toomas Soome		return (0);
581c023f65Toomas Soome	} else {
582c023f65Toomas Soome		return (1);
583c023f65Toomas Soome	}
584c023f65Toomas Soome}
585c023f65Toomas Soome
586c023f65Toomas Soome/*
587c023f65Toomas Soome * Return array entry.
588c023f65Toomas Soome */
589c023f65Toomas Soomestatic vdev_indirect_mapping_entry_phys_t *
590c023f65Toomas Soomevdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
591c023f65Toomas Soome{
592c023f65Toomas Soome	uint64_t size;
593c023f65Toomas Soome	off_t offset = 0;
594c023f65Toomas Soome	int rc;
595c023f65Toomas Soome
596c023f65Toomas Soome	if (vim->vim_phys->vimp_num_entries == 0)
597c023f65Toomas Soome		return (NULL);
598c023f65Toomas Soome
599c023f65Toomas Soome	if (vim->vim_entries == NULL) {
600c023f65Toomas Soome		uint64_t bsize;
601c023f65Toomas Soome
602c023f65Toomas Soome		bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
603c023f65Toomas Soome		size = vim->vim_phys->vimp_num_entries *
604c023f65Toomas Soome		    sizeof (*vim->vim_entries);
605c023f65Toomas Soome		if (size > bsize) {
606c023f65Toomas Soome			size = bsize / sizeof (*vim->vim_entries);
607c023f65Toomas Soome			size *= sizeof (*vim->vim_entries);
608c023f65Toomas Soome		}
609c023f65Toomas Soome		vim->vim_entries = malloc(size);
610c023f65Toomas Soome		if (vim->vim_entries == NULL)
611c023f65Toomas Soome			return (NULL);
612c023f65Toomas Soome		vim->vim_num_entries = size / sizeof (*vim->vim_entries);
613c023f65Toomas Soome		offset = index * sizeof (*vim->vim_entries);
614c023f65Toomas Soome	}
615c023f65Toomas Soome
616c023f65Toomas Soome	/* We have data in vim_entries */
617c023f65Toomas Soome	if (offset == 0) {
618c023f65Toomas Soome		if (index >= vim->vim_entry_offset &&
619c023f65Toomas Soome		    index <= vim->vim_entry_offset + vim->vim_num_entries) {
620c023f65Toomas Soome			index -= vim->vim_entry_offset;
621c023f65Toomas Soome			return (&vim->vim_entries[index]);
622c023f65Toomas Soome		}
623c023f65Toomas Soome		offset = index * sizeof (*vim->vim_entries);
624c023f65Toomas Soome	}
625c023f65Toomas Soome
626c023f65Toomas Soome	vim->vim_entry_offset = index;
627c023f65Toomas Soome	size = vim->vim_num_entries * sizeof (*vim->vim_entries);
628c023f65Toomas Soome	rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
629c023f65Toomas Soome	    size);
630c023f65Toomas Soome	if (rc != 0) {
631c023f65Toomas Soome		/* Read error, invalidate vim_entries. */
632c023f65Toomas Soome		free(vim->vim_entries);
633c023f65Toomas Soome		vim->vim_entries = NULL;
634c023f65Toomas Soome		return (NULL);
635c023f65Toomas Soome	}
636c023f65Toomas Soome	index -= vim->vim_entry_offset;
637c023f65Toomas Soome	return (&vim->vim_entries[index]);
638c023f65Toomas Soome}
639c023f65Toomas Soome
640c023f65Toomas Soome/*
641c023f65Toomas Soome * Returns the mapping entry for the given offset.
642c023f65Toomas Soome *
643c023f65Toomas Soome * It's possible that the given offset will not be in the mapping table
644c023f65Toomas Soome * (i.e. no mapping entries contain this offset), in which case, the
645c023f65Toomas Soome * return value value depends on the "next_if_missing" parameter.
646c023f65Toomas Soome *
647c023f65Toomas Soome * If the offset is not found in the table and "next_if_missing" is
648c023f65Toomas Soome * B_FALSE, then NULL will always be returned. The behavior is intended
649c023f65Toomas Soome * to allow consumers to get the entry corresponding to the offset
650c023f65Toomas Soome * parameter, iff the offset overlaps with an entry in the table.
651c023f65Toomas Soome *
652c023f65Toomas Soome * If the offset is not found in the table and "next_if_missing" is
653c023f65Toomas Soome * B_TRUE, then the entry nearest to the given offset will be returned,
654c023f65Toomas Soome * such that the entry's source offset is greater than the offset
655c023f65Toomas Soome * passed in (i.e. the "next" mapping entry in the table is returned, if
656c023f65Toomas Soome * the offset is missing from the table). If there are no entries whose
657c023f65Toomas Soome * source offset is greater than the passed in offset, NULL is returned.
658c023f65Toomas Soome */
659c023f65Toomas Soomestatic vdev_indirect_mapping_entry_phys_t *
660c023f65Toomas Soomevdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
661c023f65Toomas Soome    uint64_t offset)
662c023f65Toomas Soome{
663c023f65Toomas Soome	ASSERT(vim->vim_phys->vimp_num_entries > 0);
664c023f65Toomas Soome
665c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *entry;
666c023f65Toomas Soome
667c023f65Toomas Soome	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
668c023f65Toomas Soome	uint64_t base = 0;
669c023f65Toomas Soome
670c023f65Toomas Soome	/*
671c023f65Toomas Soome	 * We don't define these inside of the while loop because we use
672c023f65Toomas Soome	 * their value in the case that offset isn't in the mapping.
673c023f65Toomas Soome	 */
674c023f65Toomas Soome	uint64_t mid;
675c023f65Toomas Soome	int result;
676c023f65Toomas Soome
677c023f65Toomas Soome	while (last >= base) {
678c023f65Toomas Soome		mid = base + ((last - base) >> 1);
679c023f65Toomas Soome
680c023f65Toomas Soome		entry = vdev_indirect_mapping_entry(vim, mid);
681c023f65Toomas Soome		if (entry == NULL)
682c023f65Toomas Soome			break;
683c023f65Toomas Soome		result = dva_mapping_overlap_compare(&offset, entry);
684c023f65Toomas Soome
685c023f65Toomas Soome		if (result == 0) {
686c023f65Toomas Soome			break;
687c023f65Toomas Soome		} else if (result < 0) {
688c023f65Toomas Soome			last = mid - 1;
689c023f65Toomas Soome		} else {
690c023f65Toomas Soome			base = mid + 1;
691c023f65Toomas Soome		}
692c023f65Toomas Soome	}
693c023f65Toomas Soome	return (entry);
694c023f65Toomas Soome}
695c023f65Toomas Soome
696c023f65Toomas Soome/*
697c023f65Toomas Soome * Given an indirect vdev and an extent on that vdev, it duplicates the
698c023f65Toomas Soome * physical entries of the indirect mapping that correspond to the extent
699c023f65Toomas Soome * to a new array and returns a pointer to it. In addition, copied_entries
700c023f65Toomas Soome * is populated with the number of mapping entries that were duplicated.
701c023f65Toomas Soome *
702c023f65Toomas Soome * Finally, since we are doing an allocation, it is up to the caller to
703c023f65Toomas Soome * free the array allocated in this function.
704c023f65Toomas Soome */
705c023f65Toomas Soomevdev_indirect_mapping_entry_phys_t *
706c023f65Toomas Soomevdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
707c023f65Toomas Soome    uint64_t asize, uint64_t *copied_entries)
708c023f65Toomas Soome{
709c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
710c023f65Toomas Soome	vdev_indirect_mapping_t *vim = vd->v_mapping;
711c023f65Toomas Soome	uint64_t entries = 0;
712c023f65Toomas Soome
713c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *first_mapping =
714c023f65Toomas Soome	    vdev_indirect_mapping_entry_for_offset(vim, offset);
715c023f65Toomas Soome	ASSERT3P(first_mapping, !=, NULL);
716c023f65Toomas Soome
717c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
718c023f65Toomas Soome	while (asize > 0) {
719c023f65Toomas Soome		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
720c023f65Toomas Soome		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
721c023f65Toomas Soome		uint64_t inner_size = MIN(asize, size - inner_offset);
722c023f65Toomas Soome
723c023f65Toomas Soome		offset += inner_size;
724c023f65Toomas Soome		asize -= inner_size;
725c023f65Toomas Soome		entries++;
726c023f65Toomas Soome		m++;
727c023f65Toomas Soome	}
728c023f65Toomas Soome
729c023f65Toomas Soome	size_t copy_length = entries * sizeof (*first_mapping);
730c023f65Toomas Soome	duplicate_mappings = malloc(copy_length);
731c023f65Toomas Soome	if (duplicate_mappings != NULL)
732c023f65Toomas Soome		bcopy(first_mapping, duplicate_mappings, copy_length);
733c023f65Toomas Soome	else
734c023f65Toomas Soome		entries = 0;
735c023f65Toomas Soome
736c023f65Toomas Soome	*copied_entries = entries;
737c023f65Toomas Soome
738c023f65Toomas Soome	return (duplicate_mappings);
739c023f65Toomas Soome}
740c023f65Toomas Soome
741c023f65Toomas Soomestatic vdev_t *
742c023f65Toomas Soomevdev_lookup_top(spa_t *spa, uint64_t vdev)
743c023f65Toomas Soome{
744c023f65Toomas Soome	vdev_t *rvd;
745da9bf00Toomas Soome	vdev_list_t *vlist;
746c023f65Toomas Soome
747da9bf00Toomas Soome	vlist = &spa->spa_root_vdev->v_children;
748da9bf00Toomas Soome	STAILQ_FOREACH(rvd, vlist, v_childlink)
749c023f65Toomas Soome		if (rvd->v_id == vdev)
750c023f65Toomas Soome			break;
751c023f65Toomas Soome
752c023f65Toomas Soome	return (rvd);
753c023f65Toomas Soome}
754c023f65Toomas Soome
755c023f65Toomas Soome/*
756c023f65Toomas Soome * This is a callback for vdev_indirect_remap() which allocates an
757c023f65Toomas Soome * indirect_split_t for each split segment and adds it to iv_splits.
758c023f65Toomas Soome */
759c023f65Toomas Soomestatic void
760c023f65Toomas Soomevdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
761c023f65Toomas Soome    uint64_t size, void *arg)
762c023f65Toomas Soome{
763c023f65Toomas Soome	int n = 1;
764c023f65Toomas Soome	zio_t *zio = arg;
765c023f65Toomas Soome	indirect_vsd_t *iv = zio->io_vsd;
766c023f65Toomas Soome
767c023f65Toomas Soome	if (vd->v_read == vdev_indirect_read)
768c023f65Toomas Soome		return;
769c023f65Toomas Soome
770c023f65Toomas Soome	if (vd->v_read == vdev_mirror_read)
771c023f65Toomas Soome		n = vd->v_nchildren;
772c023f65Toomas Soome
773c023f65Toomas Soome	indirect_split_t *is =
774c023f65Toomas Soome	    malloc(offsetof(indirect_split_t, is_child[n]));
775c023f65Toomas Soome	if (is == NULL) {
776c023f65Toomas Soome		zio->io_error = ENOMEM;
777c023f65Toomas Soome		return;
778c023f65Toomas Soome	}
779c023f65Toomas Soome	bzero(is, offsetof(indirect_split_t, is_child[n]));
780c023f65Toomas Soome
781c023f65Toomas Soome	is->is_children = n;
782c023f65Toomas Soome	is->is_size = size;
783c023f65Toomas Soome	is->is_split_offset = split_offset;
784c023f65Toomas Soome	is->is_target_offset = offset;
785c023f65Toomas Soome	is->is_vdev = vd;
786c023f65Toomas Soome
787c023f65Toomas Soome	/*
788c023f65Toomas Soome	 * Note that we only consider multiple copies of the data for
789c023f65Toomas Soome	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
790c023f65Toomas Soome	 * though they use the same ops as mirror, because there's only one
791c023f65Toomas Soome	 * "good" copy under the replacing/spare.
792c023f65Toomas Soome	 */
793c023f65Toomas Soome	if (vd->v_read == vdev_mirror_read) {
794c023f65Toomas Soome		int i = 0;
795c023f65Toomas Soome		vdev_t *kid;
796c023f65Toomas Soome
797c023f65Toomas Soome		STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
798c023f65Toomas Soome			is->is_child[i++].ic_vdev = kid;
799c023f65Toomas Soome		}
800c023f65Toomas Soome	} else {
801c023f65Toomas Soome		is->is_child[0].ic_vdev = vd;
802c023f65Toomas Soome	}
803c023f65Toomas Soome
804c023f65Toomas Soome	list_insert_tail(&iv->iv_splits, is);
805c023f65Toomas Soome}
806c023f65Toomas Soome
807c023f65Toomas Soomestatic void
808c023f65Toomas Soomevdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
809c023f65Toomas Soome{
810c023f65Toomas Soome	list_t stack;
811da9bf00Toomas Soome	spa_t *spa = vd->v_spa;
812c023f65Toomas Soome	zio_t *zio = arg;
813042b560Toomas Soome	remap_segment_t *rs;
814c023f65Toomas Soome
815c023f65Toomas Soome	list_create(&stack, sizeof (remap_segment_t),
816c023f65Toomas Soome	    offsetof(remap_segment_t, rs_node));
817c023f65Toomas Soome
818042b560Toomas Soome	rs = rs_alloc(vd, offset, asize, 0);
819042b560Toomas Soome	if (rs == NULL) {
820042b560Toomas Soome		printf("vdev_indirect_remap: out of memory.\n");
821042b560Toomas Soome		zio->io_error = ENOMEM;
822042b560Toomas Soome	}
8236fd7fa3Toomas Soome	for (; rs != NULL; rs = list_remove_head(&stack)) {
824c023f65Toomas Soome		vdev_t *v = rs->rs_vd;
825c023f65Toomas Soome		uint64_t num_entries = 0;
826c023f65Toomas Soome		/* vdev_indirect_mapping_t *vim = v->v_mapping; */
827c023f65Toomas Soome		vdev_indirect_mapping_entry_phys_t *mapping =
828c023f65Toomas Soome		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
829c023f65Toomas Soome		    rs->rs_offset, rs->rs_asize, &num_entries);
830c023f65Toomas Soome
831042b560Toomas Soome		if (num_entries == 0)
832042b560Toomas Soome			zio->io_error = ENOMEM;
833042b560Toomas Soome
834c023f65Toomas Soome		for (uint64_t i = 0; i < num_entries; i++) {
835c023f65Toomas Soome			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
836c023f65Toomas Soome			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
837c023f65Toomas Soome			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
838c023f65Toomas Soome			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
839c023f65Toomas Soome			uint64_t inner_offset = rs->rs_offset -
840c023f65Toomas Soome			    DVA_MAPPING_GET_SRC_OFFSET(m);
841c023f65Toomas Soome			uint64_t inner_size =
842c023f65Toomas Soome			    MIN(rs->rs_asize, size - inner_offset);
843c023f65Toomas Soome			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
844c023f65Toomas Soome
845c023f65Toomas Soome			if (dst_v->v_read == vdev_indirect_read) {
846042b560Toomas Soome				remap_segment_t *o;
847042b560Toomas Soome
848042b560Toomas Soome				o = rs_alloc(dst_v, dst_offset + inner_offset,
849042b560Toomas Soome				    inner_size, rs->rs_split_offset);
850042b560Toomas Soome				if (o == NULL) {
851042b560Toomas Soome					printf("vdev_indirect_remap: "
852042b560Toomas Soome					    "out of memory.\n");
853042b560Toomas Soome					zio->io_error = ENOMEM;
854042b560Toomas Soome					break;
855042b560Toomas Soome				}
856042b560Toomas Soome
857042b560Toomas Soome				list_insert_head(&stack, o);
858c023f65Toomas Soome			}
859c023f65Toomas Soome			vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
860c023f65Toomas Soome			    dst_offset + inner_offset,
861c023f65Toomas Soome			    inner_size, arg);
862c023f65Toomas Soome
863c023f65Toomas Soome			/*
864c023f65Toomas Soome			 * vdev_indirect_gather_splits can have memory
865c023f65Toomas Soome			 * allocation error, we can not recover from it.
866c023f65Toomas Soome			 */
867c023f65Toomas Soome			if (zio->io_error != 0)
868c023f65Toomas Soome				break;
869c023f65Toomas Soome			rs->rs_offset += inner_size;
870c023f65Toomas Soome			rs->rs_asize -= inner_size;
871c023f65Toomas Soome			rs->rs_split_offset += inner_size;
872c023f65Toomas Soome		}
873c023f65Toomas Soome
874c023f65Toomas Soome		free(mapping);
875c023f65Toomas Soome		free(rs);
876c023f65Toomas Soome		if (zio->io_error != 0)
877c023f65Toomas Soome			break;
878c023f65Toomas Soome	}
879c023f65Toomas Soome
880c023f65Toomas Soome	list_destroy(&stack);
881c023f65Toomas Soome}
882c023f65Toomas Soome
883c023f65Toomas Soomestatic void
884c023f65Toomas Soomevdev_indirect_map_free(zio_t *zio)
885c023f65Toomas Soome{
886c023f65Toomas Soome	indirect_vsd_t *iv = zio->io_vsd;
887c023f65Toomas Soome	indirect_split_t *is;
888c023f65Toomas Soome
889c023f65Toomas Soome	while ((is = list_head(&iv->iv_splits)) != NULL) {
890c023f65Toomas Soome		for (int c = 0; c < is->is_children; c++) {
891c023f65Toomas Soome			indirect_child_t *ic = &is->is_child[c];
892c023f65Toomas Soome			free(ic->ic_data);
893c023f65Toomas Soome		}
894c023f65Toomas Soome		list_remove(&iv->iv_splits, is);
895c023f65Toomas Soome		free(is);
896c023f65Toomas Soome	}
897c023f65Toomas Soome	free(iv);
898c023f65Toomas Soome}
899c023f65Toomas Soome
900c023f65Toomas Soomestatic int
901c023f65Toomas Soomevdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
902c023f65Toomas Soome    off_t offset, size_t bytes)
903c023f65Toomas Soome{
904da9bf00Toomas Soome	zio_t zio;
905da9bf00Toomas Soome	spa_t *spa = vdev->v_spa;
906da9bf00Toomas Soome	indirect_vsd_t *iv;
907c023f65Toomas Soome	indirect_split_t *first;
908c023f65Toomas Soome	int rc = EIO;
909c023f65Toomas Soome
910da9bf00Toomas Soome	iv = calloc(1, sizeof (*iv));
911c023f65Toomas Soome	if (iv == NULL)
912c023f65Toomas Soome		return (ENOMEM);
913c023f65Toomas Soome
914c023f65Toomas Soome	list_create(&iv->iv_splits,
915c023f65Toomas Soome	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
916c023f65Toomas Soome
917da9bf00Toomas Soome	bzero(&zio, sizeof (zio));
918c023f65Toomas Soome	zio.io_spa = spa;
919c023f65Toomas Soome	zio.io_bp = (blkptr_t *)bp;
920c023f65Toomas Soome	zio.io_data = buf;
921c023f65Toomas Soome	zio.io_size = bytes;
922c023f65Toomas Soome	zio.io_offset = offset;
923c023f65Toomas Soome	zio.io_vd = vdev;
924c023f65Toomas Soome	zio.io_vsd = iv;
925c023f65Toomas Soome
926c023f65Toomas Soome	if (vdev->v_mapping == NULL) {
927c023f65Toomas Soome		vdev_indirect_config_t *vic;
928c023f65Toomas Soome
929c023f65Toomas Soome		vic = &vdev->vdev_indirect_config;
930c023f65Toomas Soome		vdev->v_mapping = vdev_indirect_mapping_open(spa,
931c023f65Toomas Soome		    &spa->spa_mos, vic->vic_mapping_object);
932c023f65Toomas Soome	}
933c023f65Toomas Soome
934c023f65Toomas Soome	vdev_indirect_remap(vdev, offset, bytes, &zio);
935c023f65Toomas Soome	if (zio.io_error != 0)
936c023f65Toomas Soome		return (zio.io_error);
937c023f65Toomas Soome
938c023f65Toomas Soome	first = list_head(&iv->iv_splits);
939c023f65Toomas Soome	if (first->is_size == zio.io_size) {
940c023f65Toomas Soome		/*
941c023f65Toomas Soome		 * This is not a split block; we are pointing to the entire
942c023f65Toomas Soome		 * data, which will checksum the same as the original data.
943c023f65Toomas Soome		 * Pass the BP down so that the child i/o can verify the
944c023f65Toomas Soome		 * checksum, and try a different location if available
945c023f65Toomas Soome		 * (e.g. on a mirror).
946c023f65Toomas Soome		 *
947c023f65Toomas Soome		 * While this special case could be handled the same as the
948c023f65Toomas Soome		 * general (split block) case, doing it this way ensures
949c023f65Toomas Soome		 * that the vast majority of blocks on indirect vdevs
950c023f65Toomas Soome		 * (which are not split) are handled identically to blocks
951c023f65Toomas Soome		 * on non-indirect vdevs.  This allows us to be less strict
952c023f65Toomas Soome		 * about performance in the general (but rare) case.
953c023f65Toomas Soome		 */
954c023f65Toomas Soome		rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
955c023f65Toomas Soome		    zio.io_data, first->is_target_offset, bytes);
956c023f65Toomas Soome	} else {
957c023f65Toomas Soome		iv->iv_split_block = B_TRUE;
958c023f65Toomas Soome		/*
959c023f65Toomas Soome		 * Read one copy of each split segment, from the
960c023f65Toomas Soome		 * top-level vdev.  Since we don't know the
961c023f65Toomas Soome		 * checksum of each split individually, the child
962c023f65Toomas Soome		 * zio can't ensure that we get the right data.
963c023f65Toomas Soome		 * E.g. if it's a mirror, it will just read from a
964c023f65Toomas Soome		 * random (healthy) leaf vdev.  We have to verify
965c023f65Toomas Soome		 * the checksum in vdev_indirect_io_done().
966c023f65Toomas Soome		 */
967c023f65Toomas Soome		for (indirect_split_t *is = list_head(&iv->iv_splits);
968c023f65Toomas Soome		    is != NULL; is = list_next(&iv->iv_splits, is)) {
969c023f65Toomas Soome			char *ptr = zio.io_data;
970c023f65Toomas Soome
971c023f65Toomas Soome			rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
972c023f65Toomas Soome			    ptr + is->is_split_offset, is->is_target_offset,
973c023f65Toomas Soome			    is->is_size);
974c023f65Toomas Soome		}
975c023f65Toomas Soome		if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
976c023f65Toomas Soome			rc = ECKSUM;
977c023f65Toomas Soome		else
978c023f65Toomas Soome			rc = 0;
979c023f65Toomas Soome	}
980c023f65Toomas Soome
981c023f65Toomas Soome	vdev_indirect_map_free(&zio);
982c023f65Toomas Soome	if (rc == 0)
983c023f65Toomas Soome		rc = zio.io_error;
984c023f65Toomas Soome
985c023f65Toomas Soome	return (rc);
986c023f65Toomas Soome}
987c023f65Toomas Soome
988199767fToomas Soomestatic int
989199767fToomas Soomevdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
990199767fToomas Soome    off_t offset, size_t bytes)
991199767fToomas Soome{
992199767fToomas Soome
993199767fToomas Soome	return (vdev_read_phys(vdev, bp, buf,
9946fd7fa3Toomas Soome	    offset + VDEV_LABEL_START_SIZE, bytes));
995199767fToomas Soome}
996199767fToomas Soome
997199767fToomas Soome
998199767fToomas Soomestatic int
999199767fToomas Soomevdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1000199767fToomas Soome    off_t offset, size_t bytes)
1001199767fToomas Soome{
1002199767fToomas Soome	vdev_t *kid;
1003199767fToomas Soome	int rc;
1004199767fToomas Soome
1005199767fToomas Soome	rc = EIO;
1006199767fToomas Soome	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1007199767fToomas Soome		if (kid->v_state != VDEV_STATE_HEALTHY)
1008199767fToomas Soome			continue;
1009199767fToomas Soome		rc = kid->v_read(kid, bp, buf, offset, bytes);
1010199767fToomas Soome		if (!rc)
1011199767fToomas Soome			return (0);
1012199767fToomas Soome	}
1013199767fToomas Soome
1014199767fToomas Soome	return (rc);
1015199767fToomas Soome}
1016199767fToomas Soome
1017199767fToomas Soomestatic int
1018199767fToomas Soomevdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1019199767fToomas Soome    off_t offset, size_t bytes)
1020199767fToomas Soome{
1021199767fToomas Soome	vdev_t *kid;
1022199767fToomas Soome
1023199767fToomas Soome	/*
1024199767fToomas Soome	 * Here we should have two kids:
1025199767fToomas Soome	 * First one which is the one we are replacing and we can trust
1026199767fToomas Soome	 * only this one to have valid data, but it might not be present.
1027199767fToomas Soome	 * Second one is that one we are replacing with. It is most likely
1028199767fToomas Soome	 * healthy, but we can't trust it has needed data, so we won't use it.
1029199767fToomas Soome	 */
1030199767fToomas Soome	kid = STAILQ_FIRST(&vdev->v_children);
1031199767fToomas Soome	if (kid == NULL)
1032199767fToomas Soome		return (EIO);
1033199767fToomas Soome	if (kid->v_state != VDEV_STATE_HEALTHY)
1034199767fToomas Soome		return (EIO);
1035199767fToomas Soome	return (kid->v_read(kid, bp, buf, offset, bytes));
1036199767fToomas Soome}
1037199767fToomas Soome
1038199767fToomas Soomestatic vdev_t *
1039199767fToomas Soomevdev_find(uint64_t guid)
1040199767fToomas Soome{
1041199767fToomas Soome	vdev_t *vdev;
1042199767fToomas Soome
1043199767fToomas Soome	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
1044199767fToomas Soome		if (vdev->v_guid == guid)
1045199767fToomas Soome			return (vdev);
1046199767fToomas Soome
1047199767fToomas Soome	return (0);
1048199767fToomas Soome}
1049199767fToomas Soome
1050199767fToomas Soomestatic vdev_t *
1051199767fToomas Soomevdev_create(uint64_t guid, vdev_read_t *vdev_read)
1052199767fToomas Soome{
1053199767fToomas Soome	vdev_t *vdev;
1054c023f65Toomas Soome	vdev_indirect_config_t *vic;
1055199767fToomas Soome
1056da9bf00Toomas Soome	vdev = calloc(1, sizeof (vdev_t));
1057da9bf00Toomas Soome	if (vdev != NULL) {
1058da9bf00Toomas Soome		STAILQ_INIT(&vdev->v_children);
1059da9bf00Toomas Soome		vdev->v_guid = guid;
1060da9bf00Toomas Soome		vdev->v_read = vdev_read;
1061c023f65Toomas Soome
1062da9bf00Toomas Soome		/*
1063da9bf00Toomas Soome		 * root vdev has no read function, we use this fact to
1064da9bf00Toomas Soome		 * skip setting up data we do not need for root vdev.
1065da9bf00Toomas Soome		 * We only point root vdev from spa.
1066da9bf00Toomas Soome		 */
1067da9bf00Toomas Soome		if (vdev_read != NULL) {
1068da9bf00Toomas Soome			vic = &vdev->vdev_indirect_config;
1069da9bf00Toomas Soome			vic->vic_prev_indirect_vdev = UINT64_MAX;
1070da9bf00Toomas Soome			STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
1071da9bf00Toomas Soome		}
1072da9bf00Toomas Soome	}
1073199767fToomas Soome
1074199767fToomas Soome	return (vdev);
1075199767fToomas Soome}
1076199767fToomas Soome
1077da9bf00Toomas Soomestatic void
1078da9bf00Toomas Soomevdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
1079199767fToomas Soome{
1080199767fToomas Soome	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
108167806cdToomas Soome	uint64_t is_log;
1082199767fToomas Soome
1083da9bf00Toomas Soome	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
1084da9bf00Toomas Soome	is_log = 0;
1085da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
1086da9bf00Toomas Soome	    &is_offline);
1087da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
1088da9bf00Toomas Soome	    &is_removed);
1089da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
1090da9bf00Toomas Soome	    &is_faulted);
1091da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
1092da9bf00Toomas Soome	    NULL, &is_degraded);
1093da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
1094da9bf00Toomas Soome	    NULL, &isnt_present);
1095da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
1096da9bf00Toomas Soome	    &is_log);
1097da9bf00Toomas Soome
1098da9bf00Toomas Soome	if (is_offline != 0)
1099da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_OFFLINE;
1100da9bf00Toomas Soome	else if (is_removed != 0)
1101da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_REMOVED;
1102da9bf00Toomas Soome	else if (is_faulted != 0)
1103da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_FAULTED;
1104da9bf00Toomas Soome	else if (is_degraded != 0)
1105da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_DEGRADED;
1106da9bf00Toomas Soome	else if (isnt_present != 0)
1107da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_CANT_OPEN;
1108da9bf00Toomas Soome
1109da9bf00Toomas Soome	vdev->v_islog = is_log != 0;
1110da9bf00Toomas Soome}
1111da9bf00Toomas Soome
1112da9bf00Toomas Soomestatic int
1113da9bf00Toomas Soomevdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
1114da9bf00Toomas Soome{
1115da9bf00Toomas Soome	uint64_t id, ashift, asize, nparity;
1116da9bf00Toomas Soome	const char *path;
1117da9bf00Toomas Soome	const char *type;
1118da9bf00Toomas Soome	vdev_t *vdev;
1119da9bf00Toomas Soome
1120da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
1121edb3504Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1122edb3504Toomas Soome	    NULL, &type)) {
1123199767fToomas Soome		return (ENOENT);
1124199767fToomas Soome	}
1125199767fToomas Soome
11266fd7fa3Toomas Soome	if (strcmp(type, VDEV_TYPE_MIRROR) != 0 &&
11276fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_DISK) != 0 &&
1128199767fToomas Soome#ifdef ZFS_TEST
11296fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_FILE) != 0 &&
1130199767fToomas Soome#endif
11316fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
11326fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_INDIRECT) != 0 &&
11336fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_REPLACING) != 0) {
11346fd7fa3Toomas Soome		printf("ZFS: can only boot from disk, mirror, raidz1, "
11356fd7fa3Toomas Soome		    "raidz2 and raidz3 vdevs\n");
1136199767fToomas Soome		return (EIO);
1137199767fToomas Soome	}
1138199767fToomas Soome
1139da9bf00Toomas Soome	if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
1140da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_mirror_read);
1141da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
1142da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_raidz_read);
1143da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
1144da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_replacing_read);
1145da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
1146da9bf00Toomas Soome		vdev_indirect_config_t *vic;
1147199767fToomas Soome
1148da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_indirect_read);
1149da9bf00Toomas Soome		if (vdev != NULL) {
1150c023f65Toomas Soome			vdev->v_state = VDEV_STATE_HEALTHY;
1151c023f65Toomas Soome			vic = &vdev->vdev_indirect_config;
1152c023f65Toomas Soome
1153c023f65Toomas Soome			nvlist_find(nvlist,
1154da9bf00Toomas Soome			    ZPOOL_CONFIG_INDIRECT_OBJECT,
1155da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1156c023f65Toomas Soome			    NULL, &vic->vic_mapping_object);
1157c023f65Toomas Soome			nvlist_find(nvlist,
1158da9bf00Toomas Soome			    ZPOOL_CONFIG_INDIRECT_BIRTHS,
1159da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1160c023f65Toomas Soome			    NULL, &vic->vic_births_object);
1161c023f65Toomas Soome			nvlist_find(nvlist,
1162da9bf00Toomas Soome			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
1163da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1164c023f65Toomas Soome			    NULL, &vic->vic_prev_indirect_vdev);
11654c2b14fToomas Soome		}
1166da9bf00Toomas Soome	} else {
1167da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_disk_read);
1168da9bf00Toomas Soome	}
1169da9bf00Toomas Soome
1170da9bf00Toomas Soome	if (vdev == NULL)
1171da9bf00Toomas Soome		return (ENOMEM);
1172da9bf00Toomas Soome
1173da9bf00Toomas Soome	vdev_set_initial_state(vdev, nvlist);
1174da9bf00Toomas Soome	vdev->v_id = id;
1175da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
1176da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &ashift) == 0)
1177da9bf00Toomas Soome		vdev->v_ashift = ashift;
1178da9bf00Toomas Soome
1179da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1180da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1181da9bf00Toomas Soome		vdev->v_psize = asize +
1182da9bf00Toomas Soome		    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1183da9bf00Toomas Soome	}
1184da9bf00Toomas Soome
1185da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1186da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &nparity) == 0)
1187da9bf00Toomas Soome		vdev->v_nparity = nparity;
1188da9bf00Toomas Soome
1189da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1190da9bf00Toomas Soome	    DATA_TYPE_STRING, NULL, &path) == 0) {
1191da9bf00Toomas Soome		if (strncmp(path, "/dev/dsk/", 9) == 0)
1192da9bf00Toomas Soome			path += 9;
1193da9bf00Toomas Soome		vdev->v_name = strdup(path);
1194da9bf00Toomas Soome		if (nvlist_find(nvlist, ZPOOL_CONFIG_PHYS_PATH,
1195da9bf00Toomas Soome		    DATA_TYPE_STRING, NULL, &path) == 0) {
1196da9bf00Toomas Soome			vdev->v_phys_path = strdup(path);
1197edb3504Toomas Soome		} else {
1198da9bf00Toomas Soome			vdev->v_phys_path = NULL;
1199edb3504Toomas Soome		}
1200da9bf00Toomas Soome		if (nvlist_find(nvlist, ZPOOL_CONFIG_DEVID,
1201edb3504Toomas Soome		    DATA_TYPE_STRING, NULL, &path) == 0) {
1202da9bf00Toomas Soome			vdev->v_devid = strdup(path);
1203199767fToomas Soome		} else {
1204da9bf00Toomas Soome			vdev->v_devid = NULL;
1205199767fToomas Soome		}
1206199767fToomas Soome	} else {
1207da9bf00Toomas Soome		char *name;
1208da9bf00Toomas Soome
1209da9bf00Toomas Soome		name = NULL;
1210da9bf00Toomas Soome		if (strcmp(type, "raidz") == 0) {
1211da9bf00Toomas Soome			if (vdev->v_nparity < 1 ||
1212da9bf00Toomas Soome			    vdev->v_nparity > 3) {
1213da9bf00Toomas Soome				printf("ZFS: invalid raidz parity: %d\n",
1214da9bf00Toomas Soome				    vdev->v_nparity);
1215da9bf00Toomas Soome				return (EIO);
1216da9bf00Toomas Soome			}
1217da9bf00Toomas Soome			(void) asprintf(&name, "%s%d-%" PRIu64, type,
1218da9bf00Toomas Soome			    vdev->v_nparity, id);
1219da9bf00Toomas Soome		} else {
1220da9bf00Toomas Soome			(void) asprintf(&name, "%s-%" PRIu64, type, id);
1221da9bf00Toomas Soome		}
1222da9bf00Toomas Soome		vdev->v_name = name;
1223da9bf00Toomas Soome	}
1224da9bf00Toomas Soome	*vdevp = vdev;
1225da9bf00Toomas Soome	return (0);
1226da9bf00Toomas Soome}
1227da9bf00Toomas Soome
1228da9bf00Toomas Soome/*
1229da9bf00Toomas Soome * Find slot for vdev. We return either NULL to signal to use
1230da9bf00Toomas Soome * STAILQ_INSERT_HEAD, or we return link element to be used with
1231da9bf00Toomas Soome * STAILQ_INSERT_AFTER.
1232da9bf00Toomas Soome */
1233da9bf00Toomas Soomestatic vdev_t *
1234da9bf00Toomas Soomevdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1235da9bf00Toomas Soome{
1236da9bf00Toomas Soome	vdev_t *v, *previous;
1237da9bf00Toomas Soome
1238da9bf00Toomas Soome	if (STAILQ_EMPTY(&top_vdev->v_children))
1239da9bf00Toomas Soome		return (NULL);
1240da9bf00Toomas Soome
1241da9bf00Toomas Soome	previous = NULL;
1242da9bf00Toomas Soome	STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1243da9bf00Toomas Soome		if (v->v_id > vdev->v_id)
1244da9bf00Toomas Soome			return (previous);
1245da9bf00Toomas Soome
1246da9bf00Toomas Soome		if (v->v_id == vdev->v_id)
1247da9bf00Toomas Soome			return (v);
1248da9bf00Toomas Soome
1249da9bf00Toomas Soome		if (v->v_id < vdev->v_id)
1250da9bf00Toomas Soome			previous = v;
1251199767fToomas Soome	}
1252da9bf00Toomas Soome	return (previous);
1253da9bf00Toomas Soome}
1254da9bf00Toomas Soome
1255da9bf00Toomas Soomestatic size_t
1256da9bf00Toomas Soomevdev_child_count(vdev_t *vdev)
1257da9bf00Toomas Soome{
1258da9bf00Toomas Soome	vdev_t *v;
1259da9bf00Toomas Soome	size_t count;
1260da9bf00Toomas Soome
1261da9bf00Toomas Soome	count = 0;
1262da9bf00Toomas Soome	STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1263da9bf00Toomas Soome		count++;
1264da9bf00Toomas Soome	}
1265da9bf00Toomas Soome	return (count);
1266da9bf00Toomas Soome}
1267da9bf00Toomas Soome
1268da9bf00Toomas Soome/*
1269da9bf00Toomas Soome * Insert vdev into top_vdev children list. List is ordered by v_id.
1270da9bf00Toomas Soome */
1271da9bf00Toomas Soomestatic void
1272da9bf00Toomas Soomevdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1273da9bf00Toomas Soome{
1274da9bf00Toomas Soome	vdev_t *previous;
1275da9bf00Toomas Soome	size_t count;
1276da9bf00Toomas Soome
1277da9bf00Toomas Soome	/*
1278da9bf00Toomas Soome	 * The top level vdev can appear in random order, depending how
1279da9bf00Toomas Soome	 * the firmware is presenting the disk devices.
1280da9bf00Toomas Soome	 * However, we will insert vdev to create list ordered by v_id,
1281da9bf00Toomas Soome	 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1282da9bf00Toomas Soome	 * as STAILQ does not have insert before.
1283da9bf00Toomas Soome	 */
1284da9bf00Toomas Soome	previous = vdev_find_previous(top_vdev, vdev);
1285199767fToomas Soome
1286da9bf00Toomas Soome	if (previous == NULL) {
1287da9bf00Toomas Soome		STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1288da9bf00Toomas Soome	} else if (previous->v_id == vdev->v_id) {
1289199767fToomas Soome		/*
1290da9bf00Toomas Soome		 * This vdev was configured from label config,
1291da9bf00Toomas Soome		 * do not insert duplicate.
1292199767fToomas Soome		 */
1293da9bf00Toomas Soome		return;
1294da9bf00Toomas Soome	} else {
1295da9bf00Toomas Soome		STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1296da9bf00Toomas Soome		    v_childlink);
1297da9bf00Toomas Soome	}
1298da9bf00Toomas Soome
1299da9bf00Toomas Soome	count = vdev_child_count(top_vdev);
1300da9bf00Toomas Soome	if (top_vdev->v_nchildren < count)
1301da9bf00Toomas Soome		top_vdev->v_nchildren = count;
1302da9bf00Toomas Soome}
1303da9bf00Toomas Soome
1304da9bf00Toomas Soomestatic int
1305da9bf00Toomas Soomevdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
1306da9bf00Toomas Soome{
1307da9bf00Toomas Soome	vdev_t *top_vdev, *vdev;
1308da9bf00Toomas Soome	const unsigned char *kids;
1309da9bf00Toomas Soome	int rc, nkids;
1310da9bf00Toomas Soome
1311da9bf00Toomas Soome	/* Get top vdev. */
1312da9bf00Toomas Soome	top_vdev = vdev_find(top_guid);
1313da9bf00Toomas Soome	if (top_vdev == NULL) {
1314da9bf00Toomas Soome		rc = vdev_init(top_guid, nvlist, &top_vdev);
1315da9bf00Toomas Soome		if (rc != 0)
1316da9bf00Toomas Soome			return (rc);
1317da9bf00Toomas Soome		top_vdev->v_spa = spa;
1318da9bf00Toomas Soome		top_vdev->v_top = top_vdev;
1319da9bf00Toomas Soome		vdev_insert(spa->spa_root_vdev, top_vdev);
1320199767fToomas Soome	}
1321199767fToomas Soome
1322da9bf00Toomas Soome	/* Add children if there are any. */
1323edb3504Toomas Soome	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1324edb3504Toomas Soome	    &nkids, &kids);
1325199767fToomas Soome	if (rc == 0) {
1326da9bf00Toomas Soome		for (int i = 0; i < nkids; i++) {
1327da9bf00Toomas Soome			uint64_t guid;
1328da9bf00Toomas Soome
1329da9bf00Toomas Soome			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1330da9bf00Toomas Soome			    DATA_TYPE_UINT64, NULL, &guid);
1331da9bf00Toomas Soome			if (rc != 0)
1332da9bf00Toomas Soome				return (rc);
1333da9bf00Toomas Soome			rc = vdev_init(guid, kids, &vdev);
1334da9bf00Toomas Soome			if (rc != 0)
1335199767fToomas Soome				return (rc);
1336da9bf00Toomas Soome
1337da9bf00Toomas Soome			vdev->v_spa = spa;
1338da9bf00Toomas Soome			vdev->v_top = top_vdev;
1339da9bf00Toomas Soome			vdev_insert(top_vdev, vdev);
1340da9bf00Toomas Soome
1341199767fToomas Soome			kids = nvlist_next(kids);
1342199767fToomas Soome		}
1343199767fToomas Soome	} else {
1344da9bf00Toomas Soome		/*
1345da9bf00Toomas Soome		 * When there are no children, nvlist_find() does return
1346da9bf00Toomas Soome		 * error, reset it because leaf devices have no children.
1347da9bf00Toomas Soome		 */
1348da9bf00Toomas Soome		rc = 0;
1349199767fToomas Soome	}
1350199767fToomas Soome
1351da9bf00Toomas Soome	return (rc);
1352da9bf00Toomas Soome}
1353da9bf00Toomas Soome
1354da9bf00Toomas Soomestatic int
1355da9bf00Toomas Soomevdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
1356da9bf00Toomas Soome{
1357da9bf00Toomas Soome	uint64_t pool_guid, top_guid;
1358da9bf00Toomas Soome	const unsigned char *vdevs;
1359da9bf00Toomas Soome
1360da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1361da9bf00Toomas Soome	    NULL, &pool_guid) ||
1362da9bf00Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1363da9bf00Toomas Soome	    NULL, &top_guid) ||
1364da9bf00Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1365da9bf00Toomas Soome	    NULL, &vdevs)) {
1366da9bf00Toomas Soome		printf("ZFS: can't find vdev details\n");
1367da9bf00Toomas Soome		return (ENOENT);
1368da9bf00Toomas Soome	}
1369da9bf00Toomas Soome
1370da9bf00Toomas Soome	return (vdev_from_nvlist(spa, top_guid, vdevs));
1371199767fToomas Soome}
1372199767fToomas Soome
1373199767fToomas Soomestatic void
1374199767fToomas Soomevdev_set_state(vdev_t *vdev)
1375199767fToomas Soome{
1376199767fToomas Soome	vdev_t *kid;
1377199767fToomas Soome	int good_kids;
1378199767fToomas Soome	int bad_kids;
1379199767fToomas Soome
1380da9bf00Toomas Soome	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1381da9bf00Toomas Soome		vdev_set_state(kid);
1382da9bf00Toomas Soome	}
1383da9bf00Toomas Soome
1384199767fToomas Soome	/*
1385199767fToomas Soome	 * A mirror or raidz is healthy if all its kids are healthy. A
1386199767fToomas Soome	 * mirror is degraded if any of its kids is healthy; a raidz
1387199767fToomas Soome	 * is degraded if at most nparity kids are offline.
1388199767fToomas Soome	 */
1389199767fToomas Soome	if (STAILQ_FIRST(&vdev->v_children)) {
1390199767fToomas Soome		good_kids = 0;
1391199767fToomas Soome		bad_kids = 0;
1392199767fToomas Soome		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1393199767fToomas Soome			if (kid->v_state == VDEV_STATE_HEALTHY)
1394199767fToomas Soome				good_kids++;
1395199767fToomas Soome			else
1396199767fToomas Soome				bad_kids++;
1397199767fToomas Soome		}
1398199767fToomas Soome		if (bad_kids == 0) {
1399199767fToomas Soome			vdev->v_state = VDEV_STATE_HEALTHY;
1400199767fToomas Soome		} else {
1401199767fToomas Soome			if (vdev->v_read == vdev_mirror_read) {
1402199767fToomas Soome				if (good_kids) {
1403199767fToomas Soome					vdev->v_state = VDEV_STATE_DEGRADED;
1404199767fToomas Soome				} else {
1405199767fToomas Soome					vdev->v_state = VDEV_STATE_OFFLINE;
1406199767fToomas Soome				}
1407199767fToomas Soome			} else if (vdev->v_read == vdev_raidz_read) {
1408199767fToomas Soome				if (bad_kids > vdev->v_nparity) {
1409199767fToomas Soome					vdev->v_state = VDEV_STATE_OFFLINE;
1410199767fToomas Soome				} else {
1411199767fToomas Soome					vdev->v_state = VDEV_STATE_DEGRADED;
1412199767fToomas Soome				}
1413199767fToomas Soome			}
1414199767fToomas Soome		}
1415199767fToomas Soome	}
1416199767fToomas Soome}
1417199767fToomas Soome
1418da9bf00Toomas Soomestatic int
1419da9bf00Toomas Soomevdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
1420da9bf00Toomas Soome{
1421da9bf00Toomas Soome	vdev_t *vdev;
1422da9bf00Toomas Soome	const unsigned char *kids;
1423da9bf00Toomas Soome	int rc, nkids;
1424da9bf00Toomas Soome
1425da9bf00Toomas Soome	/* Update top vdev. */
1426da9bf00Toomas Soome	vdev = vdev_find(top_guid);
1427da9bf00Toomas Soome	if (vdev != NULL)
1428da9bf00Toomas Soome		vdev_set_initial_state(vdev, nvlist);
1429da9bf00Toomas Soome
1430da9bf00Toomas Soome	/* Update children if there are any. */
1431da9bf00Toomas Soome	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1432da9bf00Toomas Soome	    &nkids, &kids);
1433da9bf00Toomas Soome	if (rc == 0) {
1434da9bf00Toomas Soome		for (int i = 0; i < nkids; i++) {
1435da9bf00Toomas Soome			uint64_t guid;
1436da9bf00Toomas Soome
1437da9bf00Toomas Soome			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1438da9bf00Toomas Soome			    DATA_TYPE_UINT64, NULL, &guid);
1439da9bf00Toomas Soome			if (rc != 0)
1440da9bf00Toomas Soome				break;
1441da9bf00Toomas Soome
1442da9bf00Toomas Soome			vdev = vdev_find(guid);
1443da9bf00Toomas Soome			if (vdev != NULL)
1444da9bf00Toomas Soome				vdev_set_initial_state(vdev, kids);
1445da9bf00Toomas Soome
1446da9bf00Toomas Soome			kids = nvlist_next(kids);
1447da9bf00Toomas Soome		}
1448da9bf00Toomas Soome	} else {
1449da9bf00Toomas Soome		rc = 0;