1c023f65Toomas Soome/*
2199767fToomas Soome * Copyright (c) 2007 Doug Rabson
3199767fToomas Soome * All rights reserved.
4199767fToomas Soome *
5199767fToomas Soome * Redistribution and use in source and binary forms, with or without
6199767fToomas Soome * modification, are permitted provided that the following conditions
7199767fToomas Soome * are met:
8199767fToomas Soome * 1. Redistributions of source code must retain the above copyright
9199767fToomas Soome *    notice, this list of conditions and the following disclaimer.
10199767fToomas Soome * 2. Redistributions in binary form must reproduce the above copyright
11199767fToomas Soome *    notice, this list of conditions and the following disclaimer in the
12199767fToomas Soome *    documentation and/or other materials provided with the distribution.
13199767fToomas Soome *
24199767fToomas Soome * SUCH DAMAGE.
25199767fToomas Soome */
26199767fToomas Soome
27199767fToomas Soome#include <sys/cdefs.h>
28199767fToomas Soome
29199767fToomas Soome/*
30199767fToomas Soome *	Stand-alone ZFS file reader.
31199767fToomas Soome */
32199767fToomas Soome
3313a6e30Toomas Soome#include <sys/endian.h>
34199767fToomas Soome#include <sys/stat.h>
35199767fToomas Soome#include <sys/stdint.h>
36c023f65Toomas Soome#include <sys/list.h>
37c023f65Toomas Soome#include <inttypes.h>
38199767fToomas Soome
39199767fToomas Soome#include "zfsimpl.h"
40199767fToomas Soome#include "zfssubr.c"
41199767fToomas Soome
42199767fToomas Soome
43199767fToomas Soomestruct zfsmount {
44199767fToomas Soome	const spa_t	*spa;
45199767fToomas Soome	objset_phys_t	objset;
46199767fToomas Soome	uint64_t	rootobj;
47199767fToomas Soome};
48199767fToomas Soome
49199767fToomas Soome/*
50c023f65Toomas Soome * The indirect_child_t represents the vdev that we will read from, when we
51c023f65Toomas Soome * need to read all copies of the data (e.g. for scrub or reconstruction).
52c023f65Toomas Soome * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
53c023f65Toomas Soome * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
54c023f65Toomas Soome * ic_vdev is a child of the mirror.
55c023f65Toomas Soome */
56c023f65Toomas Soometypedef struct indirect_child {
57c023f65Toomas Soome	void *ic_data;
58c023f65Toomas Soome	vdev_t *ic_vdev;
59c023f65Toomas Soome} indirect_child_t;
60c023f65Toomas Soome
61c023f65Toomas Soome/*
62c023f65Toomas Soome * The indirect_split_t represents one mapped segment of an i/o to the
63c023f65Toomas Soome * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
64c023f65Toomas Soome * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
65c023f65Toomas Soome * For split blocks, there will be several of these.
66c023f65Toomas Soome */
67c023f65Toomas Soometypedef struct indirect_split {
68c023f65Toomas Soome	list_node_t is_node; /* link on iv_splits */
69c023f65Toomas Soome
70c023f65Toomas Soome	/*
71c023f65Toomas Soome	 * is_split_offset is the offset into the i/o.
72c023f65Toomas Soome	 * This is the sum of the previous splits' is_size's.
73c023f65Toomas Soome	 */
74c023f65Toomas Soome	uint64_t is_split_offset;
75c023f65Toomas Soome
76c023f65Toomas Soome	vdev_t *is_vdev; /* top-level vdev */
77c023f65Toomas Soome	uint64_t is_target_offset; /* offset on is_vdev */
78c023f65Toomas Soome	uint64_t is_size;
79c023f65Toomas Soome	int is_children; /* number of entries in is_child[] */
80c023f65Toomas Soome
81c023f65Toomas Soome	/*
82c023f65Toomas Soome	 * is_good_child is the child that we are currently using to
83c023f65Toomas Soome	 * attempt reconstruction.
84c023f65Toomas Soome	 */
85c023f65Toomas Soome	int is_good_child;
86c023f65Toomas Soome
87c023f65Toomas Soome	indirect_child_t is_child[1]; /* variable-length */
88c023f65Toomas Soome} indirect_split_t;
89c023f65Toomas Soome
90c023f65Toomas Soome/*
91c023f65Toomas Soome * The indirect_vsd_t is associated with each i/o to the indirect vdev.
92c023f65Toomas Soome * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
93c023f65Toomas Soome */
94c023f65Toomas Soometypedef struct indirect_vsd {
95c023f65Toomas Soome	boolean_t iv_split_block;
96c023f65Toomas Soome	boolean_t iv_reconstruct;
97c023f65Toomas Soome
98c023f65Toomas Soome	list_t iv_splits; /* list of indirect_split_t's */
99c023f65Toomas Soome} indirect_vsd_t;
100c023f65Toomas Soome
101c023f65Toomas Soome/*
102199767fToomas Soome * List of all vdevs, chained through v_alllink.
103199767fToomas Soome */
104199767fToomas Soomestatic vdev_list_t zfs_vdevs;
105199767fToomas Soome
1066fd7fa3Toomas Soome/*
107199767fToomas Soome * List of ZFS features supported for read
108199767fToomas Soome */
109199767fToomas Soomestatic const char *features_for_read[] = {
110199767fToomas Soome	"org.illumos:lz4_compress",
111199767fToomas Soome	"com.delphix:hole_birth",
112199767fToomas Soome	"com.delphix:extensible_dataset",
113199767fToomas Soome	"com.delphix:embedded_data",
114199767fToomas Soome	"org.open-zfs:large_blocks",
115199767fToomas Soome	"org.illumos:sha512",
1164a04e8dToomas Soome	"org.illumos:skein",
1174a04e8dToomas Soome	"org.illumos:edonr",
118f905073Toomas Soome	"org.zfsonlinux:large_dnode",
1196f8e6e5Alex Wilson	"com.joyent:multi_vdev_crash_dump",
12042b4b09Andy Fiddaman	"com.delphix:spacemap_histogram",
12142b4b09Andy Fiddaman	"com.delphix:zpool_checkpoint",
12242b4b09Andy Fiddaman	"com.delphix:spacemap_v2",
12342b4b09Andy Fiddaman	"com.datto:encryption",
12442b4b09Andy Fiddaman	"com.datto:bookmark_v2",
12542b4b09Andy Fiddaman	"org.zfsonlinux:allocation_classes",
12642b4b09Andy Fiddaman	"com.datto:resilver_defer",
127c023f65Toomas Soome	"com.delphix:device_removal",
128c023f65Toomas Soome	"com.delphix:obsolete_counts",
129199767fToomas Soome	NULL
130199767fToomas Soome};
131199767fToomas Soome
132199767fToomas Soome/*
133199767fToomas Soome * List of all pools, chained through spa_link.
134199767fToomas Soome */
135199767fToomas Soomestatic spa_list_t zfs_pools;
136199767fToomas Soome
137edb3504Toomas Soomestatic const dnode_phys_t *dnode_cache_obj;
138199767fToomas Soomestatic uint64_t dnode_cache_bn;
139199767fToomas Soomestatic char *dnode_cache_buf;
140199767fToomas Soomestatic char *zap_scratch;
141199767fToomas Soomestatic char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
142199767fToomas Soome
1436fd7fa3Toomas Soome#define	TEMP_SIZE	(1024 * 1024)
144199767fToomas Soome
145199767fToomas Soomestatic int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
146199767fToomas Soomestatic int zfs_get_root(const spa_t *spa, uint64_t *objid);
147199767fToomas Soomestatic int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
1484a04e8dToomas Soomestatic int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
1494a04e8dToomas Soome    const char *name, uint64_t integer_size, uint64_t num_integers,
1504a04e8dToomas Soome    void *value);
151c023f65Toomas Soomestatic int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t,
152c023f65Toomas Soome    dnode_phys_t *);
153c023f65Toomas Soomestatic int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *,
154c023f65Toomas Soome    size_t);
155c023f65Toomas Soomestatic int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t,
156c023f65Toomas Soome    size_t);
157c023f65Toomas Soomestatic int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t,
158c023f65Toomas Soome    size_t);
159199767fToomas Soome
160199767fToomas Soomestatic void
161199767fToomas Soomezfs_init(void)
162199767fToomas Soome{
163199767fToomas Soome	STAILQ_INIT(&zfs_vdevs);
164199767fToomas Soome	STAILQ_INIT(&zfs_pools);
165199767fToomas Soome
166199767fToomas Soome	zfs_temp_buf = malloc(TEMP_SIZE);
167199767fToomas Soome	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
168199767fToomas Soome	zfs_temp_ptr = zfs_temp_buf;
169199767fToomas Soome	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
170199767fToomas Soome	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
171199767fToomas Soome
172199767fToomas Soome	zfs_init_crc();
173199767fToomas Soome}
174199767fToomas Soome
175199767fToomas Soomestatic void *
176199767fToomas Soomezfs_alloc(size_t size)
177199767fToomas Soome{
178199767fToomas Soome	char *ptr;
179199767fToomas Soome
180199767fToomas Soome	if (zfs_temp_ptr + size > zfs_temp_end) {
181b5b5d63Toomas Soome		panic("ZFS: out of temporary buffer space");
182199767fToomas Soome	}
183199767fToomas Soome	ptr = zfs_temp_ptr;
184199767fToomas Soome	zfs_temp_ptr += size;
185199767fToomas Soome
186199767fToomas Soome	return (ptr);
187199767fToomas Soome}
188199767fToomas Soome
189199767fToomas Soomestatic void
190199767fToomas Soomezfs_free(void *ptr, size_t size)
191199767fToomas Soome{
192199767fToomas Soome
193199767fToomas Soome	zfs_temp_ptr -= size;
194199767fToomas Soome	if (zfs_temp_ptr != ptr) {
195b5b5d63Toomas Soome		panic("ZFS: zfs_alloc()/zfs_free() mismatch");
196199767fToomas Soome	}
197199767fToomas Soome}
198199767fToomas Soome
199199767fToomas Soomestatic int
200199767fToomas Soomexdr_int(const unsigned char **xdr, int *ip)
201199767fToomas Soome{
20213a6e30Toomas Soome	*ip = be32dec(*xdr);
203199767fToomas Soome	(*xdr) += 4;
204199767fToomas Soome	return (0);
205199767fToomas Soome}
206199767fToomas Soome
207199767fToomas Soomestatic int
2086fd7fa3Toomas Soomexdr_u_int(const unsigned char **xdr, uint_t *ip)
209199767fToomas Soome{
21013a6e30Toomas Soome	*ip = be32dec(*xdr);
211199767fToomas Soome	(*xdr) += 4;
212199767fToomas Soome	return (0);
213199767fToomas Soome}
214199767fToomas Soome
215199767fToomas Soomestatic int
216199767fToomas Soomexdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
217199767fToomas Soome{
2186fd7fa3Toomas Soome	uint_t hi, lo;
219199767fToomas Soome
220199767fToomas Soome	xdr_u_int(xdr, &hi);
221199767fToomas Soome	xdr_u_int(xdr, &lo);
2226fd7fa3Toomas Soome	*lp = (((uint64_t)hi) << 32) | lo;
223199767fToomas Soome	return (0);
224199767fToomas Soome}
225199767fToomas Soome
226199767fToomas Soomestatic int
227199767fToomas Soomenvlist_find(const unsigned char *nvlist, const char *name, int type,
2286fd7fa3Toomas Soome    int *elementsp, void *valuep)
229199767fToomas Soome{
230199767fToomas Soome	const unsigned char *p, *pair;
231199767fToomas Soome	int junk;
232199767fToomas Soome	int encoded_size, decoded_size;
233199767fToomas Soome
234199767fToomas Soome	p = nvlist;
235199767fToomas Soome	xdr_int(&p, &junk);
236199767fToomas Soome	xdr_int(&p, &junk);
237199767fToomas Soome
238199767fToomas Soome	pair = p;
239199767fToomas Soome	xdr_int(&p, &encoded_size);
240199767fToomas Soome	xdr_int(&p, &decoded_size);
241199767fToomas Soome	while (encoded_size && decoded_size) {
242199767fToomas Soome		int namelen, pairtype, elements;
243199767fToomas Soome		const char *pairname;
244199767fToomas Soome
245199767fToomas Soome		xdr_int(&p, &namelen);
2466fd7fa3Toomas Soome		pairname = (const char *)p;
247199767fToomas Soome		p += roundup(namelen, 4);
248199767fToomas Soome		xdr_int(&p, &pairtype);
249199767fToomas Soome
2506fd7fa3Toomas Soome		if (memcmp(name, pairname, namelen) == 0 && type == pairtype) {
251199767fToomas Soome			xdr_int(&p, &elements);
252199767fToomas Soome			if (elementsp)
253199767fToomas Soome				*elementsp = elements;
254199767fToomas Soome			if (type == DATA_TYPE_UINT64) {
2556fd7fa3Toomas Soome				xdr_uint64_t(&p, (uint64_t *)valuep);
256199767fToomas Soome				return (0);
257199767fToomas Soome			} else if (type == DATA_TYPE_STRING) {
258199767fToomas Soome				int len;
259199767fToomas Soome				xdr_int(&p, &len);
2606fd7fa3Toomas Soome				(*(const char **)valuep) = (const char *)p;
261199767fToomas Soome				return (0);
2626fd7fa3Toomas Soome			} else if (type == DATA_TYPE_NVLIST ||
2636fd7fa3Toomas Soome			    type == DATA_TYPE_NVLIST_ARRAY) {
2646fd7fa3Toomas Soome				(*(const unsigned char **)valuep) =
2656fd7fa3Toomas Soome				    (const unsigned char *)p;
266199767fToomas Soome				return (0);
267199767fToomas Soome			} else {
268199767fToomas Soome				return (EIO);
269199767fToomas Soome			}
270199767fToomas Soome		} else {
271199767fToomas Soome			/*
2726fd7fa3Toomas Soome			 * Not the pair we are looking for, skip to the
2736fd7fa3Toomas Soome			 * next one.
274199767fToomas Soome			 */
275199767fToomas Soome			p = pair + encoded_size;
276199767fToomas Soome		}
277199767fToomas Soome
278199767fToomas Soome		pair = p;
279199767fToomas Soome		xdr_int(&p, &encoded_size);
280199767fToomas Soome		xdr_int(&p, &decoded_size);
281199767fToomas Soome	}
282199767fToomas Soome
283199767fToomas Soome	return (EIO);
284199767fToomas Soome}
285199767fToomas Soome
286199767fToomas Soomestatic int
287199767fToomas Soomenvlist_check_features_for_read(const unsigned char *nvlist)
288199767fToomas Soome{
289199767fToomas Soome	const unsigned char *p, *pair;
290199767fToomas Soome	int junk;
291199767fToomas Soome	int encoded_size, decoded_size;
292199767fToomas Soome	int rc;
293199767fToomas Soome
294199767fToomas Soome	rc = 0;
295199767fToomas Soome
296199767fToomas Soome	p = nvlist;
297199767fToomas Soome	xdr_int(&p, &junk);
298199767fToomas Soome	xdr_int(&p, &junk);
299199767fToomas Soome
300199767fToomas Soome	pair = p;
301199767fToomas Soome	xdr_int(&p, &encoded_size);
302199767fToomas Soome	xdr_int(&p, &decoded_size);
303199767fToomas Soome	while (encoded_size && decoded_size) {
304199767fToomas Soome		int namelen, pairtype;
305199767fToomas Soome		const char *pairname;
306199767fToomas Soome		int i, found;
307199767fToomas Soome
308199767fToomas Soome		found = 0;
309199767fToomas Soome
310199767fToomas Soome		xdr_int(&p, &namelen);
3116fd7fa3Toomas Soome		pairname = (const char *)p;
312199767fToomas Soome		p += roundup(namelen, 4);
313199767fToomas Soome		xdr_int(&p, &pairtype);
314199767fToomas Soome
315199767fToomas Soome		for (i = 0; features_for_read[i] != NULL; i++) {
3166fd7fa3Toomas Soome			if (memcmp(pairname, features_for_read[i],
3176fd7fa3Toomas Soome			    namelen) == 0) {
318199767fToomas Soome				found = 1;
319199767fToomas Soome				break;
320199767fToomas Soome			}
321199767fToomas Soome		}
322199767fToomas Soome
323199767fToomas Soome		if (!found) {
324199767fToomas Soome			printf("ZFS: unsupported feature: %s\n", pairname);
325199767fToomas Soome			rc = EIO;
326199767fToomas Soome		}
327199767fToomas Soome
328199767fToomas Soome		p = pair + encoded_size;
329199767fToomas Soome
330199767fToomas Soome		pair = p;
331199767fToomas Soome		xdr_int(&p, &encoded_size);
332199767fToomas Soome		xdr_int(&p, &decoded_size);
333199767fToomas Soome	}
334199767fToomas Soome
335199767fToomas Soome	return (rc);
336199767fToomas Soome}
337199767fToomas Soome
338199767fToomas Soome/*
339199767fToomas Soome * Return the next nvlist in an nvlist array.
340199767fToomas Soome */
341199767fToomas Soomestatic const unsigned char *
342199767fToomas Soomenvlist_next(const unsigned char *nvlist)
343199767fToomas Soome{
344199767fToomas Soome	const unsigned char *p, *pair;
345199767fToomas Soome	int junk;
346199767fToomas Soome	int encoded_size, decoded_size;
347199767fToomas Soome
348199767fToomas Soome	p = nvlist;
349199767fToomas Soome	xdr_int(&p, &junk);
350199767fToomas Soome	xdr_int(&p, &junk);
351199767fToomas Soome
352199767fToomas Soome	pair = p;
353199767fToomas Soome	xdr_int(&p, &encoded_size);
354199767fToomas Soome	xdr_int(&p, &decoded_size);
355199767fToomas Soome	while (encoded_size && decoded_size) {
356199767fToomas Soome		p = pair + encoded_size;
357199767fToomas Soome
358199767fToomas Soome		pair = p;
359199767fToomas Soome		xdr_int(&p, &encoded_size);
360199767fToomas Soome		xdr_int(&p, &decoded_size);
361199767fToomas Soome	}
362199767fToomas Soome
3636fd7fa3Toomas Soome	return (p);
364199767fToomas Soome}
365199767fToomas Soome
366199767fToomas Soome#ifdef TEST
367199767fToomas Soome
368199767fToomas Soomestatic const unsigned char *
369199767fToomas Soomenvlist_print(const unsigned char *nvlist, unsigned int indent)
370199767fToomas Soome{
3716fd7fa3Toomas Soome	static const char *typenames[] = {
372199767fToomas Soome		"DATA_TYPE_UNKNOWN",
373199767fToomas Soome		"DATA_TYPE_BOOLEAN",
374199767fToomas Soome		"DATA_TYPE_BYTE",
375199767fToomas Soome		"DATA_TYPE_INT16",
376199767fToomas Soome		"DATA_TYPE_UINT16",
377199767fToomas Soome		"DATA_TYPE_INT32",
378199767fToomas Soome		"DATA_TYPE_UINT32",
379199767fToomas Soome		"DATA_TYPE_INT64",
380199767fToomas Soome		"DATA_TYPE_UINT64",
381199767fToomas Soome		"DATA_TYPE_STRING",
382199767fToomas Soome		"DATA_TYPE_BYTE_ARRAY",
383199767fToomas Soome		"DATA_TYPE_INT16_ARRAY",
384199767fToomas Soome		"DATA_TYPE_UINT16_ARRAY",
385199767fToomas Soome		"DATA_TYPE_INT32_ARRAY",
386199767fToomas Soome		"DATA_TYPE_UINT32_ARRAY",
387199767fToomas Soome		"DATA_TYPE_INT64_ARRAY",
388199767fToomas Soome		"DATA_TYPE_UINT64_ARRAY",
389199767fToomas Soome		"DATA_TYPE_STRING_ARRAY",
390199767fToomas Soome		"DATA_TYPE_HRTIME",
391199767fToomas Soome		"DATA_TYPE_NVLIST",
392199767fToomas Soome		"DATA_TYPE_NVLIST_ARRAY",
393199767fToomas Soome		"DATA_TYPE_BOOLEAN_VALUE",
394199767fToomas Soome		"DATA_TYPE_INT8",
395199767fToomas Soome		"DATA_TYPE_UINT8",
396199767fToomas Soome		"DATA_TYPE_BOOLEAN_ARRAY",
397199767fToomas Soome		"DATA_TYPE_INT8_ARRAY",
398199767fToomas Soome		"DATA_TYPE_UINT8_ARRAY"
399199767fToomas Soome	};
400199767fToomas Soome
401199767fToomas Soome	unsigned int i, j;
402199767fToomas Soome	const unsigned char *p, *pair;
403199767fToomas Soome	int junk;
404199767fToomas Soome	int encoded_size, decoded_size;
405199767fToomas Soome
406199767fToomas Soome	p = nvlist;
407199767fToomas Soome	xdr_int(&p, &junk);
408199767fToomas Soome	xdr_int(&p, &junk);
409199767fToomas Soome
410199767fToomas Soome	pair = p;
411199767fToomas Soome	xdr_int(&p, &encoded_size);
412199767fToomas Soome	xdr_int(&p, &decoded_size);
413199767fToomas Soome	while (encoded_size && decoded_size) {
414199767fToomas Soome		int namelen, pairtype, elements;
415199767fToomas Soome		const char *pairname;
416199767fToomas Soome
417199767fToomas Soome		xdr_int(&p, &namelen);
4186fd7fa3Toomas Soome		pairname = (const char *)p;
419199767fToomas Soome		p += roundup(namelen, 4);
420199767fToomas Soome		xdr_int(&p, &pairtype);
421199767fToomas Soome
422199767fToomas Soome		for (i = 0; i < indent; i++)
423199767fToomas Soome			printf(" ");
424199767fToomas Soome		printf("%s %s", typenames[pairtype], pairname);
425199767fToomas Soome
426199767fToomas Soome		xdr_int(&p, &elements);
427199767fToomas Soome		switch (pairtype) {
428199767fToomas Soome		case DATA_TYPE_UINT64: {
429199767fToomas Soome			uint64_t val;
430199767fToomas Soome			xdr_uint64_t(&p, &val);
431199767fToomas Soome			printf(" = 0x%jx\n", (uintmax_t)val);
432199767fToomas Soome			break;
433199767fToomas Soome		}
434199767fToomas Soome
435199767fToomas Soome		case DATA_TYPE_STRING: {
436199767fToomas Soome			int len;
437199767fToomas Soome			xdr_int(&p, &len);
438199767fToomas Soome			printf(" = \"%s\"\n", p);
439199767fToomas Soome			break;
440199767fToomas Soome		}
441199767fToomas Soome
442199767fToomas Soome		case DATA_TYPE_NVLIST:
443199767fToomas Soome			printf("\n");
444199767fToomas Soome			nvlist_print(p, indent + 1);
445199767fToomas Soome			break;
446199767fToomas Soome
447199767fToomas Soome		case DATA_TYPE_NVLIST_ARRAY:
448199767fToomas Soome			for (j = 0; j < elements; j++) {
449199767fToomas Soome				printf("[%d]\n", j);
450199767fToomas Soome				p = nvlist_print(p, indent + 1);
451199767fToomas Soome				if (j != elements - 1) {
452199767fToomas Soome					for (i = 0; i < indent; i++)
453199767fToomas Soome						printf(" ");
4546fd7fa3Toomas Soome					printf("%s %s", typenames[pairtype],
4556fd7fa3Toomas Soome					    pairname);
456199767fToomas Soome				}
457199767fToomas Soome			}
458199767fToomas Soome			break;
459199767fToomas Soome
460199767fToomas Soome		default:
461199767fToomas Soome			printf("\n");
462199767fToomas Soome		}
463199767fToomas Soome
464199767fToomas Soome		p = pair + encoded_size;
465199767fToomas Soome
466199767fToomas Soome		pair = p;
467199767fToomas Soome		xdr_int(&p, &encoded_size);
468199767fToomas Soome		xdr_int(&p, &decoded_size);
469199767fToomas Soome	}
470199767fToomas Soome
4716fd7fa3Toomas Soome	return (p);
472199767fToomas Soome}
473199767fToomas Soome
474199767fToomas Soome#endif
475199767fToomas Soome
476199767fToomas Soomestatic int
477199767fToomas Soomevdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
478199767fToomas Soome    off_t offset, size_t size)
479199767fToomas Soome{
480199767fToomas Soome	size_t psize;
481199767fToomas Soome	int rc;
482199767fToomas Soome
483199767fToomas Soome	if (!vdev->v_phys_read)
484199767fToomas Soome		return (EIO);
485199767fToomas Soome
486199767fToomas Soome	if (bp) {
487199767fToomas Soome		psize = BP_GET_PSIZE(bp);
488199767fToomas Soome	} else {
489199767fToomas Soome		psize = size;
490199767fToomas Soome	}
491199767fToomas Soome
492199767fToomas Soome	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
493da9bf00Toomas Soome	if (rc == 0) {
494da9bf00Toomas Soome		if (bp != NULL)
495da9bf00Toomas Soome			rc = zio_checksum_verify(vdev->v_spa, bp, buf);
496da9bf00Toomas Soome	}
497199767fToomas Soome
498da9bf00Toomas Soome	return (rc);
499199767fToomas Soome}
500199767fToomas Soome
501c023f65Toomas Soometypedef struct remap_segment {
502c023f65Toomas Soome	vdev_t *rs_vd;
503c023f65Toomas Soome	uint64_t rs_offset;
504c023f65Toomas Soome	uint64_t rs_asize;
505c023f65Toomas Soome	uint64_t rs_split_offset;
506c023f65Toomas Soome	list_node_t rs_node;
507c023f65Toomas Soome} remap_segment_t;
508c023f65Toomas Soome
509c023f65Toomas Soomestatic remap_segment_t *
510c023f65Toomas Soomers_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
511c023f65Toomas Soome{
512c023f65Toomas Soome	remap_segment_t *rs = malloc(sizeof (remap_segment_t));
513c023f65Toomas Soome
514c023f65Toomas Soome	if (rs != NULL) {
515c023f65Toomas Soome		rs->rs_vd = vd;
516c023f65Toomas Soome		rs->rs_offset = offset;
517c023f65Toomas Soome		rs->rs_asize = asize;
518c023f65Toomas Soome		rs->rs_split_offset = split_offset;
519c023f65Toomas Soome	}
520c023f65Toomas Soome
521c023f65Toomas Soome	return (rs);
522c023f65Toomas Soome}
523c023f65Toomas Soome
524c023f65Toomas Soomevdev_indirect_mapping_t *
525c023f65Toomas Soomevdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os,
526c023f65Toomas Soome    uint64_t mapping_object)
527c023f65Toomas Soome{
528c023f65Toomas Soome	vdev_indirect_mapping_t *vim;
529c023f65Toomas Soome	vdev_indirect_mapping_phys_t *vim_phys;
530c023f65Toomas Soome	int rc;
531c023f65Toomas Soome
532c023f65Toomas Soome	vim = calloc(1, sizeof (*vim));
533c023f65Toomas Soome	if (vim == NULL)
534c023f65Toomas Soome		return (NULL);
535c023f65Toomas Soome
536c023f65Toomas Soome	vim->vim_dn = calloc(1, sizeof (*vim->vim_dn));
537c023f65Toomas Soome	if (vim->vim_dn == NULL) {
538c023f65Toomas Soome		free(vim);
539c023f65Toomas Soome		return (NULL);
540c023f65Toomas Soome	}
541c023f65Toomas Soome
542c023f65Toomas Soome	rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn);
543c023f65Toomas Soome	if (rc != 0) {
544c023f65Toomas Soome		free(vim->vim_dn);
545c023f65Toomas Soome		free(vim);
546c023f65Toomas Soome		return (NULL);
547c023f65Toomas Soome	}
548c023f65Toomas Soome
549c023f65Toomas Soome	vim->vim_spa = spa;
550c023f65Toomas Soome	vim->vim_phys = malloc(sizeof (*vim->vim_phys));
551c023f65Toomas Soome	if (vim->vim_phys == NULL) {
552c023f65Toomas Soome		free(vim->vim_dn);
553c023f65Toomas Soome		free(vim);
554c023f65Toomas Soome		return (NULL);
555c023f65Toomas Soome	}
556c023f65Toomas Soome
557c023f65Toomas Soome	vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn);
558c023f65Toomas Soome	*vim->vim_phys = *vim_phys;
559c023f65Toomas Soome
560c023f65Toomas Soome	vim->vim_objset = os;
561c023f65Toomas Soome	vim->vim_object = mapping_object;
562c023f65Toomas Soome	vim->vim_entries = NULL;
563c023f65Toomas Soome
564c023f65Toomas Soome	vim->vim_havecounts =
565c023f65Toomas Soome	    (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
566c023f65Toomas Soome
567c023f65Toomas Soome	return (vim);
568c023f65Toomas Soome}
569c023f65Toomas Soome
570c023f65Toomas Soome/*
571c023f65Toomas Soome * Compare an offset with an indirect mapping entry; there are three
572c023f65Toomas Soome * possible scenarios:
573c023f65Toomas Soome *
574c023f65Toomas Soome *     1. The offset is "less than" the mapping entry; meaning the
575c023f65Toomas Soome *        offset is less than the source offset of the mapping entry. In
576c023f65Toomas Soome *        this case, there is no overlap between the offset and the
577c023f65Toomas Soome *        mapping entry and -1 will be returned.
578c023f65Toomas Soome *
579c023f65Toomas Soome *     2. The offset is "greater than" the mapping entry; meaning the
580c023f65Toomas Soome *        offset is greater than the mapping entry's source offset plus
581c023f65Toomas Soome *        the entry's size. In this case, there is no overlap between
582c023f65Toomas Soome *        the offset and the mapping entry and 1 will be returned.
583c023f65Toomas Soome *
584c023f65Toomas Soome *        NOTE: If the offset is actually equal to the entry's offset
585c023f65Toomas Soome *        plus size, this is considered to be "greater" than the entry,
586c023f65Toomas Soome *        and this case applies (i.e. 1 will be returned). Thus, the
587c023f65Toomas Soome *        entry's "range" can be considered to be inclusive at its
588c023f65Toomas Soome *        start, but exclusive at its end: e.g. [src, src + size).
589c023f65Toomas Soome *
590c023f65Toomas Soome *     3. The last case to consider is if the offset actually falls
591c023f65Toomas Soome *        within the mapping entry's range. If this is the case, the
592c023f65Toomas Soome *        offset is considered to be "equal to" the mapping entry and
593c023f65Toomas Soome *        0 will be returned.
594c023f65Toomas Soome *
595c023f65Toomas Soome *        NOTE: If the offset is equal to the entry's source offset,
596c023f65Toomas Soome *        this case applies and 0 will be returned. If the offset is
597c023f65Toomas Soome *        equal to the entry's source plus its size, this case does
598c023f65Toomas Soome *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
599c023f65Toomas Soome *        returned.
600c023f65Toomas Soome */
601c023f65Toomas Soomestatic int
602c023f65Toomas Soomedva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
603c023f65Toomas Soome{
604c023f65Toomas Soome	const uint64_t *key = v_key;
605c023f65Toomas Soome	const vdev_indirect_mapping_entry_phys_t *array_elem =
606c023f65Toomas Soome	    v_array_elem;
607c023f65Toomas Soome	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
608c023f65Toomas Soome
609c023f65Toomas Soome	if (*key < src_offset) {
610c023f65Toomas Soome		return (-1);
611c023f65Toomas Soome	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
612c023f65Toomas Soome		return (0);
613c023f65Toomas Soome	} else {
614c023f65Toomas Soome		return (1);
615c023f65Toomas Soome	}
616c023f65Toomas Soome}
617c023f65Toomas Soome
618c023f65Toomas Soome/*
619c023f65Toomas Soome * Return array entry.
620c023f65Toomas Soome */
621c023f65Toomas Soomestatic vdev_indirect_mapping_entry_phys_t *
622c023f65Toomas Soomevdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index)
623c023f65Toomas Soome{
624c023f65Toomas Soome	uint64_t size;
625c023f65Toomas Soome	off_t offset = 0;
626c023f65Toomas Soome	int rc;
627c023f65Toomas Soome
628c023f65Toomas Soome	if (vim->vim_phys->vimp_num_entries == 0)
629c023f65Toomas Soome		return (NULL);
630c023f65Toomas Soome
631c023f65Toomas Soome	if (vim->vim_entries == NULL) {
632c023f65Toomas Soome		uint64_t bsize;
633c023f65Toomas Soome
634c023f65Toomas Soome		bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
635c023f65Toomas Soome		size = vim->vim_phys->vimp_num_entries *
636c023f65Toomas Soome		    sizeof (*vim->vim_entries);
637c023f65Toomas Soome		if (size > bsize) {
638c023f65Toomas Soome			size = bsize / sizeof (*vim->vim_entries);
639c023f65Toomas Soome			size *= sizeof (*vim->vim_entries);
640c023f65Toomas Soome		}
641c023f65Toomas Soome		vim->vim_entries = malloc(size);
642c023f65Toomas Soome		if (vim->vim_entries == NULL)
643c023f65Toomas Soome			return (NULL);
644c023f65Toomas Soome		vim->vim_num_entries = size / sizeof (*vim->vim_entries);
645c023f65Toomas Soome		offset = index * sizeof (*vim->vim_entries);
646c023f65Toomas Soome	}
647c023f65Toomas Soome
648c023f65Toomas Soome	/* We have data in vim_entries */
649c023f65Toomas Soome	if (offset == 0) {
650c023f65Toomas Soome		if (index >= vim->vim_entry_offset &&
651c023f65Toomas Soome		    index <= vim->vim_entry_offset + vim->vim_num_entries) {
652c023f65Toomas Soome			index -= vim->vim_entry_offset;
653c023f65Toomas Soome			return (&vim->vim_entries[index]);
654c023f65Toomas Soome		}
655c023f65Toomas Soome		offset = index * sizeof (*vim->vim_entries);
656c023f65Toomas Soome	}
657c023f65Toomas Soome
658c023f65Toomas Soome	vim->vim_entry_offset = index;
659c023f65Toomas Soome	size = vim->vim_num_entries * sizeof (*vim->vim_entries);
660c023f65Toomas Soome	rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries,
661c023f65Toomas Soome	    size);
662c023f65Toomas Soome	if (rc != 0) {
663c023f65Toomas Soome		/* Read error, invalidate vim_entries. */
664c023f65Toomas Soome		free(vim->vim_entries);
665c023f65Toomas Soome		vim->vim_entries = NULL;
666c023f65Toomas Soome		return (NULL);
667c023f65Toomas Soome	}
668c023f65Toomas Soome	index -= vim->vim_entry_offset;
669c023f65Toomas Soome	return (&vim->vim_entries[index]);
670c023f65Toomas Soome}
671c023f65Toomas Soome
672c023f65Toomas Soome/*
673c023f65Toomas Soome * Returns the mapping entry for the given offset.
674c023f65Toomas Soome *
675c023f65Toomas Soome * It's possible that the given offset will not be in the mapping table
676c023f65Toomas Soome * (i.e. no mapping entries contain this offset), in which case, the
677c023f65Toomas Soome * return value value depends on the "next_if_missing" parameter.
678c023f65Toomas Soome *
679c023f65Toomas Soome * If the offset is not found in the table and "next_if_missing" is
680c023f65Toomas Soome * B_FALSE, then NULL will always be returned. The behavior is intended
681c023f65Toomas Soome * to allow consumers to get the entry corresponding to the offset
682c023f65Toomas Soome * parameter, iff the offset overlaps with an entry in the table.
683c023f65Toomas Soome *
684c023f65Toomas Soome * If the offset is not found in the table and "next_if_missing" is
685c023f65Toomas Soome * B_TRUE, then the entry nearest to the given offset will be returned,
686c023f65Toomas Soome * such that the entry's source offset is greater than the offset
687c023f65Toomas Soome * passed in (i.e. the "next" mapping entry in the table is returned, if
688c023f65Toomas Soome * the offset is missing from the table). If there are no entries whose
689c023f65Toomas Soome * source offset is greater than the passed in offset, NULL is returned.
690c023f65Toomas Soome */
691c023f65Toomas Soomestatic vdev_indirect_mapping_entry_phys_t *
692c023f65Toomas Soomevdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
693c023f65Toomas Soome    uint64_t offset)
694c023f65Toomas Soome{
695c023f65Toomas Soome	ASSERT(vim->vim_phys->vimp_num_entries > 0);
696c023f65Toomas Soome
697c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *entry;
698c023f65Toomas Soome
699c023f65Toomas Soome	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
700c023f65Toomas Soome	uint64_t base = 0;
701c023f65Toomas Soome
702c023f65Toomas Soome	/*
703c023f65Toomas Soome	 * We don't define these inside of the while loop because we use
704c023f65Toomas Soome	 * their value in the case that offset isn't in the mapping.
705c023f65Toomas Soome	 */
706c023f65Toomas Soome	uint64_t mid;
707c023f65Toomas Soome	int result;
708c023f65Toomas Soome
709c023f65Toomas Soome	while (last >= base) {
710c023f65Toomas Soome		mid = base + ((last - base) >> 1);
711c023f65Toomas Soome
712c023f65Toomas Soome		entry = vdev_indirect_mapping_entry(vim, mid);
713c023f65Toomas Soome		if (entry == NULL)
714c023f65Toomas Soome			break;
715c023f65Toomas Soome		result = dva_mapping_overlap_compare(&offset, entry);
716c023f65Toomas Soome
717c023f65Toomas Soome		if (result == 0) {
718c023f65Toomas Soome			break;
719c023f65Toomas Soome		} else if (result < 0) {
720c023f65Toomas Soome			last = mid - 1;
721c023f65Toomas Soome		} else {
722c023f65Toomas Soome			base = mid + 1;
723c023f65Toomas Soome		}
724c023f65Toomas Soome	}
725c023f65Toomas Soome	return (entry);
726c023f65Toomas Soome}
727c023f65Toomas Soome
728c023f65Toomas Soome/*
729c023f65Toomas Soome * Given an indirect vdev and an extent on that vdev, it duplicates the
730c023f65Toomas Soome * physical entries of the indirect mapping that correspond to the extent
731c023f65Toomas Soome * to a new array and returns a pointer to it. In addition, copied_entries
732c023f65Toomas Soome * is populated with the number of mapping entries that were duplicated.
733c023f65Toomas Soome *
734c023f65Toomas Soome * Finally, since we are doing an allocation, it is up to the caller to
735c023f65Toomas Soome * free the array allocated in this function.
736c023f65Toomas Soome */
737c023f65Toomas Soomevdev_indirect_mapping_entry_phys_t *
738c023f65Toomas Soomevdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
739c023f65Toomas Soome    uint64_t asize, uint64_t *copied_entries)
740c023f65Toomas Soome{
741c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
742c023f65Toomas Soome	vdev_indirect_mapping_t *vim = vd->v_mapping;
743c023f65Toomas Soome	uint64_t entries = 0;
744c023f65Toomas Soome
745c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *first_mapping =
746c023f65Toomas Soome	    vdev_indirect_mapping_entry_for_offset(vim, offset);
747c023f65Toomas Soome	ASSERT3P(first_mapping, !=, NULL);
748c023f65Toomas Soome
749c023f65Toomas Soome	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
750c023f65Toomas Soome	while (asize > 0) {
751c023f65Toomas Soome		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
752c023f65Toomas Soome		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
753c023f65Toomas Soome		uint64_t inner_size = MIN(asize, size - inner_offset);
754c023f65Toomas Soome
755c023f65Toomas Soome		offset += inner_size;
756c023f65Toomas Soome		asize -= inner_size;
757c023f65Toomas Soome		entries++;
758c023f65Toomas Soome		m++;
759c023f65Toomas Soome	}
760c023f65Toomas Soome
761c023f65Toomas Soome	size_t copy_length = entries * sizeof (*first_mapping);
762c023f65Toomas Soome	duplicate_mappings = malloc(copy_length);
763c023f65Toomas Soome	if (duplicate_mappings != NULL)
764c023f65Toomas Soome		bcopy(first_mapping, duplicate_mappings, copy_length);
765c023f65Toomas Soome	else
766c023f65Toomas Soome		entries = 0;
767c023f65Toomas Soome
768c023f65Toomas Soome	*copied_entries = entries;
769c023f65Toomas Soome
770c023f65Toomas Soome	return (duplicate_mappings);
771c023f65Toomas Soome}
772c023f65Toomas Soome
773c023f65Toomas Soomestatic vdev_t *
774c023f65Toomas Soomevdev_lookup_top(spa_t *spa, uint64_t vdev)
775c023f65Toomas Soome{
776c023f65Toomas Soome	vdev_t *rvd;
777da9bf00Toomas Soome	vdev_list_t *vlist;
778c023f65Toomas Soome
779da9bf00Toomas Soome	vlist = &spa->spa_root_vdev->v_children;
780da9bf00Toomas Soome	STAILQ_FOREACH(rvd, vlist, v_childlink)
781c023f65Toomas Soome		if (rvd->v_id == vdev)
782c023f65Toomas Soome			break;
783c023f65Toomas Soome
784c023f65Toomas Soome	return (rvd);
785c023f65Toomas Soome}
786c023f65Toomas Soome
787c023f65Toomas Soome/*
788c023f65Toomas Soome * This is a callback for vdev_indirect_remap() which allocates an
789c023f65Toomas Soome * indirect_split_t for each split segment and adds it to iv_splits.
790c023f65Toomas Soome */
791c023f65Toomas Soomestatic void
792c023f65Toomas Soomevdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
793c023f65Toomas Soome    uint64_t size, void *arg)
794c023f65Toomas Soome{
795c023f65Toomas Soome	int n = 1;
796c023f65Toomas Soome	zio_t *zio = arg;
797c023f65Toomas Soome	indirect_vsd_t *iv = zio->io_vsd;
798c023f65Toomas Soome
799c023f65Toomas Soome	if (vd->v_read == vdev_indirect_read)
800c023f65Toomas Soome		return;
801c023f65Toomas Soome
802c023f65Toomas Soome	if (vd->v_read == vdev_mirror_read)
803c023f65Toomas Soome		n = vd->v_nchildren;
804c023f65Toomas Soome
805c023f65Toomas Soome	indirect_split_t *is =
806c023f65Toomas Soome	    malloc(offsetof(indirect_split_t, is_child[n]));
807c023f65Toomas Soome	if (is == NULL) {
808c023f65Toomas Soome		zio->io_error = ENOMEM;
809c023f65Toomas Soome		return;
810c023f65Toomas Soome	}
811c023f65Toomas Soome	bzero(is, offsetof(indirect_split_t, is_child[n]));
812c023f65Toomas Soome
813c023f65Toomas Soome	is->is_children = n;
814c023f65Toomas Soome	is->is_size = size;
815c023f65Toomas Soome	is->is_split_offset = split_offset;
816c023f65Toomas Soome	is->is_target_offset = offset;
817c023f65Toomas Soome	is->is_vdev = vd;
818c023f65Toomas Soome
819c023f65Toomas Soome	/*
820c023f65Toomas Soome	 * Note that we only consider multiple copies of the data for
821c023f65Toomas Soome	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
822c023f65Toomas Soome	 * though they use the same ops as mirror, because there's only one
823c023f65Toomas Soome	 * "good" copy under the replacing/spare.
824c023f65Toomas Soome	 */
825c023f65Toomas Soome	if (vd->v_read == vdev_mirror_read) {
826c023f65Toomas Soome		int i = 0;
827c023f65Toomas Soome		vdev_t *kid;
828c023f65Toomas Soome
829c023f65Toomas Soome		STAILQ_FOREACH(kid, &vd->v_children, v_childlink) {
830c023f65Toomas Soome			is->is_child[i++].ic_vdev = kid;
831c023f65Toomas Soome		}
832c023f65Toomas Soome	} else {
833c023f65Toomas Soome		is->is_child[0].ic_vdev = vd;
834c023f65Toomas Soome	}
835c023f65Toomas Soome
836c023f65Toomas Soome	list_insert_tail(&iv->iv_splits, is);
837c023f65Toomas Soome}
838c023f65Toomas Soome
839c023f65Toomas Soomestatic void
840c023f65Toomas Soomevdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
841c023f65Toomas Soome{
842c023f65Toomas Soome	list_t stack;
843da9bf00Toomas Soome	spa_t *spa = vd->v_spa;
844c023f65Toomas Soome	zio_t *zio = arg;
845042b560Toomas Soome	remap_segment_t *rs;
846c023f65Toomas Soome
847c023f65Toomas Soome	list_create(&stack, sizeof (remap_segment_t),
848c023f65Toomas Soome	    offsetof(remap_segment_t, rs_node));
849c023f65Toomas Soome
850042b560Toomas Soome	rs = rs_alloc(vd, offset, asize, 0);
851042b560Toomas Soome	if (rs == NULL) {
852042b560Toomas Soome		printf("vdev_indirect_remap: out of memory.\n");
853042b560Toomas Soome		zio->io_error = ENOMEM;
854042b560Toomas Soome	}
8556fd7fa3Toomas Soome	for (; rs != NULL; rs = list_remove_head(&stack)) {
856c023f65Toomas Soome		vdev_t *v = rs->rs_vd;
857c023f65Toomas Soome		uint64_t num_entries = 0;
858c023f65Toomas Soome		/* vdev_indirect_mapping_t *vim = v->v_mapping; */
859c023f65Toomas Soome		vdev_indirect_mapping_entry_phys_t *mapping =
860c023f65Toomas Soome		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
861c023f65Toomas Soome		    rs->rs_offset, rs->rs_asize, &num_entries);
862c023f65Toomas Soome
863042b560Toomas Soome		if (num_entries == 0)
864042b560Toomas Soome			zio->io_error = ENOMEM;
865042b560Toomas Soome
866c023f65Toomas Soome		for (uint64_t i = 0; i < num_entries; i++) {
867c023f65Toomas Soome			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
868c023f65Toomas Soome			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
869c023f65Toomas Soome			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
870c023f65Toomas Soome			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
871c023f65Toomas Soome			uint64_t inner_offset = rs->rs_offset -
872c023f65Toomas Soome			    DVA_MAPPING_GET_SRC_OFFSET(m);
873c023f65Toomas Soome			uint64_t inner_size =
874c023f65Toomas Soome			    MIN(rs->rs_asize, size - inner_offset);
875c023f65Toomas Soome			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
876c023f65Toomas Soome
877c023f65Toomas Soome			if (dst_v->v_read == vdev_indirect_read) {
878042b560Toomas Soome				remap_segment_t *o;
879042b560Toomas Soome
880042b560Toomas Soome				o = rs_alloc(dst_v, dst_offset + inner_offset,
881042b560Toomas Soome				    inner_size, rs->rs_split_offset);
882042b560Toomas Soome				if (o == NULL) {
883042b560Toomas Soome					printf("vdev_indirect_remap: "
884042b560Toomas Soome					    "out of memory.\n");
885042b560Toomas Soome					zio->io_error = ENOMEM;
886042b560Toomas Soome					break;
887042b560Toomas Soome				}
888042b560Toomas Soome
889042b560Toomas Soome				list_insert_head(&stack, o);
890c023f65Toomas Soome			}
891c023f65Toomas Soome			vdev_indirect_gather_splits(rs->rs_split_offset, dst_v,
892c023f65Toomas Soome			    dst_offset + inner_offset,
893c023f65Toomas Soome			    inner_size, arg);
894c023f65Toomas Soome
895c023f65Toomas Soome			/*
896c023f65Toomas Soome			 * vdev_indirect_gather_splits can have memory
897c023f65Toomas Soome			 * allocation error, we can not recover from it.
898c023f65Toomas Soome			 */
899c023f65Toomas Soome			if (zio->io_error != 0)
900c023f65Toomas Soome				break;
901c023f65Toomas Soome			rs->rs_offset += inner_size;
902c023f65Toomas Soome			rs->rs_asize -= inner_size;
903c023f65Toomas Soome			rs->rs_split_offset += inner_size;
904c023f65Toomas Soome		}
905c023f65Toomas Soome
906c023f65Toomas Soome		free(mapping);
907c023f65Toomas Soome		free(rs);
908c023f65Toomas Soome		if (zio->io_error != 0)
909c023f65Toomas Soome			break;
910c023f65Toomas Soome	}
911c023f65Toomas Soome
912c023f65Toomas Soome	list_destroy(&stack);
913c023f65Toomas Soome}
914c023f65Toomas Soome
915c023f65Toomas Soomestatic void
916c023f65Toomas Soomevdev_indirect_map_free(zio_t *zio)
917c023f65Toomas Soome{
918c023f65Toomas Soome	indirect_vsd_t *iv = zio->io_vsd;
919c023f65Toomas Soome	indirect_split_t *is;
920c023f65Toomas Soome
921c023f65Toomas Soome	while ((is = list_head(&iv->iv_splits)) != NULL) {
922c023f65Toomas Soome		for (int c = 0; c < is->is_children; c++) {
923c023f65Toomas Soome			indirect_child_t *ic = &is->is_child[c];
924c023f65Toomas Soome			free(ic->ic_data);
925c023f65Toomas Soome		}
926c023f65Toomas Soome		list_remove(&iv->iv_splits, is);
927c023f65Toomas Soome		free(is);
928c023f65Toomas Soome	}
929c023f65Toomas Soome	free(iv);
930c023f65Toomas Soome}
931c023f65Toomas Soome
932c023f65Toomas Soomestatic int
933c023f65Toomas Soomevdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
934c023f65Toomas Soome    off_t offset, size_t bytes)
935c023f65Toomas Soome{
936da9bf00Toomas Soome	zio_t zio;
937da9bf00Toomas Soome	spa_t *spa = vdev->v_spa;
938da9bf00Toomas Soome	indirect_vsd_t *iv;
939c023f65Toomas Soome	indirect_split_t *first;
940c023f65Toomas Soome	int rc = EIO;
941c023f65Toomas Soome
942da9bf00Toomas Soome	iv = calloc(1, sizeof (*iv));
943c023f65Toomas Soome	if (iv == NULL)
944c023f65Toomas Soome		return (ENOMEM);
945c023f65Toomas Soome
946c023f65Toomas Soome	list_create(&iv->iv_splits,
947c023f65Toomas Soome	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
948c023f65Toomas Soome
949da9bf00Toomas Soome	bzero(&zio, sizeof (zio));
950c023f65Toomas Soome	zio.io_spa = spa;
951c023f65Toomas Soome	zio.io_bp = (blkptr_t *)bp;
952c023f65Toomas Soome	zio.io_data = buf;
953c023f65Toomas Soome	zio.io_size = bytes;
954c023f65Toomas Soome	zio.io_offset = offset;
955c023f65Toomas Soome	zio.io_vd = vdev;
956c023f65Toomas Soome	zio.io_vsd = iv;
957c023f65Toomas Soome
958c023f65Toomas Soome	if (vdev->v_mapping == NULL) {
959c023f65Toomas Soome		vdev_indirect_config_t *vic;
960c023f65Toomas Soome
961c023f65Toomas Soome		vic = &vdev->vdev_indirect_config;
962c023f65Toomas Soome		vdev->v_mapping = vdev_indirect_mapping_open(spa,
963c023f65Toomas Soome		    &spa->spa_mos, vic->vic_mapping_object);
964c023f65Toomas Soome	}
965c023f65Toomas Soome
966c023f65Toomas Soome	vdev_indirect_remap(vdev, offset, bytes, &zio);
967c023f65Toomas Soome	if (zio.io_error != 0)
968c023f65Toomas Soome		return (zio.io_error);
969c023f65Toomas Soome
970c023f65Toomas Soome	first = list_head(&iv->iv_splits);
971c023f65Toomas Soome	if (first->is_size == zio.io_size) {
972c023f65Toomas Soome		/*
973c023f65Toomas Soome		 * This is not a split block; we are pointing to the entire
974c023f65Toomas Soome		 * data, which will checksum the same as the original data.
975c023f65Toomas Soome		 * Pass the BP down so that the child i/o can verify the
976c023f65Toomas Soome		 * checksum, and try a different location if available
977c023f65Toomas Soome		 * (e.g. on a mirror).
978c023f65Toomas Soome		 *
979c023f65Toomas Soome		 * While this special case could be handled the same as the
980c023f65Toomas Soome		 * general (split block) case, doing it this way ensures
981c023f65Toomas Soome		 * that the vast majority of blocks on indirect vdevs
982c023f65Toomas Soome		 * (which are not split) are handled identically to blocks
983c023f65Toomas Soome		 * on non-indirect vdevs.  This allows us to be less strict
984c023f65Toomas Soome		 * about performance in the general (but rare) case.
985c023f65Toomas Soome		 */
986c023f65Toomas Soome		rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp,
987c023f65Toomas Soome		    zio.io_data, first->is_target_offset, bytes);
988c023f65Toomas Soome	} else {
989c023f65Toomas Soome		iv->iv_split_block = B_TRUE;
990c023f65Toomas Soome		/*
991c023f65Toomas Soome		 * Read one copy of each split segment, from the
992c023f65Toomas Soome		 * top-level vdev.  Since we don't know the
993c023f65Toomas Soome		 * checksum of each split individually, the child
994c023f65Toomas Soome		 * zio can't ensure that we get the right data.
995c023f65Toomas Soome		 * E.g. if it's a mirror, it will just read from a
996c023f65Toomas Soome		 * random (healthy) leaf vdev.  We have to verify
997c023f65Toomas Soome		 * the checksum in vdev_indirect_io_done().
998c023f65Toomas Soome		 */
999c023f65Toomas Soome		for (indirect_split_t *is = list_head(&iv->iv_splits);
1000c023f65Toomas Soome		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1001c023f65Toomas Soome			char *ptr = zio.io_data;
1002c023f65Toomas Soome
1003c023f65Toomas Soome			rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp,
1004c023f65Toomas Soome			    ptr + is->is_split_offset, is->is_target_offset,
1005c023f65Toomas Soome			    is->is_size);
1006c023f65Toomas Soome		}
1007c023f65Toomas Soome		if (zio_checksum_verify(spa, zio.io_bp, zio.io_data))
1008c023f65Toomas Soome			rc = ECKSUM;
1009c023f65Toomas Soome		else
1010c023f65Toomas Soome			rc = 0;
1011c023f65Toomas Soome	}
1012c023f65Toomas Soome
1013c023f65Toomas Soome	vdev_indirect_map_free(&zio);
1014c023f65Toomas Soome	if (rc == 0)
1015c023f65Toomas Soome		rc = zio.io_error;
1016c023f65Toomas Soome
1017c023f65Toomas Soome	return (rc);
1018c023f65Toomas Soome}
1019c023f65Toomas Soome
1020199767fToomas Soomestatic int
1021199767fToomas Soomevdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1022199767fToomas Soome    off_t offset, size_t bytes)
1023199767fToomas Soome{
1024199767fToomas Soome
1025199767fToomas Soome	return (vdev_read_phys(vdev, bp, buf,
10266fd7fa3Toomas Soome	    offset + VDEV_LABEL_START_SIZE, bytes));
1027199767fToomas Soome}
1028199767fToomas Soome
1029199767fToomas Soome
1030199767fToomas Soomestatic int
1031199767fToomas Soomevdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1032199767fToomas Soome    off_t offset, size_t bytes)
1033199767fToomas Soome{
1034199767fToomas Soome	vdev_t *kid;
1035199767fToomas Soome	int rc;
1036199767fToomas Soome
1037199767fToomas Soome	rc = EIO;
1038199767fToomas Soome	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1039199767fToomas Soome		if (kid->v_state != VDEV_STATE_HEALTHY)
1040199767fToomas Soome			continue;
1041199767fToomas Soome		rc = kid->v_read(kid, bp, buf, offset, bytes);
1042199767fToomas Soome		if (!rc)
1043199767fToomas Soome			return (0);
1044199767fToomas Soome	}
1045199767fToomas Soome
1046199767fToomas Soome	return (rc);
1047199767fToomas Soome}
1048199767fToomas Soome
1049199767fToomas Soomestatic int
1050199767fToomas Soomevdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
1051199767fToomas Soome    off_t offset, size_t bytes)
1052199767fToomas Soome{
1053199767fToomas Soome	vdev_t *kid;
1054199767fToomas Soome
1055199767fToomas Soome	/*
1056199767fToomas Soome	 * Here we should have two kids:
1057199767fToomas Soome	 * First one which is the one we are replacing and we can trust
1058199767fToomas Soome	 * only this one to have valid data, but it might not be present.
1059199767fToomas Soome	 * Second one is that one we are replacing with. It is most likely
1060199767fToomas Soome	 * healthy, but we can't trust it has needed data, so we won't use it.
1061199767fToomas Soome	 */
1062199767fToomas Soome	kid = STAILQ_FIRST(&vdev->v_children);
1063199767fToomas Soome	if (kid == NULL)
1064199767fToomas Soome		return (EIO);
1065199767fToomas Soome	if (kid->v_state != VDEV_STATE_HEALTHY)
1066199767fToomas Soome		return (EIO);
1067199767fToomas Soome	return (kid->v_read(kid, bp, buf, offset, bytes));
1068199767fToomas Soome}
1069199767fToomas Soome
1070199767fToomas Soomestatic vdev_t *
1071199767fToomas Soomevdev_find(uint64_t guid)
1072199767fToomas Soome{
1073199767fToomas Soome	vdev_t *vdev;
1074199767fToomas Soome
1075199767fToomas Soome	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
1076199767fToomas Soome		if (vdev->v_guid == guid)
1077199767fToomas Soome			return (vdev);
1078199767fToomas Soome
1079199767fToomas Soome	return (0);
1080199767fToomas Soome}
1081199767fToomas Soome
1082199767fToomas Soomestatic vdev_t *
1083199767fToomas Soomevdev_create(uint64_t guid, vdev_read_t *vdev_read)
1084199767fToomas Soome{
1085199767fToomas Soome	vdev_t *vdev;
1086c023f65Toomas Soome	vdev_indirect_config_t *vic;
1087199767fToomas Soome
1088da9bf00Toomas Soome	vdev = calloc(1, sizeof (vdev_t));
1089da9bf00Toomas Soome	if (vdev != NULL) {
1090da9bf00Toomas Soome		STAILQ_INIT(&vdev->v_children);
1091da9bf00Toomas Soome		vdev->v_guid = guid;
1092da9bf00Toomas Soome		vdev->v_read = vdev_read;
1093c023f65Toomas Soome
1094da9bf00Toomas Soome		/*
1095da9bf00Toomas Soome		 * root vdev has no read function, we use this fact to
1096da9bf00Toomas Soome		 * skip setting up data we do not need for root vdev.
1097da9bf00Toomas Soome		 * We only point root vdev from spa.
1098da9bf00Toomas Soome		 */
1099da9bf00Toomas Soome		if (vdev_read != NULL) {
1100da9bf00Toomas Soome			vic = &vdev->vdev_indirect_config;
1101da9bf00Toomas Soome			vic->vic_prev_indirect_vdev = UINT64_MAX;
1102da9bf00Toomas Soome			STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
1103da9bf00Toomas Soome		}
1104da9bf00Toomas Soome	}
1105199767fToomas Soome
1106199767fToomas Soome	return (vdev);
1107199767fToomas Soome}
1108199767fToomas Soome
1109da9bf00Toomas Soomestatic void
1110da9bf00Toomas Soomevdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
1111199767fToomas Soome{
1112199767fToomas Soome	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
111367806cdToomas Soome	uint64_t is_log;
1114199767fToomas Soome
1115da9bf00Toomas Soome	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
1116da9bf00Toomas Soome	is_log = 0;
1117da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
1118da9bf00Toomas Soome	    &is_offline);
1119da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
1120da9bf00Toomas Soome	    &is_removed);
1121da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
1122da9bf00Toomas Soome	    &is_faulted);
1123da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
1124da9bf00Toomas Soome	    NULL, &is_degraded);
1125da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
1126da9bf00Toomas Soome	    NULL, &isnt_present);
1127da9bf00Toomas Soome	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
1128da9bf00Toomas Soome	    &is_log);
1129da9bf00Toomas Soome
1130da9bf00Toomas Soome	if (is_offline != 0)
1131da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_OFFLINE;
1132da9bf00Toomas Soome	else if (is_removed != 0)
1133da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_REMOVED;
1134da9bf00Toomas Soome	else if (is_faulted != 0)
1135da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_FAULTED;
1136da9bf00Toomas Soome	else if (is_degraded != 0)
1137da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_DEGRADED;
1138da9bf00Toomas Soome	else if (isnt_present != 0)
1139da9bf00Toomas Soome		vdev->v_state = VDEV_STATE_CANT_OPEN;
1140da9bf00Toomas Soome
1141da9bf00Toomas Soome	vdev->v_islog = is_log != 0;
1142da9bf00Toomas Soome}
1143da9bf00Toomas Soome
1144da9bf00Toomas Soomestatic int
1145da9bf00Toomas Soomevdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
1146da9bf00Toomas Soome{
1147da9bf00Toomas Soome	uint64_t id, ashift, asize, nparity;
1148da9bf00Toomas Soome	const char *path;
1149da9bf00Toomas Soome	const char *type;
1150da9bf00Toomas Soome	vdev_t *vdev;
1151da9bf00Toomas Soome
1152da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
1153edb3504Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
1154edb3504Toomas Soome	    NULL, &type)) {
1155199767fToomas Soome		return (ENOENT);
1156199767fToomas Soome	}
1157199767fToomas Soome
11586fd7fa3Toomas Soome	if (strcmp(type, VDEV_TYPE_MIRROR) != 0 &&
11596fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_DISK) != 0 &&
1160199767fToomas Soome#ifdef ZFS_TEST
11616fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_FILE) != 0 &&
1162199767fToomas Soome#endif
11636fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
11646fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_INDIRECT) != 0 &&
11656fd7fa3Toomas Soome	    strcmp(type, VDEV_TYPE_REPLACING) != 0) {
11666fd7fa3Toomas Soome		printf("ZFS: can only boot from disk, mirror, raidz1, "
11676fd7fa3Toomas Soome		    "raidz2 and raidz3 vdevs\n");
1168199767fToomas Soome		return (EIO);
1169199767fToomas Soome	}
1170199767fToomas Soome
1171da9bf00Toomas Soome	if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
1172da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_mirror_read);
1173da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
1174da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_raidz_read);
1175da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
1176da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_replacing_read);
1177da9bf00Toomas Soome	else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
1178da9bf00Toomas Soome		vdev_indirect_config_t *vic;
1179199767fToomas Soome
1180da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_indirect_read);
1181da9bf00Toomas Soome		if (vdev != NULL) {
1182c023f65Toomas Soome			vdev->v_state = VDEV_STATE_HEALTHY;
1183c023f65Toomas Soome			vic = &vdev->vdev_indirect_config;
1184c023f65Toomas Soome
1185c023f65Toomas Soome			nvlist_find(nvlist,
1186da9bf00Toomas Soome			    ZPOOL_CONFIG_INDIRECT_OBJECT,
1187da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1188c023f65Toomas Soome			    NULL, &vic->vic_mapping_object);
1189c023f65Toomas Soome			nvlist_find(nvlist,
1190da9bf00Toomas Soome			    ZPOOL_CONFIG_INDIRECT_BIRTHS,
1191da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1192c023f65Toomas Soome			    NULL, &vic->vic_births_object);
1193c023f65Toomas Soome			nvlist_find(nvlist,
1194da9bf00Toomas Soome			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
1195da9bf00Toomas Soome			    DATA_TYPE_UINT64,
1196c023f65Toomas Soome			    NULL, &vic->vic_prev_indirect_vdev);
11974c2b14fToomas Soome		}
1198da9bf00Toomas Soome	} else {
1199da9bf00Toomas Soome		vdev = vdev_create(guid, vdev_disk_read);
1200da9bf00Toomas Soome	}
1201da9bf00Toomas Soome
1202da9bf00Toomas Soome	if (vdev == NULL)
1203da9bf00Toomas Soome		return (ENOMEM);
1204da9bf00Toomas Soome
1205da9bf00Toomas Soome	vdev_set_initial_state(vdev, nvlist);
1206da9bf00Toomas Soome	vdev->v_id = id;
1207da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
1208da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &ashift) == 0)
1209da9bf00Toomas Soome		vdev->v_ashift = ashift;
1210da9bf00Toomas Soome
1211da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
1212da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &asize) == 0) {
1213da9bf00Toomas Soome		vdev->v_psize = asize +
1214da9bf00Toomas Soome		    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
1215da9bf00Toomas Soome	}
1216da9bf00Toomas Soome
1217da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
1218da9bf00Toomas Soome	    DATA_TYPE_UINT64, NULL, &nparity) == 0)
1219da9bf00Toomas Soome		vdev->v_nparity = nparity;
1220da9bf00Toomas Soome
1221da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
1222da9bf00Toomas Soome	    DATA_TYPE_STRING, NULL, &path) == 0) {
1223da9bf00Toomas Soome		if (strncmp(path, "/dev/dsk/", 9) == 0)
1224da9bf00Toomas Soome			path += 9;
1225da9bf00Toomas Soome		vdev->v_name = strdup(path);
1226da9bf00Toomas Soome		if (nvlist_find(nvlist, ZPOOL_CONFIG_PHYS_PATH,
1227da9bf00Toomas Soome		    DATA_TYPE_STRING, NULL, &path) == 0) {
1228da9bf00Toomas Soome			vdev->v_phys_path = strdup(path);
1229edb3504Toomas Soome		} else {
1230da9bf00Toomas Soome			vdev->v_phys_path = NULL;
1231edb3504Toomas Soome		}
1232da9bf00Toomas Soome		if (nvlist_find(nvlist, ZPOOL_CONFIG_DEVID,
1233edb3504Toomas Soome		    DATA_TYPE_STRING, NULL, &path) == 0) {
1234da9bf00Toomas Soome			vdev->v_devid = strdup(path);
1235199767fToomas Soome		} else {
1236da9bf00Toomas Soome			vdev->v_devid = NULL;
1237199767fToomas Soome		}
1238199767fToomas Soome	} else {
1239da9bf00Toomas Soome		char *name;
1240da9bf00Toomas Soome
1241da9bf00Toomas Soome		name = NULL;
1242da9bf00Toomas Soome		if (strcmp(type, "raidz") == 0) {
1243da9bf00Toomas Soome			if (vdev->v_nparity < 1 ||
1244da9bf00Toomas Soome			    vdev->v_nparity > 3) {
1245da9bf00Toomas Soome				printf("ZFS: invalid raidz parity: %d\n",
1246da9bf00Toomas Soome				    vdev->v_nparity);
1247da9bf00Toomas Soome				return (EIO);
1248da9bf00Toomas Soome			}
1249da9bf00Toomas Soome			(void) asprintf(&name, "%s%d-%" PRIu64, type,
1250da9bf00Toomas Soome			    vdev->v_nparity, id);
1251da9bf00Toomas Soome		} else {
1252da9bf00Toomas Soome			(void) asprintf(&name, "%s-%" PRIu64, type, id);
1253da9bf00Toomas Soome		}
1254da9bf00Toomas Soome		vdev->v_name = name;
1255da9bf00Toomas Soome	}
1256da9bf00Toomas Soome	*vdevp = vdev;
1257da9bf00Toomas Soome	return (0);
1258da9bf00Toomas Soome}
1259da9bf00Toomas Soome
1260da9bf00Toomas Soome/*
1261da9bf00Toomas Soome * Find slot for vdev. We return either NULL to signal to use
1262da9bf00Toomas Soome * STAILQ_INSERT_HEAD, or we return link element to be used with
1263da9bf00Toomas Soome * STAILQ_INSERT_AFTER.
1264da9bf00Toomas Soome */
1265da9bf00Toomas Soomestatic vdev_t *
1266da9bf00Toomas Soomevdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
1267da9bf00Toomas Soome{
1268da9bf00Toomas Soome	vdev_t *v, *previous;
1269da9bf00Toomas Soome
1270da9bf00Toomas Soome	if (STAILQ_EMPTY(&top_vdev->v_children))
1271da9bf00Toomas Soome		return (NULL);
1272da9bf00Toomas Soome
1273da9bf00Toomas Soome	previous = NULL;
1274da9bf00Toomas Soome	STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
1275da9bf00Toomas Soome		if (v->v_id > vdev->v_id)
1276da9bf00Toomas Soome			return (previous);
1277da9bf00Toomas Soome
1278da9bf00Toomas Soome		if (v->v_id == vdev->v_id)
1279da9bf00Toomas Soome			return (v);
1280da9bf00Toomas Soome
1281da9bf00Toomas Soome		if (v->v_id < vdev->v_id)
1282da9bf00Toomas Soome			previous = v;
1283199767fToomas Soome	}
1284da9bf00Toomas Soome	return (previous);
1285da9bf00Toomas Soome}
1286da9bf00Toomas Soome
1287da9bf00Toomas Soomestatic size_t
1288da9bf00Toomas Soomevdev_child_count(vdev_t *vdev)
1289da9bf00Toomas Soome{
1290da9bf00Toomas Soome	vdev_t *v;
1291da9bf00Toomas Soome	size_t count;
1292da9bf00Toomas Soome
1293da9bf00Toomas Soome	count = 0;
1294da9bf00Toomas Soome	STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
1295da9bf00Toomas Soome		count++;
1296da9bf00Toomas Soome	}
1297da9bf00Toomas Soome	return (count);
1298da9bf00Toomas Soome}
1299da9bf00Toomas Soome
1300da9bf00Toomas Soome/*
1301da9bf00Toomas Soome * Insert vdev into top_vdev children list. List is ordered by v_id.
1302da9bf00Toomas Soome */
1303da9bf00Toomas Soomestatic void
1304da9bf00Toomas Soomevdev_insert(vdev_t *top_vdev, vdev_t *vdev)
1305da9bf00Toomas Soome{
1306da9bf00Toomas Soome	vdev_t *previous;
1307da9bf00Toomas Soome	size_t count;
1308da9bf00Toomas Soome
1309da9bf00Toomas Soome	/*
1310da9bf00Toomas Soome	 * The top level vdev can appear in random order, depending how
1311da9bf00Toomas Soome	 * the firmware is presenting the disk devices.
1312da9bf00Toomas Soome	 * However, we will insert vdev to create list ordered by v_id,
1313da9bf00Toomas Soome	 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
1314da9bf00Toomas Soome	 * as STAILQ does not have insert before.
1315da9bf00Toomas Soome	 */
1316da9bf00Toomas Soome	previous = vdev_find_previous(top_vdev, vdev);
1317199767fToomas Soome
1318da9bf00Toomas Soome	if (previous == NULL) {
1319da9bf00Toomas Soome		STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
1320da9bf00Toomas Soome	} else if (previous->v_id == vdev->v_id) {
1321199767fToomas Soome		/*
1322da9bf00Toomas Soome		 * This vdev was configured from label config,
1323da9bf00Toomas Soome		 * do not insert duplicate.
1324199767fToomas Soome		 */
1325da9bf00Toomas Soome		return;
1326da9bf00Toomas Soome	} else {
1327da9bf00Toomas Soome		STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev,
1328da9bf00Toomas Soome		    v_childlink);
1329da9bf00Toomas Soome	}
1330da9bf00Toomas Soome
1331da9bf00Toomas Soome	count = vdev_child_count(top_vdev);
1332da9bf00Toomas Soome	if (top_vdev->v_nchildren < count)
1333da9bf00Toomas Soome		top_vdev->v_nchildren = count;
1334da9bf00Toomas Soome}
1335da9bf00Toomas Soome
1336da9bf00Toomas Soomestatic int
1337da9bf00Toomas Soomevdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
1338da9bf00Toomas Soome{
1339da9bf00Toomas Soome	vdev_t *top_vdev, *vdev;
1340da9bf00Toomas Soome	const unsigned char *kids;
1341da9bf00Toomas Soome	int rc, nkids;
1342da9bf00Toomas Soome
1343da9bf00Toomas Soome	/* Get top vdev. */
1344da9bf00Toomas Soome	top_vdev = vdev_find(top_guid);
1345da9bf00Toomas Soome	if (top_vdev == NULL) {
1346da9bf00Toomas Soome		rc = vdev_init(top_guid, nvlist, &top_vdev);
1347da9bf00Toomas Soome		if (rc != 0)
1348da9bf00Toomas Soome			return (rc);
1349da9bf00Toomas Soome		top_vdev->v_spa = spa;
1350da9bf00Toomas Soome		top_vdev->v_top = top_vdev;
1351da9bf00Toomas Soome		vdev_insert(spa->spa_root_vdev, top_vdev);
1352199767fToomas Soome	}
1353199767fToomas Soome
1354da9bf00Toomas Soome	/* Add children if there are any. */
1355edb3504Toomas Soome	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1356edb3504Toomas Soome	    &nkids, &kids);
1357199767fToomas Soome	if (rc == 0) {
1358da9bf00Toomas Soome		for (int i = 0; i < nkids; i++) {
1359da9bf00Toomas Soome			uint64_t guid;
1360da9bf00Toomas Soome
1361da9bf00Toomas Soome			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
1362da9bf00Toomas Soome			    DATA_TYPE_UINT64, NULL, &guid);
1363da9bf00Toomas Soome			if (rc != 0)
1364da9bf00Toomas Soome				return (rc);
1365da9bf00Toomas Soome			rc = vdev_init(guid, kids, &vdev);
1366da9bf00Toomas Soome			if (rc != 0)
1367199767fToomas Soome				return (rc);
1368da9bf00Toomas Soome
1369da9bf00Toomas Soome			vdev->v_spa = spa;
1370da9bf00Toomas Soome			vdev->v_top = top_vdev;
1371da9bf00Toomas Soome			vdev_insert(top_vdev, vdev);
1372da9bf00Toomas Soome
1373199767fToomas Soome			kids = nvlist_next(kids);
1374199767fToomas Soome		}
1375199767fToomas Soome	} else {
1376da9bf00Toomas Soome		/*
1377da9bf00Toomas Soome		 * When there are no children, nvlist_find() does return
1378da9bf00Toomas Soome		 * error, reset it because leaf devices have no children.
1379da9bf00Toomas Soome		 */
1380da9bf00Toomas Soome		rc = 0;
1381199767fToomas Soome	}
1382199767fToomas Soome
1383da9bf00Toomas Soome	return (rc);
1384da9bf00Toomas Soome}
1385da9bf00Toomas Soome
1386da9bf00Toomas Soomestatic int
1387da9bf00Toomas Soomevdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
1388da9bf00Toomas Soome{
1389da9bf00Toomas Soome	uint64_t pool_guid, top_guid;
1390da9bf00Toomas Soome	const unsigned char *vdevs;
1391da9bf00Toomas Soome
1392da9bf00Toomas Soome	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1393da9bf00Toomas Soome	    NULL, &pool_guid) ||
1394da9bf00Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
1395da9bf00Toomas Soome	    NULL, &top_guid) ||
1396da9bf00Toomas Soome	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1397da9bf00Toomas Soome	    NULL, &vdevs)) {
1398da9bf00Toomas Soome		printf("ZFS: can't find vdev details\n");
1399da9bf00Toomas Soome		return (ENOENT);
1400da9bf00Toomas Soome	}
1401da9bf00Toomas Soome
1402da9bf00Toomas Soome	return (vdev_from_nvlist(spa, top_guid, vdevs));
1403199767fToomas Soome}
1404199767fToomas Soome
1405199767fToomas Soomestatic void
1406199767fToomas Soomevdev_set_state(vdev_t *vdev)
1407199767fToomas Soome{
1408199767fToomas Soome	vdev_t *kid;
1409199767fToomas Soome	int good_kids;
1410199767fToomas Soome	int bad_kids;
1411199767fToomas Soome
1412da9bf00Toomas Soome	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1413da9bf00Toomas Soome		vdev_set_state(kid);
1414da9bf00Toomas Soome	}
1415da9bf00Toomas Soome
1416199767fToomas Soome	/*
1417199767fToomas Soome	 * A mirror or raidz is healthy if all its kids are healthy. A
1418199767fToomas Soome	 * mirror is degraded if any of its kids is healthy; a raidz
1419199767fToomas Soome	 * is degraded if at most nparity kids are offline.
1420199767fToomas Soome	 */
1421199767fToomas Soome	if (STAILQ_FIRST(&vdev->v_children)) {
1422199767fToomas Soome		good_kids = 0;
1423199767fToomas Soome		bad_kids = 0;
1424199767fToomas Soome		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
1425199767fToomas Soome			if (kid->v_state == VDEV_STATE_HEALTHY)
1426199767fToomas Soome				good_kids++;
1427199767fToomas Soome			else
1428199767fToomas Soome				bad_kids++;
1429199767fToomas Soome		}
1430199767fToomas Soome		if (bad_kids == 0) {
1431199767fToomas Soome			vdev->v_state = VDEV_STATE_HEALTHY;
1432199767fToomas Soome		} else {
1433199767fToomas Soome			if (vdev->v_read == vdev_mirror_read) {
1434199767fToomas Soome				if (good_kids) {
1435199767fToomas Soome					vdev->v_state = VDEV_STATE_DEGRADED;
1436199767fToomas Soome				} else {
1437199767fToomas Soome					vdev->v_state = VDEV_STATE_OFFLINE;
1438199767fToomas Soome				}
1439199767fToomas Soome			} else if (vdev->v_read == vdev_raidz_read) {
1440199767fToomas Soome				if (bad_kids > vdev->v_nparity) {
1441199767fToomas Soome					vdev->v_state = VDEV_STATE_OFFLINE;
1442199767fToomas Soome				} else {
1443199767fToomas Soome					vdev->v_state = VDEV_STATE_DEGRADED;
1444199767fToomas Soome				}
1445199767fToomas Soome			}
1446199767fToomas Soome		}
1447199767fToomas Soome	}
1448199767fToomas Soome}
1449199767fToomas Soome
1450da9bf00Toomas Soomestatic int
1451da9bf00Toomas Soomevdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
1452da9bf00Toomas Soome{
1453da9bf00Toomas Soome	vdev_t *vdev;
1454da9bf00Toomas Soome	const unsigned char *kids;
1455da9bf00Toomas Soome	int rc, nkids;
1456da9bf00Toomas Soome
1457da9bf00Toomas Soome	/* Update top vdev. */
1458da9bf00Toomas Soome	vdev = vdev_find(top_guid);
1459da9bf00Toomas Soome	if (vdev != NULL)
1460da9bf00Toomas Soome		vdev_set_initial_state(vdev, nvlist);
1461da9bf00Toomas Soome
1462da9bf00Toomas Soome	/* Update children if there are any. */
1463da9bf00Toomas Soome	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
1464da9bf00Toomas Soome	    &nkids, &kids);
1465da9bf00Toomas Soome	if (rc == 0) {
1466da9bf00Toomas Soome		for (int i = 0; i < nkids; i++) {