1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5441d80alling * Common Development and Distribution License (the "License").
6441d80alling * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21ad135b5Christopher Siden
22fa9e406ahrens/*
233f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
248671400Serapheim Dimitropoulos * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
25663207aDon Brady * Copyright (c) 2017, Intel Corporation.
26e0f1c0aOlaf Faaland * Copyright 2019 Joyent, Inc.
27fa9e406ahrens */
28fa9e406ahrens
29fa9e406ahrens/*
30fa9e406ahrens * Virtual Device Labels
31fa9e406ahrens * ---------------------
32fa9e406ahrens *
33fa9e406ahrens * The vdev label serves several distinct purposes:
34fa9e406ahrens *
35fa9e406ahrens *	1. Uniquely identify this device as part of a ZFS pool and confirm its
36fa9e406ahrens *	   identity within the pool.
37fa9e406ahrens *
38cfd63e1Matthew Ahrens *	2. Verify that all the devices given in a configuration are present
39fa9e406ahrens *         within the pool.
40fa9e406ahrens *
41cfd63e1Matthew Ahrens *	3. Determine the uberblock for the pool.
42fa9e406ahrens *
43cfd63e1Matthew Ahrens *	4. In case of an import operation, determine the configuration of the
44fa9e406ahrens *         toplevel vdev of which it is a part.
45fa9e406ahrens *
46cfd63e1Matthew Ahrens *	5. If an import operation cannot find all the devices in the pool,
47fa9e406ahrens *         provide enough information to the administrator to determine which
48fa9e406ahrens *         devices are missing.
49fa9e406ahrens *
50fa9e406ahrens * It is important to note that while the kernel is responsible for writing the
51fa9e406ahrens * label, it only consumes the information in the first three cases.  The
52fa9e406ahrens * latter information is only consumed in userland when determining the
53fa9e406ahrens * configuration to import a pool.
54fa9e406ahrens *
55fa9e406ahrens *
56fa9e406ahrens * Label Organization
57fa9e406ahrens * ------------------
58fa9e406ahrens *
59fa9e406ahrens * Before describing the contents of the label, it's important to understand how
60fa9e406ahrens * the labels are written and updated with respect to the uberblock.
61fa9e406ahrens *
62fa9e406ahrens * When the pool configuration is altered, either because it was newly created
63fa9e406ahrens * or a device was added, we want to update all the labels such that we can deal
64fa9e406ahrens * with fatal failure at any point.  To this end, each disk has two labels which
65fa9e406ahrens * are updated before and after the uberblock is synced.  Assuming we have
663d7072feschrock * labels and an uberblock with the following transaction groups:
67fa9e406ahrens *
68fa9e406ahrens *              L1          UB          L2
69fa9e406ahrens *           +------+    +------+    +------+
70fa9e406ahrens *           |      |    |      |    |      |
71fa9e406ahrens *           | t10  |    | t10  |    | t10  |
72fa9e406ahrens *           |      |    |      |    |      |
73fa9e406ahrens *           +------+    +------+    +------+
74fa9e406ahrens *
75fa9e406ahrens * In this stable state, the labels and the uberblock were all updated within
76fa9e406ahrens * the same transaction group (10).  Each label is mirrored and checksummed, so
77fa9e406ahrens * that we can detect when we fail partway through writing the label.
78fa9e406ahrens *
79fa9e406ahrens * In order to identify which labels are valid, the labels are written in the
80fa9e406ahrens * following manner:
81fa9e406ahrens *
82cfd63e1Matthew Ahrens *	1. For each vdev, update 'L1' to the new label
83cfd63e1Matthew Ahrens *	2. Update the uberblock
84cfd63e1Matthew Ahrens *	3. For each vdev, update 'L2' to the new label
85fa9e406ahrens *
86fa9e406ahrens * Given arbitrary failure, we can determine the correct label to use based on
87fa9e406ahrens * the transaction group.  If we fail after updating L1 but before updating the
88fa9e406ahrens * UB, we will notice that L1's transaction group is greater than the uberblock,
89fa9e406ahrens * so L2 must be valid.  If we fail after writing the uberblock but before
90fa9e406ahrens * writing L2, we will notice that L2's transaction group is less than L1, and
91fa9e406ahrens * therefore L1 is valid.
92fa9e406ahrens *
93fa9e406ahrens * Another added complexity is that not every label is updated when the config
94fa9e406ahrens * is synced.  If we add a single device, we do not want to have to re-write
95fa9e406ahrens * every label for every device in the pool.  This means that both L1 and L2 may
96fa9e406ahrens * be older than the pool uberblock, because the necessary information is stored
97fa9e406ahrens * on another vdev.
98fa9e406ahrens *
99fa9e406ahrens *
100fa9e406ahrens * On-disk Format
101fa9e406ahrens * --------------
102fa9e406ahrens *
103fa9e406ahrens * The vdev label consists of two distinct parts, and is wrapped within the
104fa9e406ahrens * vdev_label_t structure.  The label includes 8k of padding to permit legacy
105fa9e406ahrens * VTOC disk labels, but is otherwise ignored.
106fa9e406ahrens *
107fa9e406ahrens * The first half of the label is a packed nvlist which contains pool wide
108fa9e406ahrens * properties, per-vdev properties, and configuration information.  It is
109fa9e406ahrens * described in more detail below.
110fa9e406ahrens *
111fa9e406ahrens * The latter half of the label consists of a redundant array of uberblocks.
112fa9e406ahrens * These uberblocks are updated whenever a transaction group is committed,
113fa9e406ahrens * or when the configuration is updated.  When a pool is loaded, we scan each
114fa9e406ahrens * vdev for the 'best' uberblock.
115fa9e406ahrens *
116fa9e406ahrens *
117fa9e406ahrens * Configuration Information
118fa9e406ahrens * -------------------------
119fa9e406ahrens *
120fa9e406ahrens * The nvlist describing the pool and vdev contains the following elements:
121fa9e406ahrens *
122cfd63e1Matthew Ahrens *	version		ZFS on-disk version
123cfd63e1Matthew Ahrens *	name		Pool name
124cfd63e1Matthew Ahrens *	state		Pool state
125cfd63e1Matthew Ahrens *	txg		Transaction group in which this label was written
126cfd63e1Matthew Ahrens *	pool_guid	Unique identifier for this pool
127cfd63e1Matthew Ahrens *	vdev_tree	An nvlist describing vdev tree.
128ad135b5Christopher Siden *	features_for_read
129ad135b5Christopher Siden *			An nvlist of the features necessary for reading the MOS.
130fa9e406ahrens *
131fa9e406ahrens * Each leaf device label also contains the following:
132fa9e406ahrens *
133cfd63e1Matthew Ahrens *	top_guid	Unique ID for top-level vdev in which this is contained
134cfd63e1Matthew Ahrens *	guid		Unique ID for the leaf vdev
135fa9e406ahrens *
136fa9e406ahrens * The 'vs' configuration follows the format described in 'spa_config.c'.
137fa9e406ahrens */
138fa9e406ahrens
139fa9e406ahrens#include <sys/zfs_context.h>
140fa9e406ahrens#include <sys/spa.h>
141fa9e406ahrens#include <sys/spa_impl.h>
142fa9e406ahrens#include <sys/dmu.h>
143fa9e406ahrens#include <sys/zap.h>
144fa9e406ahrens#include <sys/vdev.h>
145fa9e406ahrens#include <sys/vdev_impl.h>
146fa9e406ahrens#include <sys/uberblock_impl.h>
147fa9e406ahrens#include <sys/metaslab.h>
1485cabbc6Prashanth Sreenivasa#include <sys/metaslab_impl.h>
149fa9e406ahrens#include <sys/zio.h>
1503f9d6adLin Ling#include <sys/dsl_scan.h>
151770499eDan Kimmel#include <sys/abd.h>
152fa9e406ahrens#include <sys/fs/zfs.h>
153fa9e406ahrens
154fa9e406ahrens/*
155fa9e406ahrens * Basic routines to read and write from a vdev label.
156fa9e406ahrens * Used throughout the rest of this file.
157fa9e406ahrens */
158fa9e406ahrensuint64_t
159fa9e406ahrensvdev_label_offset(uint64_t psize, int l, uint64_t offset)
160fa9e406ahrens{
161ecc2d60bonwick	ASSERT(offset < sizeof (vdev_label_t));
162e743726ahrens	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
163ecc2d60bonwick
164fa9e406ahrens	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
165fa9e406ahrens	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
166fa9e406ahrens}
167fa9e406ahrens
16821bf64agw/*
16921bf64agw * Returns back the vdev label associated with the passed in offset.
17021bf64agw */
17121bf64agwint
17221bf64agwvdev_label_number(uint64_t psize, uint64_t offset)
17321bf64agw{
17421bf64agw	int l;
17521bf64agw
17621bf64agw	if (offset >= psize - VDEV_LABEL_END_SIZE) {
17721bf64agw		offset -= psize - VDEV_LABEL_END_SIZE;
17821bf64agw		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
17921bf64agw	}
18021bf64agw	l = offset / sizeof (vdev_label_t);
18121bf64agw	return (l < VDEV_LABELS ? l : -1);
18221bf64agw}
18321bf64agw
184fa9e406ahrensstatic void
185770499eDan Kimmelvdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
1869a686fbPaul Dagnelie    uint64_t size, zio_done_func_t *done, void *private, int flags)
187fa9e406ahrens{
18858447f6Olaf Faaland	ASSERT(
18958447f6Olaf Faaland	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
19058447f6Olaf Faaland	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
191e14bb32Jeff Bonwick	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
192fa9e406ahrens
193fa9e406ahrens	zio_nowait(zio_read_phys(zio, vd,
194fa9e406ahrens	    vdev_label_offset(vd->vdev_psize, l, offset),
195fa9e406ahrens	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
196e14bb32Jeff Bonwick	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
197fa9e406ahrens}
198fa9e406ahrens
199e0f1c0aOlaf Faalandvoid
200770499eDan Kimmelvdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
2019a686fbPaul Dagnelie    uint64_t size, zio_done_func_t *done, void *private, int flags)
202fa9e406ahrens{
20358447f6Olaf Faaland	ASSERT(
20458447f6Olaf Faaland	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
20558447f6Olaf Faaland	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
206e14bb32Jeff Bonwick	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
207fa9e406ahrens
208fa9e406ahrens	zio_nowait(zio_write_phys(zio, vd,
209fa9e406ahrens	    vdev_label_offset(vd->vdev_psize, l, offset),
210fa9e406ahrens	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
21117f17c2bonwick	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
212fa9e406ahrens}
213fa9e406ahrens
2148671400Serapheim Dimitropoulosstatic void
2158671400Serapheim Dimitropoulosroot_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
2168671400Serapheim Dimitropoulos{
2178671400Serapheim Dimitropoulos	spa_t *spa = vd->vdev_spa;
2188671400Serapheim Dimitropoulos
2198671400Serapheim Dimitropoulos	if (vd != spa->spa_root_vdev)
2208671400Serapheim Dimitropoulos		return;
2218671400Serapheim Dimitropoulos
2228671400Serapheim Dimitropoulos	/* provide either current or previous scan information */
2238671400Serapheim Dimitropoulos	pool_scan_stat_t ps;
2248671400Serapheim Dimitropoulos	if (spa_scan_get_stats(spa, &ps) == 0) {
2258671400Serapheim Dimitropoulos		fnvlist_add_uint64_array(nvl,
2268671400Serapheim Dimitropoulos		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
2278671400Serapheim Dimitropoulos		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
2288671400Serapheim Dimitropoulos	}
2298671400Serapheim Dimitropoulos
2308671400Serapheim Dimitropoulos	pool_removal_stat_t prs;
2318671400Serapheim Dimitropoulos	if (spa_removal_get_stats(spa, &prs) == 0) {
2328671400Serapheim Dimitropoulos		fnvlist_add_uint64_array(nvl,
2338671400Serapheim Dimitropoulos		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
2348671400Serapheim Dimitropoulos		    sizeof (prs) / sizeof (uint64_t));
2358671400Serapheim Dimitropoulos	}
2368671400Serapheim Dimitropoulos
2378671400Serapheim Dimitropoulos	pool_checkpoint_stat_t pcs;
2388671400Serapheim Dimitropoulos	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
2398671400Serapheim Dimitropoulos		fnvlist_add_uint64_array(nvl,
2408671400Serapheim Dimitropoulos		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
2418671400Serapheim Dimitropoulos		    sizeof (pcs) / sizeof (uint64_t));
2428671400Serapheim Dimitropoulos	}
2438671400Serapheim Dimitropoulos}
2448671400Serapheim Dimitropoulos
245fa9e406ahrens/*
246fa9e406ahrens * Generate the nvlist representing this vdev's config.
247fa9e406ahrens */
248fa9e406ahrensnvlist_t *
24999653d4eschrockvdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
2503f9d6adLin Ling    vdev_config_flag_t flags)
251fa9e406ahrens{
252fa9e406ahrens	nvlist_t *nv = NULL;
2535cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
254fa9e406ahrens
255b4952e1George Wilson	nv = fnvlist_alloc();
256fa9e406ahrens
257b4952e1George Wilson	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
2583f9d6adLin Ling	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
259b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
260b4952e1George Wilson	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
261fa9e406ahrens
262fa9e406ahrens	if (vd->vdev_path != NULL)
263b4952e1George Wilson		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
264fa9e406ahrens
265fa9e406ahrens	if (vd->vdev_devid != NULL)
266b4952e1George Wilson		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
267fa9e406ahrens
2683d7072feschrock	if (vd->vdev_physpath != NULL)
269b4952e1George Wilson		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
270b4952e1George Wilson		    vd->vdev_physpath);
2713d7072feschrock
2726809eb4Eric Schrock	if (vd->vdev_fru != NULL)
273b4952e1George Wilson		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
2746809eb4Eric Schrock
27599653d4eschrock	if (vd->vdev_nparity != 0) {
27699653d4eschrock		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
27799653d4eschrock		    VDEV_TYPE_RAIDZ) == 0);
27899653d4eschrock
27999653d4eschrock		/*
28099653d4eschrock		 * Make sure someone hasn't managed to sneak a fancy new vdev
28199653d4eschrock		 * into a crufty old storage pool.
28299653d4eschrock		 */
28399653d4eschrock		ASSERT(vd->vdev_nparity == 1 ||
284f94275cAdam Leventhal		    (vd->vdev_nparity <= 2 &&
285f94275cAdam Leventhal		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
286f94275cAdam Leventhal		    (vd->vdev_nparity <= 3 &&
287f94275cAdam Leventhal		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
28899653d4eschrock
28999653d4eschrock		/*
29099653d4eschrock		 * Note that we'll add the nparity tag even on storage pools
29199653d4eschrock		 * that only support a single parity device -- older software
29299653d4eschrock		 * will just ignore it.
29399653d4eschrock		 */
294b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
29599653d4eschrock	}
29699653d4eschrock
297afefbcdeschrock	if (vd->vdev_wholedisk != -1ULL)
298b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
299b4952e1George Wilson		    vd->vdev_wholedisk);
300afefbcdeschrock
3016f79381Pavel Zakharov	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
302b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
303ea8dc4beschrock
30499653d4eschrock	if (vd->vdev_isspare)
305b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
30699653d4eschrock
3073f9d6adLin Ling	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
3083f9d6adLin Ling	    vd == vd->vdev_top) {
309b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
310b4952e1George Wilson		    vd->vdev_ms_array);
311b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
312b4952e1George Wilson		    vd->vdev_ms_shift);
313b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
314b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
315b4952e1George Wilson		    vd->vdev_asize);
316b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
3175cabbc6Prashanth Sreenivasa		if (vd->vdev_removing) {
318b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
319b4952e1George Wilson			    vd->vdev_removing);
3205cabbc6Prashanth Sreenivasa		}
321663207aDon Brady
322663207aDon Brady		/* zpool command expects alloc class data */
323663207aDon Brady		if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
324663207aDon Brady			const char *bias = NULL;
325663207aDon Brady
326663207aDon Brady			switch (vd->vdev_alloc_bias) {
327663207aDon Brady			case VDEV_BIAS_LOG:
328663207aDon Brady				bias = VDEV_ALLOC_BIAS_LOG;
329663207aDon Brady				break;
330663207aDon Brady			case VDEV_BIAS_SPECIAL:
331663207aDon Brady				bias = VDEV_ALLOC_BIAS_SPECIAL;
332663207aDon Brady				break;
333663207aDon Brady			case VDEV_BIAS_DEDUP:
334663207aDon Brady				bias = VDEV_ALLOC_BIAS_DEDUP;
335663207aDon Brady				break;
336663207aDon Brady			default:
337663207aDon Brady				ASSERT3U(vd->vdev_alloc_bias, ==,
338663207aDon Brady				    VDEV_BIAS_NONE);
339663207aDon Brady			}
340663207aDon Brady			fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
341663207aDon Brady			    bias);
342663207aDon Brady		}
343fa9e406ahrens	}
344fa9e406ahrens
3450713e23George Wilson	if (vd->vdev_dtl_sm != NULL) {
346b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
3470713e23George Wilson		    space_map_object(vd->vdev_dtl_sm));
3480713e23George Wilson	}
349fa9e406ahrens
3505cabbc6Prashanth Sreenivasa	if (vic->vic_mapping_object != 0) {
3515cabbc6Prashanth Sreenivasa		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
3525cabbc6Prashanth Sreenivasa		    vic->vic_mapping_object);
3535cabbc6Prashanth Sreenivasa	}
3545cabbc6Prashanth Sreenivasa
3555cabbc6Prashanth Sreenivasa	if (vic->vic_births_object != 0) {
3565cabbc6Prashanth Sreenivasa		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
3575cabbc6Prashanth Sreenivasa		    vic->vic_births_object);
3585cabbc6Prashanth Sreenivasa	}
3595cabbc6Prashanth Sreenivasa
3605cabbc6Prashanth Sreenivasa	if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
3615cabbc6Prashanth Sreenivasa		fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
3625cabbc6Prashanth Sreenivasa		    vic->vic_prev_indirect_vdev);
3635cabbc6Prashanth Sreenivasa	}
3645cabbc6Prashanth Sreenivasa
36588ecc94George Wilson	if (vd->vdev_crtxg)
366b4952e1George Wilson		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
36788ecc94George Wilson
368215198aJoe Stein	if (flags & VDEV_CONFIG_MOS) {
369215198aJoe Stein		if (vd->vdev_leaf_zap != 0) {
370215198aJoe Stein			ASSERT(vd->vdev_ops->vdev_op_leaf);
371215198aJoe Stein			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
372215198aJoe Stein			    vd->vdev_leaf_zap);
373215198aJoe Stein		}
374215198aJoe Stein
375215198aJoe Stein		if (vd->vdev_top_zap != 0) {
376215198aJoe Stein			ASSERT(vd == vd->vdev_top);
377215198aJoe Stein			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
378215198aJoe Stein			    vd->vdev_top_zap);
379215198aJoe Stein		}
380e4c795bTom Caputi
381e4c795bTom Caputi		if (vd->vdev_resilver_deferred) {
382e4c795bTom Caputi			ASSERT(vd->vdev_ops->vdev_op_leaf);
383e4c795bTom Caputi			ASSERT(spa->spa_resilver_deferred);
384e4c795bTom Caputi			fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
385e4c795bTom Caputi		}
386215198aJoe Stein	}
387215198aJoe Stein
388fa9e406ahrens	if (getstats) {
389fa9e406ahrens		vdev_stat_t vs;
3903f9d6adLin Ling
391fa9e406ahrens		vdev_get_stats(vd, &vs);
392b4952e1George Wilson		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
393b4952e1George Wilson		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
3943f9d6adLin Ling
3958671400Serapheim Dimitropoulos		root_vdev_actions_getprogress(vd, nv);
3965cabbc6Prashanth Sreenivasa
3975cabbc6Prashanth Sreenivasa		/*
3985cabbc6Prashanth Sreenivasa		 * Note: this can be called from open context
3995cabbc6Prashanth Sreenivasa		 * (spa_get_stats()), so we need the rwlock to prevent
4005cabbc6Prashanth Sreenivasa		 * the mapping from being changed by condensing.
4015cabbc6Prashanth Sreenivasa		 */
4025cabbc6Prashanth Sreenivasa		rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
4035cabbc6Prashanth Sreenivasa		if (vd->vdev_indirect_mapping != NULL) {
4045cabbc6Prashanth Sreenivasa			ASSERT(vd->vdev_indirect_births != NULL);
4055cabbc6Prashanth Sreenivasa			vdev_indirect_mapping_t *vim =
4065cabbc6Prashanth Sreenivasa			    vd->vdev_indirect_mapping;
4075cabbc6Prashanth Sreenivasa			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
4085cabbc6Prashanth Sreenivasa			    vdev_indirect_mapping_size(vim));
4095cabbc6Prashanth Sreenivasa		}
4105cabbc6Prashanth Sreenivasa		rw_exit(&vd->vdev_indirect_rwlock);
4115cabbc6Prashanth Sreenivasa		if (vd->vdev_mg != NULL &&
4125cabbc6Prashanth Sreenivasa		    vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
4135cabbc6Prashanth Sreenivasa			/*
4145cabbc6Prashanth Sreenivasa			 * Compute approximately how much memory would be used
4155cabbc6Prashanth Sreenivasa			 * for the indirect mapping if this device were to
4165cabbc6Prashanth Sreenivasa			 * be removed.
4175cabbc6Prashanth Sreenivasa			 *
4185cabbc6Prashanth Sreenivasa			 * Note: If the frag metric is invalid, then not
4195cabbc6Prashanth Sreenivasa			 * enough metaslabs have been converted to have
4205cabbc6Prashanth Sreenivasa			 * histograms.
4215cabbc6Prashanth Sreenivasa			 */
4225cabbc6Prashanth Sreenivasa			uint64_t seg_count = 0;
423cfd63e1Matthew Ahrens			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
4245cabbc6Prashanth Sreenivasa
4255cabbc6Prashanth Sreenivasa			/*
4265cabbc6Prashanth Sreenivasa			 * There are the same number of allocated segments
4275cabbc6Prashanth Sreenivasa			 * as free segments, so we will have at least one
428cfd63e1Matthew Ahrens			 * entry per free segment.  However, small free
429cfd63e1Matthew Ahrens			 * segments (smaller than vdev_removal_max_span)
430cfd63e1Matthew Ahrens			 * will be combined with adjacent allocated segments
431cfd63e1Matthew Ahrens			 * as a single mapping.
4325cabbc6Prashanth Sreenivasa			 */
4335cabbc6Prashanth Sreenivasa			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
434cfd63e1Matthew Ahrens				if (1ULL << (i + 1) < vdev_removal_max_span) {
435cfd63e1Matthew Ahrens					to_alloc +=
436cfd63e1Matthew Ahrens					    vd->vdev_mg->mg_histogram[i] <<
437cfd63e1Matthew Ahrens					    i + 1;
438cfd63e1Matthew Ahrens				} else {
439cfd63e1Matthew Ahrens					seg_count +=
440cfd63e1Matthew Ahrens					    vd->vdev_mg->mg_histogram[i];
441cfd63e1Matthew Ahrens				}
4425cabbc6Prashanth Sreenivasa			}
4435cabbc6Prashanth Sreenivasa
4445cabbc6Prashanth Sreenivasa			/*
445cfd63e1Matthew Ahrens			 * The maximum length of a mapping is
446cfd63e1Matthew Ahrens			 * zfs_remove_max_segment, so we need at least one entry
447cfd63e1Matthew Ahrens			 * per zfs_remove_max_segment of allocated data.
4485cabbc6Prashanth Sreenivasa			 */
449cfd63e1Matthew Ahrens			seg_count += to_alloc / zfs_remove_max_segment;
4505cabbc6Prashanth Sreenivasa
4515cabbc6Prashanth Sreenivasa			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
4525cabbc6Prashanth Sreenivasa			    seg_count *
4535cabbc6Prashanth Sreenivasa			    sizeof (vdev_indirect_mapping_entry_phys_t));
4545cabbc6Prashanth Sreenivasa		}
455fa9e406ahrens	}
456fa9e406ahrens
457fa9e406ahrens	if (!vd->vdev_ops->vdev_op_leaf) {
458fa9e406ahrens		nvlist_t **child;
4593f9d6adLin Ling		int c, idx;
460fa9e406ahrens
46188ecc94George Wilson		ASSERT(!vd->vdev_ishole);
46288ecc94George Wilson
463fa9e406ahrens		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
464fa9e406ahrens		    KM_SLEEP);
465fa9e406ahrens
4663f9d6adLin Ling		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
4673f9d6adLin Ling			vdev_t *cvd = vd->vdev_child[c];
4683f9d6adLin Ling
4693f9d6adLin Ling			/*
4703f9d6adLin Ling			 * If we're generating an nvlist of removing
4713f9d6adLin Ling			 * vdevs then skip over any device which is
4723f9d6adLin Ling			 * not being removed.
4733f9d6adLin Ling			 */
4743f9d6adLin Ling			if ((flags & VDEV_CONFIG_REMOVING) &&
4753f9d6adLin Ling			    !cvd->vdev_removing)
4763f9d6adLin Ling				continue;
477fa9e406ahrens
4783f9d6adLin Ling			child[idx++] = vdev_config_generate(spa, cvd,
4793f9d6adLin Ling			    getstats, flags);
4803f9d6adLin Ling		}
4813f9d6adLin Ling
4823f9d6adLin Ling		if (idx) {
483b4952e1George Wilson			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
484b4952e1George Wilson			    child, idx);
4853f9d6adLin Ling		}
486fa9e406ahrens
4873f9d6adLin Ling		for (c = 0; c < idx; c++)
488fa9e406ahrens			nvlist_free(child[c]);
489fa9e406ahrens
490fa9e406ahrens		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
491441d80alling
492441d80alling	} else {
493069f55eEric Schrock		const char *aux = NULL;
494069f55eEric Schrock
495ecc2d60bonwick		if (vd->vdev_offline && !vd->vdev_tmpoffline)
496b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
497b4952e1George Wilson		if (vd->vdev_resilver_txg != 0)
498b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
499b4952e1George Wilson			    vd->vdev_resilver_txg);
5003d7072feschrock		if (vd->vdev_faulted)
501b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
5023d7072feschrock		if (vd->vdev_degraded)
503b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
5043d7072feschrock		if (vd->vdev_removed)
505b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
5063d7072feschrock		if (vd->vdev_unspare)
507b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
50888ecc94George Wilson		if (vd->vdev_ishole)
509b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
510069f55eEric Schrock
511069f55eEric Schrock		switch (vd->vdev_stat.vs_aux) {
512069f55eEric Schrock		case VDEV_AUX_ERR_EXCEEDED:
513069f55eEric Schrock			aux = "err_exceeded";
514069f55eEric Schrock			break;
515069f55eEric Schrock
516069f55eEric Schrock		case VDEV_AUX_EXTERNAL:
517069f55eEric Schrock			aux = "external";
518069f55eEric Schrock			break;
519069f55eEric Schrock		}
520069f55eEric Schrock
521069f55eEric Schrock		if (aux != NULL)
522b4952e1George Wilson			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
5231195e68Mark J Musante
5241195e68Mark J Musante		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
525b4952e1George Wilson			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
526b4952e1George Wilson			    vd->vdev_orig_guid);
5271195e68Mark J Musante		}
528fa9e406ahrens	}
529fa9e406ahrens
530fa9e406ahrens	return (nv);
531fa9e406ahrens}
532fa9e406ahrens
53388ecc94George Wilson/*
53488ecc94George Wilson * Generate a view of the top-level vdevs.  If we currently have holes
53588ecc94George Wilson * in the namespace, then generate an array which contains a list of holey
53688ecc94George Wilson * vdevs.  Additionally, add the number of top-level children that currently
53788ecc94George Wilson * exist.
53888ecc94George Wilson */
53988ecc94George Wilsonvoid
54088ecc94George Wilsonvdev_top_config_generate(spa_t *spa, nvlist_t *config)
54188ecc94George Wilson{
54288ecc94George Wilson	vdev_t *rvd = spa->spa_root_vdev;
54388ecc94George Wilson	uint64_t *array;
5443f9d6adLin Ling	uint_t c, idx;
54588ecc94George Wilson
54688ecc94George Wilson	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
54788ecc94George Wilson
5483f9d6adLin Ling	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
54988ecc94George Wilson		vdev_t *tvd = rvd->vdev_child[c];
55088ecc94George Wilson
5515cabbc6Prashanth Sreenivasa		if (tvd->vdev_ishole) {
55288ecc94George Wilson			array[idx++] = c;
5535cabbc6Prashanth Sreenivasa		}
55488ecc94George Wilson	}
55588ecc94George Wilson
556312c6e1George Wilson	if (idx) {
557312c6e1George Wilson		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
558312c6e1George Wilson		    array, idx) == 0);
559312c6e1George Wilson	}
560312c6e1George Wilson
56188ecc94George Wilson	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
56288ecc94George Wilson	    rvd->vdev_children) == 0);
56388ecc94George Wilson
56488ecc94George Wilson	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
56588ecc94George Wilson}
56688ecc94George Wilson
567ad135b5Christopher Siden/*
568dfbb943George Wilson * Returns the configuration from the label of the given vdev. For vdevs
569dfbb943George Wilson * which don't have a txg value stored on their label (i.e. spares/cache)
570dfbb943George Wilson * or have not been completely initialized (txg = 0) just return
571dfbb943George Wilson * the configuration from the first valid label we find. Otherwise,
572dfbb943George Wilson * find the most up-to-date label that does not exceed the specified
573dfbb943George Wilson * 'txg' value.
574ad135b5Christopher Siden */
575fa9e406ahrensnvlist_t *
576dfbb943George Wilsonvdev_label_read_config(vdev_t *vd, uint64_t txg)
577fa9e406ahrens{
5780373e76bonwick	spa_t *spa = vd->vdev_spa;
579fa9e406ahrens	nvlist_t *config = NULL;
580fa9e406ahrens	vdev_phys_t *vp;
581770499eDan Kimmel	abd_t *vp_abd;
582fa9e406ahrens	zio_t *zio;
583dfbb943George Wilson	uint64_t best_txg = 0;
584b6bf6e1Pavel Zakharov	uint64_t label_txg = 0;
585dfbb943George Wilson	int error = 0;
5868956713Eric Schrock	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
5878956713Eric Schrock	    ZIO_FLAG_SPECULATIVE;
588fa9e406ahrens
589e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
5900373e76bonwick
5910a4e951gw	if (!vdev_readable(vd))
592fa9e406ahrens		return (NULL);
593fa9e406ahrens
594770499eDan Kimmel	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
595770499eDan Kimmel	vp = abd_to_buf(vp_abd);
596fa9e406ahrens
5978956713Eric Schrockretry:
598e14bb32Jeff Bonwick	for (int l = 0; l < VDEV_LABELS; l++) {
599dfbb943George Wilson		nvlist_t *label = NULL;
600fa9e406ahrens
601e14bb32Jeff Bonwick		zio = zio_root(spa, NULL, NULL, flags);
602fa9e406ahrens
603770499eDan Kimmel		vdev_label_read(zio, vd, l, vp_abd,
604fa9e406ahrens		    offsetof(vdev_label_t, vl_vdev_phys),
605e14bb32Jeff Bonwick		    sizeof (vdev_phys_t), NULL, NULL, flags);
606fa9e406ahrens
607fa9e406ahrens		if (zio_wait(zio) == 0 &&
608fa9e406ahrens		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
609dfbb943George Wilson		    &label, 0) == 0) {
610dfbb943George Wilson			/*
611dfbb943George Wilson			 * Auxiliary vdevs won't have txg values in their
612dfbb943George Wilson			 * labels and newly added vdevs may not have been
613dfbb943George Wilson			 * completely initialized so just return the
614dfbb943George Wilson			 * configuration from the first valid label we
615dfbb943George Wilson			 * encounter.
616dfbb943George Wilson			 */
617dfbb943George Wilson			error = nvlist_lookup_uint64(label,
618dfbb943George Wilson			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
619dfbb943George Wilson			if ((error || label_txg == 0) && !config) {
620dfbb943George Wilson				config = label;
621dfbb943George Wilson				break;
622dfbb943George Wilson			} else if (label_txg <= txg && label_txg > best_txg) {
623dfbb943George Wilson				best_txg = label_txg;
624dfbb943George Wilson				nvlist_free(config);
625dfbb943George Wilson				config = fnvlist_dup(label);
626dfbb943George Wilson			}
627dfbb943George Wilson		}
628fa9e406ahrens
629dfbb943George Wilson		if (label != NULL) {
630dfbb943George Wilson			nvlist_free(label);
631dfbb943George Wilson			label = NULL;
632fa9e406ahrens		}
633fa9e406ahrens	}
634fa9e406ahrens
6358956713Eric Schrock	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
6368956713Eric Schrock		flags |= ZIO_FLAG_TRYHARD;
6378956713Eric Schrock		goto retry;
6388956713Eric Schrock	}
6398956713Eric Schrock
640b6bf6e1Pavel Zakharov	/*
641b6bf6e1Pavel Zakharov	 * We found a valid label but it didn't pass txg restrictions.
642b6bf6e1Pavel Zakharov	 */
643b6bf6e1Pavel Zakharov	if (config == NULL && label_txg != 0) {
644b6bf6e1Pavel Zakharov		vdev_dbgmsg(vd, "label discarded as txg is too large "
645b6bf6e1Pavel Zakharov		    "(%llu > %llu)", (u_longlong_t)label_txg,
646b6bf6e1Pavel Zakharov		    (u_longlong_t)txg);
647b6bf6e1Pavel Zakharov	}
648b6bf6e1Pavel Zakharov
649770499eDan Kimmel	abd_free(vp_abd);
650fa9e406ahrens
651fa9e406ahrens	return (config);
652fa9e406ahrens}
653fa9e406ahrens
65439c2341eschrock/*
65539c2341eschrock * Determine if a device is in use.  The 'spare_guid' parameter will be filled
65639c2341eschrock * in with the device guid if this spare is active elsewhere on the system.
65739c2341eschrock */
65839c2341eschrockstatic boolean_t
65939c2341eschrockvdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
660fa94a07brendan    uint64_t *spare_guid, uint64_t *l2cache_guid)
66139c2341eschrock{
66239c2341eschrock	spa_t *spa = vd->vdev_spa;
66339c2341eschrock	uint64_t state, pool_guid, device_guid, txg, spare_pool;
66439c2341eschrock	uint64_t vdtxg = 0;
66539c2341eschrock	nvlist_t *label;
66639c2341eschrock
66739c2341eschrock	if (spare_guid)
66839c2341eschrock		*spare_guid = 0ULL;
669fa94a07brendan	if (l2cache_guid)
670fa94a07brendan		*l2cache_guid = 0ULL;
67139c2341eschrock
67239c2341eschrock	/*
67339c2341eschrock	 * Read the label, if any, and perform some basic sanity checks.
67439c2341eschrock	 */
675dfbb943George Wilson	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
67639c2341eschrock		return (B_FALSE);
67739c2341eschrock
67839c2341eschrock	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
67939c2341eschrock	    &vdtxg);
68039c2341eschrock
68139c2341eschrock	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
68239c2341eschrock	    &state) != 0 ||
68339c2341eschrock	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
68439c2341eschrock	    &device_guid) != 0) {
68539c2341eschrock		nvlist_free(label);
68639c2341eschrock		return (B_FALSE);
68739c2341eschrock	}
68839c2341eschrock
689fa94a07brendan	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
69039c2341eschrock	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
69139c2341eschrock	    &pool_guid) != 0 ||
69239c2341eschrock	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
69339c2341eschrock	    &txg) != 0)) {
69439c2341eschrock		nvlist_free(label);
69539c2341eschrock		return (B_FALSE);
69639c2341eschrock	}
69739c2341eschrock
69839c2341eschrock	nvlist_free(label);
69939c2341eschrock
70039c2341eschrock	/*
70139c2341eschrock	 * Check to see if this device indeed belongs to the pool it claims to
70239c2341eschrock	 * be a part of.  The only way this is allowed is if the device is a hot
70339c2341eschrock	 * spare (which we check for later on).
70439c2341eschrock	 */
705fa94a07brendan	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
70639c2341eschrock	    !spa_guid_exists(pool_guid, device_guid) &&
70789a89eblling	    !spa_spare_exists(device_guid, NULL, NULL) &&
708fa94a07brendan	    !spa_l2cache_exists(device_guid, NULL))
70939c2341eschrock		return (B_FALSE);
71039c2341eschrock
71139c2341eschrock	/*
71239c2341eschrock	 * If the transaction group is zero, then this an initialized (but
71339c2341eschrock	 * unused) label.  This is only an error if the create transaction
71439c2341eschrock	 * on-disk is the same as the one we're using now, in which case the
71539c2341eschrock	 * user has attempted to add the same vdev multiple times in the same
71639c2341eschrock	 * transaction.
717