2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5441d80alling * Common Development and Distribution License (the "License").
6441d80alling * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
2398d1cbfGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245cabbc6Prashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
255f368aeYuri Pankov * Copyright 2017 Nexenta Systems, Inc.
26c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
27c8811bdToomas Soome * Copyright 2016 Toomas Soome <tsoome@me.com>
28c1064fdJerry Jelinek * Copyright 2019 Joyent, Inc.
29663207aDon Brady * Copyright (c) 2017, Intel Corporation.
30fa9e406ahrens */
32fa9e406ahrens#include <sys/zfs_context.h>
33ea8dc4beschrock#include <sys/fm/fs/zfs.h>
34fa9e406ahrens#include <sys/spa.h>
35fa9e406ahrens#include <sys/spa_impl.h>
365cabbc6Prashanth Sreenivasa#include <sys/bpobj.h>
37fa9e406ahrens#include <sys/dmu.h>
38fa9e406ahrens#include <sys/dmu_tx.h>
395cabbc6Prashanth Sreenivasa#include <sys/dsl_dir.h>
40fa9e406ahrens#include <sys/vdev_impl.h>
41fa9e406ahrens#include <sys/uberblock_impl.h>
42fa9e406ahrens#include <sys/metaslab.h>
43fa9e406ahrens#include <sys/metaslab_impl.h>
44fa9e406ahrens#include <sys/space_map.h>
450713e23George Wilson#include <sys/space_reftree.h>
46fa9e406ahrens#include <sys/zio.h>
47fa9e406ahrens#include <sys/zap.h>
48fa9e406ahrens#include <sys/fs/zfs.h>
49c5904d1eschrock#include <sys/arc.h>
50e6ca193George Wilson#include <sys/zil.h>
513f9d6adLin Ling#include <sys/dsl_scan.h>
52770499eDan Kimmel#include <sys/abd.h>
53094e47eGeorge Wilson#include <sys/vdev_initialize.h>
54084fd14Brian Behlendorf#include <sys/vdev_trim.h>
57fa9e406ahrens * Virtual device management.
58fa9e406ahrens */
60fa9e406ahrensstatic vdev_ops_t *vdev_ops_table[] = {
61fa9e406ahrens	&vdev_root_ops,
62fa9e406ahrens	&vdev_raidz_ops,
63fa9e406ahrens	&vdev_mirror_ops,
64fa9e406ahrens	&vdev_replacing_ops,
6599653d4eschrock	&vdev_spare_ops,
66fa9e406ahrens	&vdev_disk_ops,
67fa9e406ahrens	&vdev_file_ops,
68fa9e406ahrens	&vdev_missing_ops,
6988ecc94George Wilson	&vdev_hole_ops,
705cabbc6Prashanth Sreenivasa	&vdev_indirect_ops,
71fa9e406ahrens	NULL
74088f389ahrens/* maximum scrub/resilver I/O queue per leaf vdev */
75088f389ahrensint zfs_scrub_limit = 10;
77a0b03b1Serapheim Dimitropoulos/* default target for number of metaslabs per top-level vdev */
78a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_count = 200;
798671400Serapheim Dimitropoulos
80b4bf0cfDon Brady/* minimum number of metaslabs per top-level vdev */
81a0b03b1Serapheim Dimitropoulosint zfs_vdev_min_ms_count = 16;
828671400Serapheim Dimitropoulos
83b4bf0cfDon Brady/* practical upper limit of total metaslabs per top-level vdev */
84a0b03b1Serapheim Dimitropoulosint zfs_vdev_ms_count_limit = 1ULL << 17;
85b4bf0cfDon Brady
86b4bf0cfDon Brady/* lower limit for metaslab size (512M) */
87a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_shift = 29;
888671400Serapheim Dimitropoulos
89a0b03b1Serapheim Dimitropoulos/* upper limit for metaslab size (16G) */
90a0b03b1Serapheim Dimitropoulosint zfs_vdev_max_ms_shift = 34;
91b4bf0cfDon Brady
928671400Serapheim Dimitropoulosboolean_t vdev_validate_skip = B_FALSE;
938671400Serapheim Dimitropoulos
958671400Serapheim Dimitropoulos * Since the DTL space map of a vdev is not expected to have a lot of
968671400Serapheim Dimitropoulos * entries, we default its block size to 4K.
97bf3e216Matthew Ahrens */
98814dcd4Serapheim Dimitropoulosint zfs_vdev_dtl_sm_blksz = (1 << 12);
99bf3e216Matthew Ahrens
1008671400Serapheim Dimitropoulos/*
1018671400Serapheim Dimitropoulos * vdev-wide space maps that have lots of entries written to them at
1028671400Serapheim Dimitropoulos * the end of each transaction can benefit from a higher I/O bandwidth
1038671400Serapheim Dimitropoulos * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
1048671400Serapheim Dimitropoulos */
105814dcd4Serapheim Dimitropoulosint zfs_vdev_standard_sm_blksz = (1 << 17);
1066f79381Pavel Zakharov
10793a1902Matthew Ahrensint zfs_ashift_min;
10893a1902Matthew Ahrens
1093ee8c80Pavel Zakharov/*PRINTFLIKE2*/
1103ee8c80Pavel Zakharovvoid
1113ee8c80Pavel Zakharovvdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
1123ee8c80Pavel Zakharov{
1133ee8c80Pavel Zakharov	va_list adx;
1143ee8c80Pavel Zakharov	char buf[256];
1153ee8c80Pavel Zakharov
1163ee8c80Pavel Zakharov	va_start(adx, fmt);
1173ee8c80Pavel Zakharov	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
1183ee8c80Pavel Zakharov	va_end(adx);
1193ee8c80Pavel Zakharov
1203ee8c80Pavel Zakharov	if (vd->vdev_path != NULL) {
1213ee8c80Pavel Zakharov		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
1223ee8c80Pavel Zakharov		    vd->vdev_path, buf);
1233ee8c80Pavel Zakharov	} else {
1243ee8c80Pavel Zakharov		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
1253ee8c80Pavel Zakharov		    vd->vdev_ops->vdev_op_type,
1263ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_id,
1273ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_guid, buf);
1283ee8c80Pavel Zakharov	}
1293ee8c80Pavel Zakharov}
1303ee8c80Pavel Zakharov
1316f79381Pavel Zakharovvoid
1326f79381Pavel Zakharovvdev_dbgmsg_print_tree(vdev_t *vd, int indent)
1336f79381Pavel Zakharov{
1346f79381Pavel Zakharov	char state[20];
1356f79381Pavel Zakharov
1366f79381Pavel Zakharov	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
1376f79381Pavel Zakharov		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
1386f79381Pavel Zakharov		    vd->vdev_ops->vdev_op_type);
1396f79381Pavel Zakharov		return;
1406f79381Pavel Zakharov	}
1416f79381Pavel Zakharov
1426f79381Pavel Zakharov	switch (vd->vdev_state) {
1436f79381Pavel Zakharov	case VDEV_STATE_UNKNOWN:
1446f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "unknown");
1456f79381Pavel Zakharov		break;
1466f79381Pavel Zakharov	case VDEV_STATE_CLOSED:
1476f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "closed");
1486f79381Pavel Zakharov		break;
1496f79381Pavel Zakharov	case VDEV_STATE_OFFLINE:
1506f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "offline");
1516f79381Pavel Zakharov		break;
1526f79381Pavel Zakharov	case VDEV_STATE_REMOVED:
1536f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "removed");
1546f79381Pavel Zakharov		break;
1556f79381Pavel Zakharov	case VDEV_STATE_CANT_OPEN:
1566f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "can't open");
1576f79381Pavel Zakharov		break;
1586f79381Pavel Zakharov	case VDEV_STATE_FAULTED:
1596f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "faulted");
1606f79381Pavel Zakharov		break;
1616f79381Pavel Zakharov	case VDEV_STATE_DEGRADED:
1626f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "degraded");
1636f79381Pavel Zakharov		break;
1646f79381Pavel Zakharov	case VDEV_STATE_HEALTHY:
1656f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "healthy");
1666f79381Pavel Zakharov		break;
1676f79381Pavel Zakharov	default:
1686f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "<state %u>",
1696f79381Pavel Zakharov		    (uint_t)vd->vdev_state);
1706f79381Pavel Zakharov	}
1716f79381Pavel Zakharov
1726f79381Pavel Zakharov	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
173c7a7b2fAndriy Gapon	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
1746f79381Pavel Zakharov	    vd->vdev_islog ? " (log)" : "",
1756f79381Pavel Zakharov	    (u_longlong_t)vd->vdev_guid,
1766f79381Pavel Zakharov	    vd->vdev_path ? vd->vdev_path : "N/A", state);
1776f79381Pavel Zakharov
1786f79381Pavel Zakharov	for (uint64_t i = 0; i < vd->vdev_children; i++)
1796f79381Pavel Zakharov		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
1806f79381Pavel Zakharov}
1816f79381Pavel Zakharov
182bf3e216Matthew Ahrens/*
183fa9e406ahrens * Given a vdev type, return the appropriate ops vector.
184fa9e406ahrens */
185fa9e406ahrensstatic vdev_ops_t *
186fa9e406ahrensvdev_getops(const char *type)
188fa9e406ahrens	vdev_ops_t *ops, **opspp;
190fa9e406ahrens	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
191fa9e406ahrens		if (strcmp(ops->vdev_op_type, type) == 0)
192fa9e406ahrens			break;
194fa9e406ahrens	return (ops);
197663207aDon Brady/*
198663207aDon Brady * Derive the enumerated alloction bias from string input.
199663207aDon Brady * String origin is either the per-vdev zap or zpool(1M).
200663207aDon Brady */
201663207aDon Bradystatic vdev_alloc_bias_t
202663207aDon Bradyvdev_derive_alloc_bias(const char *bias)
203663207aDon Brady{
204663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
205663207aDon Brady
206663207aDon Brady	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
207663207aDon Brady		alloc_bias = VDEV_BIAS_LOG;
208663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
209663207aDon Brady		alloc_bias = VDEV_BIAS_SPECIAL;
210663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
211663207aDon Brady		alloc_bias = VDEV_BIAS_DEDUP;
212663207aDon Brady
213663207aDon Brady	return (alloc_bias);
214663207aDon Brady}
215663207aDon Brady
216094e47eGeorge Wilson/* ARGSUSED */
217094e47eGeorge Wilsonvoid
218094e47eGeorge Wilsonvdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
219094e47eGeorge Wilson{
220094e47eGeorge Wilson	res->rs_start = in->rs_start;
221094e47eGeorge Wilson	res->rs_end = in->rs_end;
222094e47eGeorge Wilson}
223094e47eGeorge Wilson
225fa9e406ahrens * Default asize function: return the MAX of psize with the asize of
226fa9e406ahrens * all children.  This is what's used by anything other than RAID-Z.
227fa9e406ahrens */
229fa9e406ahrensvdev_default_asize(vdev_t *vd, uint64_t psize)
231ecc2d60bonwick	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
232fa9e406ahrens	uint64_t csize;
234573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++) {
235fa9e406ahrens		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
236fa9e406ahrens		asize = MAX(asize, csize);
237fa9e406ahrens	}
239fa9e406ahrens	return (asize);
243573ca77George Wilson * Get the minimum allocatable size. We define the allocatable size as
244573ca77George Wilson * the vdev's asize rounded to the nearest metaslab. This allows us to
245573ca77George Wilson * replace or attach devices which don't have the same physical size but
246573ca77George Wilson * can still satisfy the same number of allocations.
2472a79c5flling */
249573ca77George Wilsonvdev_get_min_asize(vdev_t *vd)
251573ca77George Wilson	vdev_t *pvd = vd->vdev_parent;
253573ca77George Wilson	/*
2544263d13George Wilson	 * If our parent is NULL (inactive spare or cache) or is the root,
255573ca77George Wilson	 * just return our own asize.
256573ca77George Wilson	 */
257573ca77George Wilson	if (pvd == NULL)
258573ca77George Wilson		return (vd->vdev_asize);
2602a79c5flling	/*
261573ca77George Wilson	 * The top-level vdev just returns the allocatable size rounded
262573ca77George Wilson	 * to the nearest metaslab.
2632a79c5flling	 */
264573ca77George Wilson	if (vd == vd->vdev_top)
265573ca77George Wilson		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
267573ca77George Wilson	/*
268573ca77George Wilson	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
269573ca77George Wilson	 * so each child must provide at least 1/Nth of its asize.
270573ca77George Wilson	 */
271573ca77George Wilson	if (pvd->vdev_ops == &vdev_raidz_ops)
272c040c10Steven Hartland		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
273c040c10Steven Hartland		    pvd->vdev_children);
275573ca77George Wilson	return (pvd->vdev_min_asize);
276573ca77George Wilson}
278573ca77George Wilsonvoid
279573ca77George Wilsonvdev_set_min_asize(vdev_t *vd)
280573ca77George Wilson{
281573ca77George Wilson	vd->vdev_min_asize = vdev_get_min_asize(vd);
282573ca77George Wilson
283573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
284573ca77George Wilson		vdev_set_min_asize(vd->vdev_child[c]);
287fa9e406ahrensvdev_t *
288fa9e406ahrensvdev_lookup_top(spa_t *spa, uint64_t vdev)
290fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
292e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
294088f389ahrens	if (vdev < rvd->vdev_children) {
295088f389ahrens		ASSERT(rvd->vdev_child[vdev] != NULL);
296fa9e406ahrens		return (rvd->vdev_child[vdev]);
297088f389ahrens	}
299fa9e406ahrens	return (NULL);
302fa9e406ahrensvdev_t *
303fa9e406ahrensvdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
305fa9e406ahrens	vdev_t *mvd;
3070e34b6abonwick	if (vd->vdev_guid == guid)
308fa9e406ahrens		return (vd);
310573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
311fa9e406ahrens		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
312fa9e406ahrens		    NULL)
313fa9e406ahrens			return (mvd);
315fa9e406ahrens	return (NULL);
31812380e1Arne Jansenstatic int
31912380e1Arne Jansenvdev_count_leaves_impl(vdev_t *vd)
32012380e1Arne Jansen{
32112380e1Arne Jansen	int n = 0;
32212380e1Arne Jansen
32312380e1Arne Jansen	if (vd->vdev_ops->vdev_op_leaf)
32412380e1Arne Jansen		return (1);
32512380e1Arne Jansen
32612380e1Arne Jansen	for (int c = 0; c < vd->vdev_children; c++)
32712380e1Arne Jansen		n += vdev_count_leaves_impl(vd->vdev_child[c]);
32812380e1Arne Jansen
32912380e1Arne Jansen	return (n);
33012380e1Arne Jansen}
33112380e1Arne Jansen
33212380e1Arne Jansenint
33312380e1Arne Jansenvdev_count_leaves(spa_t *spa)
33412380e1Arne Jansen{
33512380e1Arne Jansen	return (vdev_count_leaves_impl(spa->spa_root_vdev));
33612380e1Arne Jansen}
33712380e1Arne Jansen
339fa9e406ahrensvdev_add_child(vdev_t *pvd, vdev_t *cvd)
341fa9e406ahrens	size_t oldsize, newsize;
342fa9e406ahrens	uint64_t id = cvd->vdev_id;
343fa9e406ahrens	vdev_t **newchild;
34481cd5c5Matthew Ahrens	spa_t *spa = cvd->vdev_spa;
34681cd5c5Matthew Ahrens	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
347fa9e406ahrens	ASSERT(cvd->vdev_parent == NULL);
349fa9e406ahrens	cvd->vdev_parent = pvd;
351fa9e406ahrens	if (pvd == NULL)
352fa9e406ahrens		return;
354fa9e406ahrens	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
356fa9e406ahrens	oldsize = pvd->vdev_children * sizeof (vdev_t *);
357fa9e406ahrens	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
358fa9e406ahrens	newsize = pvd->vdev_children * sizeof (vdev_t *);
360fa9e406ahrens	newchild = kmem_zalloc(newsize, KM_SLEEP);
361fa9e406ahrens	if (pvd->vdev_child != NULL) {
362fa9e406ahrens		bcopy(pvd->vdev_child, newchild, oldsize);
363fa9e406ahrens		kmem_free(pvd->vdev_child, oldsize);
364fa9e406ahrens	}
366fa9e406ahrens	pvd->vdev_child = newchild;
367fa9e406ahrens	pvd->vdev_child[id] = cvd;
369fa9e406ahrens	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
370fa9e406ahrens	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
372fa9e406ahrens	/*
373fa9e406ahrens	 * Walk up all ancestors to update guid sum.
374fa9e406ahrens	 */
375fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
376fa9e406ahrens		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
377e0f1c0aOlaf Faaland
378e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
379e0f1c0aOlaf Faaland		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
380e0f1c0aOlaf Faaland		cvd->vdev_spa->spa_leaf_list_gen++;
381e0f1c0aOlaf Faaland	}
385fa9e406ahrensvdev_remove_child(vdev_t *pvd, vdev_t *cvd)
387fa9e406ahrens	int c;
388fa9e406ahrens	uint_t id = cvd->vdev_id;
390fa9e406ahrens	ASSERT(cvd->vdev_parent == pvd);
392fa9e406ahrens	if (pvd == NULL)
393fa9e406ahrens		return;
395fa9e406ahrens	ASSERT(id < pvd->vdev_children);
396fa9e406ahrens	ASSERT(pvd->vdev_child[id] == cvd);
398fa9e406ahrens	pvd->vdev_child[id] = NULL;
399fa9e406ahrens	cvd->vdev_parent = NULL;
401fa9e406ahrens	for (c = 0; c < pvd->vdev_children; c++)
402fa9e406ahrens		if (pvd->vdev_child[c])
403fa9e406ahrens			break;
405fa9e406ahrens	if (c == pvd->vdev_children) {
406fa9e406ahrens		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
407fa9e406ahrens		pvd->vdev_child = NULL;
408fa9e406ahrens		pvd->vdev_children = 0;
409fa9e406ahrens	}
411e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
412e0f1c0aOlaf Faaland		spa_t *spa = cvd->vdev_spa;
413e0f1c0aOlaf Faaland		list_remove(&spa->spa_leaf_list, cvd);
414e0f1c0aOlaf Faaland		spa->spa_leaf_list_gen++;
415e0f1c0aOlaf Faaland	}
416e0f1c0aOlaf Faaland
417fa9e406ahrens	/*
418fa9e406ahrens	 * Walk up all ancestors to update guid sum.
419fa9e406ahrens	 */
420fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
421fa9e406ahrens		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
425fa9e406ahrens * Remove any holes in the child array.
426fa9e406ahrens */
428fa9e406ahrensvdev_compact_children(vdev_t *pvd)
430fa9e406ahrens	vdev_t **newchild, *cvd;
431fa9e406ahrens	int oldc = pvd->vdev_children;
432573ca77George Wilson	int newc;
434e14bb32Jeff Bonwick	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
436573ca77George Wilson	for (int c = newc = 0; c < oldc; c++)
437fa9e406ahrens		if (pvd->vdev_child[c])
438fa9e406ahrens			newc++;
440fa9e406ahrens	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
442573ca77George Wilson	for (int c = newc = 0; c < oldc; c++) {
443fa9e406ahrens		if ((cvd = pvd->vdev_child[c]) != NULL) {
444fa9e406ahrens			newchild[newc] = cvd;
445fa9e406ahrens			cvd->vdev_id = newc++;
446fa9e406ahrens		}
447fa9e406ahrens	}
449fa9e406ahrens	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
450fa9e406ahrens	pvd->vdev_child = newchild;
451fa9e406ahrens	pvd->vdev_children = newc;
455fa9e406ahrens * Allocate and minimally initialize a vdev_t.
456fa9e406ahrens */
45788ecc94George Wilsonvdev_t *
458fa9e406ahrensvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
460fa9e406ahrens	vdev_t *vd;
4615cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
463fa9e406ahrens	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
4645cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
4660e34b6abonwick	if (spa->spa_root_vdev == NULL) {
4670e34b6abonwick		ASSERT(ops == &vdev_root_ops);
4680e34b6abonwick		spa->spa_root_vdev = vd;
469e9103aaGarrett D'Amore		spa->spa_load_guid = spa_generate_guid(NULL);
4700e34b6abonwick	}
47288ecc94George Wilson	if (guid == 0 && ops != &vdev_hole_ops) {
4730e34b6abonwick		if (spa->spa_root_vdev == vd) {
4740e34b6abonwick			/*
4750e34b6abonwick			 * The root vdev's guid will also be the pool guid,
4760e34b6abonwick			 * which must be unique among all pools.
4770e34b6abonwick			 */
4781195e68Mark J Musante			guid = spa_generate_guid(NULL);
4790e34b6abonwick		} else {
4800e34b6abonwick			/*
4810e34b6abonwick			 * Any other vdev's guid must be unique within the pool.
4820e34b6abonwick			 */
4831195e68Mark J Musante			guid = spa_generate_guid(spa);
4840e34b6abonwick		}
4850e34b6abonwick		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
4860e34b6abonwick	}
488fa9e406ahrens	vd->vdev_spa = spa;
489fa9e406ahrens	vd->vdev_id = id;
490fa9e406ahrens	vd->vdev_guid = guid;
491fa9e406ahrens	vd->vdev_guid_sum = guid;
492fa9e406ahrens	vd->vdev_ops = ops;
493fa9e406ahrens	vd->vdev_state = VDEV_STATE_CLOSED;
49488ecc94George Wilson	vd->vdev_ishole = (ops == &vdev_hole_ops);
4955cabbc6Prashanth Sreenivasa	vic->vic_prev_indirect_vdev = UINT64_MAX;
4965cabbc6Prashanth Sreenivasa
4975cabbc6Prashanth Sreenivasa	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
4985cabbc6Prashanth Sreenivasa	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
4995cabbc6Prashanth Sreenivasa	vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
501084fd14Brian Behlendorf	list_link_init(&vd->vdev_initialize_node);
502e0f1c0aOlaf Faaland	list_link_init(&vd->vdev_leaf_node);
503084fd14Brian Behlendorf	list_link_init(&vd->vdev_trim_node);
504fa9e406ahrens	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
5055ad8204nd	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
506e14bb32Jeff Bonwick	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
507a3874b8Toomas Soome	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
508094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
509094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
510094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
511094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
512084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
513084fd14Brian Behlendorf	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
514084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
515084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
516084fd14Brian Behlendorf	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
517084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
518094e47eGeorge Wilson
5198ad4d6dJeff Bonwick	for (int t = 0; t < DTL_TYPES; t++) {
5205cabbc6Prashanth Sreenivasa		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
5218ad4d6dJeff Bonwick	}
522b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_ms_list, spa,
523fa9e406ahrens	    offsetof(struct metaslab, ms_txg_node));
524b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_dtl_list, spa,
525fa9e406ahrens	    offsetof(struct vdev, vdev_dtl_node));
526fa9e406ahrens	vd->vdev_stat.vs_timestamp = gethrtime();
5273d7072feschrock	vdev_queue_init(vd);
5283d7072feschrock	vdev_cache_init(vd);
530fa9e406ahrens	return (vd);
534fa9e406ahrens * Allocate a new vdev.  The 'alloctype' is used to control whether we are
535fa9e406ahrens * creating a new vdev or loading an existing one - the behavior is slightly
536fa9e406ahrens * different for each case.
537fa9e406ahrens */
53999653d4eschrockvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
54099653d4eschrock    int alloctype)
542fa9e406ahrens	vdev_ops_t *ops;
543fa9e406ahrens	char *type;
5448654d02perrin	uint64_t guid = 0, islog, nparity;
545fa9e406ahrens	vdev_t *vd;
5465cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
547663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
548663207aDon Brady	boolean_t top_level = (parent && !parent->vdev_parent);
550e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
552fa9e406ahrens	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
553be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
555fa9e406ahrens	if ((ops = vdev_getops(type)) == NULL)
556be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
558fa9e406ahrens	/*
559fa9e406ahrens	 * If this is a load, get the vdev guid from the nvlist.
560fa9e406ahrens	 * Otherwise, vdev_alloc_common() will generate one for us.
561fa9e406ahrens	 */
562fa9e406ahrens	if (alloctype == VDEV_ALLOC_LOAD) {
563fa9e406ahrens		uint64_t label_id;
565fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
566fa9e406ahrens		    label_id != id)
567be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
569fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
570be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
57199653d4eschrock	} else if (alloctype == VDEV_ALLOC_SPARE) {
57299653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
573be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
574fa94a07brendan	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
575fa94a07brendan		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
576be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
57721ecdf6Lin Ling	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
57821ecdf6Lin Ling		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
579be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
580fa9e406ahrens	}
58299653d4eschrock	/*
58399653d4eschrock	 * The first allocated vdev must be of type 'root'.
58499653d4eschrock	 */
58599653d4eschrock	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
586be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
5888654d02perrin	/*
5898654d02perrin	 * Determine whether we're a log vdev.
5908654d02perrin	 */
5918654d02perrin	islog = 0;
5928654d02perrin	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
593990b485lling	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
594be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
59688ecc94George Wilson	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
597be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
59888ecc94George Wilson
599fa9e406ahrens	/*
6008654d02perrin	 * Set the nparity property for RAID-Z vdevs.
60199653d4eschrock	 */
6028654d02perrin	nparity = -1ULL;
60399653d4eschrock	if (ops == &vdev_raidz_ops) {
60499653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
6058654d02perrin		    &nparity) == 0) {
606b24ab67Jeff Bonwick			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
607be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
60899653d4eschrock			/*
609f94275cAdam Leventhal			 * Previous versions could only support 1 or 2 parity
610f94275cAdam Leventhal			 * device.
61199653d4eschrock			 */
612f94275cAdam Leventhal			if (nparity > 1 &&
613f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ2)
614be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
615f94275cAdam Leventhal			if (nparity > 2 &&
616f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ3)
617be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
61899653d4eschrock		} else {
61999653d4eschrock			/*
62099653d4eschrock			 * We require the parity to be specified for SPAs that
62199653d4eschrock			 * support multiple parity levels.
62299653d4eschrock			 */
623f94275cAdam Leventhal			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
624be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
62599653d4eschrock			/*
62699653d4eschrock			 * Otherwise, we default to 1 parity device for RAID-Z.
62799653d4eschrock			 */
6288654d02perrin			nparity = 1;
62999653d4eschrock		}
63099653d4eschrock	} else {
6318654d02perrin		nparity = 0;
63299653d4eschrock	}
6338654d02perrin	ASSERT(nparity != -1ULL);
635663207aDon Brady	/*
636663207aDon Brady	 * If creating a top-level vdev, check for allocation classes input
637663207aDon Brady	 */
638663207aDon Brady	if (top_level && alloctype == VDEV_ALLOC_ADD) {
639663207aDon Brady		char *bias;
640663207aDon Brady
641663207aDon Brady		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
642663207aDon Brady		    &bias) == 0) {
643663207aDon Brady			alloc_bias = vdev_derive_alloc_bias(bias);
644663207aDon Brady
645663207aDon Brady			/* spa_vdev_add() expects feature to be enabled */
646c1064fdJerry Jelinek			if (alloc_bias != VDEV_BIAS_LOG &&
647c1064fdJerry Jelinek			    spa->spa_load_state != SPA_LOAD_CREATE &&
648663207aDon Brady			    !spa_feature_is_enabled(spa,
649663207aDon Brady			    SPA_FEATURE_ALLOCATION_CLASSES)) {
650663207aDon Brady				return (SET_ERROR(ENOTSUP));
651663207aDon Brady			}
652663207aDon Brady		}
653663207aDon Brady	}
654663207aDon Brady
6558654d02perrin	vd = vdev_alloc_common(spa, id, guid, ops);
6565cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
6588654d02perrin	vd->vdev_islog = islog;
6598654d02perrin	vd->vdev_nparity = nparity;
660663207aDon Brady	if (top_level && alloc_bias != VDEV_BIAS_NONE)
661663207aDon Brady		vd->vdev_alloc_bias = alloc_bias;
6638654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
6648654d02perrin		vd->vdev_path = spa_strdup(vd->vdev_path);
6658654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
6668654d02perrin		vd->vdev_devid = spa_strdup(vd->vdev_devid);
6678654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
6688654d02perrin	    &vd->vdev_physpath) == 0)
6698654d02perrin		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
6706809eb4Eric Schrock	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
6716809eb4Eric Schrock		vd->vdev_fru = spa_strdup(vd->vdev_fru);
67399653d4eschrock	/*
674afefbcdeschrock	 * Set the whole_disk property.  If it's not specified, leave the value
675afefbcdeschrock	 * as -1.
676afefbcdeschrock	 */
677afefbcdeschrock	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
678afefbcdeschrock	    &vd->vdev_wholedisk) != 0)
679afefbcdeschrock		vd->vdev_wholedisk = -1ULL;
6815cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_mapping_object);
6825cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
6835cabbc6Prashanth Sreenivasa	    &vic->vic_mapping_object);
6845cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_births_object);
6855cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
6865cabbc6Prashanth Sreenivasa	    &vic->vic_births_object);
6875cabbc6Prashanth Sreenivasa	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
6885cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
6895cabbc6Prashanth Sreenivasa	    &vic->vic_prev_indirect_vdev);
6905cabbc6Prashanth Sreenivasa
691afefbcdeschrock	/*
692ea8dc4beschrock	 * Look for the 'not present' flag.  This will only be set if the device
693ea8dc4beschrock	 * was not present at the time of import.
694ea8dc4beschrock	 */
6956809eb4Eric Schrock	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
6966809eb4Eric Schrock	    &vd->vdev_not_present);
698ea8dc4beschrock	/*
699ecc2d60bonwick	 * Get the alignment requirement.
700ecc2d60bonwick	 */
701ecc2d60bonwick	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
703ecc2d60bonwick	/*
70488ecc94George Wilson	 * Retrieve the vdev creation time.
70588ecc94George Wilson	 */
70688ecc94George Wilson	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
70788ecc94George Wilson	    &vd->vdev_crtxg);
70888ecc94George Wilson
70988ecc94George Wilson	/*
710fa9e406ahrens	 * If we're a top-level vdev, try to load the allocation parameters.
711fa9e406ahrens	 */
712663207aDon Brady	if (top_level &&
7131195e68Mark J Musante	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
714fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
715fa9e406ahrens		    &vd->vdev_ms_array);
716fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
717fa9e406ahrens		    &vd->vdev_ms_shift);
718fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
719fa9e406ahrens		    &vd->vdev_asize);
7203f9d6adLin Ling		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
7213f9d6adLin Ling		    &vd->vdev_removing);
722215198aJoe Stein		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
723215198aJoe Stein		    &vd->vdev_top_zap);
724215198aJoe Stein	} else {
725215198aJoe Stein		ASSERT0(vd->vdev_top_zap);
726fa9e406ahrens	}
728663207aDon Brady	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
729a152156Jeff Bonwick		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
7309f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ADD ||
7311195e68Mark J Musante		    alloctype == VDEV_ALLOC_SPLIT ||
7329f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ROOTPOOL);
733663207aDon Brady		/* Note: metaslab_group_create() is now deferred */
734a152156Jeff Bonwick	}
735a152156Jeff Bonwick
736215198aJoe Stein	if (vd->vdev_ops->vdev_op_leaf &&
737215198aJoe Stein	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
738215198aJoe Stein		(void) nvlist_lookup_uint64(nv,
739215198aJoe Stein		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
740215198aJoe Stein	} else {
741215198aJoe Stein		ASSERT0(vd->vdev_leaf_zap);
742215198aJoe Stein	}
743215198aJoe Stein
744fa9e406ahrens	/*
7453d7072feschrock	 * If we're a leaf vdev, try to load the DTL object and other state.
746fa9e406ahrens	 */
747215198aJoe Stein
748c5904d1eschrock	if (vd->vdev_ops->vdev_op_leaf &&
74921ecdf6Lin Ling	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
75021ecdf6Lin Ling	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
751c5904d1eschrock		if (alloctype == VDEV_ALLOC_LOAD) {
752c5904d1eschrock			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
7530713e23George Wilson			    &vd->vdev_dtl_object);
754c5904d1eschrock			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
755c5904d1eschrock			    &vd->vdev_unspare);
756c5904d1eschrock		}
75721ecdf6Lin Ling
75821ecdf6Lin Ling		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
75921ecdf6Lin Ling			uint64_t spare = 0;
76021ecdf6Lin Ling