2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5441d80alling * Common Development and Distribution License (the "License").
6441d80alling * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
2398d1cbfGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245cabbc6Prashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
255f368aeYuri Pankov * Copyright 2017 Nexenta Systems, Inc.
26c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
27c8811bdToomas Soome * Copyright 2016 Toomas Soome <tsoome@me.com>
28c1064fdJerry Jelinek * Copyright 2019 Joyent, Inc.
29663207aDon Brady * Copyright (c) 2017, Intel Corporation.
30fa9e406ahrens */
32fa9e406ahrens#include <sys/zfs_context.h>
33ea8dc4beschrock#include <sys/fm/fs/zfs.h>
34fa9e406ahrens#include <sys/spa.h>
35fa9e406ahrens#include <sys/spa_impl.h>
365cabbc6Prashanth Sreenivasa#include <sys/bpobj.h>
37fa9e406ahrens#include <sys/dmu.h>
38fa9e406ahrens#include <sys/dmu_tx.h>
395cabbc6Prashanth Sreenivasa#include <sys/dsl_dir.h>
40fa9e406ahrens#include <sys/vdev_impl.h>
41fa9e406ahrens#include <sys/uberblock_impl.h>
42fa9e406ahrens#include <sys/metaslab.h>
43fa9e406ahrens#include <sys/metaslab_impl.h>
44fa9e406ahrens#include <sys/space_map.h>
450713e23George Wilson#include <sys/space_reftree.h>
46fa9e406ahrens#include <sys/zio.h>
47fa9e406ahrens#include <sys/zap.h>
48fa9e406ahrens#include <sys/fs/zfs.h>
49c5904d1eschrock#include <sys/arc.h>
50e6ca193George Wilson#include <sys/zil.h>
513f9d6adLin Ling#include <sys/dsl_scan.h>
52770499eDan Kimmel#include <sys/abd.h>
53094e47eGeorge Wilson#include <sys/vdev_initialize.h>
54084fd14Brian Behlendorf#include <sys/vdev_trim.h>
57fa9e406ahrens * Virtual device management.
58fa9e406ahrens */
60fa9e406ahrensstatic vdev_ops_t *vdev_ops_table[] = {
61fa9e406ahrens	&vdev_root_ops,
62fa9e406ahrens	&vdev_raidz_ops,
63fa9e406ahrens	&vdev_mirror_ops,
64fa9e406ahrens	&vdev_replacing_ops,
6599653d4eschrock	&vdev_spare_ops,
66fa9e406ahrens	&vdev_disk_ops,
67fa9e406ahrens	&vdev_file_ops,
68fa9e406ahrens	&vdev_missing_ops,
6988ecc94George Wilson	&vdev_hole_ops,
705cabbc6Prashanth Sreenivasa	&vdev_indirect_ops,
71fa9e406ahrens	NULL
74088f389ahrens/* maximum scrub/resilver I/O queue per leaf vdev */
75088f389ahrensint zfs_scrub_limit = 10;
77a0b03b1Serapheim Dimitropoulos/* default target for number of metaslabs per top-level vdev */
78a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_count = 200;
798671400Serapheim Dimitropoulos
80b4bf0cfDon Brady/* minimum number of metaslabs per top-level vdev */
81a0b03b1Serapheim Dimitropoulosint zfs_vdev_min_ms_count = 16;
828671400Serapheim Dimitropoulos
83b4bf0cfDon Brady/* practical upper limit of total metaslabs per top-level vdev */
84a0b03b1Serapheim Dimitropoulosint zfs_vdev_ms_count_limit = 1ULL << 17;
85b4bf0cfDon Brady
86b4bf0cfDon Brady/* lower limit for metaslab size (512M) */
87a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_shift = 29;
888671400Serapheim Dimitropoulos
89a0b03b1Serapheim Dimitropoulos/* upper limit for metaslab size (16G) */
90a0b03b1Serapheim Dimitropoulosint zfs_vdev_max_ms_shift = 34;
91b4bf0cfDon Brady
928671400Serapheim Dimitropoulosboolean_t vdev_validate_skip = B_FALSE;
938671400Serapheim Dimitropoulos
958671400Serapheim Dimitropoulos * Since the DTL space map of a vdev is not expected to have a lot of
968671400Serapheim Dimitropoulos * entries, we default its block size to 4K.
97bf3e216Matthew Ahrens */
98814dcd4Serapheim Dimitropoulosint zfs_vdev_dtl_sm_blksz = (1 << 12);
99bf3e216Matthew Ahrens
1008671400Serapheim Dimitropoulos/*
1018671400Serapheim Dimitropoulos * vdev-wide space maps that have lots of entries written to them at
1028671400Serapheim Dimitropoulos * the end of each transaction can benefit from a higher I/O bandwidth
1038671400Serapheim Dimitropoulos * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
1048671400Serapheim Dimitropoulos */
105814dcd4Serapheim Dimitropoulosint zfs_vdev_standard_sm_blksz = (1 << 17);
1066f79381Pavel Zakharov
10793a1902Matthew Ahrensint zfs_ashift_min;
10893a1902Matthew Ahrens
1093ee8c80Pavel Zakharov/*PRINTFLIKE2*/
1103ee8c80Pavel Zakharovvoid
1113ee8c80Pavel Zakharovvdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
1123ee8c80Pavel Zakharov{
1133ee8c80Pavel Zakharov	va_list adx;
1143ee8c80Pavel Zakharov	char buf[256];
1153ee8c80Pavel Zakharov
1163ee8c80Pavel Zakharov	va_start(adx, fmt);
1173ee8c80Pavel Zakharov	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
1183ee8c80Pavel Zakharov	va_end(adx);
1193ee8c80Pavel Zakharov
1203ee8c80Pavel Zakharov	if (vd->vdev_path != NULL) {
1213ee8c80Pavel Zakharov		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
1223ee8c80Pavel Zakharov		    vd->vdev_path, buf);
1233ee8c80Pavel Zakharov	} else {
1243ee8c80Pavel Zakharov		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
1253ee8c80Pavel Zakharov		    vd->vdev_ops->vdev_op_type,
1263ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_id,
1273ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_guid, buf);
1283ee8c80Pavel Zakharov	}
1293ee8c80Pavel Zakharov}
1303ee8c80Pavel Zakharov
1316f79381Pavel Zakharovvoid
1326f79381Pavel Zakharovvdev_dbgmsg_print_tree(vdev_t *vd, int indent)
1336f79381Pavel Zakharov{
1346f79381Pavel Zakharov	char state[20];
1356f79381Pavel Zakharov
1366f79381Pavel Zakharov	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
1376f79381Pavel Zakharov		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
1386f79381Pavel Zakharov		    vd->vdev_ops->vdev_op_type);
1396f79381Pavel Zakharov		return;
1406f79381Pavel Zakharov	}
1416f79381Pavel Zakharov
1426f79381Pavel Zakharov	switch (vd->vdev_state) {
1436f79381Pavel Zakharov	case VDEV_STATE_UNKNOWN:
1446f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "unknown");
1456f79381Pavel Zakharov		break;
1466f79381Pavel Zakharov	case VDEV_STATE_CLOSED:
1476f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "closed");
1486f79381Pavel Zakharov		break;
1496f79381Pavel Zakharov	case VDEV_STATE_OFFLINE:
1506f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "offline");
1516f79381Pavel Zakharov		break;
1526f79381Pavel Zakharov	case VDEV_STATE_REMOVED:
1536f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "removed");
1546f79381Pavel Zakharov		break;
1556f79381Pavel Zakharov	case VDEV_STATE_CANT_OPEN:
1566f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "can't open");
1576f79381Pavel Zakharov		break;
1586f79381Pavel Zakharov	case VDEV_STATE_FAULTED:
1596f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "faulted");
1606f79381Pavel Zakharov		break;
1616f79381Pavel Zakharov	case VDEV_STATE_DEGRADED:
1626f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "degraded");
1636f79381Pavel Zakharov		break;
1646f79381Pavel Zakharov	case VDEV_STATE_HEALTHY:
1656f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "healthy");
1666f79381Pavel Zakharov		break;
1676f79381Pavel Zakharov	default:
1686f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "<state %u>",
1696f79381Pavel Zakharov		    (uint_t)vd->vdev_state);
1706f79381Pavel Zakharov	}
1716f79381Pavel Zakharov
1726f79381Pavel Zakharov	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
173c7a7b2fAndriy Gapon	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
1746f79381Pavel Zakharov	    vd->vdev_islog ? " (log)" : "",
1756f79381Pavel Zakharov	    (u_longlong_t)vd->vdev_guid,
1766f79381Pavel Zakharov	    vd->vdev_path ? vd->vdev_path : "N/A", state);
1776f79381Pavel Zakharov
1786f79381Pavel Zakharov	for (uint64_t i = 0; i < vd->vdev_children; i++)
1796f79381Pavel Zakharov		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
1806f79381Pavel Zakharov}
1816f79381Pavel Zakharov
182bf3e216Matthew Ahrens/*
183fa9e406ahrens * Given a vdev type, return the appropriate ops vector.
184fa9e406ahrens */
185fa9e406ahrensstatic vdev_ops_t *
186fa9e406ahrensvdev_getops(const char *type)
188fa9e406ahrens	vdev_ops_t *ops, **opspp;
190fa9e406ahrens	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
191fa9e406ahrens		if (strcmp(ops->vdev_op_type, type) == 0)
192fa9e406ahrens			break;
194fa9e406ahrens	return (ops);
197663207aDon Brady/*
198663207aDon Brady * Derive the enumerated alloction bias from string input.
199663207aDon Brady * String origin is either the per-vdev zap or zpool(1M).
200663207aDon Brady */
201663207aDon Bradystatic vdev_alloc_bias_t
202663207aDon Bradyvdev_derive_alloc_bias(const char *bias)
203663207aDon Brady{
204663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
205663207aDon Brady
206663207aDon Brady	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
207663207aDon Brady		alloc_bias = VDEV_BIAS_LOG;
208663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
209663207aDon Brady		alloc_bias = VDEV_BIAS_SPECIAL;
210663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
211663207aDon Brady		alloc_bias = VDEV_BIAS_DEDUP;
212663207aDon Brady
213663207aDon Brady	return (alloc_bias);
214663207aDon Brady}
215663207aDon Brady
216094e47eGeorge Wilson/* ARGSUSED */
217094e47eGeorge Wilsonvoid
2184d7988dPaul Dagnelievdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
219094e47eGeorge Wilson{
220094e47eGeorge Wilson	res->rs_start = in->rs_start;
221094e47eGeorge Wilson	res->rs_end = in->rs_end;
222094e47eGeorge Wilson}
223094e47eGeorge Wilson
225fa9e406ahrens * Default asize function: return the MAX of psize with the asize of
226fa9e406ahrens * all children.  This is what's used by anything other than RAID-Z.
227fa9e406ahrens */
229fa9e406ahrensvdev_default_asize(vdev_t *vd, uint64_t psize)
231ecc2d60bonwick	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
232fa9e406ahrens	uint64_t csize;
234573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++) {
235fa9e406ahrens		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
236fa9e406ahrens		asize = MAX(asize, csize);
237fa9e406ahrens	}
239fa9e406ahrens	return (asize);
243573ca77George Wilson * Get the minimum allocatable size. We define the allocatable size as
244573ca77George Wilson * the vdev's asize rounded to the nearest metaslab. This allows us to
245573ca77George Wilson * replace or attach devices which don't have the same physical size but
246573ca77George Wilson * can still satisfy the same number of allocations.
2472a79c5flling */
249573ca77George Wilsonvdev_get_min_asize(vdev_t *vd)
251573ca77George Wilson	vdev_t *pvd = vd->vdev_parent;
253573ca77George Wilson	/*
2544263d13George Wilson	 * If our parent is NULL (inactive spare or cache) or is the root,
255573ca77George Wilson	 * just return our own asize.
256573ca77George Wilson	 */
257573ca77George Wilson	if (pvd == NULL)
258573ca77George Wilson		return (vd->vdev_asize);
2602a79c5flling	/*
261573ca77George Wilson	 * The top-level vdev just returns the allocatable size rounded
262573ca77George Wilson	 * to the nearest metaslab.
2632a79c5flling	 */
264573ca77George Wilson	if (vd == vd->vdev_top)
265573ca77George Wilson		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
267573ca77George Wilson	/*
268573ca77George Wilson	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
269573ca77George Wilson	 * so each child must provide at least 1/Nth of its asize.
270573ca77George Wilson	 */
271573ca77George Wilson	if (pvd->vdev_ops == &vdev_raidz_ops)
272c040c10Steven Hartland		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
273c040c10Steven Hartland		    pvd->vdev_children);
275573ca77George Wilson	return (pvd->vdev_min_asize);
276573ca77George Wilson}
278573ca77George Wilsonvoid
279573ca77George Wilsonvdev_set_min_asize(vdev_t *vd)
280573ca77George Wilson{
281573ca77George Wilson	vd->vdev_min_asize = vdev_get_min_asize(vd);
282573ca77George Wilson
283573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
284573ca77George Wilson		vdev_set_min_asize(vd->vdev_child[c]);
287fa9e406ahrensvdev_t *
288fa9e406ahrensvdev_lookup_top(spa_t *spa, uint64_t vdev)
290fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
292e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
294088f389ahrens	if (vdev < rvd->vdev_children) {
295088f389ahrens		ASSERT(rvd->vdev_child[vdev] != NULL);
296fa9e406ahrens		return (rvd->vdev_child[vdev]);
297088f389ahrens	}
299fa9e406ahrens	return (NULL);
302fa9e406ahrensvdev_t *
303fa9e406ahrensvdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
305fa9e406ahrens	vdev_t *mvd;
3070e34b6abonwick	if (vd->vdev_guid == guid)
308fa9e406ahrens		return (vd);
310573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
311fa9e406ahrens		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
312fa9e406ahrens		    NULL)
313fa9e406ahrens			return (mvd);
315fa9e406ahrens	return (NULL);
31812380e1Arne Jansenstatic int
31912380e1Arne Jansenvdev_count_leaves_impl(vdev_t *vd)
32012380e1Arne Jansen{
32112380e1Arne Jansen	int n = 0;
32212380e1Arne Jansen
32312380e1Arne Jansen	if (vd->vdev_ops->vdev_op_leaf)
32412380e1Arne Jansen		return (1);
32512380e1Arne Jansen
32612380e1Arne Jansen	for (int c = 0; c < vd->vdev_children; c++)
32712380e1Arne Jansen		n += vdev_count_leaves_impl(vd->vdev_child[c]);
32812380e1Arne Jansen
32912380e1Arne Jansen	return (n);
33012380e1Arne Jansen}
33112380e1Arne Jansen
33212380e1Arne Jansenint
33312380e1Arne Jansenvdev_count_leaves(spa_t *spa)
33412380e1Arne Jansen{
33512380e1Arne Jansen	return (vdev_count_leaves_impl(spa->spa_root_vdev));
33612380e1Arne Jansen}
33712380e1Arne Jansen
339fa9e406ahrensvdev_add_child(vdev_t *pvd, vdev_t *cvd)
341fa9e406ahrens	size_t oldsize, newsize;
342fa9e406ahrens	uint64_t id = cvd->vdev_id;
343fa9e406ahrens	vdev_t **newchild;
34481cd5c5Matthew Ahrens	spa_t *spa = cvd->vdev_spa;
34681cd5c5Matthew Ahrens	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
347fa9e406ahrens	ASSERT(cvd->vdev_parent == NULL);
349fa9e406ahrens	cvd->vdev_parent = pvd;
351fa9e406ahrens	if (pvd == NULL)
352fa9e406ahrens		return;
354fa9e406ahrens	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
356fa9e406ahrens	oldsize = pvd->vdev_children * sizeof (vdev_t *);
357fa9e406ahrens	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
358fa9e406ahrens	newsize = pvd->vdev_children * sizeof (vdev_t *);
360fa9e406ahrens	newchild = kmem_zalloc(newsize, KM_SLEEP);
361fa9e406ahrens	if (pvd->vdev_child != NULL) {
362fa9e406ahrens		bcopy(pvd->vdev_child, newchild, oldsize);
363fa9e406ahrens		kmem_free(pvd->vdev_child, oldsize);
364fa9e406ahrens	}
366fa9e406ahrens	pvd->vdev_child = newchild;
367fa9e406ahrens	pvd->vdev_child[id] = cvd;
369fa9e406ahrens	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
370fa9e406ahrens	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
372fa9e406ahrens	/*
373fa9e406ahrens	 * Walk up all ancestors to update guid sum.
374fa9e406ahrens	 */
375fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
376fa9e406ahrens		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
377e0f1c0aOlaf Faaland
378e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
379e0f1c0aOlaf Faaland		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
380e0f1c0aOlaf Faaland		cvd->vdev_spa->spa_leaf_list_gen++;
381e0f1c0aOlaf Faaland	}
385fa9e406ahrensvdev_remove_child(vdev_t *pvd, vdev_t *cvd)
387fa9e406ahrens	int c;
388fa9e406ahrens	uint_t id = cvd->vdev_id;
390fa9e406ahrens	ASSERT(cvd->vdev_parent == pvd);
392fa9e406ahrens	if (pvd == NULL)
393fa9e406ahrens		return;
395fa9e406ahrens	ASSERT(id < pvd->vdev_children);
396fa9e406ahrens	ASSERT(pvd->vdev_child[id] == cvd);
398fa9e406ahrens	pvd->vdev_child[id] = NULL;
399fa9e406ahrens	cvd->vdev_parent = NULL;
401fa9e406ahrens	for (c = 0; c < pvd->vdev_children; c++)
402fa9e406ahrens		if (pvd->vdev_child[c])
403fa9e406ahrens			break;
405fa9e406ahrens	if (c == pvd->vdev_children) {
406fa9e406ahrens		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
407fa9e406ahrens		pvd->vdev_child = NULL;
408fa9e406ahrens		pvd->vdev_children = 0;
409fa9e406ahrens	}
411e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
412e0f1c0aOlaf Faaland		spa_t *spa = cvd->vdev_spa;
413e0f1c0aOlaf Faaland		list_remove(&spa->spa_leaf_list, cvd);
414e0f1c0aOlaf Faaland		spa->spa_leaf_list_gen++;
415e0f1c0aOlaf Faaland	}
416e0f1c0aOlaf Faaland
417fa9e406ahrens	/*
418fa9e406ahrens	 * Walk up all ancestors to update guid sum.
419fa9e406ahrens	 */
420fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
421fa9e406ahrens		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
425fa9e406ahrens * Remove any holes in the child array.
426fa9e406ahrens */
428fa9e406ahrensvdev_compact_children(vdev_t *pvd)
430fa9e406ahrens	vdev_t **newchild, *cvd;
431fa9e406ahrens	int oldc = pvd->vdev_children;
432573ca77George Wilson	int newc;
434e14bb32Jeff Bonwick	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
436573ca77George Wilson	for (int c = newc = 0; c < oldc; c++)
437fa9e406ahrens		if (pvd->vdev_child[c])
438fa9e406ahrens			newc++;
440fa9e406ahrens	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
442573ca77George Wilson	for (int c = newc = 0; c < oldc; c++) {
443fa9e406ahrens		if ((cvd = pvd->vdev_child[c]) != NULL) {
444fa9e406ahrens			newchild[newc] = cvd;
445fa9e406ahrens			cvd->vdev_id = newc++;
446fa9e406ahrens		}
447fa9e406ahrens	}
449fa9e406ahrens	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
450fa9e406ahrens	pvd->vdev_child = newchild;
451fa9e406ahrens	pvd->vdev_children = newc;
455fa9e406ahrens * Allocate and minimally initialize a vdev_t.
456fa9e406ahrens */
45788ecc94George Wilsonvdev_t *
458fa9e406ahrensvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
460fa9e406ahrens	vdev_t *vd;
4615cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
463fa9e406ahrens	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
4645cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
4660e34b6abonwick	if (spa->spa_root_vdev == NULL) {
4670e34b6abonwick		ASSERT(ops == &vdev_root_ops);
4680e34b6abonwick		spa->spa_root_vdev = vd;
469e9103aaGarrett D'Amore		spa->spa_load_guid = spa_generate_guid(NULL);
4700e34b6abonwick	}
47288ecc94George Wilson	if (guid == 0 && ops != &vdev_hole_ops) {
4730e34b6abonwick		if (spa->spa_root_vdev == vd) {
4740e34b6abonwick			/*
4750e34b6abonwick			 * The root vdev's guid will also be the pool guid,
4760e34b6abonwick			 * which must be unique among all pools.
4770e34b6abonwick			 */
4781195e68Mark J Musante			guid = spa_generate_guid(NULL);
4790e34b6abonwick		} else {
4800e34b6abonwick			/*
4810e34b6abonwick			 * Any other vdev's guid must be unique within the pool.
4820e34b6abonwick			 */
4831195e68Mark J Musante			guid = spa_generate_guid(spa);
4840e34b6abonwick		}
4850e34b6abonwick		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
4860e34b6abonwick	}
488fa9e406ahrens	vd->vdev_spa = spa;
489fa9e406ahrens	vd->vdev_id = id;
490fa9e406ahrens	vd->vdev_guid = guid;
491fa9e406ahrens	vd->vdev_guid_sum = guid;
492fa9e406ahrens	vd->vdev_ops = ops;
493fa9e406ahrens	vd->vdev_state = VDEV_STATE_CLOSED;
49488ecc94George Wilson	vd->vdev_ishole = (ops == &vdev_hole_ops);
4955cabbc6Prashanth Sreenivasa	vic->vic_prev_indirect_vdev = UINT64_MAX;
4965cabbc6Prashanth Sreenivasa
4975cabbc6Prashanth Sreenivasa	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
4985cabbc6Prashanth Sreenivasa	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
4994d7988dPaul Dagnelie	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
5004d7988dPaul Dagnelie	    0, 0);
502084fd14Brian Behlendorf	list_link_init(&vd->vdev_initialize_node);
503e0f1c0aOlaf Faaland	list_link_init(&vd->vdev_leaf_node);
504084fd14Brian Behlendorf	list_link_init(&vd->vdev_trim_node);
505fa9e406ahrens	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
5065ad8204nd	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
507e14bb32Jeff Bonwick	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
508a3874b8Toomas Soome	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
509094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
510094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
511094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
512094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
513084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
514084fd14Brian Behlendorf	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
515084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
516084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
517084fd14Brian Behlendorf	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
518084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
519094e47eGeorge Wilson
5208ad4d6dJeff Bonwick	for (int t = 0; t < DTL_TYPES; t++) {
5214d7988dPaul Dagnelie		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
5224d7988dPaul Dagnelie		    0);
5238ad4d6dJeff Bonwick	}
524b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_ms_list, spa,
525fa9e406ahrens	    offsetof(struct metaslab, ms_txg_node));
526b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_dtl_list, spa,
527fa9e406ahrens	    offsetof(struct vdev, vdev_dtl_node));
528fa9e406ahrens	vd->vdev_stat.vs_timestamp = gethrtime();
5293d7072feschrock	vdev_queue_init(vd);
5303d7072feschrock	vdev_cache_init(vd);
532fa9e406ahrens	return (vd);
536fa9e406ahrens * Allocate a new vdev.  The 'alloctype' is used to control whether we are
537fa9e406ahrens * creating a new vdev or loading an existing one - the behavior is slightly
538fa9e406ahrens * different for each case.
539fa9e406ahrens */
54199653d4eschrockvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
54299653d4eschrock    int alloctype)
544fa9e406ahrens	vdev_ops_t *ops;
545fa9e406ahrens	char *type;
5468654d02perrin	uint64_t guid = 0, islog, nparity;
547fa9e406ahrens	vdev_t *vd;
5485cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
549663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
550663207aDon Brady	boolean_t top_level = (parent && !parent->vdev_parent);
552e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
554fa9e406ahrens	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
555be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
557fa9e406ahrens	if ((ops = vdev_getops(type)) == NULL)
558be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
560fa9e406ahrens	/*
561fa9e406ahrens	 * If this is a load, get the vdev guid from the nvlist.
562fa9e406ahrens	 * Otherwise, vdev_alloc_common() will generate one for us.
563fa9e406ahrens	 */
564fa9e406ahrens	if (alloctype == VDEV_ALLOC_LOAD) {
565fa9e406ahrens		uint64_t label_id;
567fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
568fa9e406ahrens		    label_id != id)
569be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
571fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
572be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
57399653d4eschrock	} else if (alloctype == VDEV_ALLOC_SPARE) {
57499653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
575be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
576fa94a07brendan	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
577fa94a07brendan		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
578be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
57921ecdf6Lin Ling	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
58021ecdf6Lin Ling		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
581be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
582fa9e406ahrens	}
58499653d4eschrock	/*
58599653d4eschrock	 * The first allocated vdev must be of type 'root'.
58699653d4eschrock	 */
58799653d4eschrock	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
588be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
5908654d02perrin	/*
5918654d02perrin	 * Determine whether we're a log vdev.
5928654d02perrin	 */
5938654d02perrin	islog = 0;
5948654d02perrin	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
595990b485lling	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
596be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
59888ecc94George Wilson	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
599be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
60088ecc94George Wilson
601fa9e406ahrens	/*
6028654d02perrin	 * Set the nparity property for RAID-Z vdevs.
60399653d4eschrock	 */
6048654d02perrin	nparity = -1ULL;
60599653d4eschrock	if (ops == &vdev_raidz_ops) {
60699653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
6078654d02perrin		    &nparity) == 0) {
608b24ab67Jeff Bonwick			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
609be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
61099653d4eschrock			/*
611f94275cAdam Leventhal			 * Previous versions could only support 1 or 2 parity
612f94275cAdam Leventhal			 * device.
61399653d4eschrock			 */
614f94275cAdam Leventhal			if (nparity > 1 &&
615f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ2)
616be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
617f94275cAdam Leventhal			if (nparity > 2 &&
618f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ3)
619be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
62099653d4eschrock		} else {
62199653d4eschrock			/*
62299653d4eschrock			 * We require the parity to be specified for SPAs that
62399653d4eschrock			 * support multiple parity levels.
62499653d4eschrock			 */
625f94275cAdam Leventhal			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
626be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
62799653d4eschrock			/*
62899653d4eschrock			 * Otherwise, we default to 1 parity device for RAID-Z.
62999653d4eschrock			 */
6308654d02perrin			nparity = 1;
63199653d4eschrock		}
63299653d4eschrock	} else {
6338654d02perrin		nparity = 0;
63499653d4eschrock	}
6358654d02perrin	ASSERT(nparity != -1ULL);
637663207aDon Brady	/*
638663207aDon Brady	 * If creating a top-level vdev, check for allocation classes input
639663207aDon Brady	 */
640663207aDon Brady	if (top_level && alloctype == VDEV_ALLOC_ADD) {
641663207aDon Brady		char *bias;
642663207aDon Brady
643663207aDon Brady		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
644663207aDon Brady		    &bias) == 0) {
645663207aDon Brady			alloc_bias = vdev_derive_alloc_bias(bias);
646663207aDon Brady
647663207aDon Brady			/* spa_vdev_add() expects feature to be enabled */
648c1064fdJerry Jelinek			if (alloc_bias != VDEV_BIAS_LOG &&
649c1064fdJerry Jelinek			    spa->spa_load_state != SPA_LOAD_CREATE &&
650663207aDon Brady			    !spa_feature_is_enabled(spa,
651663207aDon Brady			    SPA_FEATURE_ALLOCATION_CLASSES)) {
652663207aDon Brady				return (SET_ERROR(ENOTSUP));
653663207aDon Brady			}
654663207aDon Brady		}
655663207aDon Brady	}
656663207aDon Brady
6578654d02perrin	vd = vdev_alloc_common(spa, id, guid, ops);
6585cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
6608654d02perrin	vd->vdev_islog = islog;
6618654d02perrin	vd->vdev_nparity = nparity;
662663207aDon Brady	if (top_level && alloc_bias != VDEV_BIAS_NONE)
663663207aDon Brady		vd->vdev_alloc_bias = alloc_bias;
6658654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
6668654d02perrin		vd->vdev_path = spa_strdup(vd->vdev_path);
6678654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
6688654d02perrin		vd->vdev_devid = spa_strdup(vd->vdev_devid);
6698654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
6708654d02perrin	    &vd->vdev_physpath) == 0)
6718654d02perrin		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
6726809eb4Eric Schrock	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
6736809eb4Eric Schrock		vd->vdev_fru = spa_strdup(vd->vdev_fru);
67599653d4eschrock	/*
676afefbcdeschrock	 * Set the whole_disk property.  If it's not specified, leave the value
677afefbcdeschrock	 * as -1.
678afefbcdeschrock	 */
679afefbcdeschrock	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
680afefbcdeschrock	    &vd->vdev_wholedisk) != 0)
681afefbcdeschrock		vd->vdev_wholedisk = -1ULL;
6835cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_mapping_object);
6845cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
6855cabbc6Prashanth Sreenivasa	    &vic->vic_mapping_object);
6865cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_births_object);
6875cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
6885cabbc6Prashanth Sreenivasa	    &vic->vic_births_object);
6895cabbc6Prashanth Sreenivasa	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
6905cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
6915cabbc6Prashanth Sreenivasa	    &vic->vic_prev_indirect_vdev);
6925cabbc6Prashanth Sreenivasa
693afefbcdeschrock	/*
694ea8dc4beschrock	 * Look for the 'not present' flag.  This will only be set if the device
695ea8dc4beschrock	 * was not present at the time of import.
696ea8dc4beschrock	 */
6976809eb4Eric Schrock	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
6986809eb4Eric Schrock	    &vd->vdev_not_present);
700ea8dc4beschrock	/*
701ecc2d60bonwick	 * Get the alignment requirement.
702ecc2d60bonwick	 */
703ecc2d60bonwick	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
705ecc2d60bonwick	/*
70688ecc94George Wilson	 * Retrieve the vdev creation time.
70788ecc94George Wilson	 */
70888ecc94George Wilson	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
70988ecc94George Wilson	    &vd->vdev_crtxg);
71088ecc94George Wilson
71188ecc94George Wilson	/*
712fa9e406ahrens	 * If we're a top-level vdev, try to load the allocation parameters.
713fa9e406ahrens	 */
714663207aDon Brady	if (top_level &&
7151195e68Mark J Musante	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
716fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
717fa9e406ahrens		    &vd->vdev_ms_array);
718fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
719fa9e406ahrens		    &vd->vdev_ms_shift);
720fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
721fa9e406ahrens		    &vd->vdev_asize);
7223f9d6adLin Ling		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
7233f9d6adLin Ling		    &vd->vdev_removing);
724215198aJoe Stein		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
725215198aJoe Stein		    &vd->vdev_top_zap);
726215198aJoe Stein	} else {
727215198aJoe Stein		ASSERT0(vd->vdev_top_zap);
728fa9e406ahrens	}
730663207aDon Brady	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
731a152156Jeff Bonwick		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
7329f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ADD ||
7331195e68Mark J Musante		    alloctype == VDEV_ALLOC_SPLIT ||
7349f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ROOTPOOL);
735663207aDon Brady		/* Note: metaslab_group_create() is now deferred */
736a152156Jeff Bonwick	}
737a152156Jeff Bonwick
738215198aJoe Stein	if (vd->vdev_ops->vdev_op_leaf &&
739215198aJoe Stein	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
740215198aJoe Stein		(void) nvlist_lookup_uint64(nv,
741215198aJoe Stein		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
742215198aJoe Stein	} else {
743215198aJoe Stein		ASSERT0(vd->vdev_leaf_zap);
744215198aJoe Stein	}
745215198aJoe Stein
746fa9e406ahrens	/*
7473d7072feschrock	 * If we're a leaf vdev, try to load the DTL object and other state.
748fa9e406ahrens	 */
749215198aJoe Stein
750c5904d1eschrock	if (vd->vdev_ops->vdev_op_leaf &&
75121ecdf6Lin Ling	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
75221ecdf6Lin Ling	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
753c5904d1eschrock		if (alloctype == VDEV_ALLOC_LOAD) {
754c5904d1eschrock			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
7550713e23George Wilson			    &vd->vdev_dtl_object);
756c5904d1eschrock			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
757c5904d1eschrock			    &vd->vdev_unspare);
758c5904d1eschrock		}