xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev.c (revision bbf21555)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5441d80aaSlling  * Common Development and Distribution License (the "License").
6441d80aaSlling  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
22fa9e4066Sahrens /*
2398d1cbfeSGeorge Wilson  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245cabbc6bSPrashanth Sreenivasa  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
255f368aefSYuri Pankov  * Copyright 2017 Nexenta Systems, Inc.
26c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
27c8811bd3SToomas Soome  * Copyright 2016 Toomas Soome <tsoome@me.com>
28c1064fd7SJerry Jelinek  * Copyright 2019 Joyent, Inc.
29663207adSDon Brady  * Copyright (c) 2017, Intel Corporation.
300c06d385Sjwpoduska  * Copyright (c) 2019, Datto Inc. All rights reserved.
31fa9e4066Sahrens  */
33fa9e4066Sahrens #include <sys/zfs_context.h>
34ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
35fa9e4066Sahrens #include <sys/spa.h>
36fa9e4066Sahrens #include <sys/spa_impl.h>
375cabbc6bSPrashanth Sreenivasa #include <sys/bpobj.h>
38fa9e4066Sahrens #include <sys/dmu.h>
39fa9e4066Sahrens #include <sys/dmu_tx.h>
405cabbc6bSPrashanth Sreenivasa #include <sys/dsl_dir.h>
41fa9e4066Sahrens #include <sys/vdev_impl.h>
42fa9e4066Sahrens #include <sys/uberblock_impl.h>
43fa9e4066Sahrens #include <sys/metaslab.h>
44fa9e4066Sahrens #include <sys/metaslab_impl.h>
45fa9e4066Sahrens #include <sys/space_map.h>
460713e232SGeorge Wilson #include <sys/space_reftree.h>
47fa9e4066Sahrens #include <sys/zio.h>
48fa9e4066Sahrens #include <sys/zap.h>
49fa9e4066Sahrens #include <sys/fs/zfs.h>
50c5904d13Seschrock #include <sys/arc.h>
51e6ca193dSGeorge Wilson #include <sys/zil.h>
523f9d6ad7SLin Ling #include <sys/dsl_scan.h>
53770499e1SDan Kimmel #include <sys/abd.h>
54094e47e9SGeorge Wilson #include <sys/vdev_initialize.h>
55084fd14fSBrian Behlendorf #include <sys/vdev_trim.h>
57fa9e4066Sahrens /*
58fa9e4066Sahrens  * Virtual device management.
59fa9e4066Sahrens  */
61fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = {
62fa9e4066Sahrens 	&vdev_root_ops,
63fa9e4066Sahrens 	&vdev_raidz_ops,
64fa9e4066Sahrens 	&vdev_mirror_ops,
65fa9e4066Sahrens 	&vdev_replacing_ops,
6699653d4eSeschrock 	&vdev_spare_ops,
67fa9e4066Sahrens 	&vdev_disk_ops,
68fa9e4066Sahrens 	&vdev_file_ops,
69fa9e4066Sahrens 	&vdev_missing_ops,
7088ecc943SGeorge Wilson 	&vdev_hole_ops,
715cabbc6bSPrashanth Sreenivasa 	&vdev_indirect_ops,
72fa9e4066Sahrens 	NULL
73fa9e4066Sahrens };
75088f3894Sahrens /* maximum scrub/resilver I/O queue per leaf vdev */
76088f3894Sahrens int zfs_scrub_limit = 10;
78a0b03b16SSerapheim Dimitropoulos /* default target for number of metaslabs per top-level vdev */
79a0b03b16SSerapheim Dimitropoulos int zfs_vdev_default_ms_count = 200;
8086714001SSerapheim Dimitropoulos 
81b4bf0cf0SDon Brady /* minimum number of metaslabs per top-level vdev */
82a0b03b16SSerapheim Dimitropoulos int zfs_vdev_min_ms_count = 16;
8386714001SSerapheim Dimitropoulos 
84b4bf0cf0SDon Brady /* practical upper limit of total metaslabs per top-level vdev */
85a0b03b16SSerapheim Dimitropoulos int zfs_vdev_ms_count_limit = 1ULL << 17;
86b4bf0cf0SDon Brady 
87b4bf0cf0SDon Brady /* lower limit for metaslab size (512M) */
88a0b03b16SSerapheim Dimitropoulos int zfs_vdev_default_ms_shift = 29;
8986714001SSerapheim Dimitropoulos 
90a0b03b16SSerapheim Dimitropoulos /* upper limit for metaslab size (16G) */
91a0b03b16SSerapheim Dimitropoulos int zfs_vdev_max_ms_shift = 34;
92b4bf0cf0SDon Brady 
9386714001SSerapheim Dimitropoulos boolean_t vdev_validate_skip = B_FALSE;
9486714001SSerapheim Dimitropoulos 
95bf3e216cSMatthew Ahrens /*
9686714001SSerapheim Dimitropoulos  * Since the DTL space map of a vdev is not expected to have a lot of
9786714001SSerapheim Dimitropoulos  * entries, we default its block size to 4K.
98bf3e216cSMatthew Ahrens  */
99814dcd43SSerapheim Dimitropoulos int zfs_vdev_dtl_sm_blksz = (1 << 12);
100bf3e216cSMatthew Ahrens 
101165c5c6fSJohn Poduska /*
102165c5c6fSJohn Poduska  * Ignore errors during scrub/resilver.  Allows to work around resilver
103165c5c6fSJohn Poduska  * upon import when there are pool errors.
104165c5c6fSJohn Poduska  */
105165c5c6fSJohn Poduska int zfs_scan_ignore_errors = 0;
106165c5c6fSJohn Poduska 
10786714001SSerapheim Dimitropoulos /*
10886714001SSerapheim Dimitropoulos  * vdev-wide space maps that have lots of entries written to them at
10986714001SSerapheim Dimitropoulos  * the end of each transaction can benefit from a higher I/O bandwidth
11086714001SSerapheim Dimitropoulos  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
11186714001SSerapheim Dimitropoulos  */
112814dcd43SSerapheim Dimitropoulos int zfs_vdev_standard_sm_blksz = (1 << 17);
1136f793812SPavel Zakharov 
11493a1902eSMatthew Ahrens int zfs_ashift_min;
11593a1902eSMatthew Ahrens 
1163ee8c80cSPavel Zakharov /*PRINTFLIKE2*/
1173ee8c80cSPavel Zakharov void
vdev_dbgmsg(vdev_t * vd,const char * fmt,...)1183ee8c80cSPavel Zakharov vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
1193ee8c80cSPavel Zakharov {
1203ee8c80cSPavel Zakharov 	va_list adx;
1213ee8c80cSPavel Zakharov 	char buf[256];
1223ee8c80cSPavel Zakharov 
1233ee8c80cSPavel Zakharov 	va_start(adx, fmt);
1243ee8c80cSPavel Zakharov 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
1253ee8c80cSPavel Zakharov 	va_end(adx);
1263ee8c80cSPavel Zakharov 
1273ee8c80cSPavel Zakharov 	if (vd->vdev_path != NULL) {
1283ee8c80cSPavel Zakharov 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
1293ee8c80cSPavel Zakharov 		    vd->vdev_path, buf);
1303ee8c80cSPavel Zakharov 	} else {
1313ee8c80cSPavel Zakharov 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
1323ee8c80cSPavel Zakharov 		    vd->vdev_ops->vdev_op_type,
1333ee8c80cSPavel Zakharov 		    (u_longlong_t)vd->vdev_id,
1343ee8c80cSPavel Zakharov 		    (u_longlong_t)vd->vdev_guid, buf);
1353ee8c80cSPavel Zakharov 	}
1363ee8c80cSPavel Zakharov }
1373ee8c80cSPavel Zakharov 
1386f793812SPavel Zakharov void
vdev_dbgmsg_print_tree(vdev_t * vd,int indent)1396f793812SPavel Zakharov vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
1406f793812SPavel Zakharov {
1416f793812SPavel Zakharov 	char state[20];
1426f793812SPavel Zakharov 
1436f793812SPavel Zakharov 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
1446f793812SPavel Zakharov 		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
1456f793812SPavel Zakharov 		    vd->vdev_ops->vdev_op_type);
1466f793812SPavel Zakharov 		return;
1476f793812SPavel Zakharov 	}
1486f793812SPavel Zakharov 
1496f793812SPavel Zakharov 	switch (vd->vdev_state) {
1506f793812SPavel Zakharov 	case VDEV_STATE_UNKNOWN:
1516f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "unknown");
1526f793812SPavel Zakharov 		break;
1536f793812SPavel Zakharov 	case VDEV_STATE_CLOSED:
1546f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "closed");
1556f793812SPavel Zakharov 		break;
1566f793812SPavel Zakharov 	case VDEV_STATE_OFFLINE:
1576f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "offline");
1586f793812SPavel Zakharov 		break;
1596f793812SPavel Zakharov 	case VDEV_STATE_REMOVED:
1606f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "removed");
1616f793812SPavel Zakharov 		break;
1626f793812SPavel Zakharov 	case VDEV_STATE_CANT_OPEN:
1636f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "can't open");
1646f793812SPavel Zakharov 		break;
1656f793812SPavel Zakharov 	case VDEV_STATE_FAULTED:
1666f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "faulted");
1676f793812SPavel Zakharov 		break;
1686f793812SPavel Zakharov 	case VDEV_STATE_DEGRADED:
1696f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "degraded");
1706f793812SPavel Zakharov 		break;
1716f793812SPavel Zakharov 	case VDEV_STATE_HEALTHY:
1726f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "healthy");
1736f793812SPavel Zakharov 		break;
1746f793812SPavel Zakharov 	default:
1756f793812SPavel Zakharov 		(void) snprintf(state, sizeof (state), "<state %u>",
1766f793812SPavel Zakharov 		    (uint_t)vd->vdev_state);
1776f793812SPavel Zakharov 	}
1786f793812SPavel Zakharov 
1796f793812SPavel Zakharov 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
180c7a7b2faSAndriy Gapon 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
1816f793812SPavel Zakharov 	    vd->vdev_islog ? " (log)" : "",
1826f793812SPavel Zakharov 	    (u_longlong_t)vd->vdev_guid,
1836f793812SPavel Zakharov 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
1846f793812SPavel Zakharov 
1856f793812SPavel Zakharov 	for (uint64_t i = 0; i < vd->vdev_children; i++)
1866f793812SPavel Zakharov 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
1876f793812SPavel Zakharov }
1886f793812SPavel Zakharov 
189fa9e4066Sahrens /*
190fa9e4066Sahrens  * Given a vdev type, return the appropriate ops vector.
191fa9e4066Sahrens  */
192fa9e4066Sahrens static vdev_ops_t *
vdev_getops(const char * type)193fa9e4066Sahrens vdev_getops(const char *type)
194fa9e4066Sahrens {
195fa9e4066Sahrens 	vdev_ops_t *ops, **opspp;
197fa9e4066Sahrens 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
198fa9e4066Sahrens 		if (strcmp(ops->vdev_op_type, type) == 0)
199fa9e4066Sahrens 			break;
201fa9e4066Sahrens 	return (ops);
202fa9e4066Sahrens }
204663207adSDon Brady /*
205663207adSDon Brady  * Derive the enumerated alloction bias from string input.
206*bbf21555SRichard Lowe  * String origin is either the per-vdev zap or zpool(8).
207663207adSDon Brady  */
208663207adSDon Brady static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char * bias)209663207adSDon Brady vdev_derive_alloc_bias(const char *bias)
210663207adSDon Brady {
211663207adSDon Brady 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
212663207adSDon Brady 
213663207adSDon Brady 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
214663207adSDon Brady 		alloc_bias = VDEV_BIAS_LOG;
215663207adSDon Brady 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
216663207adSDon Brady 		alloc_bias = VDEV_BIAS_SPECIAL;
217663207adSDon Brady 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
218663207adSDon Brady 		alloc_bias = VDEV_BIAS_DEDUP;
219663207adSDon Brady 
220663207adSDon Brady 	return (alloc_bias);
221663207adSDon Brady }
222663207adSDon Brady 
223094e47e9SGeorge Wilson /* ARGSUSED */
224094e47e9SGeorge Wilson void
vdev_default_xlate(vdev_t * vd,const range_seg64_t * in,range_seg64_t * res)2254d7988d6SPaul Dagnelie vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
226094e47e9SGeorge Wilson {
227094e47e9SGeorge Wilson 	res->rs_start = in->rs_start;
228094e47e9SGeorge Wilson 	res->rs_end = in->rs_end;
229094e47e9SGeorge Wilson }
230094e47e9SGeorge Wilson 
231fa9e4066Sahrens /*
232fa9e4066Sahrens  * Default asize function: return the MAX of psize with the asize of
233fa9e4066Sahrens  * all children.  This is what's used by anything other than RAID-Z.
234fa9e4066Sahrens  */
235fa9e4066Sahrens uint64_t
vdev_default_asize(vdev_t * vd,uint64_t psize)236fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize)
237fa9e4066Sahrens {
238ecc2d604Sbonwick 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
239fa9e4066Sahrens 	uint64_t csize;
241573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++) {
242fa9e4066Sahrens 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
243fa9e4066Sahrens 		asize = MAX(asize, csize);
244fa9e4066Sahrens 	}
246fa9e4066Sahrens 	return (asize);
247fa9e4066Sahrens }
2492a79c5feSlling /*
250573ca77eSGeorge Wilson  * Get the minimum allocatable size. We define the allocatable size as
251573ca77eSGeorge Wilson  * the vdev's asize rounded to the nearest metaslab. This allows us to
252573ca77eSGeorge Wilson  * replace or attach devices which don't have the same physical size but
253573ca77eSGeorge Wilson  * can still satisfy the same number of allocations.
2542a79c5feSlling  */
2552a79c5feSlling uint64_t
vdev_get_min_asize(vdev_t * vd)256573ca77eSGeorge Wilson vdev_get_min_asize(vdev_t *vd)
2572a79c5feSlling {
258573ca77eSGeorge Wilson 	vdev_t *pvd = vd->vdev_parent;
260573ca77eSGeorge Wilson 	/*
2614263d13fSGeorge Wilson 	 * If our parent is NULL (inactive spare or cache) or is the root,
262573ca77eSGeorge Wilson 	 * just return our own asize.
263573ca77eSGeorge Wilson 	 */
264573ca77eSGeorge Wilson 	if (pvd == NULL)
265573ca77eSGeorge Wilson 		return (vd->vdev_asize);
2672a79c5feSlling 	/*
268573ca77eSGeorge Wilson 	 * The top-level vdev just returns the allocatable size rounded
269573ca77eSGeorge Wilson 	 * to the nearest metaslab.
2702a79c5feSlling 	 */
271573ca77eSGeorge Wilson 	if (vd == vd->vdev_top)
272573ca77eSGeorge Wilson 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
274573ca77eSGeorge Wilson 	/*
275573ca77eSGeorge Wilson 	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
276573ca77eSGeorge Wilson 	 * so each child must provide at least 1/Nth of its asize.
277573ca77eSGeorge Wilson 	 */
278573ca77eSGeorge Wilson 	if (pvd->vdev_ops == &vdev_raidz_ops)
279c040c10cSSteven Hartland 		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
280c040c10cSSteven Hartland 		    pvd->vdev_children);
282573ca77eSGeorge Wilson 	return (pvd->vdev_min_asize);
283573ca77eSGeorge Wilson }
285573ca77eSGeorge Wilson void
vdev_set_min_asize(vdev_t * vd)286573ca77eSGeorge Wilson vdev_set_min_asize(vdev_t *vd)
287573ca77eSGeorge Wilson {
288573ca77eSGeorge Wilson 	vd->vdev_min_asize = vdev_get_min_asize(vd);
289573ca77eSGeorge Wilson 
290573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
291573ca77eSGeorge Wilson 		vdev_set_min_asize(vd->vdev_child[c]);
2922a79c5feSlling }
294fa9e4066Sahrens vdev_t *
vdev_lookup_top(spa_t * spa,uint64_t vdev)295fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev)
296fa9e4066Sahrens {
297fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
299e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
301088f3894Sahrens 	if (vdev < rvd->vdev_children) {
302088f3894Sahrens 		ASSERT(rvd->vdev_child[vdev] != NULL);
303fa9e4066Sahrens 		return (rvd->vdev_child[vdev]);
304088f3894Sahrens 	}
306fa9e4066Sahrens 	return (NULL);
307fa9e4066Sahrens }
309fa9e4066Sahrens vdev_t *
vdev_lookup_by_guid(vdev_t * vd,uint64_t guid)310fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
311fa9e4066Sahrens {
312fa9e4066Sahrens 	vdev_t *mvd;
3140e34b6a7Sbonwick 	if (vd->vdev_guid == guid)
315fa9e4066Sahrens 		return (vd);
317573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
318fa9e4066Sahrens 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
319fa9e4066Sahrens 		    NULL)
320fa9e4066Sahrens 			return (mvd);
322fa9e4066Sahrens 	return (NULL);
323fa9e4066Sahrens }
32512380e1eSArne Jansen static int
vdev_count_leaves_impl(vdev_t * vd)32612380e1eSArne Jansen vdev_count_leaves_impl(vdev_t *vd)
32712380e1eSArne Jansen {
32812380e1eSArne Jansen 	int n = 0;
32912380e1eSArne Jansen 
33012380e1eSArne Jansen 	if (vd->vdev_ops->vdev_op_leaf)
33112380e1eSArne Jansen 		return (1);
33212380e1eSArne Jansen 
33312380e1eSArne Jansen 	for (int c = 0; c < vd->vdev_children; c++)
33412380e1eSArne Jansen 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
33512380e1eSArne Jansen 
33612380e1eSArne Jansen 	return (n);
33712380e1eSArne Jansen }
33812380e1eSArne Jansen 
33912380e1eSArne Jansen int
vdev_count_leaves(spa_t * spa)34012380e1eSArne Jansen vdev_count_leaves(spa_t *spa)
34112380e1eSArne Jansen {
34212380e1eSArne Jansen 	return (vdev_count_leaves_impl(spa->spa_root_vdev));
34312380e1eSArne Jansen }
34412380e1eSArne Jansen 
345fa9e4066Sahrens void
vdev_add_child(vdev_t * pvd,vdev_t * cvd)346fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd)
347fa9e4066Sahrens {
348fa9e4066Sahrens 	size_t oldsize, newsize;
349fa9e4066Sahrens 	uint64_t id = cvd->vdev_id;
350fa9e4066Sahrens 	vdev_t **newchild;
35181cd5c55SMatthew Ahrens 	spa_t *spa = cvd->vdev_spa;
35381cd5c55SMatthew Ahrens 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
354fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == NULL);
356fa9e4066Sahrens 	cvd->vdev_parent = pvd;
358fa9e4066Sahrens 	if (pvd == NULL)
359fa9e4066Sahrens 		return;
361fa9e4066Sahrens 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
363fa9e4066Sahrens 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
364fa9e4066Sahrens 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
365fa9e4066Sahrens 	newsize = pvd->vdev_children * sizeof (vdev_t *);
367fa9e4066Sahrens 	newchild = kmem_zalloc(newsize, KM_SLEEP);
368fa9e4066Sahrens 	if (pvd->vdev_child != NULL) {
369fa9e4066Sahrens 		bcopy(pvd->vdev_child, newchild, oldsize);
370fa9e4066Sahrens 		kmem_free(pvd->vdev_child, oldsize);
371fa9e4066Sahrens 	}
373fa9e4066Sahrens 	pvd->vdev_child = newchild;
374fa9e4066Sahrens 	pvd->vdev_child[id] = cvd;
376fa9e4066Sahrens 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
377fa9e4066Sahrens 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
379fa9e4066Sahrens 	/*
380fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
381fa9e4066Sahrens 	 */
382fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
383fa9e4066Sahrens 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
384e0f1c0afSOlaf Faaland 
385e0f1c0afSOlaf Faaland 	if (cvd->vdev_ops->vdev_op_leaf) {
386e0f1c0afSOlaf Faaland 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
387e0f1c0afSOlaf Faaland 		cvd->vdev_spa->spa_leaf_list_gen++;
388e0f1c0afSOlaf Faaland 	}
389fa9e4066Sahrens }
391fa9e4066Sahrens void
vdev_remove_child(vdev_t * pvd,vdev_t * cvd)392fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
393fa9e4066Sahrens {
394fa9e4066Sahrens 	int c;
395fa9e4066Sahrens 	uint_t id = cvd->vdev_id;
397fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == pvd);
399fa9e4066Sahrens 	if (pvd == NULL)
400fa9e4066Sahrens 		return;
402fa9e4066Sahrens 	ASSERT(id < pvd->vdev_children);
403fa9e4066Sahrens 	ASSERT(pvd->vdev_child[id] == cvd);
405fa9e4066Sahrens 	pvd->vdev_child[id] = NULL;
406fa9e4066Sahrens 	cvd->vdev_parent = NULL;
408fa9e4066Sahrens 	for (c = 0; c < pvd->vdev_children; c++)
409fa9e4066Sahrens 		if (pvd->vdev_child[c])
410fa9e4066Sahrens 			break;
412fa9e4066Sahrens 	if (c == pvd->vdev_children) {
413fa9e4066Sahrens 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
414fa9e4066Sahrens 		pvd->vdev_child = NULL;
415fa9e4066Sahrens 		pvd->vdev_children = 0;
416fa9e4066Sahrens 	}
418e0f1c0afSOlaf Faaland 	if (cvd->vdev_ops->vdev_op_leaf) {
419e0f1c0afSOlaf Faaland 		spa_t *spa = cvd->vdev_spa;
420e0f1c0afSOlaf Faaland 		list_remove(&spa->spa_leaf_list, cvd);
421e0f1c0afSOlaf Faaland 		spa->spa_leaf_list_gen++;
422e0f1c0afSOlaf Faaland 	}
423e0f1c0afSOlaf Faaland 
424fa9e4066Sahrens 	/*
425fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
426fa9e4066Sahrens 	 */
427fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
428fa9e4066Sahrens 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
429fa9e4066Sahrens }
431fa9e4066Sahrens /*
432fa9e4066Sahrens  * Remove any holes in the child array.
433fa9e4066Sahrens  */
434fa9e4066Sahrens void
vdev_compact_children(vdev_t * pvd)435fa9e4066Sahrens vdev_compact_children(vdev_t *pvd)
436fa9e4066Sahrens {
437fa9e4066Sahrens 	vdev_t **newchild, *cvd;
438fa9e4066Sahrens 	int oldc = pvd->vdev_children;
439573ca77eSGeorge Wilson 	int newc;
441e14bb325SJeff Bonwick 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
443573ca77eSGeorge Wilson 	for (int c = newc = 0; c < oldc; c++)
444fa9e4066Sahrens 		if (pvd->vdev_child[c])
445fa9e4066Sahrens 			newc++;
447fa9e4066Sahrens 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
449573ca77eSGeorge Wilson 	for (int c = newc = 0; c < oldc; c++) {
450fa9e4066Sahrens 		if ((cvd = pvd->vdev_child[c]) != NULL) {
451fa9e4066Sahrens 			newchild[newc] = cvd;
452fa9e4066Sahrens 			cvd->vdev_id = newc++;
453fa9e4066Sahrens 		}
454fa9e4066Sahrens 	}
456fa9e4066Sahrens 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
457fa9e4066Sahrens 	pvd->vdev_child = newchild;
458fa9e4066Sahrens 	pvd->vdev_children = newc;
459fa9e4066Sahrens }
461fa9e4066Sahrens /*
462fa9e4066Sahrens  * Allocate and minimally initialize a vdev_t.
463fa9e4066Sahrens  */
46488ecc943SGeorge Wilson vdev_t *
vdev_alloc_common(spa_t * spa,uint_t id,uint64_t guid,vdev_ops_t * ops)465fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
466fa9e4066Sahrens {
467fa9e4066Sahrens 	vdev_t *vd;
4685cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic;
470fa9e4066Sahrens 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
4715cabbc6bSPrashanth Sreenivasa 	vic = &vd->vdev_indirect_config;
4730e34b6a7Sbonwick 	if (spa->spa_root_vdev == NULL) {
4740e34b6a7Sbonwick 		ASSERT(ops == &vdev_root_ops);
4750e34b6a7Sbonwick 		spa->spa_root_vdev = vd;
476e9103aaeSGarrett D'Amore 		spa->spa_load_guid = spa_generate_guid(NULL);
4770e34b6a7Sbonwick 	}
47988ecc943SGeorge Wilson 	if (guid == 0 && ops != &vdev_hole_ops) {
4800e34b6a7Sbonwick 		if (spa->spa_root_vdev == vd) {
4810e34b6a7Sbonwick 			/*
4820e34b6a7Sbonwick 			 * The root vdev's guid will also be the pool guid,
4830e34b6a7Sbonwick 			 * which must be unique among all pools.
4840e34b6a7Sbonwick 			 */
4851195e687SMark J Musante 			guid = spa_generate_guid(NULL);
4860e34b6a7Sbonwick 		} else {
4870e34b6a7Sbonwick 			/*
4880e34b6a7Sbonwick 			 * Any other vdev's guid must be unique within the pool.
4890e34b6a7Sbonwick 			 */
4901195e687SMark J Musante 			guid = spa_generate_guid(spa);
4910e34b6a7Sbonwick 		}
4920e34b6a7Sbonwick 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
4930e34b6a7Sbonwick 	}
495fa9e4066Sahrens 	vd->vdev_spa = spa;
496fa9e4066Sahrens 	vd->vdev_id = id;
497fa9e4066Sahrens 	vd->vdev_guid = guid;
498fa9e4066Sahrens 	vd->vdev_guid_sum = guid;
499fa9e4066Sahrens 	vd->vdev_ops = ops;
500fa9e4066Sahrens 	vd->vdev_state = VDEV_STATE_CLOSED;
50188ecc943SGeorge Wilson 	vd->vdev_ishole = (ops == &vdev_hole_ops);
5025cabbc6bSPrashanth Sreenivasa 	vic->vic_prev_indirect_vdev = UINT64_MAX;
5035cabbc6bSPrashanth Sreenivasa 
5045cabbc6bSPrashanth Sreenivasa 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
5055cabbc6bSPrashanth Sreenivasa 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
5064d7988d6SPaul Dagnelie 	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
5074d7988d6SPaul Dagnelie 	    0, 0);
509084fd14fSBrian Behlendorf 	list_link_init(&vd->vdev_initialize_node);
510e0f1c0afSOlaf Faaland 	list_link_init(&vd->vdev_leaf_node);
511084fd14fSBrian Behlendorf 	list_link_init(&vd->vdev_trim_node);
512fa9e4066Sahrens 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
5135ad82045Snd 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
514e14bb325SJeff Bonwick 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
515a3874b8bSToomas Soome 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
516094e47e9SGeorge Wilson 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
517094e47e9SGeorge Wilson 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
518094e47e9SGeorge Wilson 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
519094e47e9SGeorge Wilson 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
520084fd14fSBrian Behlendorf 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
521084fd14fSBrian Behlendorf 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
522084fd14fSBrian Behlendorf 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
523084fd14fSBrian Behlendorf 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
524084fd14fSBrian Behlendorf 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
525084fd14fSBrian Behlendorf 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
526094e47e9SGeorge Wilson 
5278ad4d6ddSJeff Bonwick 	for (int t = 0; t < DTL_TYPES; t++) {
5284d7988d6SPaul Dagnelie 		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
5294d7988d6SPaul Dagnelie 		    0);
5308ad4d6ddSJeff Bonwick 	}
531b7b2590dSMatthew Ahrens 	txg_list_create(&vd->vdev_ms_list, spa,
532fa9e4066Sahrens 	    offsetof(struct metaslab, ms_txg_node));
533b7b2590dSMatthew Ahrens 	txg_list_create(&vd->vdev_dtl_list, spa,
534fa9e4066Sahrens 	    offsetof(struct vdev, vdev_dtl_node));
535fa9e4066Sahrens 	vd->vdev_stat.vs_timestamp = gethrtime();
5363d7072f8Seschrock 	vdev_queue_init(vd);
5373d7072f8Seschrock 	vdev_cache_init(vd);
539fa9e4066Sahrens 	return (vd);
540fa9e4066Sahrens }
542fa9e4066Sahrens /*
543fa9e4066Sahrens  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
544fa9e4066Sahrens  * creating a new vdev or loading an existing one - the behavior is slightly
545fa9e4066Sahrens  * different for each case.
546fa9e4066Sahrens  */
54799653d4eSeschrock int
vdev_alloc(spa_t * spa,vdev_t ** vdp,nvlist_t * nv,vdev_t * parent,uint_t id,int alloctype)54899653d4eSeschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
54999653d4eSeschrock     int alloctype)
550fa9e4066Sahrens {
551fa9e4066Sahrens 	vdev_ops_t *ops;
552fa9e4066Sahrens 	char *type;
5538654d025Sperrin 	uint64_t guid = 0, islog, nparity;
554fa9e4066Sahrens 	vdev_t *vd;
5555cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic;
556663207adSDon Brady 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
557663207adSDon Brady 	boolean_t top_level = (parent && !parent->vdev_parent);
559e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
561fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
562be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
564fa9e4066Sahrens 	if ((ops = vdev_getops(type)) == NULL)
565be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
567fa9e4066Sahrens 	/*
568fa9e4066Sahrens 	 * If this is a load, get the vdev guid from the nvlist.
569fa9e4066Sahrens 	 * Otherwise, vdev_alloc_common() will generate one for us.
570fa9e4066Sahrens 	 */
571fa9e4066Sahrens 	if (alloctype == VDEV_ALLOC_LOAD) {
572fa9e4066Sahrens 		uint64_t label_id;
574fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
575fa9e4066Sahrens 		    label_id != id)
576be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
578fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
579be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
58099653d4eSeschrock 	} else if (alloctype == VDEV_ALLOC_SPARE) {
58199653d4eSeschrock 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
582be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
583fa94a07fSbrendan 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
584fa94a07fSbrendan 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
585be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
58621ecdf64SLin Ling 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
58721ecdf64SLin Ling 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
588be6fd75aSMatthew Ahrens 			return (SET_ERROR(EINVAL));
589fa9e4066Sahrens 	}
59199653d4eSeschrock 	/*
59299653d4eSeschrock 	 * The first allocated vdev must be of type 'root'.
59399653d4eSeschrock 	 */
59499653d4eSeschrock 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
595be6fd75aSMatthew Ahrens 		return (SET_ERROR(EINVAL));
5978654d025Sperrin 	/*
5988654d025Sperrin 	 * Determine whether we're a log vdev.
5998654d025Sperrin 	 */
6008654d025Sperrin 	islog = 0;
6018654d025Sperrin 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
602990b4856Slling 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
603be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENOTSUP));
60588ecc943SGeorge Wilson 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
606be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENOTSUP));
60788ecc943SGeorge Wilson 
60899653d4eSeschrock 	/*
6098654d025Sperrin 	 * Set the nparity property for RAID-Z vdevs.
61099653d4eSeschrock 	 */
6118654d025Sperrin 	nparity = -1ULL;
61299653d4eSeschrock 	if (ops == &vdev_raidz_ops) {
61399653d4eSeschrock 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
6148654d025Sperrin 		    &nparity) == 0) {
615b24ab676SJeff Bonwick 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
616be6fd75aSMatthew Ahrens 				return (SET_ERROR(EINVAL));
61799653d4eSeschrock 			/*
618f94275ceSAdam Leventhal 			 * Previous versions could only support 1 or 2 parity
619f94275ceSAdam Leventhal 			 * device.
62099653d4eSeschrock 			 */
621f94275ceSAdam Leventhal 			if (nparity > 1 &&
622f94275ceSAdam Leventhal 			    spa_version(spa) < SPA_VERSION_RAIDZ2)
623be6fd75aSMatthew Ahrens 				return (SET_ERROR(ENOTSUP));
624f94275ceSAdam Leventhal 			if (nparity > 2 &&
625f94275ceSAdam Leventhal 			    spa_version(spa) < SPA_VERSION_RAIDZ3)
626be6fd75aSMatthew Ahrens 				return (SET_ERROR(ENOTSUP));
62799653d4eSeschrock 		} else {
62899653d4eSeschrock 			/*
62999653d4eSeschrock 			 * We require the parity to be specified for SPAs that
63099653d4eSeschrock 			 * support multiple parity levels.
63199653d4eSeschrock 			 */
632f94275ceSAdam Leventhal 			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
633be6fd75aSMatthew Ahrens 				return (SET_ERROR(EINVAL));
63499653d4eSeschrock 			/*
63599653d4eSeschrock 			 * Otherwise, we default to 1 parity device for RAID-Z.
63699653d4eSeschrock 			 */
6378654d025Sperrin 			nparity = 1;
63899653d4eSeschrock 		}
63999653d4eSeschrock 	} else {
6408654d025Sperrin 		nparity = 0;
64199653d4eSeschrock 	}
6428654d025Sperrin 	ASSERT(nparity != -1ULL);
644663207adSDon Brady 	/*
645663207adSDon Brady 	 * If creating a top-level vdev, check for allocation classes input
646663207adSDon Brady 	 */
647663207adSDon Brady 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
648663207adSDon Brady 		char *bias;
649663207adSDon Brady 
650663207adSDon Brady 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
651663207adSDon Brady 		    &bias) == 0) {
652663207adSDon Brady 			alloc_bias = vdev_derive_alloc_bias(bias);
653663207adSDon Brady