2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5441d80alling * Common Development and Distribution License (the "License").
6441d80alling * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
2398d1cbfGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245cabbc6Prashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
255f368aeYuri Pankov * Copyright 2017 Nexenta Systems, Inc.
26c3d26abMatthew Ahrens * Copyright (c) 2014 Integros [integros.com]
27c8811bdToomas Soome * Copyright 2016 Toomas Soome <tsoome@me.com>
28c1064fdJerry Jelinek * Copyright 2019 Joyent, Inc.
29663207aDon Brady * Copyright (c) 2017, Intel Corporation.
300c06d38jwpoduska * Copyright (c) 2019, Datto Inc. All rights reserved.
31fa9e406ahrens */
33fa9e406ahrens#include <sys/zfs_context.h>
34ea8dc4beschrock#include <sys/fm/fs/zfs.h>
35fa9e406ahrens#include <sys/spa.h>
36fa9e406ahrens#include <sys/spa_impl.h>
375cabbc6Prashanth Sreenivasa#include <sys/bpobj.h>
38fa9e406ahrens#include <sys/dmu.h>
39fa9e406ahrens#include <sys/dmu_tx.h>
405cabbc6Prashanth Sreenivasa#include <sys/dsl_dir.h>
41fa9e406ahrens#include <sys/vdev_impl.h>
42fa9e406ahrens#include <sys/uberblock_impl.h>
43fa9e406ahrens#include <sys/metaslab.h>
44fa9e406ahrens#include <sys/metaslab_impl.h>
45fa9e406ahrens#include <sys/space_map.h>
460713e23George Wilson#include <sys/space_reftree.h>
47fa9e406ahrens#include <sys/zio.h>
48fa9e406ahrens#include <sys/zap.h>
49fa9e406ahrens#include <sys/fs/zfs.h>
50c5904d1eschrock#include <sys/arc.h>
51e6ca193George Wilson#include <sys/zil.h>
523f9d6adLin Ling#include <sys/dsl_scan.h>
53770499eDan Kimmel#include <sys/abd.h>
54094e47eGeorge Wilson#include <sys/vdev_initialize.h>
55084fd14Brian Behlendorf#include <sys/vdev_trim.h>
58fa9e406ahrens * Virtual device management.
59fa9e406ahrens */
61fa9e406ahrensstatic vdev_ops_t *vdev_ops_table[] = {
62fa9e406ahrens	&vdev_root_ops,
63fa9e406ahrens	&vdev_raidz_ops,
64fa9e406ahrens	&vdev_mirror_ops,
65fa9e406ahrens	&vdev_replacing_ops,
6699653d4eschrock	&vdev_spare_ops,
67fa9e406ahrens	&vdev_disk_ops,
68fa9e406ahrens	&vdev_file_ops,
69fa9e406ahrens	&vdev_missing_ops,
7088ecc94George Wilson	&vdev_hole_ops,
715cabbc6Prashanth Sreenivasa	&vdev_indirect_ops,
72fa9e406ahrens	NULL
75088f389ahrens/* maximum scrub/resilver I/O queue per leaf vdev */
76088f389ahrensint zfs_scrub_limit = 10;
78a0b03b1Serapheim Dimitropoulos/* default target for number of metaslabs per top-level vdev */
79a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_count = 200;
808671400Serapheim Dimitropoulos
81b4bf0cfDon Brady/* minimum number of metaslabs per top-level vdev */
82a0b03b1Serapheim Dimitropoulosint zfs_vdev_min_ms_count = 16;
838671400Serapheim Dimitropoulos
84b4bf0cfDon Brady/* practical upper limit of total metaslabs per top-level vdev */
85a0b03b1Serapheim Dimitropoulosint zfs_vdev_ms_count_limit = 1ULL << 17;
86b4bf0cfDon Brady
87b4bf0cfDon Brady/* lower limit for metaslab size (512M) */
88a0b03b1Serapheim Dimitropoulosint zfs_vdev_default_ms_shift = 29;
898671400Serapheim Dimitropoulos
90a0b03b1Serapheim Dimitropoulos/* upper limit for metaslab size (16G) */
91a0b03b1Serapheim Dimitropoulosint zfs_vdev_max_ms_shift = 34;
92b4bf0cfDon Brady
938671400Serapheim Dimitropoulosboolean_t vdev_validate_skip = B_FALSE;
948671400Serapheim Dimitropoulos
968671400Serapheim Dimitropoulos * Since the DTL space map of a vdev is not expected to have a lot of
978671400Serapheim Dimitropoulos * entries, we default its block size to 4K.
98bf3e216Matthew Ahrens */
99814dcd4Serapheim Dimitropoulosint zfs_vdev_dtl_sm_blksz = (1 << 12);
100bf3e216Matthew Ahrens
1018671400Serapheim Dimitropoulos/*
102165c5c6John Poduska * Ignore errors during scrub/resilver.  Allows to work around resilver
103165c5c6John Poduska * upon import when there are pool errors.
104165c5c6John Poduska */
105165c5c6John Poduskaint zfs_scan_ignore_errors = 0;
106165c5c6John Poduska
107165c5c6John Poduska/*
1088671400Serapheim Dimitropoulos * vdev-wide space maps that have lots of entries written to them at
1098671400Serapheim Dimitropoulos * the end of each transaction can benefit from a higher I/O bandwidth
1108671400Serapheim Dimitropoulos * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
1118671400Serapheim Dimitropoulos */
112814dcd4Serapheim Dimitropoulosint zfs_vdev_standard_sm_blksz = (1 << 17);
1136f79381Pavel Zakharov
11493a1902Matthew Ahrensint zfs_ashift_min;
11593a1902Matthew Ahrens
1163ee8c80Pavel Zakharov/*PRINTFLIKE2*/
1173ee8c80Pavel Zakharovvoid
1183ee8c80Pavel Zakharovvdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
1193ee8c80Pavel Zakharov{
1203ee8c80Pavel Zakharov	va_list adx;
1213ee8c80Pavel Zakharov	char buf[256];
1223ee8c80Pavel Zakharov
1233ee8c80Pavel Zakharov	va_start(adx, fmt);
1243ee8c80Pavel Zakharov	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
1253ee8c80Pavel Zakharov	va_end(adx);
1263ee8c80Pavel Zakharov
1273ee8c80Pavel Zakharov	if (vd->vdev_path != NULL) {
1283ee8c80Pavel Zakharov		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
1293ee8c80Pavel Zakharov		    vd->vdev_path, buf);
1303ee8c80Pavel Zakharov	} else {
1313ee8c80Pavel Zakharov		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
1323ee8c80Pavel Zakharov		    vd->vdev_ops->vdev_op_type,
1333ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_id,
1343ee8c80Pavel Zakharov		    (u_longlong_t)vd->vdev_guid, buf);
1353ee8c80Pavel Zakharov	}
1363ee8c80Pavel Zakharov}
1373ee8c80Pavel Zakharov
1386f79381Pavel Zakharovvoid
1396f79381Pavel Zakharovvdev_dbgmsg_print_tree(vdev_t *vd, int indent)
1406f79381Pavel Zakharov{
1416f79381Pavel Zakharov	char state[20];
1426f79381Pavel Zakharov
1436f79381Pavel Zakharov	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
1446f79381Pavel Zakharov		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
1456f79381Pavel Zakharov		    vd->vdev_ops->vdev_op_type);
1466f79381Pavel Zakharov		return;
1476f79381Pavel Zakharov	}
1486f79381Pavel Zakharov
1496f79381Pavel Zakharov	switch (vd->vdev_state) {
1506f79381Pavel Zakharov	case VDEV_STATE_UNKNOWN:
1516f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "unknown");
1526f79381Pavel Zakharov		break;
1536f79381Pavel Zakharov	case VDEV_STATE_CLOSED:
1546f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "closed");
1556f79381Pavel Zakharov		break;
1566f79381Pavel Zakharov	case VDEV_STATE_OFFLINE:
1576f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "offline");
1586f79381Pavel Zakharov		break;
1596f79381Pavel Zakharov	case VDEV_STATE_REMOVED:
1606f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "removed");
1616f79381Pavel Zakharov		break;
1626f79381Pavel Zakharov	case VDEV_STATE_CANT_OPEN:
1636f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "can't open");
1646f79381Pavel Zakharov		break;
1656f79381Pavel Zakharov	case VDEV_STATE_FAULTED:
1666f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "faulted");
1676f79381Pavel Zakharov		break;
1686f79381Pavel Zakharov	case VDEV_STATE_DEGRADED:
1696f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "degraded");
1706f79381Pavel Zakharov		break;
1716f79381Pavel Zakharov	case VDEV_STATE_HEALTHY:
1726f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "healthy");
1736f79381Pavel Zakharov		break;
1746f79381Pavel Zakharov	default:
1756f79381Pavel Zakharov		(void) snprintf(state, sizeof (state), "<state %u>",
1766f79381Pavel Zakharov		    (uint_t)vd->vdev_state);
1776f79381Pavel Zakharov	}
1786f79381Pavel Zakharov
1796f79381Pavel Zakharov	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
180c7a7b2fAndriy Gapon	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
1816f79381Pavel Zakharov	    vd->vdev_islog ? " (log)" : "",
1826f79381Pavel Zakharov	    (u_longlong_t)vd->vdev_guid,
1836f79381Pavel Zakharov	    vd->vdev_path ? vd->vdev_path : "N/A", state);
1846f79381Pavel Zakharov
1856f79381Pavel Zakharov	for (uint64_t i = 0; i < vd->vdev_children; i++)
1866f79381Pavel Zakharov		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
1876f79381Pavel Zakharov}
1886f79381Pavel Zakharov
189bf3e216Matthew Ahrens/*
190fa9e406ahrens * Given a vdev type, return the appropriate ops vector.
191fa9e406ahrens */
192fa9e406ahrensstatic vdev_ops_t *
193fa9e406ahrensvdev_getops(const char *type)
195fa9e406ahrens	vdev_ops_t *ops, **opspp;
197fa9e406ahrens	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
198fa9e406ahrens		if (strcmp(ops->vdev_op_type, type) == 0)
199fa9e406ahrens			break;
201fa9e406ahrens	return (ops);
204663207aDon Brady/*
205663207aDon Brady * Derive the enumerated alloction bias from string input.
206663207aDon Brady * String origin is either the per-vdev zap or zpool(1M).
207663207aDon Brady */
208663207aDon Bradystatic vdev_alloc_bias_t
209663207aDon Bradyvdev_derive_alloc_bias(const char *bias)
210663207aDon Brady{
211663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
212663207aDon Brady
213663207aDon Brady	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
214663207aDon Brady		alloc_bias = VDEV_BIAS_LOG;
215663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
216663207aDon Brady		alloc_bias = VDEV_BIAS_SPECIAL;
217663207aDon Brady	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
218663207aDon Brady		alloc_bias = VDEV_BIAS_DEDUP;
219663207aDon Brady
220663207aDon Brady	return (alloc_bias);
221663207aDon Brady}
222663207aDon Brady
223094e47eGeorge Wilson/* ARGSUSED */
224094e47eGeorge Wilsonvoid
2254d7988dPaul Dagnelievdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
226094e47eGeorge Wilson{
227094e47eGeorge Wilson	res->rs_start = in->rs_start;
228094e47eGeorge Wilson	res->rs_end = in->rs_end;
229094e47eGeorge Wilson}
230094e47eGeorge Wilson
232fa9e406ahrens * Default asize function: return the MAX of psize with the asize of
233fa9e406ahrens * all children.  This is what's used by anything other than RAID-Z.
234fa9e406ahrens */
236fa9e406ahrensvdev_default_asize(vdev_t *vd, uint64_t psize)
238ecc2d60bonwick	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
239fa9e406ahrens	uint64_t csize;
241573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++) {
242fa9e406ahrens		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
243fa9e406ahrens		asize = MAX(asize, csize);
244fa9e406ahrens	}
246fa9e406ahrens	return (asize);
250573ca77George Wilson * Get the minimum allocatable size. We define the allocatable size as
251573ca77George Wilson * the vdev's asize rounded to the nearest metaslab. This allows us to
252573ca77George Wilson * replace or attach devices which don't have the same physical size but
253573ca77George Wilson * can still satisfy the same number of allocations.
2542a79c5flling */
256573ca77George Wilsonvdev_get_min_asize(vdev_t *vd)
258573ca77George Wilson	vdev_t *pvd = vd->vdev_parent;
260573ca77George Wilson	/*
2614263d13George Wilson	 * If our parent is NULL (inactive spare or cache) or is the root,
262573ca77George Wilson	 * just return our own asize.
263573ca77George Wilson	 */
264573ca77George Wilson	if (pvd == NULL)
265573ca77George Wilson		return (vd->vdev_asize);
2672a79c5flling	/*
268573ca77George Wilson	 * The top-level vdev just returns the allocatable size rounded
269573ca77George Wilson	 * to the nearest metaslab.
2702a79c5flling	 */
271573ca77George Wilson	if (vd == vd->vdev_top)
272573ca77George Wilson		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
274573ca77George Wilson	/*
275573ca77George Wilson	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
276573ca77George Wilson	 * so each child must provide at least 1/Nth of its asize.
277573ca77George Wilson	 */
278573ca77George Wilson	if (pvd->vdev_ops == &vdev_raidz_ops)
279c040c10Steven Hartland		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
280c040c10Steven Hartland		    pvd->vdev_children);
282573ca77George Wilson	return (pvd->vdev_min_asize);
283573ca77George Wilson}
285573ca77George Wilsonvoid
286573ca77George Wilsonvdev_set_min_asize(vdev_t *vd)
287573ca77George Wilson{
288573ca77George Wilson	vd->vdev_min_asize = vdev_get_min_asize(vd);
289573ca77George Wilson
290573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
291573ca77George Wilson		vdev_set_min_asize(vd->vdev_child[c]);
294fa9e406ahrensvdev_t *
295fa9e406ahrensvdev_lookup_top(spa_t *spa, uint64_t vdev)
297fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
299e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
301088f389ahrens	if (vdev < rvd->vdev_children) {
302088f389ahrens		ASSERT(rvd->vdev_child[vdev] != NULL);
303fa9e406ahrens		return (rvd->vdev_child[vdev]);
304088f389ahrens	}
306fa9e406ahrens	return (NULL);
309fa9e406ahrensvdev_t *
310fa9e406ahrensvdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
312fa9e406ahrens	vdev_t *mvd;
3140e34b6abonwick	if (vd->vdev_guid == guid)
315fa9e406ahrens		return (vd);
317573ca77George Wilson	for (int c = 0; c < vd->vdev_children; c++)
318fa9e406ahrens		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
319fa9e406ahrens		    NULL)
320fa9e406ahrens			return (mvd);
322fa9e406ahrens	return (NULL);
32512380e1Arne Jansenstatic int
32612380e1Arne Jansenvdev_count_leaves_impl(vdev_t *vd)
32712380e1Arne Jansen{
32812380e1Arne Jansen	int n = 0;
32912380e1Arne Jansen
33012380e1Arne Jansen	if (vd->vdev_ops->vdev_op_leaf)
33112380e1Arne Jansen		return (1);
33212380e1Arne Jansen
33312380e1Arne Jansen	for (int c = 0; c < vd->vdev_children; c++)
33412380e1Arne Jansen		n += vdev_count_leaves_impl(vd->vdev_child[c]);
33512380e1Arne Jansen
33612380e1Arne Jansen	return (n);
33712380e1Arne Jansen}
33812380e1Arne Jansen
33912380e1Arne Jansenint
34012380e1Arne Jansenvdev_count_leaves(spa_t *spa)
34112380e1Arne Jansen{
34212380e1Arne Jansen	return (vdev_count_leaves_impl(spa->spa_root_vdev));
34312380e1Arne Jansen}
34412380e1Arne Jansen
346fa9e406ahrensvdev_add_child(vdev_t *pvd, vdev_t *cvd)
348fa9e406ahrens	size_t oldsize, newsize;
349fa9e406ahrens	uint64_t id = cvd->vdev_id;
350fa9e406ahrens	vdev_t **newchild;
35181cd5c5Matthew Ahrens	spa_t *spa = cvd->vdev_spa;
35381cd5c5Matthew Ahrens	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
354fa9e406ahrens	ASSERT(cvd->vdev_parent == NULL);
356fa9e406ahrens	cvd->vdev_parent = pvd;
358fa9e406ahrens	if (pvd == NULL)
359fa9e406ahrens		return;
361fa9e406ahrens	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
363fa9e406ahrens	oldsize = pvd->vdev_children * sizeof (vdev_t *);
364fa9e406ahrens	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
365fa9e406ahrens	newsize = pvd->vdev_children * sizeof (vdev_t *);
367fa9e406ahrens	newchild = kmem_zalloc(newsize, KM_SLEEP);
368fa9e406ahrens	if (pvd->vdev_child != NULL) {
369fa9e406ahrens		bcopy(pvd->vdev_child, newchild, oldsize);
370fa9e406ahrens		kmem_free(pvd->vdev_child, oldsize);
371fa9e406ahrens	}
373fa9e406ahrens	pvd->vdev_child = newchild;
374fa9e406ahrens	pvd->vdev_child[id] = cvd;
376fa9e406ahrens	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
377fa9e406ahrens	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
379fa9e406ahrens	/*
380fa9e406ahrens	 * Walk up all ancestors to update guid sum.
381fa9e406ahrens	 */
382fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
383fa9e406ahrens		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
384e0f1c0aOlaf Faaland
385e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
386e0f1c0aOlaf Faaland		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
387e0f1c0aOlaf Faaland		cvd->vdev_spa->spa_leaf_list_gen++;
388e0f1c0aOlaf Faaland	}
392fa9e406ahrensvdev_remove_child(vdev_t *pvd, vdev_t *cvd)
394fa9e406ahrens	int c;
395fa9e406ahrens	uint_t id = cvd->vdev_id;
397fa9e406ahrens	ASSERT(cvd->vdev_parent == pvd);
399fa9e406ahrens	if (pvd == NULL)
400fa9e406ahrens		return;
402fa9e406ahrens	ASSERT(id < pvd->vdev_children);
403fa9e406ahrens	ASSERT(pvd->vdev_child[id] == cvd);
405fa9e406ahrens	pvd->vdev_child[id] = NULL;
406fa9e406ahrens	cvd->vdev_parent = NULL;
408fa9e406ahrens	for (c = 0; c < pvd->vdev_children; c++)
409fa9e406ahrens		if (pvd->vdev_child[c])
410fa9e406ahrens			break;
412fa9e406ahrens	if (c == pvd->vdev_children) {
413fa9e406ahrens		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
414fa9e406ahrens		pvd->vdev_child = NULL;
415fa9e406ahrens		pvd->vdev_children = 0;
416fa9e406ahrens	}
418e0f1c0aOlaf Faaland	if (cvd->vdev_ops->vdev_op_leaf) {
419e0f1c0aOlaf Faaland		spa_t *spa = cvd->vdev_spa;
420e0f1c0aOlaf Faaland		list_remove(&spa->spa_leaf_list, cvd);
421e0f1c0aOlaf Faaland		spa->spa_leaf_list_gen++;
422e0f1c0aOlaf Faaland	}
423e0f1c0aOlaf Faaland
424fa9e406ahrens	/*
425fa9e406ahrens	 * Walk up all ancestors to update guid sum.
426fa9e406ahrens	 */
427fa9e406ahrens	for (; pvd != NULL; pvd = pvd->vdev_parent)
428fa9e406ahrens		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
432fa9e406ahrens * Remove any holes in the child array.
433fa9e406ahrens */
435fa9e406ahrensvdev_compact_children(vdev_t *pvd)
437fa9e406ahrens	vdev_t **newchild, *cvd;
438fa9e406ahrens	int oldc = pvd->vdev_children;
439573ca77George Wilson	int newc;
441e14bb32Jeff Bonwick	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
443573ca77George Wilson	for (int c = newc = 0; c < oldc; c++)
444fa9e406ahrens		if (pvd->vdev_child[c])
445fa9e406ahrens			newc++;
447fa9e406ahrens	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
449573ca77George Wilson	for (int c = newc = 0; c < oldc; c++) {
450fa9e406ahrens		if ((cvd = pvd->vdev_child[c]) != NULL) {
451fa9e406ahrens			newchild[newc] = cvd;
452fa9e406ahrens			cvd->vdev_id = newc++;
453fa9e406ahrens		}
454fa9e406ahrens	}
456fa9e406ahrens	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
457fa9e406ahrens	pvd->vdev_child = newchild;
458fa9e406ahrens	pvd->vdev_children = newc;
462fa9e406ahrens * Allocate and minimally initialize a vdev_t.
463fa9e406ahrens */
46488ecc94George Wilsonvdev_t *
465fa9e406ahrensvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
467fa9e406ahrens	vdev_t *vd;
4685cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
470fa9e406ahrens	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
4715cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
4730e34b6abonwick	if (spa->spa_root_vdev == NULL) {
4740e34b6abonwick		ASSERT(ops == &vdev_root_ops);
4750e34b6abonwick		spa->spa_root_vdev = vd;
476e9103aaGarrett D'Amore		spa->spa_load_guid = spa_generate_guid(NULL);
4770e34b6abonwick	}
47988ecc94George Wilson	if (guid == 0 && ops != &vdev_hole_ops) {
4800e34b6abonwick		if (spa->spa_root_vdev == vd) {
4810e34b6abonwick			/*
4820e34b6abonwick			 * The root vdev's guid will also be the pool guid,
4830e34b6abonwick			 * which must be unique among all pools.
4840e34b6abonwick			 */
4851195e68Mark J Musante			guid = spa_generate_guid(NULL);
4860e34b6abonwick		} else {
4870e34b6abonwick			/*
4880e34b6abonwick			 * Any other vdev's guid must be unique within the pool.
4890e34b6abonwick			 */
4901195e68Mark J Musante			guid = spa_generate_guid(spa);
4910e34b6abonwick		}
4920e34b6abonwick		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
4930e34b6abonwick	}
495fa9e406ahrens	vd->vdev_spa = spa;
496fa9e406ahrens	vd->vdev_id = id;
497fa9e406ahrens	vd->vdev_guid = guid;
498fa9e406ahrens	vd->vdev_guid_sum = guid;
499fa9e406ahrens	vd->vdev_ops = ops;
500fa9e406ahrens	vd->vdev_state = VDEV_STATE_CLOSED;
50188ecc94George Wilson	vd->vdev_ishole = (ops == &vdev_hole_ops);
5025cabbc6Prashanth Sreenivasa	vic->vic_prev_indirect_vdev = UINT64_MAX;
5035cabbc6Prashanth Sreenivasa
5045cabbc6Prashanth Sreenivasa	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
5055cabbc6Prashanth Sreenivasa	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
5064d7988dPaul Dagnelie	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
5074d7988dPaul Dagnelie	    0, 0);
509084fd14Brian Behlendorf	list_link_init(&vd->vdev_initialize_node);
510e0f1c0aOlaf Faaland	list_link_init(&vd->vdev_leaf_node);
511084fd14Brian Behlendorf	list_link_init(&vd->vdev_trim_node);
512fa9e406ahrens	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
5135ad8204nd	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
514e14bb32Jeff Bonwick	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
515a3874b8Toomas Soome	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
516094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
517094e47eGeorge Wilson	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
518094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
519094e47eGeorge Wilson	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
520084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
521084fd14Brian Behlendorf	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
522084fd14Brian Behlendorf	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
523084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
524084fd14Brian Behlendorf	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
525084fd14Brian Behlendorf	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
526094e47eGeorge Wilson
5278ad4d6dJeff Bonwick	for (int t = 0; t < DTL_TYPES; t++) {
5284d7988dPaul Dagnelie		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
5294d7988dPaul Dagnelie		    0);
5308ad4d6dJeff Bonwick	}
531b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_ms_list, spa,
532fa9e406ahrens	    offsetof(struct metaslab, ms_txg_node));
533b7b2590Matthew Ahrens	txg_list_create(&vd->vdev_dtl_list, spa,
534fa9e406ahrens	    offsetof(struct vdev, vdev_dtl_node));
535fa9e406ahrens	vd->vdev_stat.vs_timestamp = gethrtime();
5363d7072feschrock	vdev_queue_init(vd);
5373d7072feschrock	vdev_cache_init(vd);
539fa9e406ahrens	return (vd);
543fa9e406ahrens * Allocate a new vdev.  The 'alloctype' is used to control whether we are
544fa9e406ahrens * creating a new vdev or loading an existing one - the behavior is slightly
545fa9e406ahrens * different for each case.
546fa9e406ahrens */
54899653d4eschrockvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
54999653d4eschrock    int alloctype)
551fa9e406ahrens	vdev_ops_t *ops;
552fa9e406ahrens	char *type;
5538654d02perrin	uint64_t guid = 0, islog, nparity;
554fa9e406ahrens	vdev_t *vd;
5555cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic;
556663207aDon Brady	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
557663207aDon Brady	boolean_t top_level = (parent && !parent->vdev_parent);
559e14bb32Jeff Bonwick	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
561fa9e406ahrens	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
562be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
564fa9e406ahrens	if ((ops = vdev_getops(type)) == NULL)
565be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
567fa9e406ahrens	/*
568fa9e406ahrens	 * If this is a load, get the vdev guid from the nvlist.
569fa9e406ahrens	 * Otherwise, vdev_alloc_common() will generate one for us.
570fa9e406ahrens	 */
571fa9e406ahrens	if (alloctype == VDEV_ALLOC_LOAD) {
572fa9e406ahrens		uint64_t label_id;
574fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
575fa9e406ahrens		    label_id != id)
576be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
578fa9e406ahrens		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
579be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
58099653d4eschrock	} else if (alloctype == VDEV_ALLOC_SPARE) {
58199653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
582be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
583fa94a07brendan	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
584fa94a07brendan		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
585be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
58621ecdf6Lin Ling	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
58721ecdf6Lin Ling		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
588be6fd75Matthew Ahrens			return (SET_ERROR(EINVAL));
589fa9e406ahrens	}
59199653d4eschrock	/*
59299653d4eschrock	 * The first allocated vdev must be of type 'root'.
59399653d4eschrock	 */
59499653d4eschrock	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
595be6fd75Matthew Ahrens		return (SET_ERROR(EINVAL));
5978654d02perrin	/*
5988654d02perrin	 * Determine whether we're a log vdev.
5998654d02perrin	 */
6008654d02perrin	islog = 0;
6018654d02perrin	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
602990b485lling	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
603be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
60588ecc94George Wilson	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
606be6fd75Matthew Ahrens		return (SET_ERROR(ENOTSUP));
60788ecc94George Wilson
608fa9e406ahrens	/*
6098654d02perrin	 * Set the nparity property for RAID-Z vdevs.
61099653d4eschrock	 */
6118654d02perrin	nparity = -1ULL;
61299653d4eschrock	if (ops == &vdev_raidz_ops) {
61399653d4eschrock		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
6148654d02perrin		    &nparity) == 0) {
615b24ab67Jeff Bonwick			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
616be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
61799653d4eschrock			/*
618f94275cAdam Leventhal			 * Previous versions could only support 1 or 2 parity
619f94275cAdam Leventhal			 * device.
62099653d4eschrock			 */
621f94275cAdam Leventhal			if (nparity > 1 &&
622f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ2)
623be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
624f94275cAdam Leventhal			if (nparity > 2 &&
625f94275cAdam Leventhal			    spa_version(spa) < SPA_VERSION_RAIDZ3)
626be6fd75Matthew Ahrens				return (SET_ERROR(ENOTSUP));
62799653d4eschrock		} else {
62899653d4eschrock			/*
62999653d4eschrock			 * We require the parity to be specified for SPAs that
63099653d4eschrock			 * support multiple parity levels.
63199653d4eschrock			 */
632f94275cAdam Leventhal			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
633be6fd75Matthew Ahrens				return (SET_ERROR(EINVAL));
63499653d4eschrock			/*
63599653d4eschrock			 * Otherwise, we default to 1 parity device for RAID-Z.
63699653d4eschrock			 */
6378654d02perrin			nparity = 1;
63899653d4eschrock		}
63999653d4eschrock	} else {
6408654d02perrin		nparity = 0;
64199653d4eschrock	}
6428654d02perrin	ASSERT(nparity != -1ULL);
644663207aDon Brady	/*
645663207aDon Brady	 * If creating a top-level vdev, check for allocation classes input
646663207aDon Brady	 */
647663207aDon Brady	if (top_level && alloctype == VDEV_ALLOC_ADD) {
648663207aDon Brady		char *bias;
649663207aDon Brady
650663207aDon Brady		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
651663207aDon Brady		    &bias) == 0) {
652663207aDon Brady			alloc_bias = vdev_derive_alloc_bias(bias);
653663207aDon Brady
654663207aDon Brady			/* spa_vdev_add() expects feature to be enabled */
655c1064fdJerry Jelinek			if (alloc_bias != VDEV_BIAS_LOG &&
656c1064fdJerry Jelinek			    spa->spa_load_state != SPA_LOAD_CREATE &&
657663207aDon Brady			    !spa_feature_is_enabled(spa,
658663207aDon Brady			    SPA_FEATURE_ALLOCATION_CLASSES)) {
659663207aDon Brady				return (SET_ERROR(ENOTSUP));
660663207aDon Brady			}
661663207aDon Brady		}
662663207aDon Brady	}
663663207aDon Brady
6648654d02perrin	vd = vdev_alloc_common(spa, id, guid, ops);
6655cabbc6Prashanth Sreenivasa	vic = &vd->vdev_indirect_config;
6678654d02perrin	vd->vdev_islog = islog;
6688654d02perrin	vd->vdev_nparity = nparity;
669663207aDon Brady	if (top_level && alloc_bias != VDEV_BIAS_NONE)
670663207aDon Brady		vd->vdev_alloc_bias = alloc_bias;
6728654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
6738654d02perrin		vd->vdev_path = spa_strdup(vd->vdev_path);
6748654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
6758654d02perrin		vd->vdev_devid = spa_strdup(vd->vdev_devid);
6768654d02perrin	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
6778654d02perrin	    &vd->vdev_physpath) == 0)
6788654d02perrin		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
6796809eb4Eric Schrock	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
6806809eb4Eric Schrock		vd->vdev_fru = spa_strdup(vd->vdev_fru);
68299653d4eschrock	/*
683afefbcdeschrock	 * Set the whole_disk property.  If it's not specified, leave the value
684afefbcdeschrock	 * as -1.
685afefbcdeschrock	 */
686afefbcdeschrock	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
687afefbcdeschrock	    &vd->vdev_wholedisk) != 0)
688afefbcdeschrock		vd->vdev_wholedisk = -1ULL;
6905cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_mapping_object);
6915cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
6925cabbc6Prashanth Sreenivasa	    &vic->vic_mapping_object);
6935cabbc6Prashanth Sreenivasa	ASSERT0(vic->vic_births_object);
6945cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
6955cabbc6Prashanth Sreenivasa	    &vic->vic_births_object);
6965cabbc6Prashanth Sreenivasa	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
6975cabbc6Prashanth Sreenivasa	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
6985cabbc6Prashanth Sreenivasa	    &vic->vic_prev_indirect_vdev);
6995cabbc6Prashanth Sreenivasa
700afefbcdeschrock	/*
701ea8dc4beschrock	 * Look for the 'not present' flag.  This will only be set if the device
702ea8dc4beschrock	 * was not present at the time of import.
703ea8dc4beschrock	 */
7046809eb4Eric Schrock	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
7056809eb4Eric Schrock	    &vd->vdev_not_present);
707ea8dc4beschrock	/*
708ecc2d60bonwick	 * Get the alignment requirement.
709ecc2d60bonwick	 */
710ecc2d60bonwick	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
712ecc2d60bonwick	/*
71388ecc94George Wilson	 * Retrieve the vdev creation time.
71488ecc94George Wilson	 */
71588ecc94George Wilson	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
71688ecc94George Wilson	    &vd->vdev_crtxg);
71788ecc94George Wilson
71888ecc94George Wilson	/*
719fa9e406ahrens	 * If we're a top-level vdev, try to load the allocation parameters.
720fa9e406ahrens	 */
721663207aDon Brady	if (top_level &&
7221195e68Mark J Musante	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
723fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
724fa9e406ahrens		    &vd->vdev_ms_array);
725fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
726fa9e406ahrens		    &vd->vdev_ms_shift);
727fa9e406ahrens		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
728fa9e406ahrens		    &vd->vdev_asize);
7293f9d6adLin Ling		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
7303f9d6adLin Ling		    &vd->vdev_removing);
731215198aJoe Stein		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
732215198aJoe Stein		    &vd->vdev_top_zap);
733215198aJoe Stein	} else {
734215198aJoe Stein		ASSERT0(vd->vdev_top_zap);
735fa9e406ahrens	}
737663207aDon Brady	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
738a152156Jeff Bonwick		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
7399f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ADD ||
7401195e68Mark J Musante		    alloctype == VDEV_ALLOC_SPLIT ||
7419f4ab4dGeorge Wilson		    alloctype == VDEV_ALLOC_ROOTPOOL);
742663207aDon Brady		/* Note: metaslab_group_create() is now deferred */
743a152156Jeff Bonwick	}
744a152156Jeff Bonwick
745215198aJoe Stein	if (vd->vdev_ops->vdev_op_leaf &&
746215198aJoe Stein	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
747215198aJoe Stein		(void) nvlist_lookup_uint64(nv,
748215198aJoe Stein		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
749215198aJoe Stein	} else {
750215198aJoe Stein		ASSERT0(vd->vdev_leaf_zap);
751215198aJoe Stein	}
752215198aJoe Stein
753fa9e406ahrens	/*
7543d7072feschrock	 * If we're a leaf vdev, try to load the DTL object and other state.
755fa9e406ahrens	 */
756215198aJoe Stein
757c5904d1eschrock	if (vd->vdev_ops->vdev_op_leaf &&
75821ecdf6Lin Ling	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
75921ecdf6Lin Ling	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
760c5904d1eschrock		if (alloctype == VDEV_ALLOC_LOAD) {
761c5904d1eschrock			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,