xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev.c (revision 441d80aa)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5*441d80aaSlling  * Common Development and Distribution License (the "License").
6*441d80aaSlling  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
2234f18512Seschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens #include <sys/zfs_context.h>
29fa9e4066Sahrens #include <sys/spa.h>
30fa9e4066Sahrens #include <sys/spa_impl.h>
31fa9e4066Sahrens #include <sys/dmu.h>
32fa9e4066Sahrens #include <sys/dmu_tx.h>
33fa9e4066Sahrens #include <sys/vdev_impl.h>
34fa9e4066Sahrens #include <sys/uberblock_impl.h>
35fa9e4066Sahrens #include <sys/metaslab.h>
36fa9e4066Sahrens #include <sys/metaslab_impl.h>
37fa9e4066Sahrens #include <sys/space_map.h>
38fa9e4066Sahrens #include <sys/zio.h>
39fa9e4066Sahrens #include <sys/zap.h>
40fa9e4066Sahrens #include <sys/fs/zfs.h>
41fa9e4066Sahrens 
42fa9e4066Sahrens /*
43fa9e4066Sahrens  * Virtual device management.
44fa9e4066Sahrens  */
45fa9e4066Sahrens 
46fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = {
47fa9e4066Sahrens 	&vdev_root_ops,
48fa9e4066Sahrens 	&vdev_raidz_ops,
49fa9e4066Sahrens 	&vdev_mirror_ops,
50fa9e4066Sahrens 	&vdev_replacing_ops,
51fa9e4066Sahrens 	&vdev_disk_ops,
52fa9e4066Sahrens 	&vdev_file_ops,
53fa9e4066Sahrens 	&vdev_missing_ops,
54fa9e4066Sahrens 	NULL
55fa9e4066Sahrens };
56fa9e4066Sahrens 
57fa9e4066Sahrens /*
58fa9e4066Sahrens  * Given a vdev type, return the appropriate ops vector.
59fa9e4066Sahrens  */
60fa9e4066Sahrens static vdev_ops_t *
61fa9e4066Sahrens vdev_getops(const char *type)
62fa9e4066Sahrens {
63fa9e4066Sahrens 	vdev_ops_t *ops, **opspp;
64fa9e4066Sahrens 
65fa9e4066Sahrens 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
66fa9e4066Sahrens 		if (strcmp(ops->vdev_op_type, type) == 0)
67fa9e4066Sahrens 			break;
68fa9e4066Sahrens 
69fa9e4066Sahrens 	return (ops);
70fa9e4066Sahrens }
71fa9e4066Sahrens 
72fa9e4066Sahrens /*
73fa9e4066Sahrens  * Default asize function: return the MAX of psize with the asize of
74fa9e4066Sahrens  * all children.  This is what's used by anything other than RAID-Z.
75fa9e4066Sahrens  */
76fa9e4066Sahrens uint64_t
77fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize)
78fa9e4066Sahrens {
79fa9e4066Sahrens 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
80fa9e4066Sahrens 	uint64_t csize;
81fa9e4066Sahrens 	uint64_t c;
82fa9e4066Sahrens 
83fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
84fa9e4066Sahrens 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
85fa9e4066Sahrens 		asize = MAX(asize, csize);
86fa9e4066Sahrens 	}
87fa9e4066Sahrens 
88fa9e4066Sahrens 	return (asize);
89fa9e4066Sahrens }
90fa9e4066Sahrens 
912a79c5feSlling /*
922a79c5feSlling  * Get the replaceable or attachable device size.
932a79c5feSlling  * If the parent is a mirror or raidz, the replaceable size is the minimum
942a79c5feSlling  * psize of all its children. For the rest, just return our own psize.
952a79c5feSlling  *
962a79c5feSlling  * e.g.
972a79c5feSlling  *			psize	rsize
982a79c5feSlling  * root			-	-
992a79c5feSlling  *	mirror/raidz	-	-
1002a79c5feSlling  *	    disk1	20g	20g
1012a79c5feSlling  *	    disk2 	40g	20g
1022a79c5feSlling  *	disk3 		80g	80g
1032a79c5feSlling  */
1042a79c5feSlling uint64_t
1052a79c5feSlling vdev_get_rsize(vdev_t *vd)
1062a79c5feSlling {
1072a79c5feSlling 	vdev_t *pvd, *cvd;
1082a79c5feSlling 	uint64_t c, rsize;
1092a79c5feSlling 
1102a79c5feSlling 	pvd = vd->vdev_parent;
1112a79c5feSlling 
1122a79c5feSlling 	/*
1132a79c5feSlling 	 * If our parent is NULL or the root, just return our own psize.
1142a79c5feSlling 	 */
1152a79c5feSlling 	if (pvd == NULL || pvd->vdev_parent == NULL)
1162a79c5feSlling 		return (vd->vdev_psize);
1172a79c5feSlling 
1182a79c5feSlling 	rsize = 0;
1192a79c5feSlling 
1202a79c5feSlling 	for (c = 0; c < pvd->vdev_children; c++) {
1212a79c5feSlling 		cvd = pvd->vdev_child[c];
1222a79c5feSlling 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
1232a79c5feSlling 	}
1242a79c5feSlling 
1252a79c5feSlling 	return (rsize);
1262a79c5feSlling }
1272a79c5feSlling 
128fa9e4066Sahrens vdev_t *
129fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev)
130fa9e4066Sahrens {
131fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
132fa9e4066Sahrens 
133fa9e4066Sahrens 	if (vdev < rvd->vdev_children)
134fa9e4066Sahrens 		return (rvd->vdev_child[vdev]);
135fa9e4066Sahrens 
136fa9e4066Sahrens 	return (NULL);
137fa9e4066Sahrens }
138fa9e4066Sahrens 
139fa9e4066Sahrens vdev_t *
140fa9e4066Sahrens vdev_lookup_by_path(vdev_t *vd, const char *path)
141fa9e4066Sahrens {
142fa9e4066Sahrens 	int c;
143fa9e4066Sahrens 	vdev_t *mvd;
144fa9e4066Sahrens 
14534f18512Seschrock 	if (vd->vdev_path != NULL) {
14634f18512Seschrock 		if (vd->vdev_wholedisk == 1) {
14734f18512Seschrock 			/*
14834f18512Seschrock 			 * For whole disks, the internal path has 's0', but the
14934f18512Seschrock 			 * path passed in by the user doesn't.
15034f18512Seschrock 			 */
15134f18512Seschrock 			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
15234f18512Seschrock 			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
15334f18512Seschrock 				return (vd);
15434f18512Seschrock 		} else if (strcmp(path, vd->vdev_path) == 0) {
15534f18512Seschrock 			return (vd);
15634f18512Seschrock 		}
15734f18512Seschrock 	}
158fa9e4066Sahrens 
159fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
160fa9e4066Sahrens 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
161fa9e4066Sahrens 		    NULL)
162fa9e4066Sahrens 			return (mvd);
163fa9e4066Sahrens 
164fa9e4066Sahrens 	return (NULL);
165fa9e4066Sahrens }
166fa9e4066Sahrens 
167fa9e4066Sahrens vdev_t *
168fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
169fa9e4066Sahrens {
170fa9e4066Sahrens 	int c;
171fa9e4066Sahrens 	vdev_t *mvd;
172fa9e4066Sahrens 
173fa9e4066Sahrens 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
174fa9e4066Sahrens 		return (vd);
175fa9e4066Sahrens 
176fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
177fa9e4066Sahrens 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
178fa9e4066Sahrens 		    NULL)
179fa9e4066Sahrens 			return (mvd);
180fa9e4066Sahrens 
181fa9e4066Sahrens 	return (NULL);
182fa9e4066Sahrens }
183fa9e4066Sahrens 
184fa9e4066Sahrens void
185fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd)
186fa9e4066Sahrens {
187fa9e4066Sahrens 	size_t oldsize, newsize;
188fa9e4066Sahrens 	uint64_t id = cvd->vdev_id;
189fa9e4066Sahrens 	vdev_t **newchild;
190fa9e4066Sahrens 
191fa9e4066Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
192fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == NULL);
193fa9e4066Sahrens 
194fa9e4066Sahrens 	cvd->vdev_parent = pvd;
195fa9e4066Sahrens 
196fa9e4066Sahrens 	if (pvd == NULL)
197fa9e4066Sahrens 		return;
198fa9e4066Sahrens 
199fa9e4066Sahrens 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
200fa9e4066Sahrens 
201fa9e4066Sahrens 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
202fa9e4066Sahrens 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
203fa9e4066Sahrens 	newsize = pvd->vdev_children * sizeof (vdev_t *);
204fa9e4066Sahrens 
205fa9e4066Sahrens 	newchild = kmem_zalloc(newsize, KM_SLEEP);
206fa9e4066Sahrens 	if (pvd->vdev_child != NULL) {
207fa9e4066Sahrens 		bcopy(pvd->vdev_child, newchild, oldsize);
208fa9e4066Sahrens 		kmem_free(pvd->vdev_child, oldsize);
209fa9e4066Sahrens 	}
210fa9e4066Sahrens 
211fa9e4066Sahrens 	pvd->vdev_child = newchild;
212fa9e4066Sahrens 	pvd->vdev_child[id] = cvd;
213fa9e4066Sahrens 
214fa9e4066Sahrens 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
215fa9e4066Sahrens 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
216fa9e4066Sahrens 
217fa9e4066Sahrens 	/*
218fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
219fa9e4066Sahrens 	 */
220fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
221fa9e4066Sahrens 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
222fa9e4066Sahrens }
223fa9e4066Sahrens 
224fa9e4066Sahrens void
225fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
226fa9e4066Sahrens {
227fa9e4066Sahrens 	int c;
228fa9e4066Sahrens 	uint_t id = cvd->vdev_id;
229fa9e4066Sahrens 
230fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == pvd);
231fa9e4066Sahrens 
232fa9e4066Sahrens 	if (pvd == NULL)
233fa9e4066Sahrens 		return;
234fa9e4066Sahrens 
235fa9e4066Sahrens 	ASSERT(id < pvd->vdev_children);
236fa9e4066Sahrens 	ASSERT(pvd->vdev_child[id] == cvd);
237fa9e4066Sahrens 
238fa9e4066Sahrens 	pvd->vdev_child[id] = NULL;
239fa9e4066Sahrens 	cvd->vdev_parent = NULL;
240fa9e4066Sahrens 
241fa9e4066Sahrens 	for (c = 0; c < pvd->vdev_children; c++)
242fa9e4066Sahrens 		if (pvd->vdev_child[c])
243fa9e4066Sahrens 			break;
244fa9e4066Sahrens 
245fa9e4066Sahrens 	if (c == pvd->vdev_children) {
246fa9e4066Sahrens 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
247fa9e4066Sahrens 		pvd->vdev_child = NULL;
248fa9e4066Sahrens 		pvd->vdev_children = 0;
249fa9e4066Sahrens 	}
250fa9e4066Sahrens 
251fa9e4066Sahrens 	/*
252fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
253fa9e4066Sahrens 	 */
254fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
255fa9e4066Sahrens 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
256fa9e4066Sahrens }
257fa9e4066Sahrens 
258fa9e4066Sahrens /*
259fa9e4066Sahrens  * Remove any holes in the child array.
260fa9e4066Sahrens  */
261fa9e4066Sahrens void
262fa9e4066Sahrens vdev_compact_children(vdev_t *pvd)
263fa9e4066Sahrens {
264fa9e4066Sahrens 	vdev_t **newchild, *cvd;
265fa9e4066Sahrens 	int oldc = pvd->vdev_children;
266fa9e4066Sahrens 	int newc, c;
267fa9e4066Sahrens 
268fa9e4066Sahrens 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
269fa9e4066Sahrens 
270fa9e4066Sahrens 	for (c = newc = 0; c < oldc; c++)
271fa9e4066Sahrens 		if (pvd->vdev_child[c])
272fa9e4066Sahrens 			newc++;
273fa9e4066Sahrens 
274fa9e4066Sahrens 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
275fa9e4066Sahrens 
276fa9e4066Sahrens 	for (c = newc = 0; c < oldc; c++) {
277fa9e4066Sahrens 		if ((cvd = pvd->vdev_child[c]) != NULL) {
278fa9e4066Sahrens 			newchild[newc] = cvd;
279fa9e4066Sahrens 			cvd->vdev_id = newc++;
280fa9e4066Sahrens 		}
281fa9e4066Sahrens 	}
282fa9e4066Sahrens 
283fa9e4066Sahrens 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
284fa9e4066Sahrens 	pvd->vdev_child = newchild;
285fa9e4066Sahrens 	pvd->vdev_children = newc;
286fa9e4066Sahrens }
287fa9e4066Sahrens 
288fa9e4066Sahrens /*
289fa9e4066Sahrens  * Allocate and minimally initialize a vdev_t.
290fa9e4066Sahrens  */
291fa9e4066Sahrens static vdev_t *
292fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
293fa9e4066Sahrens {
294fa9e4066Sahrens 	vdev_t *vd;
295fa9e4066Sahrens 
296fa9e4066Sahrens 	while (guid == 0)
297fa9e4066Sahrens 		guid = spa_get_random(-1ULL);
298fa9e4066Sahrens 
299fa9e4066Sahrens 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
300fa9e4066Sahrens 
301fa9e4066Sahrens 	vd->vdev_spa = spa;
302fa9e4066Sahrens 	vd->vdev_id = id;
303fa9e4066Sahrens 	vd->vdev_guid = guid;
304fa9e4066Sahrens 	vd->vdev_guid_sum = guid;
305fa9e4066Sahrens 	vd->vdev_ops = ops;
306fa9e4066Sahrens 	vd->vdev_state = VDEV_STATE_CLOSED;
307fa9e4066Sahrens 
308fa9e4066Sahrens 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
309fa9e4066Sahrens 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
310fa9e4066Sahrens 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
311fa9e4066Sahrens 	    offsetof(zio_t, io_pending));
312fa9e4066Sahrens 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
313fa9e4066Sahrens 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
314fa9e4066Sahrens 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
315fa9e4066Sahrens 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
316fa9e4066Sahrens 	txg_list_create(&vd->vdev_ms_list,
317fa9e4066Sahrens 	    offsetof(struct metaslab, ms_txg_node));
318fa9e4066Sahrens 	txg_list_create(&vd->vdev_dtl_list,
319fa9e4066Sahrens 	    offsetof(struct vdev, vdev_dtl_node));
320fa9e4066Sahrens 	vd->vdev_stat.vs_timestamp = gethrtime();
321fa9e4066Sahrens 
322fa9e4066Sahrens 	return (vd);
323fa9e4066Sahrens }
324fa9e4066Sahrens 
325fa9e4066Sahrens /*
326fa9e4066Sahrens  * Free a vdev_t that has been removed from service.
327fa9e4066Sahrens  */
328fa9e4066Sahrens static void
329fa9e4066Sahrens vdev_free_common(vdev_t *vd)
330fa9e4066Sahrens {
331fa9e4066Sahrens 	if (vd->vdev_path)
332fa9e4066Sahrens 		spa_strfree(vd->vdev_path);
333fa9e4066Sahrens 	if (vd->vdev_devid)
334fa9e4066Sahrens 		spa_strfree(vd->vdev_devid);
335fa9e4066Sahrens 
336fa9e4066Sahrens 	txg_list_destroy(&vd->vdev_ms_list);
337fa9e4066Sahrens 	txg_list_destroy(&vd->vdev_dtl_list);
338fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
339fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
340fa9e4066Sahrens 	space_map_destroy(&vd->vdev_dtl_map);
341fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
342fa9e4066Sahrens 	space_map_destroy(&vd->vdev_dtl_scrub);
343fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
344fa9e4066Sahrens 	mutex_destroy(&vd->vdev_dtl_lock);
345fa9e4066Sahrens 	mutex_destroy(&vd->vdev_dirty_lock);
346fa9e4066Sahrens 	list_destroy(&vd->vdev_io_pending);
347fa9e4066Sahrens 	mutex_destroy(&vd->vdev_io_lock);
348fa9e4066Sahrens 	cv_destroy(&vd->vdev_io_cv);
349fa9e4066Sahrens 
350fa9e4066Sahrens 	kmem_free(vd, sizeof (vdev_t));
351fa9e4066Sahrens }
352fa9e4066Sahrens 
353fa9e4066Sahrens /*
354fa9e4066Sahrens  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
355fa9e4066Sahrens  * creating a new vdev or loading an existing one - the behavior is slightly
356fa9e4066Sahrens  * different for each case.
357fa9e4066Sahrens  */
358fa9e4066Sahrens vdev_t *
359fa9e4066Sahrens vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
360fa9e4066Sahrens {
361fa9e4066Sahrens 	vdev_ops_t *ops;
362fa9e4066Sahrens 	char *type;
363*441d80aaSlling 	uint64_t guid = 0, offline = 0;
364fa9e4066Sahrens 	vdev_t *vd;
365fa9e4066Sahrens 
366fa9e4066Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
367fa9e4066Sahrens 
368fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
369fa9e4066Sahrens 		return (NULL);
370fa9e4066Sahrens 
371fa9e4066Sahrens 	if ((ops = vdev_getops(type)) == NULL)
372fa9e4066Sahrens 		return (NULL);
373fa9e4066Sahrens 
374fa9e4066Sahrens 	/*
375fa9e4066Sahrens 	 * If this is a load, get the vdev guid from the nvlist.
376fa9e4066Sahrens 	 * Otherwise, vdev_alloc_common() will generate one for us.
377fa9e4066Sahrens 	 */
378fa9e4066Sahrens 	if (alloctype == VDEV_ALLOC_LOAD) {
379fa9e4066Sahrens 		uint64_t label_id;
380fa9e4066Sahrens 
381fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
382fa9e4066Sahrens 		    label_id != id)
383fa9e4066Sahrens 			return (NULL);
384fa9e4066Sahrens 
385fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
386fa9e4066Sahrens 			return (NULL);
387fa9e4066Sahrens 	}
388fa9e4066Sahrens 
389fa9e4066Sahrens 	vd = vdev_alloc_common(spa, id, guid, ops);
390fa9e4066Sahrens 
391fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
392fa9e4066Sahrens 		vd->vdev_path = spa_strdup(vd->vdev_path);
393fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
394fa9e4066Sahrens 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
395fa9e4066Sahrens 
396afefbcddSeschrock 	/*
397afefbcddSeschrock 	 * Set the whole_disk property.  If it's not specified, leave the value
398afefbcddSeschrock 	 * as -1.
399afefbcddSeschrock 	 */
400afefbcddSeschrock 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
401afefbcddSeschrock 	    &vd->vdev_wholedisk) != 0)
402afefbcddSeschrock 		vd->vdev_wholedisk = -1ULL;
403afefbcddSeschrock 
404fa9e4066Sahrens 	/*
405fa9e4066Sahrens 	 * If we're a top-level vdev, try to load the allocation parameters.
406fa9e4066Sahrens 	 */
407fa9e4066Sahrens 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
408fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
409fa9e4066Sahrens 		    &vd->vdev_ms_array);
410fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
411fa9e4066Sahrens 		    &vd->vdev_ms_shift);
412fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
413fa9e4066Sahrens 		    &vd->vdev_ashift);
414fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
415fa9e4066Sahrens 		    &vd->vdev_asize);
416fa9e4066Sahrens 	}
417fa9e4066Sahrens 
418fa9e4066Sahrens 	/*
419*441d80aaSlling 	 * If we're a leaf vdev, try to load the DTL object
420*441d80aaSlling 	 * and the offline state.
421fa9e4066Sahrens 	 */
422*441d80aaSlling 	vd->vdev_offline = B_FALSE;
423fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
424fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
425fa9e4066Sahrens 		    &vd->vdev_dtl.smo_object);
426*441d80aaSlling 
427*441d80aaSlling 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &offline)
428*441d80aaSlling 		    == 0)
429*441d80aaSlling 			vd->vdev_offline = offline;
430fa9e4066Sahrens 	}
431fa9e4066Sahrens 
432fa9e4066Sahrens 	/*
433fa9e4066Sahrens 	 * Add ourselves to the parent's list of children.
434fa9e4066Sahrens 	 */
435fa9e4066Sahrens 	vdev_add_child(parent, vd);
436fa9e4066Sahrens 
437fa9e4066Sahrens 	return (vd);
438fa9e4066Sahrens }
439fa9e4066Sahrens 
440fa9e4066Sahrens void
441fa9e4066Sahrens vdev_free(vdev_t *vd)
442fa9e4066Sahrens {
443fa9e4066Sahrens 	int c;
444fa9e4066Sahrens 
445fa9e4066Sahrens 	/*
446fa9e4066Sahrens 	 * vdev_free() implies closing the vdev first.  This is simpler than
447fa9e4066Sahrens 	 * trying to ensure complicated semantics for all callers.
448fa9e4066Sahrens 	 */
449fa9e4066Sahrens 	vdev_close(vd);
450fa9e4066Sahrens 
451fa9e4066Sahrens 	/*
452fa9e4066Sahrens 	 * It's possible to free a vdev that's been added to the dirty
453fa9e4066Sahrens 	 * list when in the middle of spa_vdev_add().  Handle that case
454fa9e4066Sahrens 	 * correctly here.
455fa9e4066Sahrens 	 */
456fa9e4066Sahrens 	if (vd->vdev_is_dirty)
457fa9e4066Sahrens 		vdev_config_clean(vd);
458fa9e4066Sahrens 
459fa9e4066Sahrens 	/*
460fa9e4066Sahrens 	 * Free all children.
461fa9e4066Sahrens 	 */
462fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
463fa9e4066Sahrens 		vdev_free(vd->vdev_child[c]);
464fa9e4066Sahrens 
465fa9e4066Sahrens 	ASSERT(vd->vdev_child == NULL);
466fa9e4066Sahrens 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
467fa9e4066Sahrens 
468fa9e4066Sahrens 	/*
469fa9e4066Sahrens 	 * Discard allocation state.
470fa9e4066Sahrens 	 */
471fa9e4066Sahrens 	if (vd == vd->vdev_top)
472fa9e4066Sahrens 		vdev_metaslab_fini(vd);
473fa9e4066Sahrens 
474fa9e4066Sahrens 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
475fa9e4066Sahrens 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
476fa9e4066Sahrens 
477fa9e4066Sahrens 	/*
478fa9e4066Sahrens 	 * Remove this vdev from its parent's child list.
479fa9e4066Sahrens 	 */
480fa9e4066Sahrens 	vdev_remove_child(vd->vdev_parent, vd);
481fa9e4066Sahrens 
482fa9e4066Sahrens 	ASSERT(vd->vdev_parent == NULL);
483fa9e4066Sahrens 
484fa9e4066Sahrens 	vdev_free_common(vd);
485fa9e4066Sahrens }
486fa9e4066Sahrens 
487fa9e4066Sahrens /*
488fa9e4066Sahrens  * Transfer top-level vdev state from svd to tvd.
489fa9e4066Sahrens  */
490fa9e4066Sahrens static void
491fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
492fa9e4066Sahrens {
493fa9e4066Sahrens 	spa_t *spa = svd->vdev_spa;
494fa9e4066Sahrens 	metaslab_t *msp;
495fa9e4066Sahrens 	vdev_t *vd;
496fa9e4066Sahrens 	int t;
497fa9e4066Sahrens 
498fa9e4066Sahrens 	ASSERT(tvd == tvd->vdev_top);
499fa9e4066Sahrens 
500fa9e4066Sahrens 	tvd->vdev_ms_array = svd->vdev_ms_array;
501fa9e4066Sahrens 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
502fa9e4066Sahrens 	tvd->vdev_ms_count = svd->vdev_ms_count;
503fa9e4066Sahrens 
504fa9e4066Sahrens 	svd->vdev_ms_array = 0;
505fa9e4066Sahrens 	svd->vdev_ms_shift = 0;
506fa9e4066Sahrens 	svd->vdev_ms_count = 0;
507fa9e4066Sahrens 
508fa9e4066Sahrens 	tvd->vdev_mg = svd->vdev_mg;
509fa9e4066Sahrens 	tvd->vdev_mg->mg_vd = tvd;
510fa9e4066Sahrens 	tvd->vdev_ms = svd->vdev_ms;
511fa9e4066Sahrens 	tvd->vdev_smo = svd->vdev_smo;
512fa9e4066Sahrens 
513fa9e4066Sahrens 	svd->vdev_mg = NULL;
514fa9e4066Sahrens 	svd->vdev_ms = NULL;
515fa9e4066Sahrens 	svd->vdev_smo = NULL;
516fa9e4066Sahrens 
517fa9e4066Sahrens 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
518fa9e4066Sahrens 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
519fa9e4066Sahrens 
520fa9e4066Sahrens 	svd->vdev_stat.vs_alloc = 0;
521fa9e4066Sahrens 	svd->vdev_stat.vs_space = 0;
522fa9e4066Sahrens 
523fa9e4066Sahrens 	for (t = 0; t < TXG_SIZE; t++) {
524fa9e4066Sahrens 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
525fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
526fa9e4066Sahrens 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
527fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
528fa9e4066Sahrens 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
529fa9e4066Sahrens 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
530fa9e4066Sahrens 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
531fa9e4066Sahrens 		svd->vdev_dirty[t] = 0;
532fa9e4066Sahrens 	}
533fa9e4066Sahrens 
534fa9e4066Sahrens 	if (svd->vdev_is_dirty) {
535fa9e4066Sahrens 		vdev_config_clean(svd);
536fa9e4066Sahrens 		vdev_config_dirty(tvd);
537fa9e4066Sahrens 	}
538fa9e4066Sahrens 
539fa9e4066Sahrens 	ASSERT(svd->vdev_io_retry == NULL);
540fa9e4066Sahrens 	ASSERT(list_is_empty(&svd->vdev_io_pending));
541fa9e4066Sahrens }
542fa9e4066Sahrens 
543fa9e4066Sahrens static void
544fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd)
545fa9e4066Sahrens {
546fa9e4066Sahrens 	int c;
547fa9e4066Sahrens 
548fa9e4066Sahrens 	if (vd == NULL)
549fa9e4066Sahrens 		return;
550fa9e4066Sahrens 
551fa9e4066Sahrens 	vd->vdev_top = tvd;
552fa9e4066Sahrens 
553fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
554fa9e4066Sahrens 		vdev_top_update(tvd, vd->vdev_child[c]);
555fa9e4066Sahrens }
556fa9e4066Sahrens 
557fa9e4066Sahrens /*
558fa9e4066Sahrens  * Add a mirror/replacing vdev above an existing vdev.
559fa9e4066Sahrens  */
560fa9e4066Sahrens vdev_t *
561fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
562fa9e4066Sahrens {
563fa9e4066Sahrens 	spa_t *spa = cvd->vdev_spa;
564fa9e4066Sahrens 	vdev_t *pvd = cvd->vdev_parent;
565fa9e4066Sahrens 	vdev_t *mvd;
566fa9e4066Sahrens 
567fa9e4066Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
568fa9e4066Sahrens 
569fa9e4066Sahrens 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
570fa9e4066Sahrens 	vdev_remove_child(pvd, cvd);
571fa9e4066Sahrens 	vdev_add_child(pvd, mvd);
572fa9e4066Sahrens 	cvd->vdev_id = mvd->vdev_children;
573fa9e4066Sahrens 	vdev_add_child(mvd, cvd);
574fa9e4066Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
575fa9e4066Sahrens 
576fa9e4066Sahrens 	mvd->vdev_asize = cvd->vdev_asize;
577fa9e4066Sahrens 	mvd->vdev_ashift = cvd->vdev_ashift;
578fa9e4066Sahrens 	mvd->vdev_state = cvd->vdev_state;
579fa9e4066Sahrens 
580fa9e4066Sahrens 	if (mvd == mvd->vdev_top)
581fa9e4066Sahrens 		vdev_top_transfer(cvd, mvd);
582fa9e4066Sahrens 
583fa9e4066Sahrens 	return (mvd);
584fa9e4066Sahrens }
585fa9e4066Sahrens 
586fa9e4066Sahrens /*
587fa9e4066Sahrens  * Remove a 1-way mirror/replacing vdev from the tree.
588fa9e4066Sahrens  */
589fa9e4066Sahrens void
590fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd)
591fa9e4066Sahrens {
592fa9e4066Sahrens 	vdev_t *mvd = cvd->vdev_parent;
593fa9e4066Sahrens 	vdev_t *pvd = mvd->vdev_parent;
594fa9e4066Sahrens 
595fa9e4066Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
596fa9e4066Sahrens 
597fa9e4066Sahrens 	ASSERT(mvd->vdev_children == 1);
598fa9e4066Sahrens 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
599fa9e4066Sahrens 	    mvd->vdev_ops == &vdev_replacing_ops);
600fa9e4066Sahrens 
601fa9e4066Sahrens 	vdev_remove_child(mvd, cvd);
602fa9e4066Sahrens 	vdev_remove_child(pvd, mvd);
603fa9e4066Sahrens 	cvd->vdev_id = mvd->vdev_id;
604fa9e4066Sahrens 	vdev_add_child(pvd, cvd);
605fa9e4066Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
606fa9e4066Sahrens 
607fa9e4066Sahrens 	if (cvd == cvd->vdev_top)
608fa9e4066Sahrens 		vdev_top_transfer(mvd, cvd);
609fa9e4066Sahrens 
610fa9e4066Sahrens 	ASSERT(mvd->vdev_children == 0);
611fa9e4066Sahrens 	vdev_free(mvd);
612fa9e4066Sahrens }
613fa9e4066Sahrens 
614fa9e4066Sahrens void
615fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg)
616fa9e4066Sahrens {
617fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
618fa9e4066Sahrens 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
619fa9e4066Sahrens 	uint64_t c;
620fa9e4066Sahrens 	uint64_t oldc = vd->vdev_ms_count;
621fa9e4066Sahrens 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
622fa9e4066Sahrens 	space_map_obj_t *smo = vd->vdev_smo;
623fa9e4066Sahrens 	metaslab_t **mspp = vd->vdev_ms;
624fa9e4066Sahrens 
625fa9e4066Sahrens 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
626fa9e4066Sahrens 
627fa9e4066Sahrens 	ASSERT(oldc <= newc);
628fa9e4066Sahrens 
629fa9e4066Sahrens 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
630fa9e4066Sahrens 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
631fa9e4066Sahrens 	vd->vdev_ms_count = newc;
632fa9e4066Sahrens 
633fa9e4066Sahrens 	if (vd->vdev_mg == NULL) {
634fa9e4066Sahrens 		if (txg == 0) {
635fa9e4066Sahrens 			dmu_buf_t *db;
636fa9e4066Sahrens 			uint64_t *ms_array;
637fa9e4066Sahrens 
638fa9e4066Sahrens 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
639fa9e4066Sahrens 			    KM_SLEEP);
640fa9e4066Sahrens 
641fa9e4066Sahrens 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
642fa9e4066Sahrens 			    0, newc * sizeof (uint64_t), ms_array);
643fa9e4066Sahrens 
644fa9e4066Sahrens 			for (c = 0; c < newc; c++) {
645fa9e4066Sahrens 				if (ms_array[c] == 0)
646fa9e4066Sahrens 					continue;
647fa9e4066Sahrens 				db = dmu_bonus_hold(spa->spa_meta_objset,
648fa9e4066Sahrens 				    ms_array[c]);
649fa9e4066Sahrens 				dmu_buf_read(db);
650fa9e4066Sahrens 				ASSERT3U(db->db_size, ==, sizeof (*smo));
651fa9e4066Sahrens 				bcopy(db->db_data, &vd->vdev_smo[c],
652fa9e4066Sahrens 				    db->db_size);
653fa9e4066Sahrens 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
654fa9e4066Sahrens 				    ms_array[c]);
655fa9e4066Sahrens 				dmu_buf_rele(db);
656fa9e4066Sahrens 			}
657fa9e4066Sahrens 			kmem_free(ms_array, newc * sizeof (uint64_t));
658fa9e4066Sahrens 		}
659fa9e4066Sahrens 		vd->vdev_mg = metaslab_group_create(mc, vd);
660fa9e4066Sahrens 	}
661fa9e4066Sahrens 
662fa9e4066Sahrens 	for (c = 0; c < oldc; c++) {
663fa9e4066Sahrens 		vd->vdev_smo[c] = smo[c];
664fa9e4066Sahrens 		vd->vdev_ms[c] = mspp[c];
665fa9e4066Sahrens 		mspp[c]->ms_smo = &vd->vdev_smo[c];
666fa9e4066Sahrens 	}
667fa9e4066Sahrens 
668fa9e4066Sahrens 	for (c = oldc; c < newc; c++)
669fa9e4066Sahrens 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
670fa9e4066Sahrens 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
671fa9e4066Sahrens 
672fa9e4066Sahrens 	if (oldc != 0) {
673fa9e4066Sahrens 		kmem_free(smo, oldc * sizeof (*smo));
674fa9e4066Sahrens 		kmem_free(mspp, oldc * sizeof (*mspp));
675fa9e4066Sahrens 	}
676fa9e4066Sahrens 
677fa9e4066Sahrens }
678fa9e4066Sahrens 
679fa9e4066Sahrens void
680fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd)
681fa9e4066Sahrens {
682fa9e4066Sahrens 	uint64_t m;
683fa9e4066Sahrens 	uint64_t count = vd->vdev_ms_count;
684fa9e4066Sahrens 
685fa9e4066Sahrens 	if (vd->vdev_ms != NULL) {
686fa9e4066Sahrens 		for (m = 0; m < count; m++)
687fa9e4066Sahrens 			metaslab_fini(vd->vdev_ms[m]);
688fa9e4066Sahrens 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
689fa9e4066Sahrens 		vd->vdev_ms = NULL;
690fa9e4066Sahrens 	}
691fa9e4066Sahrens 
692fa9e4066Sahrens 	if (vd->vdev_smo != NULL) {
693fa9e4066Sahrens 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
694fa9e4066Sahrens 		vd->vdev_smo = NULL;
695fa9e4066Sahrens 	}
696fa9e4066Sahrens }
697fa9e4066Sahrens 
698fa9e4066Sahrens /*
699fa9e4066Sahrens  * Prepare a virtual device for access.
700fa9e4066Sahrens  */
701fa9e4066Sahrens int
702fa9e4066Sahrens vdev_open(vdev_t *vd)
703fa9e4066Sahrens {
704fa9e4066Sahrens 	int error;
705fa9e4066Sahrens 	vdev_knob_t *vk;
706fa9e4066Sahrens 	int c;
707fa9e4066Sahrens 	uint64_t osize = 0;
708fa9e4066Sahrens 	uint64_t asize, psize;
709fa9e4066Sahrens 	uint64_t ashift = -1ULL;
710fa9e4066Sahrens 
711fa9e4066Sahrens 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
712fa9e4066Sahrens 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
713fa9e4066Sahrens 	    vd->vdev_state == VDEV_STATE_OFFLINE);
714fa9e4066Sahrens 
715fa9e4066Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
716fa9e4066Sahrens 		vd->vdev_fault_arg >>= 1;
717fa9e4066Sahrens 	else
718fa9e4066Sahrens 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
719fa9e4066Sahrens 
720fa9e4066Sahrens 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
721fa9e4066Sahrens 
722fa9e4066Sahrens 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
723fa9e4066Sahrens 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
724fa9e4066Sahrens 
725fa9e4066Sahrens 		*valp = vk->vk_default;
726fa9e4066Sahrens 		*valp = MAX(*valp, vk->vk_min);
727fa9e4066Sahrens 		*valp = MIN(*valp, vk->vk_max);
728fa9e4066Sahrens 	}
729fa9e4066Sahrens 
730fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
731fa9e4066Sahrens 		vdev_cache_init(vd);
732fa9e4066Sahrens 		vdev_queue_init(vd);
733fa9e4066Sahrens 		vd->vdev_cache_active = B_TRUE;
734fa9e4066Sahrens 	}
735fa9e4066Sahrens 
736fa9e4066Sahrens 	if (vd->vdev_offline) {
737fa9e4066Sahrens 		ASSERT(vd->vdev_children == 0);
738fa9e4066Sahrens 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
739fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
740fa9e4066Sahrens 		return (ENXIO);
741fa9e4066Sahrens 	}
742fa9e4066Sahrens 
743fa9e4066Sahrens 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
744fa9e4066Sahrens 
745fa9e4066Sahrens 	dprintf("%s = %d, osize %llu, state = %d\n",
746fa9e4066Sahrens 	    vdev_description(vd), error, osize, vd->vdev_state);
747fa9e4066Sahrens 
748fa9e4066Sahrens 	if (error) {
749fa9e4066Sahrens 		dprintf("%s in %s failed to open, error %d, aux %d\n",
750fa9e4066Sahrens 		    vdev_description(vd),
751fa9e4066Sahrens 		    vdev_description(vd->vdev_parent),
752fa9e4066Sahrens 		    error,
753fa9e4066Sahrens 		    vd->vdev_stat.vs_aux);
754fa9e4066Sahrens 
755fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
756fa9e4066Sahrens 		return (error);
757fa9e4066Sahrens 	}
758fa9e4066Sahrens 
759fa9e4066Sahrens 	vd->vdev_state = VDEV_STATE_HEALTHY;
760fa9e4066Sahrens 
761fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
762fa9e4066Sahrens 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
763fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_DEGRADED;
764fa9e4066Sahrens 
765fa9e4066Sahrens 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
766fa9e4066Sahrens 
767fa9e4066Sahrens 	if (vd->vdev_children == 0) {
768fa9e4066Sahrens 		if (osize < SPA_MINDEVSIZE) {
769fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
770fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
771fa9e4066Sahrens 			return (EOVERFLOW);
772fa9e4066Sahrens 		}
773fa9e4066Sahrens 		psize = osize;
774fa9e4066Sahrens 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
775fa9e4066Sahrens 	} else {
776fa9e4066Sahrens 		if (osize < SPA_MINDEVSIZE -
777fa9e4066Sahrens 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
778fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
779fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
780fa9e4066Sahrens 			return (EOVERFLOW);
781fa9e4066Sahrens 		}
782fa9e4066Sahrens 		psize = 0;
783fa9e4066Sahrens 		asize = osize;
784fa9e4066Sahrens 	}
785fa9e4066Sahrens 
786fa9e4066Sahrens 	vd->vdev_psize = psize;
787fa9e4066Sahrens 
788fa9e4066Sahrens 	if (vd->vdev_asize == 0) {
789fa9e4066Sahrens 		/*
790fa9e4066Sahrens 		 * This is the first-ever open, so use the computed values.
791fa9e4066Sahrens 		 */
792fa9e4066Sahrens 		vd->vdev_asize = asize;
793fa9e4066Sahrens 		vd->vdev_ashift = ashift;
794fa9e4066Sahrens 	} else {
795fa9e4066Sahrens 		/*
796fa9e4066Sahrens 		 * Make sure the alignment requirement hasn't increased.
797fa9e4066Sahrens 		 */
798fa9e4066Sahrens 		if (ashift > vd->vdev_ashift) {
799fa9e4066Sahrens 			dprintf("%s: ashift grew\n", vdev_description(vd));
800fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
801fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
802fa9e4066Sahrens 			return (EINVAL);
803fa9e4066Sahrens 		}
804fa9e4066Sahrens 
805fa9e4066Sahrens 		/*
806fa9e4066Sahrens 		 * Make sure the device hasn't shrunk.
807fa9e4066Sahrens 		 */
808fa9e4066Sahrens 		if (asize < vd->vdev_asize) {
809fa9e4066Sahrens 			dprintf("%s: device shrank\n", vdev_description(vd));
810fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
811fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
812fa9e4066Sahrens 			return (EINVAL);
813fa9e4066Sahrens 		}
814fa9e4066Sahrens 
815fa9e4066Sahrens 		/*
816fa9e4066Sahrens 		 * If all children are healthy and the asize has increased,
817fa9e4066Sahrens 		 * then we've experienced dynamic LUN growth.
818fa9e4066Sahrens 		 */
819fa9e4066Sahrens 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
820fa9e4066Sahrens 		    asize > vd->vdev_asize) {
821fa9e4066Sahrens 			dprintf("%s: device grew\n", vdev_description(vd));
822fa9e4066Sahrens 			vd->vdev_asize = asize;
823fa9e4066Sahrens 		}
824fa9e4066Sahrens 	}
825fa9e4066Sahrens 
826fa9e4066Sahrens 	return (0);
827fa9e4066Sahrens }
828fa9e4066Sahrens 
829fa9e4066Sahrens /*
830fa9e4066Sahrens  * Close a virtual device.
831fa9e4066Sahrens  */
832fa9e4066Sahrens void
833fa9e4066Sahrens vdev_close(vdev_t *vd)
834fa9e4066Sahrens {
835fa9e4066Sahrens 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
836fa9e4066Sahrens 
837fa9e4066Sahrens 	vd->vdev_ops->vdev_op_close(vd);
838fa9e4066Sahrens 
839fa9e4066Sahrens 	if (vd->vdev_cache_active) {
840fa9e4066Sahrens 		vdev_cache_fini(vd);
841fa9e4066Sahrens 		vdev_queue_fini(vd);
842fa9e4066Sahrens 		vd->vdev_cache_active = B_FALSE;
843fa9e4066Sahrens 	}
844fa9e4066Sahrens 
845fa9e4066Sahrens 	if (vd->vdev_offline)
846fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
847fa9e4066Sahrens 	else
848fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_CLOSED;
849fa9e4066Sahrens }
850fa9e4066Sahrens 
851fa9e4066Sahrens void
852fa9e4066Sahrens vdev_reopen(vdev_t *vd, zio_t **rq)
853fa9e4066Sahrens {
854fa9e4066Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
855fa9e4066Sahrens 	int c;
856fa9e4066Sahrens 
857fa9e4066Sahrens 	if (vd == rvd) {
858fa9e4066Sahrens 		ASSERT(rq == NULL);
859fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
860fa9e4066Sahrens 			vdev_reopen(rvd->vdev_child[c], NULL);
861fa9e4066Sahrens 		return;
862fa9e4066Sahrens 	}
863fa9e4066Sahrens 
864fa9e4066Sahrens 	/* only valid for top-level vdevs */
865fa9e4066Sahrens 	ASSERT3P(vd, ==, vd->vdev_top);
866fa9e4066Sahrens 
867fa9e4066Sahrens 	/*
868fa9e4066Sahrens 	 * vdev_state can change when spa_config_lock is held as writer,
869fa9e4066Sahrens 	 * or when it's held as reader and we're doing a vdev_reopen().
870fa9e4066Sahrens 	 * To handle the latter case, we grab rvd's io_lock to serialize
871fa9e4066Sahrens 	 * reopens.  This ensures that there's never more than one vdev
872fa9e4066Sahrens 	 * state changer active at a time.
873fa9e4066Sahrens 	 */
874fa9e4066Sahrens 	mutex_enter(&rvd->vdev_io_lock);
875fa9e4066Sahrens 
876fa9e4066Sahrens 	mutex_enter(&vd->vdev_io_lock);
877fa9e4066Sahrens 	while (list_head(&vd->vdev_io_pending) != NULL)
878fa9e4066Sahrens 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
879fa9e4066Sahrens 	vdev_close(vd);
880fa9e4066Sahrens 	(void) vdev_open(vd);
881fa9e4066Sahrens 	if (rq != NULL) {
882fa9e4066Sahrens 		*rq = vd->vdev_io_retry;
883fa9e4066Sahrens 		vd->vdev_io_retry = NULL;
884fa9e4066Sahrens 	}
885fa9e4066Sahrens 	mutex_exit(&vd->vdev_io_lock);
886fa9e4066Sahrens 
887fa9e4066Sahrens 	/*
888fa9e4066Sahrens 	 * Reassess root vdev's health.
889fa9e4066Sahrens 	 */
890fa9e4066Sahrens 	rvd->vdev_state = VDEV_STATE_HEALTHY;
891fa9e4066Sahrens 	for (c = 0; c < rvd->vdev_children; c++) {
892fa9e4066Sahrens 		uint64_t state = rvd->vdev_child[c]->vdev_state;
893fa9e4066Sahrens 		rvd->vdev_state = MIN(rvd->vdev_state, state);
894fa9e4066Sahrens 	}
895fa9e4066Sahrens 
896fa9e4066Sahrens 	mutex_exit(&rvd->vdev_io_lock);
897fa9e4066Sahrens }
898fa9e4066Sahrens 
899fa9e4066Sahrens int
900fa9e4066Sahrens vdev_create(vdev_t *vd, uint64_t txg)
901fa9e4066Sahrens {
902fa9e4066Sahrens 	int error;
903fa9e4066Sahrens 
904fa9e4066Sahrens 	/*
905fa9e4066Sahrens 	 * Normally, partial opens (e.g. of a mirror) are allowed.
906fa9e4066Sahrens 	 * For a create, however, we want to fail the request if
907fa9e4066Sahrens 	 * there are any components we can't open.
908fa9e4066Sahrens 	 */
909fa9e4066Sahrens 	error = vdev_open(vd);
910fa9e4066Sahrens 
911fa9e4066Sahrens 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
912fa9e4066Sahrens 		vdev_close(vd);
913fa9e4066Sahrens 		return (error ? error : ENXIO);
914fa9e4066Sahrens 	}
915fa9e4066Sahrens 
916fa9e4066Sahrens 	/*
917fa9e4066Sahrens 	 * Recursively initialize all labels.
918fa9e4066Sahrens 	 */
919fa9e4066Sahrens 	if ((error = vdev_label_init(vd, txg)) != 0) {
920fa9e4066Sahrens 		vdev_close(vd);
921fa9e4066Sahrens 		return (error);
922fa9e4066Sahrens 	}
923fa9e4066Sahrens 
924fa9e4066Sahrens 	return (0);
925fa9e4066Sahrens }
926fa9e4066Sahrens 
927fa9e4066Sahrens /*
928fa9e4066Sahrens  * The is the latter half of vdev_create().  It is distinct because it
929fa9e4066Sahrens  * involves initiating transactions in order to do metaslab creation.
930fa9e4066Sahrens  * For creation, we want to try to create all vdevs at once and then undo it
931fa9e4066Sahrens  * if anything fails; this is much harder if we have pending transactions.
932fa9e4066Sahrens  */
933fa9e4066Sahrens void
934fa9e4066Sahrens vdev_init(vdev_t *vd, uint64_t txg)
935fa9e4066Sahrens {
936fa9e4066Sahrens 	/*
937fa9e4066Sahrens 	 * Aim for roughly 200 metaslabs per vdev.
938fa9e4066Sahrens 	 */
939fa9e4066Sahrens 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
940fa9e4066Sahrens 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
941fa9e4066Sahrens 
942fa9e4066Sahrens 	/*
943fa9e4066Sahrens 	 * Initialize the vdev's metaslabs.
944fa9e4066Sahrens 	 */
945fa9e4066Sahrens 	vdev_metaslab_init(vd, txg);
946fa9e4066Sahrens }
947fa9e4066Sahrens 
948fa9e4066Sahrens void
949fa9e4066Sahrens vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
950fa9e4066Sahrens {
951fa9e4066Sahrens 	vdev_t *tvd = vd->vdev_top;
952fa9e4066Sahrens 
953fa9e4066Sahrens 	mutex_enter(&tvd->vdev_dirty_lock);
954fa9e4066Sahrens 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
955fa9e4066Sahrens 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
956fa9e4066Sahrens 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
957fa9e4066Sahrens 		    tvd, txg);
958fa9e4066Sahrens 	}
959fa9e4066Sahrens 	mutex_exit(&tvd->vdev_dirty_lock);
960fa9e4066Sahrens }
961fa9e4066Sahrens 
962fa9e4066Sahrens void
963fa9e4066Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
964fa9e4066Sahrens {
965fa9e4066Sahrens 	mutex_enter(sm->sm_lock);
966fa9e4066Sahrens 	if (!space_map_contains(sm, txg, size))
967fa9e4066Sahrens 		space_map_add(sm, txg, size);
968fa9e4066Sahrens 	mutex_exit(sm->sm_lock);
969fa9e4066Sahrens }
970fa9e4066Sahrens 
971fa9e4066Sahrens int
972fa9e4066Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
973fa9e4066Sahrens {
974fa9e4066Sahrens 	int dirty;
975fa9e4066Sahrens 
976fa9e4066Sahrens 	/*
977fa9e4066Sahrens 	 * Quick test without the lock -- covers the common case that
978fa9e4066Sahrens 	 * there are no dirty time segments.
979fa9e4066Sahrens 	 */
980fa9e4066Sahrens 	if (sm->sm_space == 0)
981fa9e4066Sahrens 		return (0);
982fa9e4066Sahrens 
983fa9e4066Sahrens 	mutex_enter(sm->sm_lock);
984fa9e4066Sahrens 	dirty = space_map_contains(sm, txg, size);
985fa9e4066Sahrens 	mutex_exit(sm->sm_lock);
986fa9e4066Sahrens 
987fa9e4066Sahrens 	return (dirty);
988fa9e4066Sahrens }
989fa9e4066Sahrens 
990fa9e4066Sahrens /*
991fa9e4066Sahrens  * Reassess DTLs after a config change or scrub completion.
992fa9e4066Sahrens  */
993fa9e4066Sahrens void
994fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
995fa9e4066Sahrens {
996fa9e4066Sahrens 	int c;
997fa9e4066Sahrens 
998fa9e4066Sahrens 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
999fa9e4066Sahrens 
1000fa9e4066Sahrens 	if (vd->vdev_children == 0) {
1001fa9e4066Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
1002fa9e4066Sahrens 		/*
1003fa9e4066Sahrens 		 * We're successfully scrubbed everything up to scrub_txg.
1004fa9e4066Sahrens 		 * Therefore, excise all old DTLs up to that point, then
1005fa9e4066Sahrens 		 * fold in the DTLs for everything we couldn't scrub.
1006fa9e4066Sahrens 		 */
1007fa9e4066Sahrens 		if (scrub_txg != 0) {
1008fa9e4066Sahrens 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
1009fa9e4066Sahrens 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
1010fa9e4066Sahrens 		}
1011fa9e4066Sahrens 		if (scrub_done)
1012fa9e4066Sahrens 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1013fa9e4066Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
1014fa9e4066Sahrens 		if (txg != 0) {
1015fa9e4066Sahrens 			vdev_t *tvd = vd->vdev_top;
1016fa9e4066Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
1017fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1018fa9e4066Sahrens 		}
1019fa9e4066Sahrens 		return;
1020fa9e4066Sahrens 	}
1021fa9e4066Sahrens 
1022fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1023fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
1024fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1025fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1026fa9e4066Sahrens 
1027fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
1028fa9e4066Sahrens 		vdev_t *cvd = vd->vdev_child[c];
1029fa9e4066Sahrens 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
1030fa9e4066Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
1031fa9e4066Sahrens 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
1032fa9e4066Sahrens 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
1033fa9e4066Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
1034fa9e4066Sahrens 	}
1035fa9e4066Sahrens }
1036fa9e4066Sahrens 
1037fa9e4066Sahrens static int
1038fa9e4066Sahrens vdev_dtl_load(vdev_t *vd)
1039fa9e4066Sahrens {
1040fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1041fa9e4066Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
1042fa9e4066Sahrens 	dmu_buf_t *db;
1043fa9e4066Sahrens 	int error;
1044fa9e4066Sahrens 
1045fa9e4066Sahrens 	ASSERT(vd->vdev_children == 0);
1046fa9e4066Sahrens 
1047fa9e4066Sahrens 	if (smo->smo_object == 0)
1048fa9e4066Sahrens 		return (0);
1049fa9e4066Sahrens 
1050fa9e4066Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1051fa9e4066Sahrens 	dmu_buf_read(db);
1052fa9e4066Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1053fa9e4066Sahrens 	bcopy(db->db_data, smo, db->db_size);
1054fa9e4066Sahrens 	dmu_buf_rele(db);
1055fa9e4066Sahrens 
1056fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1057fa9e4066Sahrens 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1058fa9e4066Sahrens 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1059fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1060fa9e4066Sahrens 
1061fa9e4066Sahrens 	return (error);
1062fa9e4066Sahrens }
1063fa9e4066Sahrens 
1064fa9e4066Sahrens void
1065fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1066fa9e4066Sahrens {
1067fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1068fa9e4066Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
1069fa9e4066Sahrens 	space_map_t *sm = &vd->vdev_dtl_map;
1070fa9e4066Sahrens 	space_map_t smsync;
1071fa9e4066Sahrens 	kmutex_t smlock;
1072fa9e4066Sahrens 	avl_tree_t *t = &sm->sm_root;
1073fa9e4066Sahrens 	space_seg_t *ss;
1074fa9e4066Sahrens 	dmu_buf_t *db;
1075fa9e4066Sahrens 	dmu_tx_t *tx;
1076fa9e4066Sahrens 
1077fa9e4066Sahrens 	dprintf("%s in txg %llu pass %d\n",
1078fa9e4066Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1079fa9e4066Sahrens 
1080fa9e4066Sahrens 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1081fa9e4066Sahrens 
1082fa9e4066Sahrens 	if (vd->vdev_detached) {
1083fa9e4066Sahrens 		if (smo->smo_object != 0) {
1084fa9e4066Sahrens 			int err = dmu_object_free(spa->spa_meta_objset,
1085fa9e4066Sahrens 			    smo->smo_object, tx);
1086fa9e4066Sahrens 			ASSERT3U(err, ==, 0);
1087fa9e4066Sahrens 			smo->smo_object = 0;
1088fa9e4066Sahrens 		}
1089fa9e4066Sahrens 		dmu_tx_commit(tx);
1090fa9e4066Sahrens 		return;
1091fa9e4066Sahrens 	}
1092fa9e4066Sahrens 
1093fa9e4066Sahrens 	if (smo->smo_object == 0) {
1094fa9e4066Sahrens 		ASSERT(smo->smo_objsize == 0);
1095fa9e4066Sahrens 		ASSERT(smo->smo_alloc == 0);
1096fa9e4066Sahrens 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1097fa9e4066Sahrens 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1098fa9e4066Sahrens 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1099fa9e4066Sahrens 		ASSERT(smo->smo_object != 0);
1100fa9e4066Sahrens 		vdev_config_dirty(vd->vdev_top);
1101fa9e4066Sahrens 	}
1102fa9e4066Sahrens 
1103fa9e4066Sahrens 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1104fa9e4066Sahrens 	    0, smo->smo_objsize, tx);
1105fa9e4066Sahrens 
1106fa9e4066Sahrens 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1107fa9e4066Sahrens 
1108fa9e4066Sahrens 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1109fa9e4066Sahrens 	    &smlock);
1110fa9e4066Sahrens 
1111fa9e4066Sahrens 	mutex_enter(&smlock);
1112fa9e4066Sahrens 
1113fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1114fa9e4066Sahrens 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1115fa9e4066Sahrens 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1116fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1117fa9e4066Sahrens 
1118fa9e4066Sahrens 	smo->smo_objsize = 0;
1119fa9e4066Sahrens 	smo->smo_alloc = smsync.sm_space;
1120fa9e4066Sahrens 
1121fa9e4066Sahrens 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1122fa9e4066Sahrens 	space_map_destroy(&smsync);
1123fa9e4066Sahrens 
1124fa9e4066Sahrens 	mutex_exit(&smlock);
1125fa9e4066Sahrens 	mutex_destroy(&smlock);
1126fa9e4066Sahrens 
1127fa9e4066Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1128fa9e4066Sahrens 	dmu_buf_will_dirty(db, tx);
1129fa9e4066Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1130fa9e4066Sahrens 	bcopy(smo, db->db_data, db->db_size);
1131fa9e4066Sahrens 	dmu_buf_rele(db);
1132fa9e4066Sahrens 
1133fa9e4066Sahrens 	dmu_tx_commit(tx);
1134fa9e4066Sahrens }
1135fa9e4066Sahrens 
1136fa9e4066Sahrens int
1137fa9e4066Sahrens vdev_load(vdev_t *vd, int import)
1138fa9e4066Sahrens {
1139fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1140fa9e4066Sahrens 	int c, error;
1141fa9e4066Sahrens 	nvlist_t *label;
1142fa9e4066Sahrens 	uint64_t guid, state;
1143fa9e4066Sahrens 
1144fa9e4066Sahrens 	dprintf("loading %s\n", vdev_description(vd));
1145fa9e4066Sahrens 
1146fa9e4066Sahrens 	/*
1147fa9e4066Sahrens 	 * Recursively load all children.
1148fa9e4066Sahrens 	 */
1149fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1150fa9e4066Sahrens 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1151fa9e4066Sahrens 			return (error);
1152fa9e4066Sahrens 
1153fa9e4066Sahrens 	/*
1154fa9e4066Sahrens 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1155fa9e4066Sahrens 	 */
1156fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1157fa9e4066Sahrens 
1158fa9e4066Sahrens 		if (vdev_is_dead(vd))
1159fa9e4066Sahrens 			return (0);
1160fa9e4066Sahrens 
1161fa9e4066Sahrens 		/*
1162fa9e4066Sahrens 		 * XXX state transitions don't propagate to parent here.
1163fa9e4066Sahrens 		 * Also, merely setting the state isn't sufficient because
1164fa9e4066Sahrens 		 * it's not persistent; a vdev_reopen() would make us
1165fa9e4066Sahrens 		 * forget all about it.
1166fa9e4066Sahrens 		 */
1167fa9e4066Sahrens 		if ((label = vdev_label_read_config(vd)) == NULL) {
1168fa9e4066Sahrens 			dprintf("can't load label config\n");
1169fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1170fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1171fa9e4066Sahrens 			return (0);
1172fa9e4066Sahrens 		}
1173fa9e4066Sahrens 
1174fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1175fa9e4066Sahrens 		    &guid) != 0 || guid != spa_guid(spa)) {
1176fa9e4066Sahrens 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1177fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1178fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1179fa9e4066Sahrens 			nvlist_free(label);
1180fa9e4066Sahrens 			return (0);
1181fa9e4066Sahrens 		}
1182fa9e4066Sahrens 
1183fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1184fa9e4066Sahrens 		    guid != vd->vdev_guid) {
1185fa9e4066Sahrens 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1186fa9e4066Sahrens 			    guid, vd->vdev_guid);
1187fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1188fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1189fa9e4066Sahrens 			nvlist_free(label);
1190fa9e4066Sahrens 			return (0);
1191fa9e4066Sahrens 		}
1192fa9e4066Sahrens 
1193fa9e4066Sahrens 		/*
1194fa9e4066Sahrens 		 * If we find a vdev with a matching pool guid and vdev guid,
1195fa9e4066Sahrens 		 * but the pool state is not active, it indicates that the user
1196fa9e4066Sahrens 		 * exported or destroyed the pool without affecting the config
1197fa9e4066Sahrens 		 * cache (if / was mounted readonly, for example).  In this
1198fa9e4066Sahrens 		 * case, immediately return EBADF so the caller can remove it
1199fa9e4066Sahrens 		 * from the config.
1200fa9e4066Sahrens 		 */
1201fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1202fa9e4066Sahrens 		    &state)) {
1203fa9e4066Sahrens 			dprintf("missing pool state\n");
1204fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1205fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1206fa9e4066Sahrens 			nvlist_free(label);
1207fa9e4066Sahrens 			return (0);
1208fa9e4066Sahrens 		}
1209fa9e4066Sahrens 
1210fa9e4066Sahrens 		if (state != POOL_STATE_ACTIVE &&
1211fa9e4066Sahrens 		    (!import || state != POOL_STATE_EXPORTED)) {
1212fa9e4066Sahrens 			dprintf("pool state not active (%llu)\n", state);
1213fa9e4066Sahrens 			nvlist_free(label);
1214fa9e4066Sahrens 			return (EBADF);
1215fa9e4066Sahrens 		}
1216fa9e4066Sahrens 
1217fa9e4066Sahrens 		nvlist_free(label);
1218fa9e4066Sahrens 	}
1219fa9e4066Sahrens 
1220fa9e4066Sahrens 	/*
1221fa9e4066Sahrens 	 * If this is a top-level vdev, make sure its allocation parameters
1222fa9e4066Sahrens 	 * exist and initialize its metaslabs.
1223fa9e4066Sahrens 	 */
1224fa9e4066Sahrens 	if (vd == vd->vdev_top) {
1225fa9e4066Sahrens 
1226fa9e4066Sahrens 		if (vd->vdev_ms_array == 0 ||
1227fa9e4066Sahrens 		    vd->vdev_ms_shift == 0 ||
1228fa9e4066Sahrens 		    vd->vdev_ashift == 0 ||
1229fa9e4066Sahrens 		    vd->vdev_asize == 0) {
1230fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1231fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1232fa9e4066Sahrens 			return (0);
1233fa9e4066Sahrens 		}
1234fa9e4066Sahrens 
1235fa9e4066Sahrens 		vdev_metaslab_init(vd, 0);
1236fa9e4066Sahrens 	}
1237fa9e4066Sahrens 
1238fa9e4066Sahrens 	/*
1239fa9e4066Sahrens 	 * If this is a leaf vdev, load its DTL.
1240fa9e4066Sahrens 	 */
1241fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1242fa9e4066Sahrens 		error = vdev_dtl_load(vd);
1243fa9e4066Sahrens 		if (error) {
1244fa9e4066Sahrens 			dprintf("can't load DTL for %s, error %d\n",
1245fa9e4066Sahrens 			    vdev_description(vd), error);
1246fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1247fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1248fa9e4066Sahrens 			return (0);
1249fa9e4066Sahrens 		}
1250fa9e4066Sahrens 	}
1251fa9e4066Sahrens 
1252fa9e4066Sahrens 	return (0);
1253fa9e4066Sahrens }
1254fa9e4066Sahrens 
1255fa9e4066Sahrens void
1256fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg)
1257fa9e4066Sahrens {
1258fa9e4066Sahrens 	metaslab_t *msp;
1259fa9e4066Sahrens 
1260fa9e4066Sahrens 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1261fa9e4066Sahrens 
1262fa9e4066Sahrens 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1263fa9e4066Sahrens 		metaslab_sync_done(msp, txg);
1264fa9e4066Sahrens }
1265fa9e4066Sahrens 
1266fa9e4066Sahrens void
1267fa9e4066Sahrens vdev_add_sync(vdev_t *vd, uint64_t txg)
1268fa9e4066Sahrens {
1269fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1270fa9e4066Sahrens 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1271fa9e4066Sahrens 
1272fa9e4066Sahrens 	ASSERT(vd == vd->vdev_top);
1273fa9e4066Sahrens 
1274fa9e4066Sahrens 	if (vd->vdev_ms_array == 0)
1275fa9e4066Sahrens 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1276fa9e4066Sahrens 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1277fa9e4066Sahrens 
1278fa9e4066Sahrens 	ASSERT(vd->vdev_ms_array != 0);
1279fa9e4066Sahrens 
1280fa9e4066Sahrens 	vdev_config_dirty(vd);
1281fa9e4066Sahrens 
1282fa9e4066Sahrens 	dmu_tx_commit(tx);
1283fa9e4066Sahrens }
1284fa9e4066Sahrens 
1285fa9e4066Sahrens void
1286fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg)
1287fa9e4066Sahrens {
1288fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1289fa9e4066Sahrens 	vdev_t *lvd;
1290fa9e4066Sahrens 	metaslab_t *msp;
1291fa9e4066Sahrens 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1292fa9e4066Sahrens 	uint8_t dirty = *dirtyp;
1293fa9e4066Sahrens 
1294fa9e4066Sahrens 	mutex_enter(&vd->vdev_dirty_lock);
1295fa9e4066Sahrens 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1296fa9e4066Sahrens 	mutex_exit(&vd->vdev_dirty_lock);
1297fa9e4066Sahrens 
1298fa9e4066Sahrens 	dprintf("%s txg %llu pass %d\n",
1299fa9e4066Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1300fa9e4066Sahrens 
1301fa9e4066Sahrens 	if (dirty & VDD_ADD)
1302fa9e4066Sahrens 		vdev_add_sync(vd, txg);
1303fa9e4066Sahrens 
1304fa9e4066Sahrens 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1305fa9e4066Sahrens 		metaslab_sync(msp, txg);
1306fa9e4066Sahrens 
1307fa9e4066Sahrens 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1308fa9e4066Sahrens 		vdev_dtl_sync(lvd, txg);
1309fa9e4066Sahrens 
1310fa9e4066Sahrens 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1311fa9e4066Sahrens }
1312fa9e4066Sahrens 
1313fa9e4066Sahrens uint64_t
1314fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1315fa9e4066Sahrens {
1316fa9e4066Sahrens 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1317fa9e4066Sahrens }
1318fa9e4066Sahrens 
1319fa9e4066Sahrens void
1320fa9e4066Sahrens vdev_io_start(zio_t *zio)
1321fa9e4066Sahrens {
1322fa9e4066Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1323fa9e4066Sahrens }
1324fa9e4066Sahrens 
1325fa9e4066Sahrens void
1326fa9e4066Sahrens vdev_io_done(zio_t *zio)
1327fa9e4066Sahrens {
1328fa9e4066Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1329fa9e4066Sahrens }
1330fa9e4066Sahrens 
1331fa9e4066Sahrens const char *
1332fa9e4066Sahrens vdev_description(vdev_t *vd)
1333fa9e4066Sahrens {
1334fa9e4066Sahrens 	if (vd == NULL || vd->vdev_ops == NULL)
1335fa9e4066Sahrens 		return ("<unknown>");
1336fa9e4066Sahrens 
1337fa9e4066Sahrens 	if (vd->vdev_path != NULL)
1338fa9e4066Sahrens 		return (vd->vdev_path);
1339fa9e4066Sahrens 
1340fa9e4066Sahrens 	if (vd->vdev_parent == NULL)
1341fa9e4066Sahrens 		return (spa_name(vd->vdev_spa));
1342fa9e4066Sahrens 
1343fa9e4066Sahrens 	return (vd->vdev_ops->vdev_op_type);
1344fa9e4066Sahrens }
1345fa9e4066Sahrens 
1346fa9e4066Sahrens int
1347fa9e4066Sahrens vdev_online(spa_t *spa, const char *path)
1348fa9e4066Sahrens {
1349*441d80aaSlling 	vdev_t *rvd, *vd;
1350*441d80aaSlling 	uint64_t txg;
1351fa9e4066Sahrens 
1352*441d80aaSlling 	txg = spa_vdev_enter(spa);
1353fa9e4066Sahrens 
1354*441d80aaSlling 	rvd = spa->spa_root_vdev;
1355*441d80aaSlling 	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
1356*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1357fa9e4066Sahrens 
1358fa9e4066Sahrens 	dprintf("ONLINE: %s\n", vdev_description(vd));
1359fa9e4066Sahrens 
1360fa9e4066Sahrens 	vd->vdev_offline = B_FALSE;
1361*441d80aaSlling 	vd->vdev_tmpoffline = B_FALSE;
1362fa9e4066Sahrens 
1363fa9e4066Sahrens 	/*
1364fa9e4066Sahrens 	 * Clear the error counts.  The idea is that you expect to see all
1365fa9e4066Sahrens 	 * zeroes when everything is working, so if you've just onlined a
1366fa9e4066Sahrens 	 * device, you don't want to keep hearing about errors from before.
1367fa9e4066Sahrens 	 */
1368fa9e4066Sahrens 	vd->vdev_stat.vs_read_errors = 0;
1369fa9e4066Sahrens 	vd->vdev_stat.vs_write_errors = 0;
1370fa9e4066Sahrens 	vd->vdev_stat.vs_checksum_errors = 0;
1371fa9e4066Sahrens 
1372fa9e4066Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1373fa9e4066Sahrens 
1374*441d80aaSlling 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
1375*441d80aaSlling 
1376*441d80aaSlling 	vdev_config_dirty(vd->vdev_top);
1377*441d80aaSlling 
1378*441d80aaSlling 	(void) spa_vdev_exit(spa, NULL, txg, 0);
1379fa9e4066Sahrens 
1380fa9e4066Sahrens 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1381fa9e4066Sahrens 
1382fa9e4066Sahrens 	return (0);
1383fa9e4066Sahrens }
1384fa9e4066Sahrens 
1385fa9e4066Sahrens int
1386*441d80aaSlling vdev_offline(spa_t *spa, const char *path, int istmp)
1387fa9e4066Sahrens {
1388*441d80aaSlling 	vdev_t *rvd, *vd;
1389*441d80aaSlling 	uint64_t txg;
1390fa9e4066Sahrens 
1391*441d80aaSlling 	txg = spa_vdev_enter(spa);
1392fa9e4066Sahrens 
1393*441d80aaSlling 	rvd = spa->spa_root_vdev;
1394*441d80aaSlling 	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
1395*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1396fa9e4066Sahrens 
1397fa9e4066Sahrens 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1398fa9e4066Sahrens 
1399*441d80aaSlling 	/* vdev is already offlined, do nothing */
1400*441d80aaSlling 	if (vd->vdev_offline)
1401*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, 0));
1402*441d80aaSlling 
1403fa9e4066Sahrens 	/*
1404fa9e4066Sahrens 	 * If this device's top-level vdev has a non-empty DTL,
1405fa9e4066Sahrens 	 * don't allow the device to be offlined.
1406fa9e4066Sahrens 	 *
1407fa9e4066Sahrens 	 * XXX -- we should make this more precise by allowing the offline
1408fa9e4066Sahrens 	 * as long as the remaining devices don't have any DTL holes.
1409fa9e4066Sahrens 	 */
1410*441d80aaSlling 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
1411*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1412fa9e4066Sahrens 
1413fa9e4066Sahrens 	/*
1414fa9e4066Sahrens 	 * Set this device to offline state and reopen its top-level vdev.
1415fa9e4066Sahrens 	 * If this action results in the top-level vdev becoming unusable,
1416fa9e4066Sahrens 	 * undo it and fail the request.
1417fa9e4066Sahrens 	 */
1418fa9e4066Sahrens 	vd->vdev_offline = B_TRUE;
1419fa9e4066Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1420fa9e4066Sahrens 	if (vdev_is_dead(vd->vdev_top)) {
1421fa9e4066Sahrens 		vd->vdev_offline = B_FALSE;
1422fa9e4066Sahrens 		vdev_reopen(vd->vdev_top, NULL);
1423*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1424fa9e4066Sahrens 	}
1425fa9e4066Sahrens 
1426*441d80aaSlling 	vd->vdev_tmpoffline = istmp;
1427*441d80aaSlling 	if (istmp)
1428*441d80aaSlling 		return (spa_vdev_exit(spa, NULL, txg, 0));
1429fa9e4066Sahrens 
1430*441d80aaSlling 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
1431*441d80aaSlling 
1432*441d80aaSlling 	vdev_config_dirty(vd->vdev_top);
1433*441d80aaSlling 
1434*441d80aaSlling 	return (spa_vdev_exit(spa, NULL, txg, 0));
1435fa9e4066Sahrens }
1436fa9e4066Sahrens 
1437fa9e4066Sahrens int
1438fa9e4066Sahrens vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1439fa9e4066Sahrens {
1440fa9e4066Sahrens 	vdev_t *vd;
1441fa9e4066Sahrens 
1442fa9e4066Sahrens 	spa_config_enter(spa, RW_WRITER);
1443fa9e4066Sahrens 
1444fa9e4066Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1445fa9e4066Sahrens 		spa_config_exit(spa);
1446fa9e4066Sahrens 		return (ENODEV);
1447fa9e4066Sahrens 	}
1448fa9e4066Sahrens 
1449fa9e4066Sahrens 	vd->vdev_fault_mode = mode;
1450fa9e4066Sahrens 	vd->vdev_fault_mask = mask;
1451fa9e4066Sahrens 	vd->vdev_fault_arg = arg;
1452fa9e4066Sahrens 
1453fa9e4066Sahrens 	spa_config_exit(spa);
1454fa9e4066Sahrens 
1455fa9e4066Sahrens 	return (0);
1456fa9e4066Sahrens }
1457fa9e4066Sahrens 
1458fa9e4066Sahrens int
1459fa9e4066Sahrens vdev_is_dead(vdev_t *vd)
1460fa9e4066Sahrens {
1461fa9e4066Sahrens 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1462fa9e4066Sahrens }
1463fa9e4066Sahrens 
1464fa9e4066Sahrens int
1465fa9e4066Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio)
1466fa9e4066Sahrens {
1467fa9e4066Sahrens 	int error = 0;
1468fa9e4066Sahrens 
1469fa9e4066Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1470fa9e4066Sahrens 		return (0);
1471fa9e4066Sahrens 
1472fa9e4066Sahrens 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1473fa9e4066Sahrens 		return (0);
1474fa9e4066Sahrens 
1475fa9e4066Sahrens 	switch (vd->vdev_fault_mode) {
1476fa9e4066Sahrens 	case VDEV_FAULT_RANDOM:
1477fa9e4066Sahrens 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1478fa9e4066Sahrens 			error = EIO;
1479fa9e4066Sahrens 		break;
1480fa9e4066Sahrens 
1481fa9e4066Sahrens 	case VDEV_FAULT_COUNT:
1482fa9e4066Sahrens 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1483fa9e4066Sahrens 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1484fa9e4066Sahrens 		error = EIO;
1485fa9e4066Sahrens 		break;
1486fa9e4066Sahrens 	}
1487fa9e4066Sahrens 
1488fa9e4066Sahrens 	if (error != 0) {
1489fa9e4066Sahrens 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1490fa9e4066Sahrens 		    error, zio->io_type, vdev_description(vd),
1491fa9e4066Sahrens 		    vd->vdev_state, zio->io_offset);
1492fa9e4066Sahrens 	}
1493fa9e4066Sahrens 
1494fa9e4066Sahrens 	return (error);
1495fa9e4066Sahrens }
1496fa9e4066Sahrens 
1497fa9e4066Sahrens /*
1498fa9e4066Sahrens  * Get statistics for the given vdev.
1499fa9e4066Sahrens  */
1500fa9e4066Sahrens void
1501fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1502fa9e4066Sahrens {
1503fa9e4066Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1504fa9e4066Sahrens 	int c, t;
1505fa9e4066Sahrens 
1506fa9e4066Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1507fa9e4066Sahrens 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1508fa9e4066Sahrens 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1509fa9e4066Sahrens 	vs->vs_state = vd->vdev_state;
15102a79c5feSlling 	vs->vs_rsize = vdev_get_rsize(vd);
1511fa9e4066Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1512fa9e4066Sahrens 
1513fa9e4066Sahrens 	/*
1514fa9e4066Sahrens 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1515fa9e4066Sahrens 	 * over all top-level vdevs (i.e. the direct children of the root).
1516fa9e4066Sahrens 	 */
1517fa9e4066Sahrens 	if (vd == rvd) {
1518fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++) {
1519fa9e4066Sahrens 			vdev_t *cvd = rvd->vdev_child[c];
1520fa9e4066Sahrens 			vdev_stat_t *cvs = &cvd->vdev_stat;
1521fa9e4066Sahrens 
1522fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1523fa9e4066Sahrens 			for (t = 0; t < ZIO_TYPES; t++) {
1524fa9e4066Sahrens 				vs->vs_ops[t] += cvs->vs_ops[t];
1525fa9e4066Sahrens 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1526fa9e4066Sahrens 			}
1527fa9e4066Sahrens 			vs->vs_read_errors += cvs->vs_read_errors;
1528fa9e4066Sahrens 			vs->vs_write_errors += cvs->vs_write_errors;
1529fa9e4066Sahrens 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1530fa9e4066Sahrens 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1531fa9e4066Sahrens 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1532fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1533fa9e4066Sahrens 		}
1534fa9e4066Sahrens 	}
1535fa9e4066Sahrens }
1536fa9e4066Sahrens 
1537fa9e4066Sahrens void
1538fa9e4066Sahrens vdev_stat_update(zio_t *zio)
1539fa9e4066Sahrens {
1540fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1541fa9e4066Sahrens 	vdev_t *pvd;
1542fa9e4066Sahrens 	uint64_t txg = zio->io_txg;
1543fa9e4066Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1544fa9e4066Sahrens 	zio_type_t type = zio->io_type;
1545fa9e4066Sahrens 	int flags = zio->io_flags;
1546fa9e4066Sahrens 
1547fa9e4066Sahrens 	if (zio->io_error == 0) {
1548fa9e4066Sahrens 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1549fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1550fa9e4066Sahrens 			vs->vs_ops[type]++;
1551fa9e4066Sahrens 			vs->vs_bytes[type] += zio->io_size;
1552fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1553fa9e4066Sahrens 		}
1554fa9e4066Sahrens 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1555fa9e4066Sahrens 		    zio->io_delegate_list == NULL) {
1556fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1557fa9e4066Sahrens 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1558fa9e4066Sahrens 				vs->vs_scrub_repaired += zio->io_size;
1559fa9e4066Sahrens 			else
1560fa9e4066Sahrens 				vs->vs_self_healed += zio->io_size;
1561fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1562fa9e4066Sahrens 		}
1563fa9e4066Sahrens 		return;
1564fa9e4066Sahrens 	}
1565fa9e4066Sahrens 
1566fa9e4066Sahrens 	if (flags & ZIO_FLAG_SPECULATIVE)
1567fa9e4066Sahrens 		return;
1568fa9e4066Sahrens 
1569fa9e4066Sahrens 	if (!vdev_is_dead(vd)) {
1570fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1571fa9e4066Sahrens 		if (type == ZIO_TYPE_READ) {
1572fa9e4066Sahrens 			if (zio->io_error == ECKSUM)
1573fa9e4066Sahrens 				vs->vs_checksum_errors++;
1574fa9e4066Sahrens 			else
1575fa9e4066Sahrens 				vs->vs_read_errors++;
1576fa9e4066Sahrens 		}
1577fa9e4066Sahrens 		if (type == ZIO_TYPE_WRITE)
1578fa9e4066Sahrens 			vs->vs_write_errors++;
1579fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1580fa9e4066Sahrens 	}
1581fa9e4066Sahrens 
1582fa9e4066Sahrens 	if (type == ZIO_TYPE_WRITE) {
1583fa9e4066Sahrens 		if (txg == 0 || vd->vdev_children != 0)
1584fa9e4066Sahrens 			return;
1585fa9e4066Sahrens 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1586fa9e4066Sahrens 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1587fa9e4066Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1588fa9e4066Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1589fa9e4066Sahrens 		}
1590fa9e4066Sahrens 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1591fa9e4066Sahrens 			vdev_t *tvd = vd->vdev_top;
1592fa9e4066Sahrens 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1593fa9e4066Sahrens 				return;
1594fa9e4066Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
1595fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1596fa9e4066Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1597fa9e4066Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1598fa9e4066Sahrens 		}
1599fa9e4066Sahrens 	}
1600fa9e4066Sahrens }
1601fa9e4066Sahrens 
1602fa9e4066Sahrens void
1603fa9e4066Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1604fa9e4066Sahrens {
1605fa9e4066Sahrens 	int c;
1606fa9e4066Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1607fa9e4066Sahrens 
1608fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1609fa9e4066Sahrens 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1610fa9e4066Sahrens 
1611fa9e4066Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1612fa9e4066Sahrens 
1613fa9e4066Sahrens 	if (type == POOL_SCRUB_NONE) {
1614fa9e4066Sahrens 		/*
1615fa9e4066Sahrens 		 * Update completion and end time.  Leave everything else alone
1616fa9e4066Sahrens 		 * so we can report what happened during the previous scrub.
1617fa9e4066Sahrens 		 */
1618fa9e4066Sahrens 		vs->vs_scrub_complete = complete;
1619fa9e4066Sahrens 		vs->vs_scrub_end = gethrestime_sec();
1620fa9e4066Sahrens 	} else {
1621fa9e4066Sahrens 		vs->vs_scrub_type = type;
1622fa9e4066Sahrens 		vs->vs_scrub_complete = 0;
1623fa9e4066Sahrens 		vs->vs_scrub_examined = 0;
1624fa9e4066Sahrens 		vs->vs_scrub_repaired = 0;
1625fa9e4066Sahrens 		vs->vs_scrub_errors = 0;
1626fa9e4066Sahrens 		vs->vs_scrub_start = gethrestime_sec();
1627fa9e4066Sahrens 		vs->vs_scrub_end = 0;
1628fa9e4066Sahrens 	}
1629fa9e4066Sahrens 
1630fa9e4066Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1631fa9e4066Sahrens }
1632fa9e4066Sahrens 
1633fa9e4066Sahrens /*
1634fa9e4066Sahrens  * Report checksum errors that a vdev that didn't realize it made.
1635fa9e4066Sahrens  * This can happen, for example, when RAID-Z combinatorial reconstruction
1636fa9e4066Sahrens  * infers that one of its components returned bad data.
1637fa9e4066Sahrens  */
1638fa9e4066Sahrens void
1639fa9e4066Sahrens vdev_checksum_error(zio_t *zio, vdev_t *vd)
1640fa9e4066Sahrens {
1641fa9e4066Sahrens 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1642fa9e4066Sahrens 	    vdev_description(vd));
1643fa9e4066Sahrens 
1644fa9e4066Sahrens 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1645fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1646fa9e4066Sahrens 		vd->vdev_stat.vs_checksum_errors++;
1647fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1648fa9e4066Sahrens 	}
1649fa9e4066Sahrens }
1650fa9e4066Sahrens 
1651fa9e4066Sahrens /*
1652fa9e4066Sahrens  * Update the in-core space usage stats for this vdev and the root vdev.
1653fa9e4066Sahrens  */
1654fa9e4066Sahrens void
1655fa9e4066Sahrens vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1656fa9e4066Sahrens {
1657fa9e4066Sahrens 	ASSERT(vd == vd->vdev_top);
1658fa9e4066Sahrens 
1659fa9e4066Sahrens 	do {
1660fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1661fa9e4066Sahrens 		vd->vdev_stat.vs_space += space_delta;
1662fa9e4066Sahrens 		vd->vdev_stat.vs_alloc += alloc_delta;
1663fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1664fa9e4066Sahrens 	} while ((vd = vd->vdev_parent) != NULL);
1665fa9e4066Sahrens }
1666fa9e4066Sahrens 
1667fa9e4066Sahrens /*
1668fa9e4066Sahrens  * Various knobs to tune a vdev.
1669fa9e4066Sahrens  */
1670fa9e4066Sahrens static vdev_knob_t vdev_knob[] = {
1671fa9e4066Sahrens 	{
1672fa9e4066Sahrens 		"cache_size",
1673fa9e4066Sahrens 		"size of the read-ahead cache",
1674fa9e4066Sahrens 		0,
1675fa9e4066Sahrens 		1ULL << 30,
1676fa9e4066Sahrens 		10ULL << 20,
1677fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_size)
1678fa9e4066Sahrens 	},
1679fa9e4066Sahrens 	{
1680fa9e4066Sahrens 		"cache_bshift",
1681fa9e4066Sahrens 		"log2 of cache blocksize",
1682fa9e4066Sahrens 		SPA_MINBLOCKSHIFT,
1683fa9e4066Sahrens 		SPA_MAXBLOCKSHIFT,
1684fa9e4066Sahrens 		16,
1685fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_bshift)
1686fa9e4066Sahrens 	},
1687fa9e4066Sahrens 	{
1688fa9e4066Sahrens 		"cache_max",
1689fa9e4066Sahrens 		"largest block size to cache",
1690fa9e4066Sahrens 		0,
1691fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1692fa9e4066Sahrens 		1ULL << 14,
1693fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_max)
1694fa9e4066Sahrens 	},
1695fa9e4066Sahrens 	{
1696fa9e4066Sahrens 		"min_pending",
1697fa9e4066Sahrens 		"minimum pending I/Os to the disk",
1698fa9e4066Sahrens 		1,
1699fa9e4066Sahrens 		10000,
1700fa9e4066Sahrens 		2,
1701fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1702fa9e4066Sahrens 	},
1703fa9e4066Sahrens 	{
1704fa9e4066Sahrens 		"max_pending",
1705fa9e4066Sahrens 		"maximum pending I/Os to the disk",
1706fa9e4066Sahrens 		1,
1707fa9e4066Sahrens 		10000,
1708fa9e4066Sahrens 		35,
1709fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1710fa9e4066Sahrens 	},
1711fa9e4066Sahrens 	{
1712fa9e4066Sahrens 		"agg_limit",
1713fa9e4066Sahrens 		"maximum size of aggregated I/Os",
1714fa9e4066Sahrens 		0,
1715fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1716fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1717fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1718fa9e4066Sahrens 	},
1719fa9e4066Sahrens 	{
1720fa9e4066Sahrens 		"time_shift",
1721fa9e4066Sahrens 		"deadline = pri + (lbolt >> time_shift)",
1722fa9e4066Sahrens 		0,
1723fa9e4066Sahrens 		63,
1724fa9e4066Sahrens 		4,
1725fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1726fa9e4066Sahrens 	},
1727fa9e4066Sahrens 	{
1728fa9e4066Sahrens 		"ramp_rate",
1729fa9e4066Sahrens 		"exponential I/O issue ramp-up rate",
1730fa9e4066Sahrens 		1,
1731fa9e4066Sahrens 		10000,
1732fa9e4066Sahrens 		2,
1733fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1734fa9e4066Sahrens 	},
1735fa9e4066Sahrens };
1736fa9e4066Sahrens 
1737fa9e4066Sahrens vdev_knob_t *
1738fa9e4066Sahrens vdev_knob_next(vdev_knob_t *vk)
1739fa9e4066Sahrens {
1740fa9e4066Sahrens 	if (vk == NULL)
1741fa9e4066Sahrens 		return (vdev_knob);
1742fa9e4066Sahrens 
1743fa9e4066Sahrens 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1744fa9e4066Sahrens 		return (NULL);
1745fa9e4066Sahrens 
1746fa9e4066Sahrens 	return (vk);
1747fa9e4066Sahrens }
1748fa9e4066Sahrens 
1749fa9e4066Sahrens /*
1750fa9e4066Sahrens  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1751fa9e4066Sahrens  * so that it will be written out next time the vdev configuration is synced.
1752fa9e4066Sahrens  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1753fa9e4066Sahrens  */
1754fa9e4066Sahrens void
1755fa9e4066Sahrens vdev_config_dirty(vdev_t *vd)
1756fa9e4066Sahrens {
1757fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1758fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
1759fa9e4066Sahrens 	int c;
1760fa9e4066Sahrens 
1761fa9e4066Sahrens 	if (vd == rvd) {
1762fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
1763fa9e4066Sahrens 			vdev_config_dirty(rvd->vdev_child[c]);
1764fa9e4066Sahrens 	} else {
1765fa9e4066Sahrens 		ASSERT(vd == vd->vdev_top);
1766fa9e4066Sahrens 
1767fa9e4066Sahrens 		if (!vd->vdev_is_dirty) {
1768fa9e4066Sahrens 			list_insert_head(&spa->spa_dirty_list, vd);
1769fa9e4066Sahrens 			vd->vdev_is_dirty = B_TRUE;
1770fa9e4066Sahrens 		}
1771fa9e4066Sahrens 	}
1772fa9e4066Sahrens }
1773fa9e4066Sahrens 
1774fa9e4066Sahrens void
1775fa9e4066Sahrens vdev_config_clean(vdev_t *vd)
1776fa9e4066Sahrens {
1777fa9e4066Sahrens 	ASSERT(vd->vdev_is_dirty);
1778fa9e4066Sahrens 
1779fa9e4066Sahrens 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1780fa9e4066Sahrens 	vd->vdev_is_dirty = B_FALSE;
1781fa9e4066Sahrens }
1782fa9e4066Sahrens 
1783fa9e4066Sahrens /*
1784fa9e4066Sahrens  * Set a vdev's state, updating any parent's state as well.
1785fa9e4066Sahrens  */
1786fa9e4066Sahrens void
1787fa9e4066Sahrens vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1788fa9e4066Sahrens {
1789fa9e4066Sahrens 	if (state == vd->vdev_state)
1790fa9e4066Sahrens 		return;
1791fa9e4066Sahrens 
1792fa9e4066Sahrens 	vd->vdev_state = state;
1793fa9e4066Sahrens 	vd->vdev_stat.vs_aux = aux;
1794fa9e4066Sahrens 
1795fa9e4066Sahrens 	if (vd->vdev_parent != NULL) {
1796fa9e4066Sahrens 		int c;
1797fa9e4066Sahrens 		int degraded = 0, faulted = 0;
1798fa9e4066Sahrens 		vdev_t *parent, *child;
1799fa9e4066Sahrens 
1800fa9e4066Sahrens 		parent = vd->vdev_parent;
1801fa9e4066Sahrens 		for (c = 0; c < parent->vdev_children; c++) {
1802fa9e4066Sahrens 			child = parent->vdev_child[c];
1803fa9e4066Sahrens 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1804fa9e4066Sahrens 				faulted++;
1805fa9e4066Sahrens 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1806fa9e4066Sahrens 				degraded++;
1807fa9e4066Sahrens 		}
1808fa9e4066Sahrens 
1809fa9e4066Sahrens 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1810fa9e4066Sahrens 		    vd->vdev_parent, faulted, degraded);
1811fa9e4066Sahrens 	    }
1812fa9e4066Sahrens }
1813