xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev.c (revision 2a79c5fee1dab68e30266ba4356cf60b871aabcf)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5fa9e4066Sahrens  * Common Development and Distribution License, Version 1.0 only
6fa9e4066Sahrens  * (the "License").  You may not use this file except in compliance
7fa9e4066Sahrens  * with the License.
8fa9e4066Sahrens  *
9fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
11fa9e4066Sahrens  * See the License for the specific language governing permissions
12fa9e4066Sahrens  * and limitations under the License.
13fa9e4066Sahrens  *
14fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19fa9e4066Sahrens  *
20fa9e4066Sahrens  * CDDL HEADER END
21fa9e4066Sahrens  */
22fa9e4066Sahrens /*
23fa9e4066Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24fa9e4066Sahrens  * Use is subject to license terms.
25fa9e4066Sahrens  */
26fa9e4066Sahrens 
27fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/spa.h>
31fa9e4066Sahrens #include <sys/spa_impl.h>
32fa9e4066Sahrens #include <sys/dmu.h>
33fa9e4066Sahrens #include <sys/dmu_tx.h>
34fa9e4066Sahrens #include <sys/vdev_impl.h>
35fa9e4066Sahrens #include <sys/uberblock_impl.h>
36fa9e4066Sahrens #include <sys/metaslab.h>
37fa9e4066Sahrens #include <sys/metaslab_impl.h>
38fa9e4066Sahrens #include <sys/space_map.h>
39fa9e4066Sahrens #include <sys/zio.h>
40fa9e4066Sahrens #include <sys/zap.h>
41fa9e4066Sahrens #include <sys/fs/zfs.h>
42fa9e4066Sahrens 
43fa9e4066Sahrens /*
44fa9e4066Sahrens  * Virtual device management.
45fa9e4066Sahrens  */
46fa9e4066Sahrens 
47fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = {
48fa9e4066Sahrens 	&vdev_root_ops,
49fa9e4066Sahrens 	&vdev_raidz_ops,
50fa9e4066Sahrens 	&vdev_mirror_ops,
51fa9e4066Sahrens 	&vdev_replacing_ops,
52fa9e4066Sahrens 	&vdev_disk_ops,
53fa9e4066Sahrens 	&vdev_file_ops,
54fa9e4066Sahrens 	&vdev_missing_ops,
55fa9e4066Sahrens 	NULL
56fa9e4066Sahrens };
57fa9e4066Sahrens 
58fa9e4066Sahrens /*
59fa9e4066Sahrens  * Given a vdev type, return the appropriate ops vector.
60fa9e4066Sahrens  */
61fa9e4066Sahrens static vdev_ops_t *
62fa9e4066Sahrens vdev_getops(const char *type)
63fa9e4066Sahrens {
64fa9e4066Sahrens 	vdev_ops_t *ops, **opspp;
65fa9e4066Sahrens 
66fa9e4066Sahrens 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
67fa9e4066Sahrens 		if (strcmp(ops->vdev_op_type, type) == 0)
68fa9e4066Sahrens 			break;
69fa9e4066Sahrens 
70fa9e4066Sahrens 	return (ops);
71fa9e4066Sahrens }
72fa9e4066Sahrens 
73fa9e4066Sahrens /*
74fa9e4066Sahrens  * Default asize function: return the MAX of psize with the asize of
75fa9e4066Sahrens  * all children.  This is what's used by anything other than RAID-Z.
76fa9e4066Sahrens  */
77fa9e4066Sahrens uint64_t
78fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize)
79fa9e4066Sahrens {
80fa9e4066Sahrens 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
81fa9e4066Sahrens 	uint64_t csize;
82fa9e4066Sahrens 	uint64_t c;
83fa9e4066Sahrens 
84fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
85fa9e4066Sahrens 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
86fa9e4066Sahrens 		asize = MAX(asize, csize);
87fa9e4066Sahrens 	}
88fa9e4066Sahrens 
89fa9e4066Sahrens 	return (asize);
90fa9e4066Sahrens }
91fa9e4066Sahrens 
92*2a79c5feSlling /*
93*2a79c5feSlling  * Get the replaceable or attachable device size.
94*2a79c5feSlling  * If the parent is a mirror or raidz, the replaceable size is the minimum
95*2a79c5feSlling  * psize of all its children. For the rest, just return our own psize.
96*2a79c5feSlling  *
97*2a79c5feSlling  * e.g.
98*2a79c5feSlling  *			psize	rsize
99*2a79c5feSlling  * root			-	-
100*2a79c5feSlling  *	mirror/raidz	-	-
101*2a79c5feSlling  *	    disk1	20g	20g
102*2a79c5feSlling  *	    disk2 	40g	20g
103*2a79c5feSlling  *	disk3 		80g	80g
104*2a79c5feSlling  */
105*2a79c5feSlling uint64_t
106*2a79c5feSlling vdev_get_rsize(vdev_t *vd)
107*2a79c5feSlling {
108*2a79c5feSlling 	vdev_t *pvd, *cvd;
109*2a79c5feSlling 	uint64_t c, rsize;
110*2a79c5feSlling 
111*2a79c5feSlling 	pvd = vd->vdev_parent;
112*2a79c5feSlling 
113*2a79c5feSlling 	/*
114*2a79c5feSlling 	 * If our parent is NULL or the root, just return our own psize.
115*2a79c5feSlling 	 */
116*2a79c5feSlling 	if (pvd == NULL || pvd->vdev_parent == NULL)
117*2a79c5feSlling 		return (vd->vdev_psize);
118*2a79c5feSlling 
119*2a79c5feSlling 	rsize = 0;
120*2a79c5feSlling 
121*2a79c5feSlling 	for (c = 0; c < pvd->vdev_children; c++) {
122*2a79c5feSlling 		cvd = pvd->vdev_child[c];
123*2a79c5feSlling 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
124*2a79c5feSlling 	}
125*2a79c5feSlling 
126*2a79c5feSlling 	return (rsize);
127*2a79c5feSlling }
128*2a79c5feSlling 
129fa9e4066Sahrens vdev_t *
130fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev)
131fa9e4066Sahrens {
132fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
133fa9e4066Sahrens 
134fa9e4066Sahrens 	if (vdev < rvd->vdev_children)
135fa9e4066Sahrens 		return (rvd->vdev_child[vdev]);
136fa9e4066Sahrens 
137fa9e4066Sahrens 	return (NULL);
138fa9e4066Sahrens }
139fa9e4066Sahrens 
140fa9e4066Sahrens vdev_t *
141fa9e4066Sahrens vdev_lookup_by_path(vdev_t *vd, const char *path)
142fa9e4066Sahrens {
143fa9e4066Sahrens 	int c;
144fa9e4066Sahrens 	vdev_t *mvd;
145fa9e4066Sahrens 
146fa9e4066Sahrens 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
147fa9e4066Sahrens 		return (vd);
148fa9e4066Sahrens 
149fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
150fa9e4066Sahrens 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
151fa9e4066Sahrens 		    NULL)
152fa9e4066Sahrens 			return (mvd);
153fa9e4066Sahrens 
154fa9e4066Sahrens 	return (NULL);
155fa9e4066Sahrens }
156fa9e4066Sahrens 
157fa9e4066Sahrens vdev_t *
158fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
159fa9e4066Sahrens {
160fa9e4066Sahrens 	int c;
161fa9e4066Sahrens 	vdev_t *mvd;
162fa9e4066Sahrens 
163fa9e4066Sahrens 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
164fa9e4066Sahrens 		return (vd);
165fa9e4066Sahrens 
166fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
167fa9e4066Sahrens 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
168fa9e4066Sahrens 		    NULL)
169fa9e4066Sahrens 			return (mvd);
170fa9e4066Sahrens 
171fa9e4066Sahrens 	return (NULL);
172fa9e4066Sahrens }
173fa9e4066Sahrens 
174fa9e4066Sahrens void
175fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd)
176fa9e4066Sahrens {
177fa9e4066Sahrens 	size_t oldsize, newsize;
178fa9e4066Sahrens 	uint64_t id = cvd->vdev_id;
179fa9e4066Sahrens 	vdev_t **newchild;
180fa9e4066Sahrens 
181fa9e4066Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
182fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == NULL);
183fa9e4066Sahrens 
184fa9e4066Sahrens 	cvd->vdev_parent = pvd;
185fa9e4066Sahrens 
186fa9e4066Sahrens 	if (pvd == NULL)
187fa9e4066Sahrens 		return;
188fa9e4066Sahrens 
189fa9e4066Sahrens 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
190fa9e4066Sahrens 
191fa9e4066Sahrens 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
192fa9e4066Sahrens 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
193fa9e4066Sahrens 	newsize = pvd->vdev_children * sizeof (vdev_t *);
194fa9e4066Sahrens 
195fa9e4066Sahrens 	newchild = kmem_zalloc(newsize, KM_SLEEP);
196fa9e4066Sahrens 	if (pvd->vdev_child != NULL) {
197fa9e4066Sahrens 		bcopy(pvd->vdev_child, newchild, oldsize);
198fa9e4066Sahrens 		kmem_free(pvd->vdev_child, oldsize);
199fa9e4066Sahrens 	}
200fa9e4066Sahrens 
201fa9e4066Sahrens 	pvd->vdev_child = newchild;
202fa9e4066Sahrens 	pvd->vdev_child[id] = cvd;
203fa9e4066Sahrens 
204fa9e4066Sahrens 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
205fa9e4066Sahrens 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
206fa9e4066Sahrens 
207fa9e4066Sahrens 	/*
208fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
209fa9e4066Sahrens 	 */
210fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
211fa9e4066Sahrens 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
212fa9e4066Sahrens }
213fa9e4066Sahrens 
214fa9e4066Sahrens void
215fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
216fa9e4066Sahrens {
217fa9e4066Sahrens 	int c;
218fa9e4066Sahrens 	uint_t id = cvd->vdev_id;
219fa9e4066Sahrens 
220fa9e4066Sahrens 	ASSERT(cvd->vdev_parent == pvd);
221fa9e4066Sahrens 
222fa9e4066Sahrens 	if (pvd == NULL)
223fa9e4066Sahrens 		return;
224fa9e4066Sahrens 
225fa9e4066Sahrens 	ASSERT(id < pvd->vdev_children);
226fa9e4066Sahrens 	ASSERT(pvd->vdev_child[id] == cvd);
227fa9e4066Sahrens 
228fa9e4066Sahrens 	pvd->vdev_child[id] = NULL;
229fa9e4066Sahrens 	cvd->vdev_parent = NULL;
230fa9e4066Sahrens 
231fa9e4066Sahrens 	for (c = 0; c < pvd->vdev_children; c++)
232fa9e4066Sahrens 		if (pvd->vdev_child[c])
233fa9e4066Sahrens 			break;
234fa9e4066Sahrens 
235fa9e4066Sahrens 	if (c == pvd->vdev_children) {
236fa9e4066Sahrens 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
237fa9e4066Sahrens 		pvd->vdev_child = NULL;
238fa9e4066Sahrens 		pvd->vdev_children = 0;
239fa9e4066Sahrens 	}
240fa9e4066Sahrens 
241fa9e4066Sahrens 	/*
242fa9e4066Sahrens 	 * Walk up all ancestors to update guid sum.
243fa9e4066Sahrens 	 */
244fa9e4066Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
245fa9e4066Sahrens 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
246fa9e4066Sahrens }
247fa9e4066Sahrens 
248fa9e4066Sahrens /*
249fa9e4066Sahrens  * Remove any holes in the child array.
250fa9e4066Sahrens  */
251fa9e4066Sahrens void
252fa9e4066Sahrens vdev_compact_children(vdev_t *pvd)
253fa9e4066Sahrens {
254fa9e4066Sahrens 	vdev_t **newchild, *cvd;
255fa9e4066Sahrens 	int oldc = pvd->vdev_children;
256fa9e4066Sahrens 	int newc, c;
257fa9e4066Sahrens 
258fa9e4066Sahrens 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
259fa9e4066Sahrens 
260fa9e4066Sahrens 	for (c = newc = 0; c < oldc; c++)
261fa9e4066Sahrens 		if (pvd->vdev_child[c])
262fa9e4066Sahrens 			newc++;
263fa9e4066Sahrens 
264fa9e4066Sahrens 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
265fa9e4066Sahrens 
266fa9e4066Sahrens 	for (c = newc = 0; c < oldc; c++) {
267fa9e4066Sahrens 		if ((cvd = pvd->vdev_child[c]) != NULL) {
268fa9e4066Sahrens 			newchild[newc] = cvd;
269fa9e4066Sahrens 			cvd->vdev_id = newc++;
270fa9e4066Sahrens 		}
271fa9e4066Sahrens 	}
272fa9e4066Sahrens 
273fa9e4066Sahrens 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
274fa9e4066Sahrens 	pvd->vdev_child = newchild;
275fa9e4066Sahrens 	pvd->vdev_children = newc;
276fa9e4066Sahrens }
277fa9e4066Sahrens 
278fa9e4066Sahrens /*
279fa9e4066Sahrens  * Allocate and minimally initialize a vdev_t.
280fa9e4066Sahrens  */
281fa9e4066Sahrens static vdev_t *
282fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
283fa9e4066Sahrens {
284fa9e4066Sahrens 	vdev_t *vd;
285fa9e4066Sahrens 
286fa9e4066Sahrens 	while (guid == 0)
287fa9e4066Sahrens 		guid = spa_get_random(-1ULL);
288fa9e4066Sahrens 
289fa9e4066Sahrens 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
290fa9e4066Sahrens 
291fa9e4066Sahrens 	vd->vdev_spa = spa;
292fa9e4066Sahrens 	vd->vdev_id = id;
293fa9e4066Sahrens 	vd->vdev_guid = guid;
294fa9e4066Sahrens 	vd->vdev_guid_sum = guid;
295fa9e4066Sahrens 	vd->vdev_ops = ops;
296fa9e4066Sahrens 	vd->vdev_state = VDEV_STATE_CLOSED;
297fa9e4066Sahrens 
298fa9e4066Sahrens 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
299fa9e4066Sahrens 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
300fa9e4066Sahrens 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
301fa9e4066Sahrens 	    offsetof(zio_t, io_pending));
302fa9e4066Sahrens 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
303fa9e4066Sahrens 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
304fa9e4066Sahrens 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
305fa9e4066Sahrens 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
306fa9e4066Sahrens 	txg_list_create(&vd->vdev_ms_list,
307fa9e4066Sahrens 	    offsetof(struct metaslab, ms_txg_node));
308fa9e4066Sahrens 	txg_list_create(&vd->vdev_dtl_list,
309fa9e4066Sahrens 	    offsetof(struct vdev, vdev_dtl_node));
310fa9e4066Sahrens 	vd->vdev_stat.vs_timestamp = gethrtime();
311fa9e4066Sahrens 
312fa9e4066Sahrens 	return (vd);
313fa9e4066Sahrens }
314fa9e4066Sahrens 
315fa9e4066Sahrens /*
316fa9e4066Sahrens  * Free a vdev_t that has been removed from service.
317fa9e4066Sahrens  */
318fa9e4066Sahrens static void
319fa9e4066Sahrens vdev_free_common(vdev_t *vd)
320fa9e4066Sahrens {
321fa9e4066Sahrens 	if (vd->vdev_path)
322fa9e4066Sahrens 		spa_strfree(vd->vdev_path);
323fa9e4066Sahrens 	if (vd->vdev_devid)
324fa9e4066Sahrens 		spa_strfree(vd->vdev_devid);
325fa9e4066Sahrens 
326fa9e4066Sahrens 	txg_list_destroy(&vd->vdev_ms_list);
327fa9e4066Sahrens 	txg_list_destroy(&vd->vdev_dtl_list);
328fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
329fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
330fa9e4066Sahrens 	space_map_destroy(&vd->vdev_dtl_map);
331fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
332fa9e4066Sahrens 	space_map_destroy(&vd->vdev_dtl_scrub);
333fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
334fa9e4066Sahrens 	mutex_destroy(&vd->vdev_dtl_lock);
335fa9e4066Sahrens 	mutex_destroy(&vd->vdev_dirty_lock);
336fa9e4066Sahrens 	list_destroy(&vd->vdev_io_pending);
337fa9e4066Sahrens 	mutex_destroy(&vd->vdev_io_lock);
338fa9e4066Sahrens 	cv_destroy(&vd->vdev_io_cv);
339fa9e4066Sahrens 
340fa9e4066Sahrens 	kmem_free(vd, sizeof (vdev_t));
341fa9e4066Sahrens }
342fa9e4066Sahrens 
343fa9e4066Sahrens /*
344fa9e4066Sahrens  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
345fa9e4066Sahrens  * creating a new vdev or loading an existing one - the behavior is slightly
346fa9e4066Sahrens  * different for each case.
347fa9e4066Sahrens  */
348fa9e4066Sahrens vdev_t *
349fa9e4066Sahrens vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
350fa9e4066Sahrens {
351fa9e4066Sahrens 	vdev_ops_t *ops;
352fa9e4066Sahrens 	char *type;
353fa9e4066Sahrens 	uint64_t guid = 0;
354fa9e4066Sahrens 	vdev_t *vd;
355fa9e4066Sahrens 
356fa9e4066Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
357fa9e4066Sahrens 
358fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
359fa9e4066Sahrens 		return (NULL);
360fa9e4066Sahrens 
361fa9e4066Sahrens 	if ((ops = vdev_getops(type)) == NULL)
362fa9e4066Sahrens 		return (NULL);
363fa9e4066Sahrens 
364fa9e4066Sahrens 	/*
365fa9e4066Sahrens 	 * If this is a load, get the vdev guid from the nvlist.
366fa9e4066Sahrens 	 * Otherwise, vdev_alloc_common() will generate one for us.
367fa9e4066Sahrens 	 */
368fa9e4066Sahrens 	if (alloctype == VDEV_ALLOC_LOAD) {
369fa9e4066Sahrens 		uint64_t label_id;
370fa9e4066Sahrens 
371fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
372fa9e4066Sahrens 		    label_id != id)
373fa9e4066Sahrens 			return (NULL);
374fa9e4066Sahrens 
375fa9e4066Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
376fa9e4066Sahrens 			return (NULL);
377fa9e4066Sahrens 	}
378fa9e4066Sahrens 
379fa9e4066Sahrens 	vd = vdev_alloc_common(spa, id, guid, ops);
380fa9e4066Sahrens 
381fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
382fa9e4066Sahrens 		vd->vdev_path = spa_strdup(vd->vdev_path);
383fa9e4066Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
384fa9e4066Sahrens 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
385fa9e4066Sahrens 
386afefbcddSeschrock 	/*
387afefbcddSeschrock 	 * Set the whole_disk property.  If it's not specified, leave the value
388afefbcddSeschrock 	 * as -1.
389afefbcddSeschrock 	 */
390afefbcddSeschrock 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
391afefbcddSeschrock 	    &vd->vdev_wholedisk) != 0)
392afefbcddSeschrock 		vd->vdev_wholedisk = -1ULL;
393afefbcddSeschrock 
394fa9e4066Sahrens 	/*
395fa9e4066Sahrens 	 * If we're a top-level vdev, try to load the allocation parameters.
396fa9e4066Sahrens 	 */
397fa9e4066Sahrens 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
398fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
399fa9e4066Sahrens 		    &vd->vdev_ms_array);
400fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
401fa9e4066Sahrens 		    &vd->vdev_ms_shift);
402fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
403fa9e4066Sahrens 		    &vd->vdev_ashift);
404fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
405fa9e4066Sahrens 		    &vd->vdev_asize);
406fa9e4066Sahrens 	}
407fa9e4066Sahrens 
408fa9e4066Sahrens 	/*
409fa9e4066Sahrens 	 * If we're a leaf vdev, try to load the DTL object.
410fa9e4066Sahrens 	 */
411fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
412fa9e4066Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
413fa9e4066Sahrens 		    &vd->vdev_dtl.smo_object);
414fa9e4066Sahrens 	}
415fa9e4066Sahrens 
416fa9e4066Sahrens 	/*
417fa9e4066Sahrens 	 * Add ourselves to the parent's list of children.
418fa9e4066Sahrens 	 */
419fa9e4066Sahrens 	vdev_add_child(parent, vd);
420fa9e4066Sahrens 
421fa9e4066Sahrens 	return (vd);
422fa9e4066Sahrens }
423fa9e4066Sahrens 
424fa9e4066Sahrens void
425fa9e4066Sahrens vdev_free(vdev_t *vd)
426fa9e4066Sahrens {
427fa9e4066Sahrens 	int c;
428fa9e4066Sahrens 
429fa9e4066Sahrens 	/*
430fa9e4066Sahrens 	 * vdev_free() implies closing the vdev first.  This is simpler than
431fa9e4066Sahrens 	 * trying to ensure complicated semantics for all callers.
432fa9e4066Sahrens 	 */
433fa9e4066Sahrens 	vdev_close(vd);
434fa9e4066Sahrens 
435fa9e4066Sahrens 	/*
436fa9e4066Sahrens 	 * It's possible to free a vdev that's been added to the dirty
437fa9e4066Sahrens 	 * list when in the middle of spa_vdev_add().  Handle that case
438fa9e4066Sahrens 	 * correctly here.
439fa9e4066Sahrens 	 */
440fa9e4066Sahrens 	if (vd->vdev_is_dirty)
441fa9e4066Sahrens 		vdev_config_clean(vd);
442fa9e4066Sahrens 
443fa9e4066Sahrens 	/*
444fa9e4066Sahrens 	 * Free all children.
445fa9e4066Sahrens 	 */
446fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
447fa9e4066Sahrens 		vdev_free(vd->vdev_child[c]);
448fa9e4066Sahrens 
449fa9e4066Sahrens 	ASSERT(vd->vdev_child == NULL);
450fa9e4066Sahrens 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
451fa9e4066Sahrens 
452fa9e4066Sahrens 	/*
453fa9e4066Sahrens 	 * Discard allocation state.
454fa9e4066Sahrens 	 */
455fa9e4066Sahrens 	if (vd == vd->vdev_top)
456fa9e4066Sahrens 		vdev_metaslab_fini(vd);
457fa9e4066Sahrens 
458fa9e4066Sahrens 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
459fa9e4066Sahrens 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
460fa9e4066Sahrens 
461fa9e4066Sahrens 	/*
462fa9e4066Sahrens 	 * Remove this vdev from its parent's child list.
463fa9e4066Sahrens 	 */
464fa9e4066Sahrens 	vdev_remove_child(vd->vdev_parent, vd);
465fa9e4066Sahrens 
466fa9e4066Sahrens 	ASSERT(vd->vdev_parent == NULL);
467fa9e4066Sahrens 
468fa9e4066Sahrens 	vdev_free_common(vd);
469fa9e4066Sahrens }
470fa9e4066Sahrens 
471fa9e4066Sahrens /*
472fa9e4066Sahrens  * Transfer top-level vdev state from svd to tvd.
473fa9e4066Sahrens  */
474fa9e4066Sahrens static void
475fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
476fa9e4066Sahrens {
477fa9e4066Sahrens 	spa_t *spa = svd->vdev_spa;
478fa9e4066Sahrens 	metaslab_t *msp;
479fa9e4066Sahrens 	vdev_t *vd;
480fa9e4066Sahrens 	int t;
481fa9e4066Sahrens 
482fa9e4066Sahrens 	ASSERT(tvd == tvd->vdev_top);
483fa9e4066Sahrens 
484fa9e4066Sahrens 	tvd->vdev_ms_array = svd->vdev_ms_array;
485fa9e4066Sahrens 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
486fa9e4066Sahrens 	tvd->vdev_ms_count = svd->vdev_ms_count;
487fa9e4066Sahrens 
488fa9e4066Sahrens 	svd->vdev_ms_array = 0;
489fa9e4066Sahrens 	svd->vdev_ms_shift = 0;
490fa9e4066Sahrens 	svd->vdev_ms_count = 0;
491fa9e4066Sahrens 
492fa9e4066Sahrens 	tvd->vdev_mg = svd->vdev_mg;
493fa9e4066Sahrens 	tvd->vdev_mg->mg_vd = tvd;
494fa9e4066Sahrens 	tvd->vdev_ms = svd->vdev_ms;
495fa9e4066Sahrens 	tvd->vdev_smo = svd->vdev_smo;
496fa9e4066Sahrens 
497fa9e4066Sahrens 	svd->vdev_mg = NULL;
498fa9e4066Sahrens 	svd->vdev_ms = NULL;
499fa9e4066Sahrens 	svd->vdev_smo = NULL;
500fa9e4066Sahrens 
501fa9e4066Sahrens 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
502fa9e4066Sahrens 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
503fa9e4066Sahrens 
504fa9e4066Sahrens 	svd->vdev_stat.vs_alloc = 0;
505fa9e4066Sahrens 	svd->vdev_stat.vs_space = 0;
506fa9e4066Sahrens 
507fa9e4066Sahrens 	for (t = 0; t < TXG_SIZE; t++) {
508fa9e4066Sahrens 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
509fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
510fa9e4066Sahrens 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
511fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
512fa9e4066Sahrens 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
513fa9e4066Sahrens 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
514fa9e4066Sahrens 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
515fa9e4066Sahrens 		svd->vdev_dirty[t] = 0;
516fa9e4066Sahrens 	}
517fa9e4066Sahrens 
518fa9e4066Sahrens 	if (svd->vdev_is_dirty) {
519fa9e4066Sahrens 		vdev_config_clean(svd);
520fa9e4066Sahrens 		vdev_config_dirty(tvd);
521fa9e4066Sahrens 	}
522fa9e4066Sahrens 
523fa9e4066Sahrens 	ASSERT(svd->vdev_io_retry == NULL);
524fa9e4066Sahrens 	ASSERT(list_is_empty(&svd->vdev_io_pending));
525fa9e4066Sahrens }
526fa9e4066Sahrens 
527fa9e4066Sahrens static void
528fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd)
529fa9e4066Sahrens {
530fa9e4066Sahrens 	int c;
531fa9e4066Sahrens 
532fa9e4066Sahrens 	if (vd == NULL)
533fa9e4066Sahrens 		return;
534fa9e4066Sahrens 
535fa9e4066Sahrens 	vd->vdev_top = tvd;
536fa9e4066Sahrens 
537fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
538fa9e4066Sahrens 		vdev_top_update(tvd, vd->vdev_child[c]);
539fa9e4066Sahrens }
540fa9e4066Sahrens 
541fa9e4066Sahrens /*
542fa9e4066Sahrens  * Add a mirror/replacing vdev above an existing vdev.
543fa9e4066Sahrens  */
544fa9e4066Sahrens vdev_t *
545fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
546fa9e4066Sahrens {
547fa9e4066Sahrens 	spa_t *spa = cvd->vdev_spa;
548fa9e4066Sahrens 	vdev_t *pvd = cvd->vdev_parent;
549fa9e4066Sahrens 	vdev_t *mvd;
550fa9e4066Sahrens 
551fa9e4066Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
552fa9e4066Sahrens 
553fa9e4066Sahrens 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
554fa9e4066Sahrens 	vdev_remove_child(pvd, cvd);
555fa9e4066Sahrens 	vdev_add_child(pvd, mvd);
556fa9e4066Sahrens 	cvd->vdev_id = mvd->vdev_children;
557fa9e4066Sahrens 	vdev_add_child(mvd, cvd);
558fa9e4066Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
559fa9e4066Sahrens 
560fa9e4066Sahrens 	mvd->vdev_asize = cvd->vdev_asize;
561fa9e4066Sahrens 	mvd->vdev_ashift = cvd->vdev_ashift;
562fa9e4066Sahrens 	mvd->vdev_state = cvd->vdev_state;
563fa9e4066Sahrens 
564fa9e4066Sahrens 	if (mvd == mvd->vdev_top)
565fa9e4066Sahrens 		vdev_top_transfer(cvd, mvd);
566fa9e4066Sahrens 
567fa9e4066Sahrens 	return (mvd);
568fa9e4066Sahrens }
569fa9e4066Sahrens 
570fa9e4066Sahrens /*
571fa9e4066Sahrens  * Remove a 1-way mirror/replacing vdev from the tree.
572fa9e4066Sahrens  */
573fa9e4066Sahrens void
574fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd)
575fa9e4066Sahrens {
576fa9e4066Sahrens 	vdev_t *mvd = cvd->vdev_parent;
577fa9e4066Sahrens 	vdev_t *pvd = mvd->vdev_parent;
578fa9e4066Sahrens 
579fa9e4066Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
580fa9e4066Sahrens 
581fa9e4066Sahrens 	ASSERT(mvd->vdev_children == 1);
582fa9e4066Sahrens 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
583fa9e4066Sahrens 	    mvd->vdev_ops == &vdev_replacing_ops);
584fa9e4066Sahrens 
585fa9e4066Sahrens 	vdev_remove_child(mvd, cvd);
586fa9e4066Sahrens 	vdev_remove_child(pvd, mvd);
587fa9e4066Sahrens 	cvd->vdev_id = mvd->vdev_id;
588fa9e4066Sahrens 	vdev_add_child(pvd, cvd);
589fa9e4066Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
590fa9e4066Sahrens 
591fa9e4066Sahrens 	if (cvd == cvd->vdev_top)
592fa9e4066Sahrens 		vdev_top_transfer(mvd, cvd);
593fa9e4066Sahrens 
594fa9e4066Sahrens 	ASSERT(mvd->vdev_children == 0);
595fa9e4066Sahrens 	vdev_free(mvd);
596fa9e4066Sahrens }
597fa9e4066Sahrens 
598fa9e4066Sahrens void
599fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg)
600fa9e4066Sahrens {
601fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
602fa9e4066Sahrens 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
603fa9e4066Sahrens 	uint64_t c;
604fa9e4066Sahrens 	uint64_t oldc = vd->vdev_ms_count;
605fa9e4066Sahrens 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
606fa9e4066Sahrens 	space_map_obj_t *smo = vd->vdev_smo;
607fa9e4066Sahrens 	metaslab_t **mspp = vd->vdev_ms;
608fa9e4066Sahrens 
609fa9e4066Sahrens 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
610fa9e4066Sahrens 
611fa9e4066Sahrens 	ASSERT(oldc <= newc);
612fa9e4066Sahrens 
613fa9e4066Sahrens 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
614fa9e4066Sahrens 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
615fa9e4066Sahrens 	vd->vdev_ms_count = newc;
616fa9e4066Sahrens 
617fa9e4066Sahrens 	if (vd->vdev_mg == NULL) {
618fa9e4066Sahrens 		if (txg == 0) {
619fa9e4066Sahrens 			dmu_buf_t *db;
620fa9e4066Sahrens 			uint64_t *ms_array;
621fa9e4066Sahrens 
622fa9e4066Sahrens 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
623fa9e4066Sahrens 			    KM_SLEEP);
624fa9e4066Sahrens 
625fa9e4066Sahrens 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
626fa9e4066Sahrens 			    0, newc * sizeof (uint64_t), ms_array);
627fa9e4066Sahrens 
628fa9e4066Sahrens 			for (c = 0; c < newc; c++) {
629fa9e4066Sahrens 				if (ms_array[c] == 0)
630fa9e4066Sahrens 					continue;
631fa9e4066Sahrens 				db = dmu_bonus_hold(spa->spa_meta_objset,
632fa9e4066Sahrens 				    ms_array[c]);
633fa9e4066Sahrens 				dmu_buf_read(db);
634fa9e4066Sahrens 				ASSERT3U(db->db_size, ==, sizeof (*smo));
635fa9e4066Sahrens 				bcopy(db->db_data, &vd->vdev_smo[c],
636fa9e4066Sahrens 				    db->db_size);
637fa9e4066Sahrens 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
638fa9e4066Sahrens 				    ms_array[c]);
639fa9e4066Sahrens 				dmu_buf_rele(db);
640fa9e4066Sahrens 			}
641fa9e4066Sahrens 			kmem_free(ms_array, newc * sizeof (uint64_t));
642fa9e4066Sahrens 		}
643fa9e4066Sahrens 		vd->vdev_mg = metaslab_group_create(mc, vd);
644fa9e4066Sahrens 	}
645fa9e4066Sahrens 
646fa9e4066Sahrens 	for (c = 0; c < oldc; c++) {
647fa9e4066Sahrens 		vd->vdev_smo[c] = smo[c];
648fa9e4066Sahrens 		vd->vdev_ms[c] = mspp[c];
649fa9e4066Sahrens 		mspp[c]->ms_smo = &vd->vdev_smo[c];
650fa9e4066Sahrens 	}
651fa9e4066Sahrens 
652fa9e4066Sahrens 	for (c = oldc; c < newc; c++)
653fa9e4066Sahrens 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
654fa9e4066Sahrens 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
655fa9e4066Sahrens 
656fa9e4066Sahrens 	if (oldc != 0) {
657fa9e4066Sahrens 		kmem_free(smo, oldc * sizeof (*smo));
658fa9e4066Sahrens 		kmem_free(mspp, oldc * sizeof (*mspp));
659fa9e4066Sahrens 	}
660fa9e4066Sahrens 
661fa9e4066Sahrens }
662fa9e4066Sahrens 
663fa9e4066Sahrens void
664fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd)
665fa9e4066Sahrens {
666fa9e4066Sahrens 	uint64_t m;
667fa9e4066Sahrens 	uint64_t count = vd->vdev_ms_count;
668fa9e4066Sahrens 
669fa9e4066Sahrens 	if (vd->vdev_ms != NULL) {
670fa9e4066Sahrens 		for (m = 0; m < count; m++)
671fa9e4066Sahrens 			metaslab_fini(vd->vdev_ms[m]);
672fa9e4066Sahrens 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
673fa9e4066Sahrens 		vd->vdev_ms = NULL;
674fa9e4066Sahrens 	}
675fa9e4066Sahrens 
676fa9e4066Sahrens 	if (vd->vdev_smo != NULL) {
677fa9e4066Sahrens 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
678fa9e4066Sahrens 		vd->vdev_smo = NULL;
679fa9e4066Sahrens 	}
680fa9e4066Sahrens }
681fa9e4066Sahrens 
682fa9e4066Sahrens /*
683fa9e4066Sahrens  * Prepare a virtual device for access.
684fa9e4066Sahrens  */
685fa9e4066Sahrens int
686fa9e4066Sahrens vdev_open(vdev_t *vd)
687fa9e4066Sahrens {
688fa9e4066Sahrens 	int error;
689fa9e4066Sahrens 	vdev_knob_t *vk;
690fa9e4066Sahrens 	int c;
691fa9e4066Sahrens 	uint64_t osize = 0;
692fa9e4066Sahrens 	uint64_t asize, psize;
693fa9e4066Sahrens 	uint64_t ashift = -1ULL;
694fa9e4066Sahrens 
695fa9e4066Sahrens 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
696fa9e4066Sahrens 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
697fa9e4066Sahrens 	    vd->vdev_state == VDEV_STATE_OFFLINE);
698fa9e4066Sahrens 
699fa9e4066Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
700fa9e4066Sahrens 		vd->vdev_fault_arg >>= 1;
701fa9e4066Sahrens 	else
702fa9e4066Sahrens 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
703fa9e4066Sahrens 
704fa9e4066Sahrens 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
705fa9e4066Sahrens 
706fa9e4066Sahrens 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
707fa9e4066Sahrens 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
708fa9e4066Sahrens 
709fa9e4066Sahrens 		*valp = vk->vk_default;
710fa9e4066Sahrens 		*valp = MAX(*valp, vk->vk_min);
711fa9e4066Sahrens 		*valp = MIN(*valp, vk->vk_max);
712fa9e4066Sahrens 	}
713fa9e4066Sahrens 
714fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
715fa9e4066Sahrens 		vdev_cache_init(vd);
716fa9e4066Sahrens 		vdev_queue_init(vd);
717fa9e4066Sahrens 		vd->vdev_cache_active = B_TRUE;
718fa9e4066Sahrens 	}
719fa9e4066Sahrens 
720fa9e4066Sahrens 	if (vd->vdev_offline) {
721fa9e4066Sahrens 		ASSERT(vd->vdev_children == 0);
722fa9e4066Sahrens 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
723fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
724fa9e4066Sahrens 		return (ENXIO);
725fa9e4066Sahrens 	}
726fa9e4066Sahrens 
727fa9e4066Sahrens 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
728fa9e4066Sahrens 
729fa9e4066Sahrens 	dprintf("%s = %d, osize %llu, state = %d\n",
730fa9e4066Sahrens 	    vdev_description(vd), error, osize, vd->vdev_state);
731fa9e4066Sahrens 
732fa9e4066Sahrens 	if (error) {
733fa9e4066Sahrens 		dprintf("%s in %s failed to open, error %d, aux %d\n",
734fa9e4066Sahrens 		    vdev_description(vd),
735fa9e4066Sahrens 		    vdev_description(vd->vdev_parent),
736fa9e4066Sahrens 		    error,
737fa9e4066Sahrens 		    vd->vdev_stat.vs_aux);
738fa9e4066Sahrens 
739fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
740fa9e4066Sahrens 		return (error);
741fa9e4066Sahrens 	}
742fa9e4066Sahrens 
743fa9e4066Sahrens 	vd->vdev_state = VDEV_STATE_HEALTHY;
744fa9e4066Sahrens 
745fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
746fa9e4066Sahrens 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
747fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_DEGRADED;
748fa9e4066Sahrens 
749fa9e4066Sahrens 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
750fa9e4066Sahrens 
751fa9e4066Sahrens 	if (vd->vdev_children == 0) {
752fa9e4066Sahrens 		if (osize < SPA_MINDEVSIZE) {
753fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
754fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
755fa9e4066Sahrens 			return (EOVERFLOW);
756fa9e4066Sahrens 		}
757fa9e4066Sahrens 		psize = osize;
758fa9e4066Sahrens 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
759fa9e4066Sahrens 	} else {
760fa9e4066Sahrens 		if (osize < SPA_MINDEVSIZE -
761fa9e4066Sahrens 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
762fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
763fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
764fa9e4066Sahrens 			return (EOVERFLOW);
765fa9e4066Sahrens 		}
766fa9e4066Sahrens 		psize = 0;
767fa9e4066Sahrens 		asize = osize;
768fa9e4066Sahrens 	}
769fa9e4066Sahrens 
770fa9e4066Sahrens 	vd->vdev_psize = psize;
771fa9e4066Sahrens 
772fa9e4066Sahrens 	if (vd->vdev_asize == 0) {
773fa9e4066Sahrens 		/*
774fa9e4066Sahrens 		 * This is the first-ever open, so use the computed values.
775fa9e4066Sahrens 		 */
776fa9e4066Sahrens 		vd->vdev_asize = asize;
777fa9e4066Sahrens 		vd->vdev_ashift = ashift;
778fa9e4066Sahrens 	} else {
779fa9e4066Sahrens 		/*
780fa9e4066Sahrens 		 * Make sure the alignment requirement hasn't increased.
781fa9e4066Sahrens 		 */
782fa9e4066Sahrens 		if (ashift > vd->vdev_ashift) {
783fa9e4066Sahrens 			dprintf("%s: ashift grew\n", vdev_description(vd));
784fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
785fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
786fa9e4066Sahrens 			return (EINVAL);
787fa9e4066Sahrens 		}
788fa9e4066Sahrens 
789fa9e4066Sahrens 		/*
790fa9e4066Sahrens 		 * Make sure the device hasn't shrunk.
791fa9e4066Sahrens 		 */
792fa9e4066Sahrens 		if (asize < vd->vdev_asize) {
793fa9e4066Sahrens 			dprintf("%s: device shrank\n", vdev_description(vd));
794fa9e4066Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
795fa9e4066Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
796fa9e4066Sahrens 			return (EINVAL);
797fa9e4066Sahrens 		}
798fa9e4066Sahrens 
799fa9e4066Sahrens 		/*
800fa9e4066Sahrens 		 * If all children are healthy and the asize has increased,
801fa9e4066Sahrens 		 * then we've experienced dynamic LUN growth.
802fa9e4066Sahrens 		 */
803fa9e4066Sahrens 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
804fa9e4066Sahrens 		    asize > vd->vdev_asize) {
805fa9e4066Sahrens 			dprintf("%s: device grew\n", vdev_description(vd));
806fa9e4066Sahrens 			vd->vdev_asize = asize;
807fa9e4066Sahrens 		}
808fa9e4066Sahrens 	}
809fa9e4066Sahrens 
810fa9e4066Sahrens 	return (0);
811fa9e4066Sahrens }
812fa9e4066Sahrens 
813fa9e4066Sahrens /*
814fa9e4066Sahrens  * Close a virtual device.
815fa9e4066Sahrens  */
816fa9e4066Sahrens void
817fa9e4066Sahrens vdev_close(vdev_t *vd)
818fa9e4066Sahrens {
819fa9e4066Sahrens 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
820fa9e4066Sahrens 
821fa9e4066Sahrens 	vd->vdev_ops->vdev_op_close(vd);
822fa9e4066Sahrens 
823fa9e4066Sahrens 	if (vd->vdev_cache_active) {
824fa9e4066Sahrens 		vdev_cache_fini(vd);
825fa9e4066Sahrens 		vdev_queue_fini(vd);
826fa9e4066Sahrens 		vd->vdev_cache_active = B_FALSE;
827fa9e4066Sahrens 	}
828fa9e4066Sahrens 
829fa9e4066Sahrens 	if (vd->vdev_offline)
830fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
831fa9e4066Sahrens 	else
832fa9e4066Sahrens 		vd->vdev_state = VDEV_STATE_CLOSED;
833fa9e4066Sahrens }
834fa9e4066Sahrens 
835fa9e4066Sahrens void
836fa9e4066Sahrens vdev_reopen(vdev_t *vd, zio_t **rq)
837fa9e4066Sahrens {
838fa9e4066Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
839fa9e4066Sahrens 	int c;
840fa9e4066Sahrens 
841fa9e4066Sahrens 	if (vd == rvd) {
842fa9e4066Sahrens 		ASSERT(rq == NULL);
843fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
844fa9e4066Sahrens 			vdev_reopen(rvd->vdev_child[c], NULL);
845fa9e4066Sahrens 		return;
846fa9e4066Sahrens 	}
847fa9e4066Sahrens 
848fa9e4066Sahrens 	/* only valid for top-level vdevs */
849fa9e4066Sahrens 	ASSERT3P(vd, ==, vd->vdev_top);
850fa9e4066Sahrens 
851fa9e4066Sahrens 	/*
852fa9e4066Sahrens 	 * vdev_state can change when spa_config_lock is held as writer,
853fa9e4066Sahrens 	 * or when it's held as reader and we're doing a vdev_reopen().
854fa9e4066Sahrens 	 * To handle the latter case, we grab rvd's io_lock to serialize
855fa9e4066Sahrens 	 * reopens.  This ensures that there's never more than one vdev
856fa9e4066Sahrens 	 * state changer active at a time.
857fa9e4066Sahrens 	 */
858fa9e4066Sahrens 	mutex_enter(&rvd->vdev_io_lock);
859fa9e4066Sahrens 
860fa9e4066Sahrens 	mutex_enter(&vd->vdev_io_lock);
861fa9e4066Sahrens 	while (list_head(&vd->vdev_io_pending) != NULL)
862fa9e4066Sahrens 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
863fa9e4066Sahrens 	vdev_close(vd);
864fa9e4066Sahrens 	(void) vdev_open(vd);
865fa9e4066Sahrens 	if (rq != NULL) {
866fa9e4066Sahrens 		*rq = vd->vdev_io_retry;
867fa9e4066Sahrens 		vd->vdev_io_retry = NULL;
868fa9e4066Sahrens 	}
869fa9e4066Sahrens 	mutex_exit(&vd->vdev_io_lock);
870fa9e4066Sahrens 
871fa9e4066Sahrens 	/*
872fa9e4066Sahrens 	 * Reassess root vdev's health.
873fa9e4066Sahrens 	 */
874fa9e4066Sahrens 	rvd->vdev_state = VDEV_STATE_HEALTHY;
875fa9e4066Sahrens 	for (c = 0; c < rvd->vdev_children; c++) {
876fa9e4066Sahrens 		uint64_t state = rvd->vdev_child[c]->vdev_state;
877fa9e4066Sahrens 		rvd->vdev_state = MIN(rvd->vdev_state, state);
878fa9e4066Sahrens 	}
879fa9e4066Sahrens 
880fa9e4066Sahrens 	mutex_exit(&rvd->vdev_io_lock);
881fa9e4066Sahrens }
882fa9e4066Sahrens 
883fa9e4066Sahrens int
884fa9e4066Sahrens vdev_create(vdev_t *vd, uint64_t txg)
885fa9e4066Sahrens {
886fa9e4066Sahrens 	int error;
887fa9e4066Sahrens 
888fa9e4066Sahrens 	/*
889fa9e4066Sahrens 	 * Normally, partial opens (e.g. of a mirror) are allowed.
890fa9e4066Sahrens 	 * For a create, however, we want to fail the request if
891fa9e4066Sahrens 	 * there are any components we can't open.
892fa9e4066Sahrens 	 */
893fa9e4066Sahrens 	error = vdev_open(vd);
894fa9e4066Sahrens 
895fa9e4066Sahrens 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
896fa9e4066Sahrens 		vdev_close(vd);
897fa9e4066Sahrens 		return (error ? error : ENXIO);
898fa9e4066Sahrens 	}
899fa9e4066Sahrens 
900fa9e4066Sahrens 	/*
901fa9e4066Sahrens 	 * Recursively initialize all labels.
902fa9e4066Sahrens 	 */
903fa9e4066Sahrens 	if ((error = vdev_label_init(vd, txg)) != 0) {
904fa9e4066Sahrens 		vdev_close(vd);
905fa9e4066Sahrens 		return (error);
906fa9e4066Sahrens 	}
907fa9e4066Sahrens 
908fa9e4066Sahrens 	return (0);
909fa9e4066Sahrens }
910fa9e4066Sahrens 
911fa9e4066Sahrens /*
912fa9e4066Sahrens  * The is the latter half of vdev_create().  It is distinct because it
913fa9e4066Sahrens  * involves initiating transactions in order to do metaslab creation.
914fa9e4066Sahrens  * For creation, we want to try to create all vdevs at once and then undo it
915fa9e4066Sahrens  * if anything fails; this is much harder if we have pending transactions.
916fa9e4066Sahrens  */
917fa9e4066Sahrens void
918fa9e4066Sahrens vdev_init(vdev_t *vd, uint64_t txg)
919fa9e4066Sahrens {
920fa9e4066Sahrens 	/*
921fa9e4066Sahrens 	 * Aim for roughly 200 metaslabs per vdev.
922fa9e4066Sahrens 	 */
923fa9e4066Sahrens 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
924fa9e4066Sahrens 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
925fa9e4066Sahrens 
926fa9e4066Sahrens 	/*
927fa9e4066Sahrens 	 * Initialize the vdev's metaslabs.
928fa9e4066Sahrens 	 */
929fa9e4066Sahrens 	vdev_metaslab_init(vd, txg);
930fa9e4066Sahrens }
931fa9e4066Sahrens 
932fa9e4066Sahrens void
933fa9e4066Sahrens vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
934fa9e4066Sahrens {
935fa9e4066Sahrens 	vdev_t *tvd = vd->vdev_top;
936fa9e4066Sahrens 
937fa9e4066Sahrens 	mutex_enter(&tvd->vdev_dirty_lock);
938fa9e4066Sahrens 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
939fa9e4066Sahrens 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
940fa9e4066Sahrens 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
941fa9e4066Sahrens 		    tvd, txg);
942fa9e4066Sahrens 	}
943fa9e4066Sahrens 	mutex_exit(&tvd->vdev_dirty_lock);
944fa9e4066Sahrens }
945fa9e4066Sahrens 
946fa9e4066Sahrens void
947fa9e4066Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
948fa9e4066Sahrens {
949fa9e4066Sahrens 	mutex_enter(sm->sm_lock);
950fa9e4066Sahrens 	if (!space_map_contains(sm, txg, size))
951fa9e4066Sahrens 		space_map_add(sm, txg, size);
952fa9e4066Sahrens 	mutex_exit(sm->sm_lock);
953fa9e4066Sahrens }
954fa9e4066Sahrens 
955fa9e4066Sahrens int
956fa9e4066Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
957fa9e4066Sahrens {
958fa9e4066Sahrens 	int dirty;
959fa9e4066Sahrens 
960fa9e4066Sahrens 	/*
961fa9e4066Sahrens 	 * Quick test without the lock -- covers the common case that
962fa9e4066Sahrens 	 * there are no dirty time segments.
963fa9e4066Sahrens 	 */
964fa9e4066Sahrens 	if (sm->sm_space == 0)
965fa9e4066Sahrens 		return (0);
966fa9e4066Sahrens 
967fa9e4066Sahrens 	mutex_enter(sm->sm_lock);
968fa9e4066Sahrens 	dirty = space_map_contains(sm, txg, size);
969fa9e4066Sahrens 	mutex_exit(sm->sm_lock);
970fa9e4066Sahrens 
971fa9e4066Sahrens 	return (dirty);
972fa9e4066Sahrens }
973fa9e4066Sahrens 
974fa9e4066Sahrens /*
975fa9e4066Sahrens  * Reassess DTLs after a config change or scrub completion.
976fa9e4066Sahrens  */
977fa9e4066Sahrens void
978fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
979fa9e4066Sahrens {
980fa9e4066Sahrens 	int c;
981fa9e4066Sahrens 
982fa9e4066Sahrens 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
983fa9e4066Sahrens 
984fa9e4066Sahrens 	if (vd->vdev_children == 0) {
985fa9e4066Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
986fa9e4066Sahrens 		/*
987fa9e4066Sahrens 		 * We're successfully scrubbed everything up to scrub_txg.
988fa9e4066Sahrens 		 * Therefore, excise all old DTLs up to that point, then
989fa9e4066Sahrens 		 * fold in the DTLs for everything we couldn't scrub.
990fa9e4066Sahrens 		 */
991fa9e4066Sahrens 		if (scrub_txg != 0) {
992fa9e4066Sahrens 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
993fa9e4066Sahrens 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
994fa9e4066Sahrens 		}
995fa9e4066Sahrens 		if (scrub_done)
996fa9e4066Sahrens 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
997fa9e4066Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
998fa9e4066Sahrens 		if (txg != 0) {
999fa9e4066Sahrens 			vdev_t *tvd = vd->vdev_top;
1000fa9e4066Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
1001fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1002fa9e4066Sahrens 		}
1003fa9e4066Sahrens 		return;
1004fa9e4066Sahrens 	}
1005fa9e4066Sahrens 
1006fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1007fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
1008fa9e4066Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1009fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1010fa9e4066Sahrens 
1011fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
1012fa9e4066Sahrens 		vdev_t *cvd = vd->vdev_child[c];
1013fa9e4066Sahrens 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
1014fa9e4066Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
1015fa9e4066Sahrens 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
1016fa9e4066Sahrens 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
1017fa9e4066Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
1018fa9e4066Sahrens 	}
1019fa9e4066Sahrens }
1020fa9e4066Sahrens 
1021fa9e4066Sahrens static int
1022fa9e4066Sahrens vdev_dtl_load(vdev_t *vd)
1023fa9e4066Sahrens {
1024fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1025fa9e4066Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
1026fa9e4066Sahrens 	dmu_buf_t *db;
1027fa9e4066Sahrens 	int error;
1028fa9e4066Sahrens 
1029fa9e4066Sahrens 	ASSERT(vd->vdev_children == 0);
1030fa9e4066Sahrens 
1031fa9e4066Sahrens 	if (smo->smo_object == 0)
1032fa9e4066Sahrens 		return (0);
1033fa9e4066Sahrens 
1034fa9e4066Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1035fa9e4066Sahrens 	dmu_buf_read(db);
1036fa9e4066Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1037fa9e4066Sahrens 	bcopy(db->db_data, smo, db->db_size);
1038fa9e4066Sahrens 	dmu_buf_rele(db);
1039fa9e4066Sahrens 
1040fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1041fa9e4066Sahrens 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1042fa9e4066Sahrens 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1043fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1044fa9e4066Sahrens 
1045fa9e4066Sahrens 	return (error);
1046fa9e4066Sahrens }
1047fa9e4066Sahrens 
1048fa9e4066Sahrens void
1049fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1050fa9e4066Sahrens {
1051fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1052fa9e4066Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
1053fa9e4066Sahrens 	space_map_t *sm = &vd->vdev_dtl_map;
1054fa9e4066Sahrens 	space_map_t smsync;
1055fa9e4066Sahrens 	kmutex_t smlock;
1056fa9e4066Sahrens 	avl_tree_t *t = &sm->sm_root;
1057fa9e4066Sahrens 	space_seg_t *ss;
1058fa9e4066Sahrens 	dmu_buf_t *db;
1059fa9e4066Sahrens 	dmu_tx_t *tx;
1060fa9e4066Sahrens 
1061fa9e4066Sahrens 	dprintf("%s in txg %llu pass %d\n",
1062fa9e4066Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1063fa9e4066Sahrens 
1064fa9e4066Sahrens 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1065fa9e4066Sahrens 
1066fa9e4066Sahrens 	if (vd->vdev_detached) {
1067fa9e4066Sahrens 		if (smo->smo_object != 0) {
1068fa9e4066Sahrens 			int err = dmu_object_free(spa->spa_meta_objset,
1069fa9e4066Sahrens 			    smo->smo_object, tx);
1070fa9e4066Sahrens 			ASSERT3U(err, ==, 0);
1071fa9e4066Sahrens 			smo->smo_object = 0;
1072fa9e4066Sahrens 		}
1073fa9e4066Sahrens 		dmu_tx_commit(tx);
1074fa9e4066Sahrens 		return;
1075fa9e4066Sahrens 	}
1076fa9e4066Sahrens 
1077fa9e4066Sahrens 	if (smo->smo_object == 0) {
1078fa9e4066Sahrens 		ASSERT(smo->smo_objsize == 0);
1079fa9e4066Sahrens 		ASSERT(smo->smo_alloc == 0);
1080fa9e4066Sahrens 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1081fa9e4066Sahrens 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1082fa9e4066Sahrens 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1083fa9e4066Sahrens 		ASSERT(smo->smo_object != 0);
1084fa9e4066Sahrens 		vdev_config_dirty(vd->vdev_top);
1085fa9e4066Sahrens 	}
1086fa9e4066Sahrens 
1087fa9e4066Sahrens 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1088fa9e4066Sahrens 	    0, smo->smo_objsize, tx);
1089fa9e4066Sahrens 
1090fa9e4066Sahrens 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1091fa9e4066Sahrens 
1092fa9e4066Sahrens 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1093fa9e4066Sahrens 	    &smlock);
1094fa9e4066Sahrens 
1095fa9e4066Sahrens 	mutex_enter(&smlock);
1096fa9e4066Sahrens 
1097fa9e4066Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1098fa9e4066Sahrens 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1099fa9e4066Sahrens 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1100fa9e4066Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1101fa9e4066Sahrens 
1102fa9e4066Sahrens 	smo->smo_objsize = 0;
1103fa9e4066Sahrens 	smo->smo_alloc = smsync.sm_space;
1104fa9e4066Sahrens 
1105fa9e4066Sahrens 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1106fa9e4066Sahrens 	space_map_destroy(&smsync);
1107fa9e4066Sahrens 
1108fa9e4066Sahrens 	mutex_exit(&smlock);
1109fa9e4066Sahrens 	mutex_destroy(&smlock);
1110fa9e4066Sahrens 
1111fa9e4066Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1112fa9e4066Sahrens 	dmu_buf_will_dirty(db, tx);
1113fa9e4066Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1114fa9e4066Sahrens 	bcopy(smo, db->db_data, db->db_size);
1115fa9e4066Sahrens 	dmu_buf_rele(db);
1116fa9e4066Sahrens 
1117fa9e4066Sahrens 	dmu_tx_commit(tx);
1118fa9e4066Sahrens }
1119fa9e4066Sahrens 
1120fa9e4066Sahrens int
1121fa9e4066Sahrens vdev_load(vdev_t *vd, int import)
1122fa9e4066Sahrens {
1123fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1124fa9e4066Sahrens 	int c, error;
1125fa9e4066Sahrens 	nvlist_t *label;
1126fa9e4066Sahrens 	uint64_t guid, state;
1127fa9e4066Sahrens 
1128fa9e4066Sahrens 	dprintf("loading %s\n", vdev_description(vd));
1129fa9e4066Sahrens 
1130fa9e4066Sahrens 	/*
1131fa9e4066Sahrens 	 * Recursively load all children.
1132fa9e4066Sahrens 	 */
1133fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1134fa9e4066Sahrens 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1135fa9e4066Sahrens 			return (error);
1136fa9e4066Sahrens 
1137fa9e4066Sahrens 	/*
1138fa9e4066Sahrens 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1139fa9e4066Sahrens 	 */
1140fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1141fa9e4066Sahrens 
1142fa9e4066Sahrens 		if (vdev_is_dead(vd))
1143fa9e4066Sahrens 			return (0);
1144fa9e4066Sahrens 
1145fa9e4066Sahrens 		/*
1146fa9e4066Sahrens 		 * XXX state transitions don't propagate to parent here.
1147fa9e4066Sahrens 		 * Also, merely setting the state isn't sufficient because
1148fa9e4066Sahrens 		 * it's not persistent; a vdev_reopen() would make us
1149fa9e4066Sahrens 		 * forget all about it.
1150fa9e4066Sahrens 		 */
1151fa9e4066Sahrens 		if ((label = vdev_label_read_config(vd)) == NULL) {
1152fa9e4066Sahrens 			dprintf("can't load label config\n");
1153fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1154fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1155fa9e4066Sahrens 			return (0);
1156fa9e4066Sahrens 		}
1157fa9e4066Sahrens 
1158fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1159fa9e4066Sahrens 		    &guid) != 0 || guid != spa_guid(spa)) {
1160fa9e4066Sahrens 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1161fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1162fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1163fa9e4066Sahrens 			nvlist_free(label);
1164fa9e4066Sahrens 			return (0);
1165fa9e4066Sahrens 		}
1166fa9e4066Sahrens 
1167fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1168fa9e4066Sahrens 		    guid != vd->vdev_guid) {
1169fa9e4066Sahrens 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1170fa9e4066Sahrens 			    guid, vd->vdev_guid);
1171fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1172fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1173fa9e4066Sahrens 			nvlist_free(label);
1174fa9e4066Sahrens 			return (0);
1175fa9e4066Sahrens 		}
1176fa9e4066Sahrens 
1177fa9e4066Sahrens 		/*
1178fa9e4066Sahrens 		 * If we find a vdev with a matching pool guid and vdev guid,
1179fa9e4066Sahrens 		 * but the pool state is not active, it indicates that the user
1180fa9e4066Sahrens 		 * exported or destroyed the pool without affecting the config
1181fa9e4066Sahrens 		 * cache (if / was mounted readonly, for example).  In this
1182fa9e4066Sahrens 		 * case, immediately return EBADF so the caller can remove it
1183fa9e4066Sahrens 		 * from the config.
1184fa9e4066Sahrens 		 */
1185fa9e4066Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1186fa9e4066Sahrens 		    &state)) {
1187fa9e4066Sahrens 			dprintf("missing pool state\n");
1188fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1189fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1190fa9e4066Sahrens 			nvlist_free(label);
1191fa9e4066Sahrens 			return (0);
1192fa9e4066Sahrens 		}
1193fa9e4066Sahrens 
1194fa9e4066Sahrens 		if (state != POOL_STATE_ACTIVE &&
1195fa9e4066Sahrens 		    (!import || state != POOL_STATE_EXPORTED)) {
1196fa9e4066Sahrens 			dprintf("pool state not active (%llu)\n", state);
1197fa9e4066Sahrens 			nvlist_free(label);
1198fa9e4066Sahrens 			return (EBADF);
1199fa9e4066Sahrens 		}
1200fa9e4066Sahrens 
1201fa9e4066Sahrens 		nvlist_free(label);
1202fa9e4066Sahrens 	}
1203fa9e4066Sahrens 
1204fa9e4066Sahrens 	/*
1205fa9e4066Sahrens 	 * If this is a top-level vdev, make sure its allocation parameters
1206fa9e4066Sahrens 	 * exist and initialize its metaslabs.
1207fa9e4066Sahrens 	 */
1208fa9e4066Sahrens 	if (vd == vd->vdev_top) {
1209fa9e4066Sahrens 
1210fa9e4066Sahrens 		if (vd->vdev_ms_array == 0 ||
1211fa9e4066Sahrens 		    vd->vdev_ms_shift == 0 ||
1212fa9e4066Sahrens 		    vd->vdev_ashift == 0 ||
1213fa9e4066Sahrens 		    vd->vdev_asize == 0) {
1214fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1215fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1216fa9e4066Sahrens 			return (0);
1217fa9e4066Sahrens 		}
1218fa9e4066Sahrens 
1219fa9e4066Sahrens 		vdev_metaslab_init(vd, 0);
1220fa9e4066Sahrens 	}
1221fa9e4066Sahrens 
1222fa9e4066Sahrens 	/*
1223fa9e4066Sahrens 	 * If this is a leaf vdev, load its DTL.
1224fa9e4066Sahrens 	 */
1225fa9e4066Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1226fa9e4066Sahrens 		error = vdev_dtl_load(vd);
1227fa9e4066Sahrens 		if (error) {
1228fa9e4066Sahrens 			dprintf("can't load DTL for %s, error %d\n",
1229fa9e4066Sahrens 			    vdev_description(vd), error);
1230fa9e4066Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1231fa9e4066Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1232fa9e4066Sahrens 			return (0);
1233fa9e4066Sahrens 		}
1234fa9e4066Sahrens 	}
1235fa9e4066Sahrens 
1236fa9e4066Sahrens 	return (0);
1237fa9e4066Sahrens }
1238fa9e4066Sahrens 
1239fa9e4066Sahrens void
1240fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg)
1241fa9e4066Sahrens {
1242fa9e4066Sahrens 	metaslab_t *msp;
1243fa9e4066Sahrens 
1244fa9e4066Sahrens 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1245fa9e4066Sahrens 
1246fa9e4066Sahrens 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1247fa9e4066Sahrens 		metaslab_sync_done(msp, txg);
1248fa9e4066Sahrens }
1249fa9e4066Sahrens 
1250fa9e4066Sahrens void
1251fa9e4066Sahrens vdev_add_sync(vdev_t *vd, uint64_t txg)
1252fa9e4066Sahrens {
1253fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1254fa9e4066Sahrens 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1255fa9e4066Sahrens 
1256fa9e4066Sahrens 	ASSERT(vd == vd->vdev_top);
1257fa9e4066Sahrens 
1258fa9e4066Sahrens 	if (vd->vdev_ms_array == 0)
1259fa9e4066Sahrens 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1260fa9e4066Sahrens 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1261fa9e4066Sahrens 
1262fa9e4066Sahrens 	ASSERT(vd->vdev_ms_array != 0);
1263fa9e4066Sahrens 
1264fa9e4066Sahrens 	vdev_config_dirty(vd);
1265fa9e4066Sahrens 
1266fa9e4066Sahrens 	dmu_tx_commit(tx);
1267fa9e4066Sahrens }
1268fa9e4066Sahrens 
1269fa9e4066Sahrens void
1270fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg)
1271fa9e4066Sahrens {
1272fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1273fa9e4066Sahrens 	vdev_t *lvd;
1274fa9e4066Sahrens 	metaslab_t *msp;
1275fa9e4066Sahrens 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1276fa9e4066Sahrens 	uint8_t dirty = *dirtyp;
1277fa9e4066Sahrens 
1278fa9e4066Sahrens 	mutex_enter(&vd->vdev_dirty_lock);
1279fa9e4066Sahrens 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1280fa9e4066Sahrens 	mutex_exit(&vd->vdev_dirty_lock);
1281fa9e4066Sahrens 
1282fa9e4066Sahrens 	dprintf("%s txg %llu pass %d\n",
1283fa9e4066Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1284fa9e4066Sahrens 
1285fa9e4066Sahrens 	if (dirty & VDD_ADD)
1286fa9e4066Sahrens 		vdev_add_sync(vd, txg);
1287fa9e4066Sahrens 
1288fa9e4066Sahrens 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1289fa9e4066Sahrens 		metaslab_sync(msp, txg);
1290fa9e4066Sahrens 
1291fa9e4066Sahrens 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1292fa9e4066Sahrens 		vdev_dtl_sync(lvd, txg);
1293fa9e4066Sahrens 
1294fa9e4066Sahrens 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1295fa9e4066Sahrens }
1296fa9e4066Sahrens 
1297fa9e4066Sahrens uint64_t
1298fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1299fa9e4066Sahrens {
1300fa9e4066Sahrens 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1301fa9e4066Sahrens }
1302fa9e4066Sahrens 
1303fa9e4066Sahrens void
1304fa9e4066Sahrens vdev_io_start(zio_t *zio)
1305fa9e4066Sahrens {
1306fa9e4066Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1307fa9e4066Sahrens }
1308fa9e4066Sahrens 
1309fa9e4066Sahrens void
1310fa9e4066Sahrens vdev_io_done(zio_t *zio)
1311fa9e4066Sahrens {
1312fa9e4066Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1313fa9e4066Sahrens }
1314fa9e4066Sahrens 
1315fa9e4066Sahrens const char *
1316fa9e4066Sahrens vdev_description(vdev_t *vd)
1317fa9e4066Sahrens {
1318fa9e4066Sahrens 	if (vd == NULL || vd->vdev_ops == NULL)
1319fa9e4066Sahrens 		return ("<unknown>");
1320fa9e4066Sahrens 
1321fa9e4066Sahrens 	if (vd->vdev_path != NULL)
1322fa9e4066Sahrens 		return (vd->vdev_path);
1323fa9e4066Sahrens 
1324fa9e4066Sahrens 	if (vd->vdev_parent == NULL)
1325fa9e4066Sahrens 		return (spa_name(vd->vdev_spa));
1326fa9e4066Sahrens 
1327fa9e4066Sahrens 	return (vd->vdev_ops->vdev_op_type);
1328fa9e4066Sahrens }
1329fa9e4066Sahrens 
1330fa9e4066Sahrens int
1331fa9e4066Sahrens vdev_online(spa_t *spa, const char *path)
1332fa9e4066Sahrens {
1333fa9e4066Sahrens 	vdev_t *vd;
1334fa9e4066Sahrens 
1335fa9e4066Sahrens 	spa_config_enter(spa, RW_WRITER);
1336fa9e4066Sahrens 
1337fa9e4066Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1338fa9e4066Sahrens 		spa_config_exit(spa);
1339fa9e4066Sahrens 		return (ENODEV);
1340fa9e4066Sahrens 	}
1341fa9e4066Sahrens 
1342fa9e4066Sahrens 	dprintf("ONLINE: %s\n", vdev_description(vd));
1343fa9e4066Sahrens 
1344fa9e4066Sahrens 	vd->vdev_offline = B_FALSE;
1345fa9e4066Sahrens 
1346fa9e4066Sahrens 	/*
1347fa9e4066Sahrens 	 * Clear the error counts.  The idea is that you expect to see all
1348fa9e4066Sahrens 	 * zeroes when everything is working, so if you've just onlined a
1349fa9e4066Sahrens 	 * device, you don't want to keep hearing about errors from before.
1350fa9e4066Sahrens 	 */
1351fa9e4066Sahrens 	vd->vdev_stat.vs_read_errors = 0;
1352fa9e4066Sahrens 	vd->vdev_stat.vs_write_errors = 0;
1353fa9e4066Sahrens 	vd->vdev_stat.vs_checksum_errors = 0;
1354fa9e4066Sahrens 
1355fa9e4066Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1356fa9e4066Sahrens 
1357fa9e4066Sahrens 	spa_config_exit(spa);
1358fa9e4066Sahrens 
1359fa9e4066Sahrens 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1360fa9e4066Sahrens 
1361fa9e4066Sahrens 	return (0);
1362fa9e4066Sahrens }
1363fa9e4066Sahrens 
1364fa9e4066Sahrens int
1365fa9e4066Sahrens vdev_offline(spa_t *spa, const char *path)
1366fa9e4066Sahrens {
1367fa9e4066Sahrens 	vdev_t *vd;
1368fa9e4066Sahrens 
1369fa9e4066Sahrens 	spa_config_enter(spa, RW_WRITER);
1370fa9e4066Sahrens 
1371fa9e4066Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1372fa9e4066Sahrens 		spa_config_exit(spa);
1373fa9e4066Sahrens 		return (ENODEV);
1374fa9e4066Sahrens 	}
1375fa9e4066Sahrens 
1376fa9e4066Sahrens 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1377fa9e4066Sahrens 
1378fa9e4066Sahrens 	/*
1379fa9e4066Sahrens 	 * If this device's top-level vdev has a non-empty DTL,
1380fa9e4066Sahrens 	 * don't allow the device to be offlined.
1381fa9e4066Sahrens 	 *
1382fa9e4066Sahrens 	 * XXX -- we should make this more precise by allowing the offline
1383fa9e4066Sahrens 	 * as long as the remaining devices don't have any DTL holes.
1384fa9e4066Sahrens 	 */
1385fa9e4066Sahrens 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
1386fa9e4066Sahrens 		spa_config_exit(spa);
1387fa9e4066Sahrens 		return (EBUSY);
1388fa9e4066Sahrens 	}
1389fa9e4066Sahrens 
1390fa9e4066Sahrens 	/*
1391fa9e4066Sahrens 	 * Set this device to offline state and reopen its top-level vdev.
1392fa9e4066Sahrens 	 * If this action results in the top-level vdev becoming unusable,
1393fa9e4066Sahrens 	 * undo it and fail the request.
1394fa9e4066Sahrens 	 */
1395fa9e4066Sahrens 	vd->vdev_offline = B_TRUE;
1396fa9e4066Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1397fa9e4066Sahrens 	if (vdev_is_dead(vd->vdev_top)) {
1398fa9e4066Sahrens 		vd->vdev_offline = B_FALSE;
1399fa9e4066Sahrens 		vdev_reopen(vd->vdev_top, NULL);
1400fa9e4066Sahrens 		spa_config_exit(spa);
1401fa9e4066Sahrens 		return (EBUSY);
1402fa9e4066Sahrens 	}
1403fa9e4066Sahrens 
1404fa9e4066Sahrens 	spa_config_exit(spa);
1405fa9e4066Sahrens 
1406fa9e4066Sahrens 	return (0);
1407fa9e4066Sahrens }
1408fa9e4066Sahrens 
1409fa9e4066Sahrens int
1410fa9e4066Sahrens vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1411fa9e4066Sahrens {
1412fa9e4066Sahrens 	vdev_t *vd;
1413fa9e4066Sahrens 
1414fa9e4066Sahrens 	spa_config_enter(spa, RW_WRITER);
1415fa9e4066Sahrens 
1416fa9e4066Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1417fa9e4066Sahrens 		spa_config_exit(spa);
1418fa9e4066Sahrens 		return (ENODEV);
1419fa9e4066Sahrens 	}
1420fa9e4066Sahrens 
1421fa9e4066Sahrens 	vd->vdev_fault_mode = mode;
1422fa9e4066Sahrens 	vd->vdev_fault_mask = mask;
1423fa9e4066Sahrens 	vd->vdev_fault_arg = arg;
1424fa9e4066Sahrens 
1425fa9e4066Sahrens 	spa_config_exit(spa);
1426fa9e4066Sahrens 
1427fa9e4066Sahrens 	return (0);
1428fa9e4066Sahrens }
1429fa9e4066Sahrens 
1430fa9e4066Sahrens int
1431fa9e4066Sahrens vdev_is_dead(vdev_t *vd)
1432fa9e4066Sahrens {
1433fa9e4066Sahrens 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1434fa9e4066Sahrens }
1435fa9e4066Sahrens 
1436fa9e4066Sahrens int
1437fa9e4066Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio)
1438fa9e4066Sahrens {
1439fa9e4066Sahrens 	int error = 0;
1440fa9e4066Sahrens 
1441fa9e4066Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1442fa9e4066Sahrens 		return (0);
1443fa9e4066Sahrens 
1444fa9e4066Sahrens 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1445fa9e4066Sahrens 		return (0);
1446fa9e4066Sahrens 
1447fa9e4066Sahrens 	switch (vd->vdev_fault_mode) {
1448fa9e4066Sahrens 	case VDEV_FAULT_RANDOM:
1449fa9e4066Sahrens 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1450fa9e4066Sahrens 			error = EIO;
1451fa9e4066Sahrens 		break;
1452fa9e4066Sahrens 
1453fa9e4066Sahrens 	case VDEV_FAULT_COUNT:
1454fa9e4066Sahrens 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1455fa9e4066Sahrens 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1456fa9e4066Sahrens 		error = EIO;
1457fa9e4066Sahrens 		break;
1458fa9e4066Sahrens 	}
1459fa9e4066Sahrens 
1460fa9e4066Sahrens 	if (error != 0) {
1461fa9e4066Sahrens 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1462fa9e4066Sahrens 		    error, zio->io_type, vdev_description(vd),
1463fa9e4066Sahrens 		    vd->vdev_state, zio->io_offset);
1464fa9e4066Sahrens 	}
1465fa9e4066Sahrens 
1466fa9e4066Sahrens 	return (error);
1467fa9e4066Sahrens }
1468fa9e4066Sahrens 
1469fa9e4066Sahrens /*
1470fa9e4066Sahrens  * Get statistics for the given vdev.
1471fa9e4066Sahrens  */
1472fa9e4066Sahrens void
1473fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1474fa9e4066Sahrens {
1475fa9e4066Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1476fa9e4066Sahrens 	int c, t;
1477fa9e4066Sahrens 
1478fa9e4066Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1479fa9e4066Sahrens 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1480fa9e4066Sahrens 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1481fa9e4066Sahrens 	vs->vs_state = vd->vdev_state;
1482*2a79c5feSlling 	vs->vs_rsize = vdev_get_rsize(vd);
1483fa9e4066Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1484fa9e4066Sahrens 
1485fa9e4066Sahrens 	/*
1486fa9e4066Sahrens 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1487fa9e4066Sahrens 	 * over all top-level vdevs (i.e. the direct children of the root).
1488fa9e4066Sahrens 	 */
1489fa9e4066Sahrens 	if (vd == rvd) {
1490fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++) {
1491fa9e4066Sahrens 			vdev_t *cvd = rvd->vdev_child[c];
1492fa9e4066Sahrens 			vdev_stat_t *cvs = &cvd->vdev_stat;
1493fa9e4066Sahrens 
1494fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1495fa9e4066Sahrens 			for (t = 0; t < ZIO_TYPES; t++) {
1496fa9e4066Sahrens 				vs->vs_ops[t] += cvs->vs_ops[t];
1497fa9e4066Sahrens 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1498fa9e4066Sahrens 			}
1499fa9e4066Sahrens 			vs->vs_read_errors += cvs->vs_read_errors;
1500fa9e4066Sahrens 			vs->vs_write_errors += cvs->vs_write_errors;
1501fa9e4066Sahrens 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1502fa9e4066Sahrens 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1503fa9e4066Sahrens 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1504fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1505fa9e4066Sahrens 		}
1506fa9e4066Sahrens 	}
1507fa9e4066Sahrens }
1508fa9e4066Sahrens 
1509fa9e4066Sahrens void
1510fa9e4066Sahrens vdev_stat_update(zio_t *zio)
1511fa9e4066Sahrens {
1512fa9e4066Sahrens 	vdev_t *vd = zio->io_vd;
1513fa9e4066Sahrens 	vdev_t *pvd;
1514fa9e4066Sahrens 	uint64_t txg = zio->io_txg;
1515fa9e4066Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1516fa9e4066Sahrens 	zio_type_t type = zio->io_type;
1517fa9e4066Sahrens 	int flags = zio->io_flags;
1518fa9e4066Sahrens 
1519fa9e4066Sahrens 	if (zio->io_error == 0) {
1520fa9e4066Sahrens 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1521fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1522fa9e4066Sahrens 			vs->vs_ops[type]++;
1523fa9e4066Sahrens 			vs->vs_bytes[type] += zio->io_size;
1524fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1525fa9e4066Sahrens 		}
1526fa9e4066Sahrens 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1527fa9e4066Sahrens 		    zio->io_delegate_list == NULL) {
1528fa9e4066Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1529fa9e4066Sahrens 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1530fa9e4066Sahrens 				vs->vs_scrub_repaired += zio->io_size;
1531fa9e4066Sahrens 			else
1532fa9e4066Sahrens 				vs->vs_self_healed += zio->io_size;
1533fa9e4066Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1534fa9e4066Sahrens 		}
1535fa9e4066Sahrens 		return;
1536fa9e4066Sahrens 	}
1537fa9e4066Sahrens 
1538fa9e4066Sahrens 	if (flags & ZIO_FLAG_SPECULATIVE)
1539fa9e4066Sahrens 		return;
1540fa9e4066Sahrens 
1541fa9e4066Sahrens 	if (!vdev_is_dead(vd)) {
1542fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1543fa9e4066Sahrens 		if (type == ZIO_TYPE_READ) {
1544fa9e4066Sahrens 			if (zio->io_error == ECKSUM)
1545fa9e4066Sahrens 				vs->vs_checksum_errors++;
1546fa9e4066Sahrens 			else
1547fa9e4066Sahrens 				vs->vs_read_errors++;
1548fa9e4066Sahrens 		}
1549fa9e4066Sahrens 		if (type == ZIO_TYPE_WRITE)
1550fa9e4066Sahrens 			vs->vs_write_errors++;
1551fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1552fa9e4066Sahrens 	}
1553fa9e4066Sahrens 
1554fa9e4066Sahrens 	if (type == ZIO_TYPE_WRITE) {
1555fa9e4066Sahrens 		if (txg == 0 || vd->vdev_children != 0)
1556fa9e4066Sahrens 			return;
1557fa9e4066Sahrens 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1558fa9e4066Sahrens 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1559fa9e4066Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1560fa9e4066Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1561fa9e4066Sahrens 		}
1562fa9e4066Sahrens 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1563fa9e4066Sahrens 			vdev_t *tvd = vd->vdev_top;
1564fa9e4066Sahrens 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1565fa9e4066Sahrens 				return;
1566fa9e4066Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
1567fa9e4066Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1568fa9e4066Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1569fa9e4066Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1570fa9e4066Sahrens 		}
1571fa9e4066Sahrens 	}
1572fa9e4066Sahrens }
1573fa9e4066Sahrens 
1574fa9e4066Sahrens void
1575fa9e4066Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1576fa9e4066Sahrens {
1577fa9e4066Sahrens 	int c;
1578fa9e4066Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1579fa9e4066Sahrens 
1580fa9e4066Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1581fa9e4066Sahrens 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1582fa9e4066Sahrens 
1583fa9e4066Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1584fa9e4066Sahrens 
1585fa9e4066Sahrens 	if (type == POOL_SCRUB_NONE) {
1586fa9e4066Sahrens 		/*
1587fa9e4066Sahrens 		 * Update completion and end time.  Leave everything else alone
1588fa9e4066Sahrens 		 * so we can report what happened during the previous scrub.
1589fa9e4066Sahrens 		 */
1590fa9e4066Sahrens 		vs->vs_scrub_complete = complete;
1591fa9e4066Sahrens 		vs->vs_scrub_end = gethrestime_sec();
1592fa9e4066Sahrens 	} else {
1593fa9e4066Sahrens 		vs->vs_scrub_type = type;
1594fa9e4066Sahrens 		vs->vs_scrub_complete = 0;
1595fa9e4066Sahrens 		vs->vs_scrub_examined = 0;
1596fa9e4066Sahrens 		vs->vs_scrub_repaired = 0;
1597fa9e4066Sahrens 		vs->vs_scrub_errors = 0;
1598fa9e4066Sahrens 		vs->vs_scrub_start = gethrestime_sec();
1599fa9e4066Sahrens 		vs->vs_scrub_end = 0;
1600fa9e4066Sahrens 	}
1601fa9e4066Sahrens 
1602fa9e4066Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1603fa9e4066Sahrens }
1604fa9e4066Sahrens 
1605fa9e4066Sahrens /*
1606fa9e4066Sahrens  * Report checksum errors that a vdev that didn't realize it made.
1607fa9e4066Sahrens  * This can happen, for example, when RAID-Z combinatorial reconstruction
1608fa9e4066Sahrens  * infers that one of its components returned bad data.
1609fa9e4066Sahrens  */
1610fa9e4066Sahrens void
1611fa9e4066Sahrens vdev_checksum_error(zio_t *zio, vdev_t *vd)
1612fa9e4066Sahrens {
1613fa9e4066Sahrens 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1614fa9e4066Sahrens 	    vdev_description(vd));
1615fa9e4066Sahrens 
1616fa9e4066Sahrens 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1617fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1618fa9e4066Sahrens 		vd->vdev_stat.vs_checksum_errors++;
1619fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1620fa9e4066Sahrens 	}
1621fa9e4066Sahrens }
1622fa9e4066Sahrens 
1623fa9e4066Sahrens /*
1624fa9e4066Sahrens  * Update the in-core space usage stats for this vdev and the root vdev.
1625fa9e4066Sahrens  */
1626fa9e4066Sahrens void
1627fa9e4066Sahrens vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1628fa9e4066Sahrens {
1629fa9e4066Sahrens 	ASSERT(vd == vd->vdev_top);
1630fa9e4066Sahrens 
1631fa9e4066Sahrens 	do {
1632fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1633fa9e4066Sahrens 		vd->vdev_stat.vs_space += space_delta;
1634fa9e4066Sahrens 		vd->vdev_stat.vs_alloc += alloc_delta;
1635fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1636fa9e4066Sahrens 	} while ((vd = vd->vdev_parent) != NULL);
1637fa9e4066Sahrens }
1638fa9e4066Sahrens 
1639fa9e4066Sahrens /*
1640fa9e4066Sahrens  * Various knobs to tune a vdev.
1641fa9e4066Sahrens  */
1642fa9e4066Sahrens static vdev_knob_t vdev_knob[] = {
1643fa9e4066Sahrens 	{
1644fa9e4066Sahrens 		"cache_size",
1645fa9e4066Sahrens 		"size of the read-ahead cache",
1646fa9e4066Sahrens 		0,
1647fa9e4066Sahrens 		1ULL << 30,
1648fa9e4066Sahrens 		10ULL << 20,
1649fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_size)
1650fa9e4066Sahrens 	},
1651fa9e4066Sahrens 	{
1652fa9e4066Sahrens 		"cache_bshift",
1653fa9e4066Sahrens 		"log2 of cache blocksize",
1654fa9e4066Sahrens 		SPA_MINBLOCKSHIFT,
1655fa9e4066Sahrens 		SPA_MAXBLOCKSHIFT,
1656fa9e4066Sahrens 		16,
1657fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_bshift)
1658fa9e4066Sahrens 	},
1659fa9e4066Sahrens 	{
1660fa9e4066Sahrens 		"cache_max",
1661fa9e4066Sahrens 		"largest block size to cache",
1662fa9e4066Sahrens 		0,
1663fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1664fa9e4066Sahrens 		1ULL << 14,
1665fa9e4066Sahrens 		offsetof(struct vdev, vdev_cache.vc_max)
1666fa9e4066Sahrens 	},
1667fa9e4066Sahrens 	{
1668fa9e4066Sahrens 		"min_pending",
1669fa9e4066Sahrens 		"minimum pending I/Os to the disk",
1670fa9e4066Sahrens 		1,
1671fa9e4066Sahrens 		10000,
1672fa9e4066Sahrens 		2,
1673fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1674fa9e4066Sahrens 	},
1675fa9e4066Sahrens 	{
1676fa9e4066Sahrens 		"max_pending",
1677fa9e4066Sahrens 		"maximum pending I/Os to the disk",
1678fa9e4066Sahrens 		1,
1679fa9e4066Sahrens 		10000,
1680fa9e4066Sahrens 		35,
1681fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1682fa9e4066Sahrens 	},
1683fa9e4066Sahrens 	{
1684fa9e4066Sahrens 		"agg_limit",
1685fa9e4066Sahrens 		"maximum size of aggregated I/Os",
1686fa9e4066Sahrens 		0,
1687fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1688fa9e4066Sahrens 		SPA_MAXBLOCKSIZE,
1689fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1690fa9e4066Sahrens 	},
1691fa9e4066Sahrens 	{
1692fa9e4066Sahrens 		"time_shift",
1693fa9e4066Sahrens 		"deadline = pri + (lbolt >> time_shift)",
1694fa9e4066Sahrens 		0,
1695fa9e4066Sahrens 		63,
1696fa9e4066Sahrens 		4,
1697fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1698fa9e4066Sahrens 	},
1699fa9e4066Sahrens 	{
1700fa9e4066Sahrens 		"ramp_rate",
1701fa9e4066Sahrens 		"exponential I/O issue ramp-up rate",
1702fa9e4066Sahrens 		1,
1703fa9e4066Sahrens 		10000,
1704fa9e4066Sahrens 		2,
1705fa9e4066Sahrens 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1706fa9e4066Sahrens 	},
1707fa9e4066Sahrens };
1708fa9e4066Sahrens 
1709fa9e4066Sahrens vdev_knob_t *
1710fa9e4066Sahrens vdev_knob_next(vdev_knob_t *vk)
1711fa9e4066Sahrens {
1712fa9e4066Sahrens 	if (vk == NULL)
1713fa9e4066Sahrens 		return (vdev_knob);
1714fa9e4066Sahrens 
1715fa9e4066Sahrens 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1716fa9e4066Sahrens 		return (NULL);
1717fa9e4066Sahrens 
1718fa9e4066Sahrens 	return (vk);
1719fa9e4066Sahrens }
1720fa9e4066Sahrens 
1721fa9e4066Sahrens /*
1722fa9e4066Sahrens  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1723fa9e4066Sahrens  * so that it will be written out next time the vdev configuration is synced.
1724fa9e4066Sahrens  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1725fa9e4066Sahrens  */
1726fa9e4066Sahrens void
1727fa9e4066Sahrens vdev_config_dirty(vdev_t *vd)
1728fa9e4066Sahrens {
1729fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
1730fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
1731fa9e4066Sahrens 	int c;
1732fa9e4066Sahrens 
1733fa9e4066Sahrens 	if (vd == rvd) {
1734fa9e4066Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
1735fa9e4066Sahrens 			vdev_config_dirty(rvd->vdev_child[c]);
1736fa9e4066Sahrens 	} else {
1737fa9e4066Sahrens 		ASSERT(vd == vd->vdev_top);
1738fa9e4066Sahrens 
1739fa9e4066Sahrens 		if (!vd->vdev_is_dirty) {
1740fa9e4066Sahrens 			list_insert_head(&spa->spa_dirty_list, vd);
1741fa9e4066Sahrens 			vd->vdev_is_dirty = B_TRUE;
1742fa9e4066Sahrens 		}
1743fa9e4066Sahrens 	}
1744fa9e4066Sahrens }
1745fa9e4066Sahrens 
1746fa9e4066Sahrens void
1747fa9e4066Sahrens vdev_config_clean(vdev_t *vd)
1748fa9e4066Sahrens {
1749fa9e4066Sahrens 	ASSERT(vd->vdev_is_dirty);
1750fa9e4066Sahrens 
1751fa9e4066Sahrens 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1752fa9e4066Sahrens 	vd->vdev_is_dirty = B_FALSE;
1753fa9e4066Sahrens }
1754fa9e4066Sahrens 
1755fa9e4066Sahrens /*
1756fa9e4066Sahrens  * Set a vdev's state, updating any parent's state as well.
1757fa9e4066Sahrens  */
1758fa9e4066Sahrens void
1759fa9e4066Sahrens vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1760fa9e4066Sahrens {
1761fa9e4066Sahrens 	if (state == vd->vdev_state)
1762fa9e4066Sahrens 		return;
1763fa9e4066Sahrens 
1764fa9e4066Sahrens 	vd->vdev_state = state;
1765fa9e4066Sahrens 	vd->vdev_stat.vs_aux = aux;
1766fa9e4066Sahrens 
1767fa9e4066Sahrens 	if (vd->vdev_parent != NULL) {
1768fa9e4066Sahrens 		int c;
1769fa9e4066Sahrens 		int degraded = 0, faulted = 0;
1770fa9e4066Sahrens 		vdev_t *parent, *child;
1771fa9e4066Sahrens 
1772fa9e4066Sahrens 		parent = vd->vdev_parent;
1773fa9e4066Sahrens 		for (c = 0; c < parent->vdev_children; c++) {
1774fa9e4066Sahrens 			child = parent->vdev_child[c];
1775fa9e4066Sahrens 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1776fa9e4066Sahrens 				faulted++;
1777fa9e4066Sahrens 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1778fa9e4066Sahrens 				degraded++;
1779fa9e4066Sahrens 		}
1780fa9e4066Sahrens 
1781fa9e4066Sahrens 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1782fa9e4066Sahrens 		    vd->vdev_parent, faulted, degraded);
1783fa9e4066Sahrens 	    }
1784fa9e4066Sahrens }
1785