xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev.c (revision afefbcddfd8caf5f3b2da510d9439471ab225040)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/uberblock_impl.h>
36 #include <sys/metaslab.h>
37 #include <sys/metaslab_impl.h>
38 #include <sys/space_map.h>
39 #include <sys/zio.h>
40 #include <sys/zap.h>
41 #include <sys/fs/zfs.h>
42 
43 /*
44  * Virtual device management.
45  */
46 
47 static vdev_ops_t *vdev_ops_table[] = {
48 	&vdev_root_ops,
49 	&vdev_raidz_ops,
50 	&vdev_mirror_ops,
51 	&vdev_replacing_ops,
52 	&vdev_disk_ops,
53 	&vdev_file_ops,
54 	&vdev_missing_ops,
55 	NULL
56 };
57 
58 /*
59  * Given a vdev type, return the appropriate ops vector.
60  */
61 static vdev_ops_t *
62 vdev_getops(const char *type)
63 {
64 	vdev_ops_t *ops, **opspp;
65 
66 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
67 		if (strcmp(ops->vdev_op_type, type) == 0)
68 			break;
69 
70 	return (ops);
71 }
72 
73 /*
74  * Default asize function: return the MAX of psize with the asize of
75  * all children.  This is what's used by anything other than RAID-Z.
76  */
77 uint64_t
78 vdev_default_asize(vdev_t *vd, uint64_t psize)
79 {
80 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
81 	uint64_t csize;
82 	uint64_t c;
83 
84 	for (c = 0; c < vd->vdev_children; c++) {
85 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
86 		asize = MAX(asize, csize);
87 	}
88 
89 	return (asize);
90 }
91 
92 vdev_t *
93 vdev_lookup_top(spa_t *spa, uint64_t vdev)
94 {
95 	vdev_t *rvd = spa->spa_root_vdev;
96 
97 	if (vdev < rvd->vdev_children)
98 		return (rvd->vdev_child[vdev]);
99 
100 	return (NULL);
101 }
102 
103 vdev_t *
104 vdev_lookup_by_path(vdev_t *vd, const char *path)
105 {
106 	int c;
107 	vdev_t *mvd;
108 
109 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
110 		return (vd);
111 
112 	for (c = 0; c < vd->vdev_children; c++)
113 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
114 		    NULL)
115 			return (mvd);
116 
117 	return (NULL);
118 }
119 
120 vdev_t *
121 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
122 {
123 	int c;
124 	vdev_t *mvd;
125 
126 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
127 		return (vd);
128 
129 	for (c = 0; c < vd->vdev_children; c++)
130 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
131 		    NULL)
132 			return (mvd);
133 
134 	return (NULL);
135 }
136 
137 void
138 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
139 {
140 	size_t oldsize, newsize;
141 	uint64_t id = cvd->vdev_id;
142 	vdev_t **newchild;
143 
144 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
145 	ASSERT(cvd->vdev_parent == NULL);
146 
147 	cvd->vdev_parent = pvd;
148 
149 	if (pvd == NULL)
150 		return;
151 
152 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
153 
154 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
155 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
156 	newsize = pvd->vdev_children * sizeof (vdev_t *);
157 
158 	newchild = kmem_zalloc(newsize, KM_SLEEP);
159 	if (pvd->vdev_child != NULL) {
160 		bcopy(pvd->vdev_child, newchild, oldsize);
161 		kmem_free(pvd->vdev_child, oldsize);
162 	}
163 
164 	pvd->vdev_child = newchild;
165 	pvd->vdev_child[id] = cvd;
166 
167 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
168 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
169 
170 	/*
171 	 * Walk up all ancestors to update guid sum.
172 	 */
173 	for (; pvd != NULL; pvd = pvd->vdev_parent)
174 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
175 }
176 
177 void
178 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
179 {
180 	int c;
181 	uint_t id = cvd->vdev_id;
182 
183 	ASSERT(cvd->vdev_parent == pvd);
184 
185 	if (pvd == NULL)
186 		return;
187 
188 	ASSERT(id < pvd->vdev_children);
189 	ASSERT(pvd->vdev_child[id] == cvd);
190 
191 	pvd->vdev_child[id] = NULL;
192 	cvd->vdev_parent = NULL;
193 
194 	for (c = 0; c < pvd->vdev_children; c++)
195 		if (pvd->vdev_child[c])
196 			break;
197 
198 	if (c == pvd->vdev_children) {
199 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
200 		pvd->vdev_child = NULL;
201 		pvd->vdev_children = 0;
202 	}
203 
204 	/*
205 	 * Walk up all ancestors to update guid sum.
206 	 */
207 	for (; pvd != NULL; pvd = pvd->vdev_parent)
208 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
209 }
210 
211 /*
212  * Remove any holes in the child array.
213  */
214 void
215 vdev_compact_children(vdev_t *pvd)
216 {
217 	vdev_t **newchild, *cvd;
218 	int oldc = pvd->vdev_children;
219 	int newc, c;
220 
221 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
222 
223 	for (c = newc = 0; c < oldc; c++)
224 		if (pvd->vdev_child[c])
225 			newc++;
226 
227 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
228 
229 	for (c = newc = 0; c < oldc; c++) {
230 		if ((cvd = pvd->vdev_child[c]) != NULL) {
231 			newchild[newc] = cvd;
232 			cvd->vdev_id = newc++;
233 		}
234 	}
235 
236 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
237 	pvd->vdev_child = newchild;
238 	pvd->vdev_children = newc;
239 }
240 
241 /*
242  * Allocate and minimally initialize a vdev_t.
243  */
244 static vdev_t *
245 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
246 {
247 	vdev_t *vd;
248 
249 	while (guid == 0)
250 		guid = spa_get_random(-1ULL);
251 
252 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
253 
254 	vd->vdev_spa = spa;
255 	vd->vdev_id = id;
256 	vd->vdev_guid = guid;
257 	vd->vdev_guid_sum = guid;
258 	vd->vdev_ops = ops;
259 	vd->vdev_state = VDEV_STATE_CLOSED;
260 
261 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
262 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
263 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
264 	    offsetof(zio_t, io_pending));
265 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
266 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
267 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
268 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
269 	txg_list_create(&vd->vdev_ms_list,
270 	    offsetof(struct metaslab, ms_txg_node));
271 	txg_list_create(&vd->vdev_dtl_list,
272 	    offsetof(struct vdev, vdev_dtl_node));
273 	vd->vdev_stat.vs_timestamp = gethrtime();
274 
275 	return (vd);
276 }
277 
278 /*
279  * Free a vdev_t that has been removed from service.
280  */
281 static void
282 vdev_free_common(vdev_t *vd)
283 {
284 	if (vd->vdev_path)
285 		spa_strfree(vd->vdev_path);
286 	if (vd->vdev_devid)
287 		spa_strfree(vd->vdev_devid);
288 
289 	txg_list_destroy(&vd->vdev_ms_list);
290 	txg_list_destroy(&vd->vdev_dtl_list);
291 	mutex_enter(&vd->vdev_dtl_lock);
292 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
293 	space_map_destroy(&vd->vdev_dtl_map);
294 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
295 	space_map_destroy(&vd->vdev_dtl_scrub);
296 	mutex_exit(&vd->vdev_dtl_lock);
297 	mutex_destroy(&vd->vdev_dtl_lock);
298 	mutex_destroy(&vd->vdev_dirty_lock);
299 	list_destroy(&vd->vdev_io_pending);
300 	mutex_destroy(&vd->vdev_io_lock);
301 	cv_destroy(&vd->vdev_io_cv);
302 
303 	kmem_free(vd, sizeof (vdev_t));
304 }
305 
306 /*
307  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
308  * creating a new vdev or loading an existing one - the behavior is slightly
309  * different for each case.
310  */
311 vdev_t *
312 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
313 {
314 	vdev_ops_t *ops;
315 	char *type;
316 	uint64_t guid = 0;
317 	vdev_t *vd;
318 
319 	ASSERT(spa_config_held(spa, RW_WRITER));
320 
321 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
322 		return (NULL);
323 
324 	if ((ops = vdev_getops(type)) == NULL)
325 		return (NULL);
326 
327 	/*
328 	 * If this is a load, get the vdev guid from the nvlist.
329 	 * Otherwise, vdev_alloc_common() will generate one for us.
330 	 */
331 	if (alloctype == VDEV_ALLOC_LOAD) {
332 		uint64_t label_id;
333 
334 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
335 		    label_id != id)
336 			return (NULL);
337 
338 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
339 			return (NULL);
340 	}
341 
342 	vd = vdev_alloc_common(spa, id, guid, ops);
343 
344 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
345 		vd->vdev_path = spa_strdup(vd->vdev_path);
346 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
347 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
348 
349 	/*
350 	 * Set the whole_disk property.  If it's not specified, leave the value
351 	 * as -1.
352 	 */
353 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
354 	    &vd->vdev_wholedisk) != 0)
355 		vd->vdev_wholedisk = -1ULL;
356 
357 	/*
358 	 * If we're a top-level vdev, try to load the allocation parameters.
359 	 */
360 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
361 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
362 		    &vd->vdev_ms_array);
363 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
364 		    &vd->vdev_ms_shift);
365 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
366 		    &vd->vdev_ashift);
367 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
368 		    &vd->vdev_asize);
369 	}
370 
371 	/*
372 	 * If we're a leaf vdev, try to load the DTL object.
373 	 */
374 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
375 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
376 		    &vd->vdev_dtl.smo_object);
377 	}
378 
379 	/*
380 	 * Add ourselves to the parent's list of children.
381 	 */
382 	vdev_add_child(parent, vd);
383 
384 	return (vd);
385 }
386 
387 void
388 vdev_free(vdev_t *vd)
389 {
390 	int c;
391 
392 	/*
393 	 * vdev_free() implies closing the vdev first.  This is simpler than
394 	 * trying to ensure complicated semantics for all callers.
395 	 */
396 	vdev_close(vd);
397 
398 	/*
399 	 * It's possible to free a vdev that's been added to the dirty
400 	 * list when in the middle of spa_vdev_add().  Handle that case
401 	 * correctly here.
402 	 */
403 	if (vd->vdev_is_dirty)
404 		vdev_config_clean(vd);
405 
406 	/*
407 	 * Free all children.
408 	 */
409 	for (c = 0; c < vd->vdev_children; c++)
410 		vdev_free(vd->vdev_child[c]);
411 
412 	ASSERT(vd->vdev_child == NULL);
413 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
414 
415 	/*
416 	 * Discard allocation state.
417 	 */
418 	if (vd == vd->vdev_top)
419 		vdev_metaslab_fini(vd);
420 
421 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
422 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
423 
424 	/*
425 	 * Remove this vdev from its parent's child list.
426 	 */
427 	vdev_remove_child(vd->vdev_parent, vd);
428 
429 	ASSERT(vd->vdev_parent == NULL);
430 
431 	vdev_free_common(vd);
432 }
433 
434 /*
435  * Transfer top-level vdev state from svd to tvd.
436  */
437 static void
438 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
439 {
440 	spa_t *spa = svd->vdev_spa;
441 	metaslab_t *msp;
442 	vdev_t *vd;
443 	int t;
444 
445 	ASSERT(tvd == tvd->vdev_top);
446 
447 	tvd->vdev_ms_array = svd->vdev_ms_array;
448 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
449 	tvd->vdev_ms_count = svd->vdev_ms_count;
450 
451 	svd->vdev_ms_array = 0;
452 	svd->vdev_ms_shift = 0;
453 	svd->vdev_ms_count = 0;
454 
455 	tvd->vdev_mg = svd->vdev_mg;
456 	tvd->vdev_mg->mg_vd = tvd;
457 	tvd->vdev_ms = svd->vdev_ms;
458 	tvd->vdev_smo = svd->vdev_smo;
459 
460 	svd->vdev_mg = NULL;
461 	svd->vdev_ms = NULL;
462 	svd->vdev_smo = NULL;
463 
464 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
465 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
466 
467 	svd->vdev_stat.vs_alloc = 0;
468 	svd->vdev_stat.vs_space = 0;
469 
470 	for (t = 0; t < TXG_SIZE; t++) {
471 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
472 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
473 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
474 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
475 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
476 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
477 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
478 		svd->vdev_dirty[t] = 0;
479 	}
480 
481 	if (svd->vdev_is_dirty) {
482 		vdev_config_clean(svd);
483 		vdev_config_dirty(tvd);
484 	}
485 
486 	ASSERT(svd->vdev_io_retry == NULL);
487 	ASSERT(list_is_empty(&svd->vdev_io_pending));
488 }
489 
490 static void
491 vdev_top_update(vdev_t *tvd, vdev_t *vd)
492 {
493 	int c;
494 
495 	if (vd == NULL)
496 		return;
497 
498 	vd->vdev_top = tvd;
499 
500 	for (c = 0; c < vd->vdev_children; c++)
501 		vdev_top_update(tvd, vd->vdev_child[c]);
502 }
503 
504 /*
505  * Add a mirror/replacing vdev above an existing vdev.
506  */
507 vdev_t *
508 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
509 {
510 	spa_t *spa = cvd->vdev_spa;
511 	vdev_t *pvd = cvd->vdev_parent;
512 	vdev_t *mvd;
513 
514 	ASSERT(spa_config_held(spa, RW_WRITER));
515 
516 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
517 	vdev_remove_child(pvd, cvd);
518 	vdev_add_child(pvd, mvd);
519 	cvd->vdev_id = mvd->vdev_children;
520 	vdev_add_child(mvd, cvd);
521 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
522 
523 	mvd->vdev_asize = cvd->vdev_asize;
524 	mvd->vdev_ashift = cvd->vdev_ashift;
525 	mvd->vdev_state = cvd->vdev_state;
526 
527 	if (mvd == mvd->vdev_top)
528 		vdev_top_transfer(cvd, mvd);
529 
530 	return (mvd);
531 }
532 
533 /*
534  * Remove a 1-way mirror/replacing vdev from the tree.
535  */
536 void
537 vdev_remove_parent(vdev_t *cvd)
538 {
539 	vdev_t *mvd = cvd->vdev_parent;
540 	vdev_t *pvd = mvd->vdev_parent;
541 
542 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
543 
544 	ASSERT(mvd->vdev_children == 1);
545 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
546 	    mvd->vdev_ops == &vdev_replacing_ops);
547 
548 	vdev_remove_child(mvd, cvd);
549 	vdev_remove_child(pvd, mvd);
550 	cvd->vdev_id = mvd->vdev_id;
551 	vdev_add_child(pvd, cvd);
552 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
553 
554 	if (cvd == cvd->vdev_top)
555 		vdev_top_transfer(mvd, cvd);
556 
557 	ASSERT(mvd->vdev_children == 0);
558 	vdev_free(mvd);
559 }
560 
561 void
562 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
563 {
564 	spa_t *spa = vd->vdev_spa;
565 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
566 	uint64_t c;
567 	uint64_t oldc = vd->vdev_ms_count;
568 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
569 	space_map_obj_t *smo = vd->vdev_smo;
570 	metaslab_t **mspp = vd->vdev_ms;
571 
572 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
573 
574 	ASSERT(oldc <= newc);
575 
576 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
577 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
578 	vd->vdev_ms_count = newc;
579 
580 	if (vd->vdev_mg == NULL) {
581 		if (txg == 0) {
582 			dmu_buf_t *db;
583 			uint64_t *ms_array;
584 
585 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
586 			    KM_SLEEP);
587 
588 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
589 			    0, newc * sizeof (uint64_t), ms_array);
590 
591 			for (c = 0; c < newc; c++) {
592 				if (ms_array[c] == 0)
593 					continue;
594 				db = dmu_bonus_hold(spa->spa_meta_objset,
595 				    ms_array[c]);
596 				dmu_buf_read(db);
597 				ASSERT3U(db->db_size, ==, sizeof (*smo));
598 				bcopy(db->db_data, &vd->vdev_smo[c],
599 				    db->db_size);
600 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
601 				    ms_array[c]);
602 				dmu_buf_rele(db);
603 			}
604 			kmem_free(ms_array, newc * sizeof (uint64_t));
605 		}
606 		vd->vdev_mg = metaslab_group_create(mc, vd);
607 	}
608 
609 	for (c = 0; c < oldc; c++) {
610 		vd->vdev_smo[c] = smo[c];
611 		vd->vdev_ms[c] = mspp[c];
612 		mspp[c]->ms_smo = &vd->vdev_smo[c];
613 	}
614 
615 	for (c = oldc; c < newc; c++)
616 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
617 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
618 
619 	if (oldc != 0) {
620 		kmem_free(smo, oldc * sizeof (*smo));
621 		kmem_free(mspp, oldc * sizeof (*mspp));
622 	}
623 
624 }
625 
626 void
627 vdev_metaslab_fini(vdev_t *vd)
628 {
629 	uint64_t m;
630 	uint64_t count = vd->vdev_ms_count;
631 
632 	if (vd->vdev_ms != NULL) {
633 		for (m = 0; m < count; m++)
634 			metaslab_fini(vd->vdev_ms[m]);
635 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
636 		vd->vdev_ms = NULL;
637 	}
638 
639 	if (vd->vdev_smo != NULL) {
640 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
641 		vd->vdev_smo = NULL;
642 	}
643 }
644 
645 /*
646  * Prepare a virtual device for access.
647  */
648 int
649 vdev_open(vdev_t *vd)
650 {
651 	int error;
652 	vdev_knob_t *vk;
653 	int c;
654 	uint64_t osize = 0;
655 	uint64_t asize, psize;
656 	uint64_t ashift = -1ULL;
657 
658 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
659 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
660 	    vd->vdev_state == VDEV_STATE_OFFLINE);
661 
662 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
663 		vd->vdev_fault_arg >>= 1;
664 	else
665 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
666 
667 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
668 
669 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
670 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
671 
672 		*valp = vk->vk_default;
673 		*valp = MAX(*valp, vk->vk_min);
674 		*valp = MIN(*valp, vk->vk_max);
675 	}
676 
677 	if (vd->vdev_ops->vdev_op_leaf) {
678 		vdev_cache_init(vd);
679 		vdev_queue_init(vd);
680 		vd->vdev_cache_active = B_TRUE;
681 	}
682 
683 	if (vd->vdev_offline) {
684 		ASSERT(vd->vdev_children == 0);
685 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
686 		vd->vdev_state = VDEV_STATE_OFFLINE;
687 		return (ENXIO);
688 	}
689 
690 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
691 
692 	dprintf("%s = %d, osize %llu, state = %d\n",
693 	    vdev_description(vd), error, osize, vd->vdev_state);
694 
695 	if (error) {
696 		dprintf("%s in %s failed to open, error %d, aux %d\n",
697 		    vdev_description(vd),
698 		    vdev_description(vd->vdev_parent),
699 		    error,
700 		    vd->vdev_stat.vs_aux);
701 
702 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
703 		return (error);
704 	}
705 
706 	vd->vdev_state = VDEV_STATE_HEALTHY;
707 
708 	for (c = 0; c < vd->vdev_children; c++)
709 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
710 			vd->vdev_state = VDEV_STATE_DEGRADED;
711 
712 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
713 
714 	if (vd->vdev_children == 0) {
715 		if (osize < SPA_MINDEVSIZE) {
716 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
717 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
718 			return (EOVERFLOW);
719 		}
720 		psize = osize;
721 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
722 	} else {
723 		if (osize < SPA_MINDEVSIZE -
724 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
725 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
726 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
727 			return (EOVERFLOW);
728 		}
729 		psize = 0;
730 		asize = osize;
731 	}
732 
733 	vd->vdev_psize = psize;
734 
735 	if (vd->vdev_asize == 0) {
736 		/*
737 		 * This is the first-ever open, so use the computed values.
738 		 */
739 		vd->vdev_asize = asize;
740 		vd->vdev_ashift = ashift;
741 	} else {
742 		/*
743 		 * Make sure the alignment requirement hasn't increased.
744 		 */
745 		if (ashift > vd->vdev_ashift) {
746 			dprintf("%s: ashift grew\n", vdev_description(vd));
747 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
748 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
749 			return (EINVAL);
750 		}
751 
752 		/*
753 		 * Make sure the device hasn't shrunk.
754 		 */
755 		if (asize < vd->vdev_asize) {
756 			dprintf("%s: device shrank\n", vdev_description(vd));
757 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
758 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
759 			return (EINVAL);
760 		}
761 
762 		/*
763 		 * If all children are healthy and the asize has increased,
764 		 * then we've experienced dynamic LUN growth.
765 		 */
766 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
767 		    asize > vd->vdev_asize) {
768 			dprintf("%s: device grew\n", vdev_description(vd));
769 			vd->vdev_asize = asize;
770 		}
771 	}
772 
773 	return (0);
774 }
775 
776 /*
777  * Close a virtual device.
778  */
779 void
780 vdev_close(vdev_t *vd)
781 {
782 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
783 
784 	vd->vdev_ops->vdev_op_close(vd);
785 
786 	if (vd->vdev_cache_active) {
787 		vdev_cache_fini(vd);
788 		vdev_queue_fini(vd);
789 		vd->vdev_cache_active = B_FALSE;
790 	}
791 
792 	if (vd->vdev_offline)
793 		vd->vdev_state = VDEV_STATE_OFFLINE;
794 	else
795 		vd->vdev_state = VDEV_STATE_CLOSED;
796 }
797 
798 void
799 vdev_reopen(vdev_t *vd, zio_t **rq)
800 {
801 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
802 	int c;
803 
804 	if (vd == rvd) {
805 		ASSERT(rq == NULL);
806 		for (c = 0; c < rvd->vdev_children; c++)
807 			vdev_reopen(rvd->vdev_child[c], NULL);
808 		return;
809 	}
810 
811 	/* only valid for top-level vdevs */
812 	ASSERT3P(vd, ==, vd->vdev_top);
813 
814 	/*
815 	 * vdev_state can change when spa_config_lock is held as writer,
816 	 * or when it's held as reader and we're doing a vdev_reopen().
817 	 * To handle the latter case, we grab rvd's io_lock to serialize
818 	 * reopens.  This ensures that there's never more than one vdev
819 	 * state changer active at a time.
820 	 */
821 	mutex_enter(&rvd->vdev_io_lock);
822 
823 	mutex_enter(&vd->vdev_io_lock);
824 	while (list_head(&vd->vdev_io_pending) != NULL)
825 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
826 	vdev_close(vd);
827 	(void) vdev_open(vd);
828 	if (rq != NULL) {
829 		*rq = vd->vdev_io_retry;
830 		vd->vdev_io_retry = NULL;
831 	}
832 	mutex_exit(&vd->vdev_io_lock);
833 
834 	/*
835 	 * Reassess root vdev's health.
836 	 */
837 	rvd->vdev_state = VDEV_STATE_HEALTHY;
838 	for (c = 0; c < rvd->vdev_children; c++) {
839 		uint64_t state = rvd->vdev_child[c]->vdev_state;
840 		rvd->vdev_state = MIN(rvd->vdev_state, state);
841 	}
842 
843 	mutex_exit(&rvd->vdev_io_lock);
844 }
845 
846 int
847 vdev_create(vdev_t *vd, uint64_t txg)
848 {
849 	int error;
850 
851 	/*
852 	 * Normally, partial opens (e.g. of a mirror) are allowed.
853 	 * For a create, however, we want to fail the request if
854 	 * there are any components we can't open.
855 	 */
856 	error = vdev_open(vd);
857 
858 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
859 		vdev_close(vd);
860 		return (error ? error : ENXIO);
861 	}
862 
863 	/*
864 	 * Recursively initialize all labels.
865 	 */
866 	if ((error = vdev_label_init(vd, txg)) != 0) {
867 		vdev_close(vd);
868 		return (error);
869 	}
870 
871 	return (0);
872 }
873 
874 /*
875  * The is the latter half of vdev_create().  It is distinct because it
876  * involves initiating transactions in order to do metaslab creation.
877  * For creation, we want to try to create all vdevs at once and then undo it
878  * if anything fails; this is much harder if we have pending transactions.
879  */
880 void
881 vdev_init(vdev_t *vd, uint64_t txg)
882 {
883 	/*
884 	 * Aim for roughly 200 metaslabs per vdev.
885 	 */
886 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
887 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
888 
889 	/*
890 	 * Initialize the vdev's metaslabs.
891 	 */
892 	vdev_metaslab_init(vd, txg);
893 }
894 
895 void
896 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
897 {
898 	vdev_t *tvd = vd->vdev_top;
899 
900 	mutex_enter(&tvd->vdev_dirty_lock);
901 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
902 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
903 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
904 		    tvd, txg);
905 	}
906 	mutex_exit(&tvd->vdev_dirty_lock);
907 }
908 
909 void
910 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
911 {
912 	mutex_enter(sm->sm_lock);
913 	if (!space_map_contains(sm, txg, size))
914 		space_map_add(sm, txg, size);
915 	mutex_exit(sm->sm_lock);
916 }
917 
918 int
919 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
920 {
921 	int dirty;
922 
923 	/*
924 	 * Quick test without the lock -- covers the common case that
925 	 * there are no dirty time segments.
926 	 */
927 	if (sm->sm_space == 0)
928 		return (0);
929 
930 	mutex_enter(sm->sm_lock);
931 	dirty = space_map_contains(sm, txg, size);
932 	mutex_exit(sm->sm_lock);
933 
934 	return (dirty);
935 }
936 
937 /*
938  * Reassess DTLs after a config change or scrub completion.
939  */
940 void
941 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
942 {
943 	int c;
944 
945 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
946 
947 	if (vd->vdev_children == 0) {
948 		mutex_enter(&vd->vdev_dtl_lock);
949 		/*
950 		 * We're successfully scrubbed everything up to scrub_txg.
951 		 * Therefore, excise all old DTLs up to that point, then
952 		 * fold in the DTLs for everything we couldn't scrub.
953 		 */
954 		if (scrub_txg != 0) {
955 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
956 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
957 		}
958 		if (scrub_done)
959 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
960 		mutex_exit(&vd->vdev_dtl_lock);
961 		if (txg != 0) {
962 			vdev_t *tvd = vd->vdev_top;
963 			vdev_dirty(tvd, VDD_DTL, txg);
964 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
965 		}
966 		return;
967 	}
968 
969 	mutex_enter(&vd->vdev_dtl_lock);
970 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
971 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
972 	mutex_exit(&vd->vdev_dtl_lock);
973 
974 	for (c = 0; c < vd->vdev_children; c++) {
975 		vdev_t *cvd = vd->vdev_child[c];
976 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
977 		mutex_enter(&vd->vdev_dtl_lock);
978 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
979 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
980 		mutex_exit(&vd->vdev_dtl_lock);
981 	}
982 }
983 
984 static int
985 vdev_dtl_load(vdev_t *vd)
986 {
987 	spa_t *spa = vd->vdev_spa;
988 	space_map_obj_t *smo = &vd->vdev_dtl;
989 	dmu_buf_t *db;
990 	int error;
991 
992 	ASSERT(vd->vdev_children == 0);
993 
994 	if (smo->smo_object == 0)
995 		return (0);
996 
997 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
998 	dmu_buf_read(db);
999 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1000 	bcopy(db->db_data, smo, db->db_size);
1001 	dmu_buf_rele(db);
1002 
1003 	mutex_enter(&vd->vdev_dtl_lock);
1004 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1005 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1006 	mutex_exit(&vd->vdev_dtl_lock);
1007 
1008 	return (error);
1009 }
1010 
1011 void
1012 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1013 {
1014 	spa_t *spa = vd->vdev_spa;
1015 	space_map_obj_t *smo = &vd->vdev_dtl;
1016 	space_map_t *sm = &vd->vdev_dtl_map;
1017 	space_map_t smsync;
1018 	kmutex_t smlock;
1019 	avl_tree_t *t = &sm->sm_root;
1020 	space_seg_t *ss;
1021 	dmu_buf_t *db;
1022 	dmu_tx_t *tx;
1023 
1024 	dprintf("%s in txg %llu pass %d\n",
1025 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1026 
1027 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1028 
1029 	if (vd->vdev_detached) {
1030 		if (smo->smo_object != 0) {
1031 			int err = dmu_object_free(spa->spa_meta_objset,
1032 			    smo->smo_object, tx);
1033 			ASSERT3U(err, ==, 0);
1034 			smo->smo_object = 0;
1035 		}
1036 		dmu_tx_commit(tx);
1037 		return;
1038 	}
1039 
1040 	if (smo->smo_object == 0) {
1041 		ASSERT(smo->smo_objsize == 0);
1042 		ASSERT(smo->smo_alloc == 0);
1043 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1044 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1045 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1046 		ASSERT(smo->smo_object != 0);
1047 		vdev_config_dirty(vd->vdev_top);
1048 	}
1049 
1050 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1051 	    0, smo->smo_objsize, tx);
1052 
1053 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1054 
1055 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1056 	    &smlock);
1057 
1058 	mutex_enter(&smlock);
1059 
1060 	mutex_enter(&vd->vdev_dtl_lock);
1061 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1062 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1063 	mutex_exit(&vd->vdev_dtl_lock);
1064 
1065 	smo->smo_objsize = 0;
1066 	smo->smo_alloc = smsync.sm_space;
1067 
1068 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1069 	space_map_destroy(&smsync);
1070 
1071 	mutex_exit(&smlock);
1072 	mutex_destroy(&smlock);
1073 
1074 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1075 	dmu_buf_will_dirty(db, tx);
1076 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1077 	bcopy(smo, db->db_data, db->db_size);
1078 	dmu_buf_rele(db);
1079 
1080 	dmu_tx_commit(tx);
1081 }
1082 
1083 int
1084 vdev_load(vdev_t *vd, int import)
1085 {
1086 	spa_t *spa = vd->vdev_spa;
1087 	int c, error;
1088 	nvlist_t *label;
1089 	uint64_t guid, state;
1090 
1091 	dprintf("loading %s\n", vdev_description(vd));
1092 
1093 	/*
1094 	 * Recursively load all children.
1095 	 */
1096 	for (c = 0; c < vd->vdev_children; c++)
1097 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1098 			return (error);
1099 
1100 	/*
1101 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1102 	 */
1103 	if (vd->vdev_ops->vdev_op_leaf) {
1104 
1105 		if (vdev_is_dead(vd))
1106 			return (0);
1107 
1108 		/*
1109 		 * XXX state transitions don't propagate to parent here.
1110 		 * Also, merely setting the state isn't sufficient because
1111 		 * it's not persistent; a vdev_reopen() would make us
1112 		 * forget all about it.
1113 		 */
1114 		if ((label = vdev_label_read_config(vd)) == NULL) {
1115 			dprintf("can't load label config\n");
1116 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1117 			    VDEV_AUX_CORRUPT_DATA);
1118 			return (0);
1119 		}
1120 
1121 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1122 		    &guid) != 0 || guid != spa_guid(spa)) {
1123 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1124 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1125 			    VDEV_AUX_CORRUPT_DATA);
1126 			nvlist_free(label);
1127 			return (0);
1128 		}
1129 
1130 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1131 		    guid != vd->vdev_guid) {
1132 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1133 			    guid, vd->vdev_guid);
1134 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1135 			    VDEV_AUX_CORRUPT_DATA);
1136 			nvlist_free(label);
1137 			return (0);
1138 		}
1139 
1140 		/*
1141 		 * If we find a vdev with a matching pool guid and vdev guid,
1142 		 * but the pool state is not active, it indicates that the user
1143 		 * exported or destroyed the pool without affecting the config
1144 		 * cache (if / was mounted readonly, for example).  In this
1145 		 * case, immediately return EBADF so the caller can remove it
1146 		 * from the config.
1147 		 */
1148 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1149 		    &state)) {
1150 			dprintf("missing pool state\n");
1151 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1152 			    VDEV_AUX_CORRUPT_DATA);
1153 			nvlist_free(label);
1154 			return (0);
1155 		}
1156 
1157 		if (state != POOL_STATE_ACTIVE &&
1158 		    (!import || state != POOL_STATE_EXPORTED)) {
1159 			dprintf("pool state not active (%llu)\n", state);
1160 			nvlist_free(label);
1161 			return (EBADF);
1162 		}
1163 
1164 		nvlist_free(label);
1165 	}
1166 
1167 	/*
1168 	 * If this is a top-level vdev, make sure its allocation parameters
1169 	 * exist and initialize its metaslabs.
1170 	 */
1171 	if (vd == vd->vdev_top) {
1172 
1173 		if (vd->vdev_ms_array == 0 ||
1174 		    vd->vdev_ms_shift == 0 ||
1175 		    vd->vdev_ashift == 0 ||
1176 		    vd->vdev_asize == 0) {
1177 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1178 			    VDEV_AUX_CORRUPT_DATA);
1179 			return (0);
1180 		}
1181 
1182 		vdev_metaslab_init(vd, 0);
1183 	}
1184 
1185 	/*
1186 	 * If this is a leaf vdev, load its DTL.
1187 	 */
1188 	if (vd->vdev_ops->vdev_op_leaf) {
1189 		error = vdev_dtl_load(vd);
1190 		if (error) {
1191 			dprintf("can't load DTL for %s, error %d\n",
1192 			    vdev_description(vd), error);
1193 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1194 			    VDEV_AUX_CORRUPT_DATA);
1195 			return (0);
1196 		}
1197 	}
1198 
1199 	return (0);
1200 }
1201 
1202 void
1203 vdev_sync_done(vdev_t *vd, uint64_t txg)
1204 {
1205 	metaslab_t *msp;
1206 
1207 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1208 
1209 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1210 		metaslab_sync_done(msp, txg);
1211 }
1212 
1213 void
1214 vdev_add_sync(vdev_t *vd, uint64_t txg)
1215 {
1216 	spa_t *spa = vd->vdev_spa;
1217 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1218 
1219 	ASSERT(vd == vd->vdev_top);
1220 
1221 	if (vd->vdev_ms_array == 0)
1222 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1223 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1224 
1225 	ASSERT(vd->vdev_ms_array != 0);
1226 
1227 	vdev_config_dirty(vd);
1228 
1229 	dmu_tx_commit(tx);
1230 }
1231 
1232 void
1233 vdev_sync(vdev_t *vd, uint64_t txg)
1234 {
1235 	spa_t *spa = vd->vdev_spa;
1236 	vdev_t *lvd;
1237 	metaslab_t *msp;
1238 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1239 	uint8_t dirty = *dirtyp;
1240 
1241 	mutex_enter(&vd->vdev_dirty_lock);
1242 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1243 	mutex_exit(&vd->vdev_dirty_lock);
1244 
1245 	dprintf("%s txg %llu pass %d\n",
1246 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1247 
1248 	if (dirty & VDD_ADD)
1249 		vdev_add_sync(vd, txg);
1250 
1251 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1252 		metaslab_sync(msp, txg);
1253 
1254 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1255 		vdev_dtl_sync(lvd, txg);
1256 
1257 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1258 }
1259 
1260 uint64_t
1261 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1262 {
1263 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1264 }
1265 
1266 void
1267 vdev_io_start(zio_t *zio)
1268 {
1269 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1270 }
1271 
1272 void
1273 vdev_io_done(zio_t *zio)
1274 {
1275 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1276 }
1277 
1278 const char *
1279 vdev_description(vdev_t *vd)
1280 {
1281 	if (vd == NULL || vd->vdev_ops == NULL)
1282 		return ("<unknown>");
1283 
1284 	if (vd->vdev_path != NULL)
1285 		return (vd->vdev_path);
1286 
1287 	if (vd->vdev_parent == NULL)
1288 		return (spa_name(vd->vdev_spa));
1289 
1290 	return (vd->vdev_ops->vdev_op_type);
1291 }
1292 
1293 int
1294 vdev_online(spa_t *spa, const char *path)
1295 {
1296 	vdev_t *vd;
1297 
1298 	spa_config_enter(spa, RW_WRITER);
1299 
1300 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1301 		spa_config_exit(spa);
1302 		return (ENODEV);
1303 	}
1304 
1305 	dprintf("ONLINE: %s\n", vdev_description(vd));
1306 
1307 	vd->vdev_offline = B_FALSE;
1308 
1309 	/*
1310 	 * Clear the error counts.  The idea is that you expect to see all
1311 	 * zeroes when everything is working, so if you've just onlined a
1312 	 * device, you don't want to keep hearing about errors from before.
1313 	 */
1314 	vd->vdev_stat.vs_read_errors = 0;
1315 	vd->vdev_stat.vs_write_errors = 0;
1316 	vd->vdev_stat.vs_checksum_errors = 0;
1317 
1318 	vdev_reopen(vd->vdev_top, NULL);
1319 
1320 	spa_config_exit(spa);
1321 
1322 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1323 
1324 	return (0);
1325 }
1326 
1327 int
1328 vdev_offline(spa_t *spa, const char *path)
1329 {
1330 	vdev_t *vd;
1331 
1332 	spa_config_enter(spa, RW_WRITER);
1333 
1334 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1335 		spa_config_exit(spa);
1336 		return (ENODEV);
1337 	}
1338 
1339 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1340 
1341 	/*
1342 	 * If this device's top-level vdev has a non-empty DTL,
1343 	 * don't allow the device to be offlined.
1344 	 *
1345 	 * XXX -- we should make this more precise by allowing the offline
1346 	 * as long as the remaining devices don't have any DTL holes.
1347 	 */
1348 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
1349 		spa_config_exit(spa);
1350 		return (EBUSY);
1351 	}
1352 
1353 	/*
1354 	 * Set this device to offline state and reopen its top-level vdev.
1355 	 * If this action results in the top-level vdev becoming unusable,
1356 	 * undo it and fail the request.
1357 	 */
1358 	vd->vdev_offline = B_TRUE;
1359 	vdev_reopen(vd->vdev_top, NULL);
1360 	if (vdev_is_dead(vd->vdev_top)) {
1361 		vd->vdev_offline = B_FALSE;
1362 		vdev_reopen(vd->vdev_top, NULL);
1363 		spa_config_exit(spa);
1364 		return (EBUSY);
1365 	}
1366 
1367 	spa_config_exit(spa);
1368 
1369 	return (0);
1370 }
1371 
1372 int
1373 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1374 {
1375 	vdev_t *vd;
1376 
1377 	spa_config_enter(spa, RW_WRITER);
1378 
1379 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1380 		spa_config_exit(spa);
1381 		return (ENODEV);
1382 	}
1383 
1384 	vd->vdev_fault_mode = mode;
1385 	vd->vdev_fault_mask = mask;
1386 	vd->vdev_fault_arg = arg;
1387 
1388 	spa_config_exit(spa);
1389 
1390 	return (0);
1391 }
1392 
1393 int
1394 vdev_is_dead(vdev_t *vd)
1395 {
1396 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1397 }
1398 
1399 int
1400 vdev_error_inject(vdev_t *vd, zio_t *zio)
1401 {
1402 	int error = 0;
1403 
1404 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1405 		return (0);
1406 
1407 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1408 		return (0);
1409 
1410 	switch (vd->vdev_fault_mode) {
1411 	case VDEV_FAULT_RANDOM:
1412 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1413 			error = EIO;
1414 		break;
1415 
1416 	case VDEV_FAULT_COUNT:
1417 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1418 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1419 		error = EIO;
1420 		break;
1421 	}
1422 
1423 	if (error != 0) {
1424 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1425 		    error, zio->io_type, vdev_description(vd),
1426 		    vd->vdev_state, zio->io_offset);
1427 	}
1428 
1429 	return (error);
1430 }
1431 
1432 /*
1433  * Get statistics for the given vdev.
1434  */
1435 void
1436 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1437 {
1438 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1439 	int c, t;
1440 
1441 	mutex_enter(&vd->vdev_stat_lock);
1442 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1443 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1444 	vs->vs_state = vd->vdev_state;
1445 	mutex_exit(&vd->vdev_stat_lock);
1446 
1447 	/*
1448 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1449 	 * over all top-level vdevs (i.e. the direct children of the root).
1450 	 */
1451 	if (vd == rvd) {
1452 		for (c = 0; c < rvd->vdev_children; c++) {
1453 			vdev_t *cvd = rvd->vdev_child[c];
1454 			vdev_stat_t *cvs = &cvd->vdev_stat;
1455 
1456 			mutex_enter(&vd->vdev_stat_lock);
1457 			for (t = 0; t < ZIO_TYPES; t++) {
1458 				vs->vs_ops[t] += cvs->vs_ops[t];
1459 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1460 			}
1461 			vs->vs_read_errors += cvs->vs_read_errors;
1462 			vs->vs_write_errors += cvs->vs_write_errors;
1463 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1464 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1465 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1466 			mutex_exit(&vd->vdev_stat_lock);
1467 		}
1468 	}
1469 }
1470 
1471 void
1472 vdev_stat_update(zio_t *zio)
1473 {
1474 	vdev_t *vd = zio->io_vd;
1475 	vdev_t *pvd;
1476 	uint64_t txg = zio->io_txg;
1477 	vdev_stat_t *vs = &vd->vdev_stat;
1478 	zio_type_t type = zio->io_type;
1479 	int flags = zio->io_flags;
1480 
1481 	if (zio->io_error == 0) {
1482 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1483 			mutex_enter(&vd->vdev_stat_lock);
1484 			vs->vs_ops[type]++;
1485 			vs->vs_bytes[type] += zio->io_size;
1486 			mutex_exit(&vd->vdev_stat_lock);
1487 		}
1488 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1489 		    zio->io_delegate_list == NULL) {
1490 			mutex_enter(&vd->vdev_stat_lock);
1491 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1492 				vs->vs_scrub_repaired += zio->io_size;
1493 			else
1494 				vs->vs_self_healed += zio->io_size;
1495 			mutex_exit(&vd->vdev_stat_lock);
1496 		}
1497 		return;
1498 	}
1499 
1500 	if (flags & ZIO_FLAG_SPECULATIVE)
1501 		return;
1502 
1503 	if (!vdev_is_dead(vd)) {
1504 		mutex_enter(&vd->vdev_stat_lock);
1505 		if (type == ZIO_TYPE_READ) {
1506 			if (zio->io_error == ECKSUM)
1507 				vs->vs_checksum_errors++;
1508 			else
1509 				vs->vs_read_errors++;
1510 		}
1511 		if (type == ZIO_TYPE_WRITE)
1512 			vs->vs_write_errors++;
1513 		mutex_exit(&vd->vdev_stat_lock);
1514 	}
1515 
1516 	if (type == ZIO_TYPE_WRITE) {
1517 		if (txg == 0 || vd->vdev_children != 0)
1518 			return;
1519 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1520 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1521 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1522 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1523 		}
1524 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1525 			vdev_t *tvd = vd->vdev_top;
1526 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1527 				return;
1528 			vdev_dirty(tvd, VDD_DTL, txg);
1529 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1530 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1531 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1532 		}
1533 	}
1534 }
1535 
1536 void
1537 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1538 {
1539 	int c;
1540 	vdev_stat_t *vs = &vd->vdev_stat;
1541 
1542 	for (c = 0; c < vd->vdev_children; c++)
1543 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1544 
1545 	mutex_enter(&vd->vdev_stat_lock);
1546 
1547 	if (type == POOL_SCRUB_NONE) {
1548 		/*
1549 		 * Update completion and end time.  Leave everything else alone
1550 		 * so we can report what happened during the previous scrub.
1551 		 */
1552 		vs->vs_scrub_complete = complete;
1553 		vs->vs_scrub_end = gethrestime_sec();
1554 	} else {
1555 		vs->vs_scrub_type = type;
1556 		vs->vs_scrub_complete = 0;
1557 		vs->vs_scrub_examined = 0;
1558 		vs->vs_scrub_repaired = 0;
1559 		vs->vs_scrub_errors = 0;
1560 		vs->vs_scrub_start = gethrestime_sec();
1561 		vs->vs_scrub_end = 0;
1562 	}
1563 
1564 	mutex_exit(&vd->vdev_stat_lock);
1565 }
1566 
1567 /*
1568  * Report checksum errors that a vdev that didn't realize it made.
1569  * This can happen, for example, when RAID-Z combinatorial reconstruction
1570  * infers that one of its components returned bad data.
1571  */
1572 void
1573 vdev_checksum_error(zio_t *zio, vdev_t *vd)
1574 {
1575 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1576 	    vdev_description(vd));
1577 
1578 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1579 		mutex_enter(&vd->vdev_stat_lock);
1580 		vd->vdev_stat.vs_checksum_errors++;
1581 		mutex_exit(&vd->vdev_stat_lock);
1582 	}
1583 }
1584 
1585 /*
1586  * Update the in-core space usage stats for this vdev and the root vdev.
1587  */
1588 void
1589 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1590 {
1591 	ASSERT(vd == vd->vdev_top);
1592 
1593 	do {
1594 		mutex_enter(&vd->vdev_stat_lock);
1595 		vd->vdev_stat.vs_space += space_delta;
1596 		vd->vdev_stat.vs_alloc += alloc_delta;
1597 		mutex_exit(&vd->vdev_stat_lock);
1598 	} while ((vd = vd->vdev_parent) != NULL);
1599 }
1600 
1601 /*
1602  * Various knobs to tune a vdev.
1603  */
1604 static vdev_knob_t vdev_knob[] = {
1605 	{
1606 		"cache_size",
1607 		"size of the read-ahead cache",
1608 		0,
1609 		1ULL << 30,
1610 		10ULL << 20,
1611 		offsetof(struct vdev, vdev_cache.vc_size)
1612 	},
1613 	{
1614 		"cache_bshift",
1615 		"log2 of cache blocksize",
1616 		SPA_MINBLOCKSHIFT,
1617 		SPA_MAXBLOCKSHIFT,
1618 		16,
1619 		offsetof(struct vdev, vdev_cache.vc_bshift)
1620 	},
1621 	{
1622 		"cache_max",
1623 		"largest block size to cache",
1624 		0,
1625 		SPA_MAXBLOCKSIZE,
1626 		1ULL << 14,
1627 		offsetof(struct vdev, vdev_cache.vc_max)
1628 	},
1629 	{
1630 		"min_pending",
1631 		"minimum pending I/Os to the disk",
1632 		1,
1633 		10000,
1634 		2,
1635 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1636 	},
1637 	{
1638 		"max_pending",
1639 		"maximum pending I/Os to the disk",
1640 		1,
1641 		10000,
1642 		35,
1643 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1644 	},
1645 	{
1646 		"agg_limit",
1647 		"maximum size of aggregated I/Os",
1648 		0,
1649 		SPA_MAXBLOCKSIZE,
1650 		SPA_MAXBLOCKSIZE,
1651 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1652 	},
1653 	{
1654 		"time_shift",
1655 		"deadline = pri + (lbolt >> time_shift)",
1656 		0,
1657 		63,
1658 		4,
1659 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1660 	},
1661 	{
1662 		"ramp_rate",
1663 		"exponential I/O issue ramp-up rate",
1664 		1,
1665 		10000,
1666 		2,
1667 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1668 	},
1669 };
1670 
1671 vdev_knob_t *
1672 vdev_knob_next(vdev_knob_t *vk)
1673 {
1674 	if (vk == NULL)
1675 		return (vdev_knob);
1676 
1677 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1678 		return (NULL);
1679 
1680 	return (vk);
1681 }
1682 
1683 /*
1684  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1685  * so that it will be written out next time the vdev configuration is synced.
1686  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1687  */
1688 void
1689 vdev_config_dirty(vdev_t *vd)
1690 {
1691 	spa_t *spa = vd->vdev_spa;
1692 	vdev_t *rvd = spa->spa_root_vdev;
1693 	int c;
1694 
1695 	if (vd == rvd) {
1696 		for (c = 0; c < rvd->vdev_children; c++)
1697 			vdev_config_dirty(rvd->vdev_child[c]);
1698 	} else {
1699 		ASSERT(vd == vd->vdev_top);
1700 
1701 		if (!vd->vdev_is_dirty) {
1702 			list_insert_head(&spa->spa_dirty_list, vd);
1703 			vd->vdev_is_dirty = B_TRUE;
1704 		}
1705 	}
1706 }
1707 
1708 void
1709 vdev_config_clean(vdev_t *vd)
1710 {
1711 	ASSERT(vd->vdev_is_dirty);
1712 
1713 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1714 	vd->vdev_is_dirty = B_FALSE;
1715 }
1716 
1717 /*
1718  * Set a vdev's state, updating any parent's state as well.
1719  */
1720 void
1721 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1722 {
1723 	if (state == vd->vdev_state)
1724 		return;
1725 
1726 	vd->vdev_state = state;
1727 	vd->vdev_stat.vs_aux = aux;
1728 
1729 	if (vd->vdev_parent != NULL) {
1730 		int c;
1731 		int degraded = 0, faulted = 0;
1732 		vdev_t *parent, *child;
1733 
1734 		parent = vd->vdev_parent;
1735 		for (c = 0; c < parent->vdev_children; c++) {
1736 			child = parent->vdev_child[c];
1737 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1738 				faulted++;
1739 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1740 				degraded++;
1741 		}
1742 
1743 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1744 		    vd->vdev_parent, faulted, degraded);
1745 	    }
1746 }
1747