spa.c revision b7b97454b9b1f6625e7e655e9651e744a8dee09d
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * This file contains all the routines used when modifying on-disk SPA state.
31 * This includes opening, importing, destroying, exporting a pool, and syncing a
32 * pool.
33 */
34
35#include <sys/zfs_context.h>
36#include <sys/fm/fs/zfs.h>
37#include <sys/spa_impl.h>
38#include <sys/zio.h>
39#include <sys/zio_checksum.h>
40#include <sys/zio_compress.h>
41#include <sys/dmu.h>
42#include <sys/dmu_tx.h>
43#include <sys/zap.h>
44#include <sys/zil.h>
45#include <sys/vdev_impl.h>
46#include <sys/metaslab.h>
47#include <sys/uberblock_impl.h>
48#include <sys/txg.h>
49#include <sys/avl.h>
50#include <sys/dmu_traverse.h>
51#include <sys/dmu_objset.h>
52#include <sys/unique.h>
53#include <sys/dsl_pool.h>
54#include <sys/dsl_dataset.h>
55#include <sys/dsl_dir.h>
56#include <sys/dsl_prop.h>
57#include <sys/dsl_synctask.h>
58#include <sys/fs/zfs.h>
59#include <sys/arc.h>
60#include <sys/callb.h>
61#include <sys/systeminfo.h>
62#include <sys/sunddi.h>
63
64#include "zfs_prop.h"
65#include "zfs_comutil.h"
66
67int zio_taskq_threads = 8;
68
69static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
70
71/*
72 * ==========================================================================
73 * SPA properties routines
74 * ==========================================================================
75 */
76
77/*
78 * Add a (source=src, propname=propval) list to an nvlist.
79 */
80static int
81spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
82    uint64_t intval, zprop_source_t src)
83{
84	const char *propname = zpool_prop_to_name(prop);
85	nvlist_t *propval;
86	int err = 0;
87
88	if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP))
89		return (err);
90
91	if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src))
92		goto out;
93
94	if (strval != NULL) {
95		if (err = nvlist_add_string(propval, ZPROP_VALUE, strval))
96			goto out;
97	} else {
98		if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval))
99			goto out;
100	}
101
102	err = nvlist_add_nvlist(nvl, propname, propval);
103out:
104	nvlist_free(propval);
105	return (err);
106}
107
108/*
109 * Get property values from the spa configuration.
110 */
111static int
112spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
113{
114	uint64_t size = spa_get_space(spa);
115	uint64_t used = spa_get_alloc(spa);
116	uint64_t cap, version;
117	zprop_source_t src = ZPROP_SRC_NONE;
118	int err;
119	char *cachefile;
120	size_t len;
121
122	/*
123	 * readonly properties
124	 */
125	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name,
126	    0, src))
127		return (err);
128
129	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src))
130		return (err);
131
132	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src))
133		return (err);
134
135	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
136	    size - used, src))
137		return (err);
138
139	cap = (size == 0) ? 0 : (used * 100 / size);
140	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src))
141		return (err);
142
143	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL,
144	    spa_guid(spa), src))
145		return (err);
146
147	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
148	    spa->spa_root_vdev->vdev_state, src))
149		return (err);
150
151	/*
152	 * settable properties that are not stored in the pool property object.
153	 */
154	version = spa_version(spa);
155	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
156		src = ZPROP_SRC_DEFAULT;
157	else
158		src = ZPROP_SRC_LOCAL;
159	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
160	    version, src))
161		return (err);
162
163	if (spa->spa_root != NULL) {
164		src = ZPROP_SRC_LOCAL;
165		if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT,
166		    spa->spa_root, 0, src))
167			return (err);
168	}
169
170	if (spa->spa_config_dir != NULL) {
171		if (strcmp(spa->spa_config_dir, "none") == 0) {
172			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
173			    spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
174		} else {
175			len = strlen(spa->spa_config_dir) +
176			    strlen(spa->spa_config_file) + 2;
177			cachefile = kmem_alloc(len, KM_SLEEP);
178			(void) snprintf(cachefile, len, "%s/%s",
179			    spa->spa_config_dir, spa->spa_config_file);
180			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
181			    cachefile, 0, ZPROP_SRC_LOCAL);
182			kmem_free(cachefile, len);
183		}
184
185		if (err)
186			return (err);
187	}
188
189	return (0);
190}
191
192/*
193 * Get zpool property values.
194 */
195int
196spa_prop_get(spa_t *spa, nvlist_t **nvp)
197{
198	zap_cursor_t zc;
199	zap_attribute_t za;
200	objset_t *mos = spa->spa_meta_objset;
201	int err;
202
203	if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP))
204		return (err);
205
206	/*
207	 * Get properties from the spa config.
208	 */
209	if (err = spa_prop_get_config(spa, nvp))
210		goto out;
211
212	mutex_enter(&spa->spa_props_lock);
213	/* If no pool property object, no more prop to get. */
214	if (spa->spa_pool_props_object == 0) {
215		mutex_exit(&spa->spa_props_lock);
216		return (0);
217	}
218
219	/*
220	 * Get properties from the MOS pool property object.
221	 */
222	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
223	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
224	    zap_cursor_advance(&zc)) {
225		uint64_t intval = 0;
226		char *strval = NULL;
227		zprop_source_t src = ZPROP_SRC_DEFAULT;
228		zpool_prop_t prop;
229
230		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
231			continue;
232
233		switch (za.za_integer_length) {
234		case 8:
235			/* integer property */
236			if (za.za_first_integer !=
237			    zpool_prop_default_numeric(prop))
238				src = ZPROP_SRC_LOCAL;
239
240			if (prop == ZPOOL_PROP_BOOTFS) {
241				dsl_pool_t *dp;
242				dsl_dataset_t *ds = NULL;
243
244				dp = spa_get_dsl(spa);
245				rw_enter(&dp->dp_config_rwlock, RW_READER);
246				if (err = dsl_dataset_open_obj(dp,
247				    za.za_first_integer, NULL, DS_MODE_NONE,
248				    FTAG, &ds)) {
249					rw_exit(&dp->dp_config_rwlock);
250					break;
251				}
252
253				strval = kmem_alloc(
254				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
255				    KM_SLEEP);
256				dsl_dataset_name(ds, strval);
257				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
258				rw_exit(&dp->dp_config_rwlock);
259			} else {
260				strval = NULL;
261				intval = za.za_first_integer;
262			}
263
264			err = spa_prop_add_list(*nvp, prop, strval,
265			    intval, src);
266
267			if (strval != NULL)
268				kmem_free(strval,
269				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
270
271			break;
272
273		case 1:
274			/* string property */
275			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
276			err = zap_lookup(mos, spa->spa_pool_props_object,
277			    za.za_name, 1, za.za_num_integers, strval);
278			if (err) {
279				kmem_free(strval, za.za_num_integers);
280				break;
281			}
282			err = spa_prop_add_list(*nvp, prop, strval, 0, src);
283			kmem_free(strval, za.za_num_integers);
284			break;
285
286		default:
287			break;
288		}
289	}
290	zap_cursor_fini(&zc);
291	mutex_exit(&spa->spa_props_lock);
292out:
293	if (err && err != ENOENT) {
294		nvlist_free(*nvp);
295		return (err);
296	}
297
298	return (0);
299}
300
301/*
302 * Validate the given pool properties nvlist and modify the list
303 * for the property values to be set.
304 */
305static int
306spa_prop_validate(spa_t *spa, nvlist_t *props)
307{
308	nvpair_t *elem;
309	int error = 0, reset_bootfs = 0;
310	uint64_t objnum;
311
312	elem = NULL;
313	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
314		zpool_prop_t prop;
315		char *propname, *strval;
316		uint64_t intval;
317		vdev_t *rvdev;
318		char *vdev_type;
319		objset_t *os;
320		char *slash;
321
322		propname = nvpair_name(elem);
323
324		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
325			return (EINVAL);
326
327		switch (prop) {
328		case ZPOOL_PROP_VERSION:
329			error = nvpair_value_uint64(elem, &intval);
330			if (!error &&
331			    (intval < spa_version(spa) || intval > SPA_VERSION))
332				error = EINVAL;
333			break;
334
335		case ZPOOL_PROP_DELEGATION:
336		case ZPOOL_PROP_AUTOREPLACE:
337			error = nvpair_value_uint64(elem, &intval);
338			if (!error && intval > 1)
339				error = EINVAL;
340			break;
341
342		case ZPOOL_PROP_BOOTFS:
343			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
344				error = ENOTSUP;
345				break;
346			}
347
348			/*
349			 * A bootable filesystem can not be on a RAIDZ pool
350			 * nor a striped pool with more than 1 device.
351			 */
352			rvdev = spa->spa_root_vdev;
353			vdev_type =
354			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
355			if (rvdev->vdev_children > 1 ||
356			    strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
357			    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
358				error = ENOTSUP;
359				break;
360			}
361
362			reset_bootfs = 1;
363
364			error = nvpair_value_string(elem, &strval);
365
366			if (!error) {
367				if (strval == NULL || strval[0] == '\0') {
368					objnum = zpool_prop_default_numeric(
369					    ZPOOL_PROP_BOOTFS);
370					break;
371				}
372
373				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
374				    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
375					break;
376				objnum = dmu_objset_id(os);
377				dmu_objset_close(os);
378			}
379			break;
380		case ZPOOL_PROP_FAILUREMODE:
381			error = nvpair_value_uint64(elem, &intval);
382			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
383			    intval > ZIO_FAILURE_MODE_PANIC))
384				error = EINVAL;
385
386			/*
387			 * This is a special case which only occurs when
388			 * the pool has completely failed. This allows
389			 * the user to change the in-core failmode property
390			 * without syncing it out to disk (I/Os might
391			 * currently be blocked). We do this by returning
392			 * EIO to the caller (spa_prop_set) to trick it
393			 * into thinking we encountered a property validation
394			 * error.
395			 */
396			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
397				spa->spa_failmode = intval;
398				error = EIO;
399			}
400			break;
401
402		case ZPOOL_PROP_CACHEFILE:
403			if ((error = nvpair_value_string(elem, &strval)) != 0)
404				break;
405
406			if (strval[0] == '\0')
407				break;
408
409			if (strcmp(strval, "none") == 0)
410				break;
411
412			if (strval[0] != '/') {
413				error = EINVAL;
414				break;
415			}
416
417			slash = strrchr(strval, '/');
418			ASSERT(slash != NULL);
419
420			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
421			    strcmp(slash, "/..") == 0)
422				error = EINVAL;
423			break;
424		}
425
426		if (error)
427			break;
428	}
429
430	if (!error && reset_bootfs) {
431		error = nvlist_remove(props,
432		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
433
434		if (!error) {
435			error = nvlist_add_uint64(props,
436			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
437		}
438	}
439
440	return (error);
441}
442
443int
444spa_prop_set(spa_t *spa, nvlist_t *nvp)
445{
446	int error;
447
448	if ((error = spa_prop_validate(spa, nvp)) != 0)
449		return (error);
450
451	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
452	    spa, nvp, 3));
453}
454
455/*
456 * If the bootfs property value is dsobj, clear it.
457 */
458void
459spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
460{
461	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
462		VERIFY(zap_remove(spa->spa_meta_objset,
463		    spa->spa_pool_props_object,
464		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
465		spa->spa_bootfs = 0;
466	}
467}
468
469/*
470 * ==========================================================================
471 * SPA state manipulation (open/create/destroy/import/export)
472 * ==========================================================================
473 */
474
475static int
476spa_error_entry_compare(const void *a, const void *b)
477{
478	spa_error_entry_t *sa = (spa_error_entry_t *)a;
479	spa_error_entry_t *sb = (spa_error_entry_t *)b;
480	int ret;
481
482	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
483	    sizeof (zbookmark_t));
484
485	if (ret < 0)
486		return (-1);
487	else if (ret > 0)
488		return (1);
489	else
490		return (0);
491}
492
493/*
494 * Utility function which retrieves copies of the current logs and
495 * re-initializes them in the process.
496 */
497void
498spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
499{
500	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
501
502	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
503	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
504
505	avl_create(&spa->spa_errlist_scrub,
506	    spa_error_entry_compare, sizeof (spa_error_entry_t),
507	    offsetof(spa_error_entry_t, se_avl));
508	avl_create(&spa->spa_errlist_last,
509	    spa_error_entry_compare, sizeof (spa_error_entry_t),
510	    offsetof(spa_error_entry_t, se_avl));
511}
512
513/*
514 * Activate an uninitialized pool.
515 */
516static void
517spa_activate(spa_t *spa)
518{
519	int t;
520
521	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
522
523	spa->spa_state = POOL_STATE_ACTIVE;
524
525	spa->spa_normal_class = metaslab_class_create();
526	spa->spa_log_class = metaslab_class_create();
527
528	for (t = 0; t < ZIO_TYPES; t++) {
529		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
530		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
531		    TASKQ_PREPOPULATE);
532		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
533		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
534		    TASKQ_PREPOPULATE);
535	}
536
537	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
538	    offsetof(vdev_t, vdev_dirty_node));
539	list_create(&spa->spa_zio_list, sizeof (zio_t),
540	    offsetof(zio_t, zio_link_node));
541
542	txg_list_create(&spa->spa_vdev_txg_list,
543	    offsetof(struct vdev, vdev_txg_node));
544
545	avl_create(&spa->spa_errlist_scrub,
546	    spa_error_entry_compare, sizeof (spa_error_entry_t),
547	    offsetof(spa_error_entry_t, se_avl));
548	avl_create(&spa->spa_errlist_last,
549	    spa_error_entry_compare, sizeof (spa_error_entry_t),
550	    offsetof(spa_error_entry_t, se_avl));
551}
552
553/*
554 * Opposite of spa_activate().
555 */
556static void
557spa_deactivate(spa_t *spa)
558{
559	int t;
560
561	ASSERT(spa->spa_sync_on == B_FALSE);
562	ASSERT(spa->spa_dsl_pool == NULL);
563	ASSERT(spa->spa_root_vdev == NULL);
564
565	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
566
567	txg_list_destroy(&spa->spa_vdev_txg_list);
568
569	list_destroy(&spa->spa_dirty_list);
570	list_destroy(&spa->spa_zio_list);
571
572	for (t = 0; t < ZIO_TYPES; t++) {
573		taskq_destroy(spa->spa_zio_issue_taskq[t]);
574		taskq_destroy(spa->spa_zio_intr_taskq[t]);
575		spa->spa_zio_issue_taskq[t] = NULL;
576		spa->spa_zio_intr_taskq[t] = NULL;
577	}
578
579	metaslab_class_destroy(spa->spa_normal_class);
580	spa->spa_normal_class = NULL;
581
582	metaslab_class_destroy(spa->spa_log_class);
583	spa->spa_log_class = NULL;
584
585	/*
586	 * If this was part of an import or the open otherwise failed, we may
587	 * still have errors left in the queues.  Empty them just in case.
588	 */
589	spa_errlog_drain(spa);
590
591	avl_destroy(&spa->spa_errlist_scrub);
592	avl_destroy(&spa->spa_errlist_last);
593
594	spa->spa_state = POOL_STATE_UNINITIALIZED;
595}
596
597/*
598 * Verify a pool configuration, and construct the vdev tree appropriately.  This
599 * will create all the necessary vdevs in the appropriate layout, with each vdev
600 * in the CLOSED state.  This will prep the pool before open/creation/import.
601 * All vdev validation is done by the vdev_alloc() routine.
602 */
603static int
604spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
605    uint_t id, int atype)
606{
607	nvlist_t **child;
608	uint_t c, children;
609	int error;
610
611	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
612		return (error);
613
614	if ((*vdp)->vdev_ops->vdev_op_leaf)
615		return (0);
616
617	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
618	    &child, &children) != 0) {
619		vdev_free(*vdp);
620		*vdp = NULL;
621		return (EINVAL);
622	}
623
624	for (c = 0; c < children; c++) {
625		vdev_t *vd;
626		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
627		    atype)) != 0) {
628			vdev_free(*vdp);
629			*vdp = NULL;
630			return (error);
631		}
632	}
633
634	ASSERT(*vdp != NULL);
635
636	return (0);
637}
638
639/*
640 * Opposite of spa_load().
641 */
642static void
643spa_unload(spa_t *spa)
644{
645	int i;
646
647	/*
648	 * Stop async tasks.
649	 */
650	spa_async_suspend(spa);
651
652	/*
653	 * Stop syncing.
654	 */
655	if (spa->spa_sync_on) {
656		txg_sync_stop(spa->spa_dsl_pool);
657		spa->spa_sync_on = B_FALSE;
658	}
659
660	/*
661	 * Wait for any outstanding prefetch I/O to complete.
662	 */
663	spa_config_enter(spa, RW_WRITER, FTAG);
664	spa_config_exit(spa, FTAG);
665
666	/*
667	 * Drop and purge level 2 cache
668	 */
669	spa_l2cache_drop(spa);
670
671	/*
672	 * Close the dsl pool.
673	 */
674	if (spa->spa_dsl_pool) {
675		dsl_pool_close(spa->spa_dsl_pool);
676		spa->spa_dsl_pool = NULL;
677	}
678
679	/*
680	 * Close all vdevs.
681	 */
682	if (spa->spa_root_vdev)
683		vdev_free(spa->spa_root_vdev);
684	ASSERT(spa->spa_root_vdev == NULL);
685
686	for (i = 0; i < spa->spa_spares.sav_count; i++)
687		vdev_free(spa->spa_spares.sav_vdevs[i]);
688	if (spa->spa_spares.sav_vdevs) {
689		kmem_free(spa->spa_spares.sav_vdevs,
690		    spa->spa_spares.sav_count * sizeof (void *));
691		spa->spa_spares.sav_vdevs = NULL;
692	}
693	if (spa->spa_spares.sav_config) {
694		nvlist_free(spa->spa_spares.sav_config);
695		spa->spa_spares.sav_config = NULL;
696	}
697
698	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
699		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
700	if (spa->spa_l2cache.sav_vdevs) {
701		kmem_free(spa->spa_l2cache.sav_vdevs,
702		    spa->spa_l2cache.sav_count * sizeof (void *));
703		spa->spa_l2cache.sav_vdevs = NULL;
704	}
705	if (spa->spa_l2cache.sav_config) {
706		nvlist_free(spa->spa_l2cache.sav_config);
707		spa->spa_l2cache.sav_config = NULL;
708	}
709
710	spa->spa_async_suspended = 0;
711}
712
713/*
714 * Load (or re-load) the current list of vdevs describing the active spares for
715 * this pool.  When this is called, we have some form of basic information in
716 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
717 * then re-generate a more complete list including status information.
718 */
719static void
720spa_load_spares(spa_t *spa)
721{
722	nvlist_t **spares;
723	uint_t nspares;
724	int i;
725	vdev_t *vd, *tvd;
726
727	/*
728	 * First, close and free any existing spare vdevs.
729	 */
730	for (i = 0; i < spa->spa_spares.sav_count; i++) {
731		vd = spa->spa_spares.sav_vdevs[i];
732
733		/* Undo the call to spa_activate() below */
734		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
735		    tvd->vdev_isspare)
736			spa_spare_remove(tvd);
737		vdev_close(vd);
738		vdev_free(vd);
739	}
740
741	if (spa->spa_spares.sav_vdevs)
742		kmem_free(spa->spa_spares.sav_vdevs,
743		    spa->spa_spares.sav_count * sizeof (void *));
744
745	if (spa->spa_spares.sav_config == NULL)
746		nspares = 0;
747	else
748		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
749		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
750
751	spa->spa_spares.sav_count = (int)nspares;
752	spa->spa_spares.sav_vdevs = NULL;
753
754	if (nspares == 0)
755		return;
756
757	/*
758	 * Construct the array of vdevs, opening them to get status in the
759	 * process.   For each spare, there is potentially two different vdev_t
760	 * structures associated with it: one in the list of spares (used only
761	 * for basic validation purposes) and one in the active vdev
762	 * configuration (if it's spared in).  During this phase we open and
763	 * validate each vdev on the spare list.  If the vdev also exists in the
764	 * active configuration, then we also mark this vdev as an active spare.
765	 */
766	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
767	    KM_SLEEP);
768	for (i = 0; i < spa->spa_spares.sav_count; i++) {
769		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
770		    VDEV_ALLOC_SPARE) == 0);
771		ASSERT(vd != NULL);
772
773		spa->spa_spares.sav_vdevs[i] = vd;
774
775		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
776			if (!tvd->vdev_isspare)
777				spa_spare_add(tvd);
778
779			/*
780			 * We only mark the spare active if we were successfully
781			 * able to load the vdev.  Otherwise, importing a pool
782			 * with a bad active spare would result in strange
783			 * behavior, because multiple pool would think the spare
784			 * is actively in use.
785			 *
786			 * There is a vulnerability here to an equally bizarre
787			 * circumstance, where a dead active spare is later
788			 * brought back to life (onlined or otherwise).  Given
789			 * the rarity of this scenario, and the extra complexity
790			 * it adds, we ignore the possibility.
791			 */
792			if (!vdev_is_dead(tvd))
793				spa_spare_activate(tvd);
794		}
795
796		if (vdev_open(vd) != 0)
797			continue;
798
799		vd->vdev_top = vd;
800		if (vdev_validate_aux(vd) == 0)
801			spa_spare_add(vd);
802	}
803
804	/*
805	 * Recompute the stashed list of spares, with status information
806	 * this time.
807	 */
808	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
809	    DATA_TYPE_NVLIST_ARRAY) == 0);
810
811	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
812	    KM_SLEEP);
813	for (i = 0; i < spa->spa_spares.sav_count; i++)
814		spares[i] = vdev_config_generate(spa,
815		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
816	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
817	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
818	for (i = 0; i < spa->spa_spares.sav_count; i++)
819		nvlist_free(spares[i]);
820	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
821}
822
823/*
824 * Load (or re-load) the current list of vdevs describing the active l2cache for
825 * this pool.  When this is called, we have some form of basic information in
826 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
827 * then re-generate a more complete list including status information.
828 * Devices which are already active have their details maintained, and are
829 * not re-opened.
830 */
831static void
832spa_load_l2cache(spa_t *spa)
833{
834	nvlist_t **l2cache;
835	uint_t nl2cache;
836	int i, j, oldnvdevs;
837	uint64_t guid;
838	vdev_t *vd, **oldvdevs, **newvdevs;
839	spa_aux_vdev_t *sav = &spa->spa_l2cache;
840
841	if (sav->sav_config != NULL) {
842		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
843		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
844		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
845	} else {
846		nl2cache = 0;
847	}
848
849	oldvdevs = sav->sav_vdevs;
850	oldnvdevs = sav->sav_count;
851	sav->sav_vdevs = NULL;
852	sav->sav_count = 0;
853
854	/*
855	 * Process new nvlist of vdevs.
856	 */
857	for (i = 0; i < nl2cache; i++) {
858		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
859		    &guid) == 0);
860
861		newvdevs[i] = NULL;
862		for (j = 0; j < oldnvdevs; j++) {
863			vd = oldvdevs[j];
864			if (vd != NULL && guid == vd->vdev_guid) {
865				/*
866				 * Retain previous vdev for add/remove ops.
867				 */
868				newvdevs[i] = vd;
869				oldvdevs[j] = NULL;
870				break;
871			}
872		}
873
874		if (newvdevs[i] == NULL) {
875			/*
876			 * Create new vdev
877			 */
878			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
879			    VDEV_ALLOC_L2CACHE) == 0);
880			ASSERT(vd != NULL);
881			newvdevs[i] = vd;
882
883			/*
884			 * Commit this vdev as an l2cache device,
885			 * even if it fails to open.
886			 */
887			spa_l2cache_add(vd);
888
889			if (vdev_open(vd) != 0)
890				continue;
891
892			vd->vdev_top = vd;
893			(void) vdev_validate_aux(vd);
894
895			if (!vdev_is_dead(vd)) {
896				uint64_t size;
897				size = vdev_get_rsize(vd);
898				ASSERT3U(size, >, 0);
899				if (spa_mode & FWRITE) {
900					l2arc_add_vdev(spa, vd,
901					    VDEV_LABEL_START_SIZE,
902					    size - VDEV_LABEL_START_SIZE);
903				}
904				spa_l2cache_activate(vd);
905			}
906		}
907	}
908
909	/*
910	 * Purge vdevs that were dropped
911	 */
912	for (i = 0; i < oldnvdevs; i++) {
913		uint64_t pool;
914
915		vd = oldvdevs[i];
916		if (vd != NULL) {
917			if (spa_mode & FWRITE &&
918			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
919			    pool != 0ULL) {
920				l2arc_remove_vdev(vd);
921			}
922			(void) vdev_close(vd);
923			spa_l2cache_remove(vd);
924		}
925	}
926
927	if (oldvdevs)
928		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
929
930	if (sav->sav_config == NULL)
931		goto out;
932
933	sav->sav_vdevs = newvdevs;
934	sav->sav_count = (int)nl2cache;
935
936	/*
937	 * Recompute the stashed list of l2cache devices, with status
938	 * information this time.
939	 */
940	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
941	    DATA_TYPE_NVLIST_ARRAY) == 0);
942
943	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
944	for (i = 0; i < sav->sav_count; i++)
945		l2cache[i] = vdev_config_generate(spa,
946		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
947	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
948	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
949out:
950	for (i = 0; i < sav->sav_count; i++)
951		nvlist_free(l2cache[i]);
952	if (sav->sav_count)
953		kmem_free(l2cache, sav->sav_count * sizeof (void *));
954}
955
956static int
957load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
958{
959	dmu_buf_t *db;
960	char *packed = NULL;
961	size_t nvsize = 0;
962	int error;
963	*value = NULL;
964
965	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
966	nvsize = *(uint64_t *)db->db_data;
967	dmu_buf_rele(db, FTAG);
968
969	packed = kmem_alloc(nvsize, KM_SLEEP);
970	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
971	if (error == 0)
972		error = nvlist_unpack(packed, nvsize, value, 0);
973	kmem_free(packed, nvsize);
974
975	return (error);
976}
977
978/*
979 * Checks to see if the given vdev could not be opened, in which case we post a
980 * sysevent to notify the autoreplace code that the device has been removed.
981 */
982static void
983spa_check_removed(vdev_t *vd)
984{
985	int c;
986
987	for (c = 0; c < vd->vdev_children; c++)
988		spa_check_removed(vd->vdev_child[c]);
989
990	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
991		zfs_post_autoreplace(vd->vdev_spa, vd);
992		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
993	}
994}
995
996/*
997 * Load an existing storage pool, using the pool's builtin spa_config as a
998 * source of configuration information.
999 */
1000static int
1001spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1002{
1003	int error = 0;
1004	nvlist_t *nvroot = NULL;
1005	vdev_t *rvd;
1006	uberblock_t *ub = &spa->spa_uberblock;
1007	uint64_t config_cache_txg = spa->spa_config_txg;
1008	uint64_t pool_guid;
1009	uint64_t version;
1010	zio_t *zio;
1011	uint64_t autoreplace = 0;
1012
1013	spa->spa_load_state = state;
1014
1015	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1016	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1017		error = EINVAL;
1018		goto out;
1019	}
1020
1021	/*
1022	 * Versioning wasn't explicitly added to the label until later, so if
1023	 * it's not present treat it as the initial version.
1024	 */
1025	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1026		version = SPA_VERSION_INITIAL;
1027
1028	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1029	    &spa->spa_config_txg);
1030
1031	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1032	    spa_guid_exists(pool_guid, 0)) {
1033		error = EEXIST;
1034		goto out;
1035	}
1036
1037	spa->spa_load_guid = pool_guid;
1038
1039	/*
1040	 * Parse the configuration into a vdev tree.  We explicitly set the
1041	 * value that will be returned by spa_version() since parsing the
1042	 * configuration requires knowing the version number.
1043	 */
1044	spa_config_enter(spa, RW_WRITER, FTAG);
1045	spa->spa_ubsync.ub_version = version;
1046	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1047	spa_config_exit(spa, FTAG);
1048
1049	if (error != 0)
1050		goto out;
1051
1052	ASSERT(spa->spa_root_vdev == rvd);
1053	ASSERT(spa_guid(spa) == pool_guid);
1054
1055	/*
1056	 * Try to open all vdevs, loading each label in the process.
1057	 */
1058	error = vdev_open(rvd);
1059	if (error != 0)
1060		goto out;
1061
1062	/*
1063	 * Validate the labels for all leaf vdevs.  We need to grab the config
1064	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
1065	 * flag.
1066	 */
1067	spa_config_enter(spa, RW_READER, FTAG);
1068	error = vdev_validate(rvd);
1069	spa_config_exit(spa, FTAG);
1070
1071	if (error != 0)
1072		goto out;
1073
1074	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1075		error = ENXIO;
1076		goto out;
1077	}
1078
1079	/*
1080	 * Find the best uberblock.
1081	 */
1082	bzero(ub, sizeof (uberblock_t));
1083
1084	zio = zio_root(spa, NULL, NULL,
1085	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1086	vdev_uberblock_load(zio, rvd, ub);
1087	error = zio_wait(zio);
1088
1089	/*
1090	 * If we weren't able to find a single valid uberblock, return failure.
1091	 */
1092	if (ub->ub_txg == 0) {
1093		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1094		    VDEV_AUX_CORRUPT_DATA);
1095		error = ENXIO;
1096		goto out;
1097	}
1098
1099	/*
1100	 * If the pool is newer than the code, we can't open it.
1101	 */
1102	if (ub->ub_version > SPA_VERSION) {
1103		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1104		    VDEV_AUX_VERSION_NEWER);
1105		error = ENOTSUP;
1106		goto out;
1107	}
1108
1109	/*
1110	 * If the vdev guid sum doesn't match the uberblock, we have an
1111	 * incomplete configuration.
1112	 */
1113	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1114		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1115		    VDEV_AUX_BAD_GUID_SUM);
1116		error = ENXIO;
1117		goto out;
1118	}
1119
1120	/*
1121	 * Initialize internal SPA structures.
1122	 */
1123	spa->spa_state = POOL_STATE_ACTIVE;
1124	spa->spa_ubsync = spa->spa_uberblock;
1125	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1126	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1127	if (error) {
1128		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1129		    VDEV_AUX_CORRUPT_DATA);
1130		goto out;
1131	}
1132	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1133
1134	if (zap_lookup(spa->spa_meta_objset,
1135	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1136	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1137		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1138		    VDEV_AUX_CORRUPT_DATA);
1139		error = EIO;
1140		goto out;
1141	}
1142
1143	if (!mosconfig) {
1144		nvlist_t *newconfig;
1145		uint64_t hostid;
1146
1147		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1148			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1149			    VDEV_AUX_CORRUPT_DATA);
1150			error = EIO;
1151			goto out;
1152		}
1153
1154		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
1155		    &hostid) == 0) {
1156			char *hostname;
1157			unsigned long myhostid = 0;
1158
1159			VERIFY(nvlist_lookup_string(newconfig,
1160			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1161
1162			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1163			if (hostid != 0 && myhostid != 0 &&
1164			    (unsigned long)hostid != myhostid) {
1165				cmn_err(CE_WARN, "pool '%s' could not be "
1166				    "loaded as it was last accessed by "
1167				    "another system (host: %s hostid: 0x%lx).  "
1168				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1169				    spa->spa_name, hostname,
1170				    (unsigned long)hostid);
1171				error = EBADF;
1172				goto out;
1173			}
1174		}
1175
1176		spa_config_set(spa, newconfig);
1177		spa_unload(spa);
1178		spa_deactivate(spa);
1179		spa_activate(spa);
1180
1181		return (spa_load(spa, newconfig, state, B_TRUE));
1182	}
1183
1184	if (zap_lookup(spa->spa_meta_objset,
1185	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1186	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1187		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1188		    VDEV_AUX_CORRUPT_DATA);
1189		error = EIO;
1190		goto out;
1191	}
1192
1193	/*
1194	 * Load the bit that tells us to use the new accounting function
1195	 * (raid-z deflation).  If we have an older pool, this will not
1196	 * be present.
1197	 */
1198	error = zap_lookup(spa->spa_meta_objset,
1199	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1200	    sizeof (uint64_t), 1, &spa->spa_deflate);
1201	if (error != 0 && error != ENOENT) {
1202		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1203		    VDEV_AUX_CORRUPT_DATA);
1204		error = EIO;
1205		goto out;
1206	}
1207
1208	/*
1209	 * Load the persistent error log.  If we have an older pool, this will
1210	 * not be present.
1211	 */
1212	error = zap_lookup(spa->spa_meta_objset,
1213	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1214	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1215	if (error != 0 && error != ENOENT) {
1216		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1217		    VDEV_AUX_CORRUPT_DATA);
1218		error = EIO;
1219		goto out;
1220	}
1221
1222	error = zap_lookup(spa->spa_meta_objset,
1223	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1224	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1225	if (error != 0 && error != ENOENT) {
1226		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1227		    VDEV_AUX_CORRUPT_DATA);
1228		error = EIO;
1229		goto out;
1230	}
1231
1232	/*
1233	 * Load the history object.  If we have an older pool, this
1234	 * will not be present.
1235	 */
1236	error = zap_lookup(spa->spa_meta_objset,
1237	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1238	    sizeof (uint64_t), 1, &spa->spa_history);
1239	if (error != 0 && error != ENOENT) {
1240		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1241		    VDEV_AUX_CORRUPT_DATA);
1242		error = EIO;
1243		goto out;
1244	}
1245
1246	/*
1247	 * Load any hot spares for this pool.
1248	 */
1249	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1250	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1251	if (error != 0 && error != ENOENT) {
1252		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1253		    VDEV_AUX_CORRUPT_DATA);
1254		error = EIO;
1255		goto out;
1256	}
1257	if (error == 0) {
1258		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1259		if (load_nvlist(spa, spa->spa_spares.sav_object,
1260		    &spa->spa_spares.sav_config) != 0) {
1261			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1262			    VDEV_AUX_CORRUPT_DATA);
1263			error = EIO;
1264			goto out;
1265		}
1266
1267		spa_config_enter(spa, RW_WRITER, FTAG);
1268		spa_load_spares(spa);
1269		spa_config_exit(spa, FTAG);
1270	}
1271
1272	/*
1273	 * Load any level 2 ARC devices for this pool.
1274	 */
1275	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1276	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1277	    &spa->spa_l2cache.sav_object);
1278	if (error != 0 && error != ENOENT) {
1279		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1280		    VDEV_AUX_CORRUPT_DATA);
1281		error = EIO;
1282		goto out;
1283	}
1284	if (error == 0) {
1285		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1286		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1287		    &spa->spa_l2cache.sav_config) != 0) {
1288			vdev_set_state(rvd, B_TRUE,
1289			    VDEV_STATE_CANT_OPEN,
1290			    VDEV_AUX_CORRUPT_DATA);
1291			error = EIO;
1292			goto out;
1293		}
1294
1295		spa_config_enter(spa, RW_WRITER, FTAG);
1296		spa_load_l2cache(spa);
1297		spa_config_exit(spa, FTAG);
1298	}
1299
1300	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1301
1302	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1303	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1304
1305	if (error && error != ENOENT) {
1306		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1307		    VDEV_AUX_CORRUPT_DATA);
1308		error = EIO;
1309		goto out;
1310	}
1311
1312	if (error == 0) {
1313		(void) zap_lookup(spa->spa_meta_objset,
1314		    spa->spa_pool_props_object,
1315		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1316		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1317		(void) zap_lookup(spa->spa_meta_objset,
1318		    spa->spa_pool_props_object,
1319		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1320		    sizeof (uint64_t), 1, &autoreplace);
1321		(void) zap_lookup(spa->spa_meta_objset,
1322		    spa->spa_pool_props_object,
1323		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1324		    sizeof (uint64_t), 1, &spa->spa_delegation);
1325		(void) zap_lookup(spa->spa_meta_objset,
1326		    spa->spa_pool_props_object,
1327		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1328		    sizeof (uint64_t), 1, &spa->spa_failmode);
1329	}
1330
1331	/*
1332	 * If the 'autoreplace' property is set, then post a resource notifying
1333	 * the ZFS DE that it should not issue any faults for unopenable
1334	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1335	 * unopenable vdevs so that the normal autoreplace handler can take
1336	 * over.
1337	 */
1338	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1339		spa_check_removed(spa->spa_root_vdev);
1340
1341	/*
1342	 * Load the vdev state for all toplevel vdevs.
1343	 */
1344	vdev_load(rvd);
1345
1346	/*
1347	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1348	 */
1349	spa_config_enter(spa, RW_WRITER, FTAG);
1350	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1351	spa_config_exit(spa, FTAG);
1352
1353	/*
1354	 * Check the state of the root vdev.  If it can't be opened, it
1355	 * indicates one or more toplevel vdevs are faulted.
1356	 */
1357	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1358		error = ENXIO;
1359		goto out;
1360	}
1361
1362	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
1363		dmu_tx_t *tx;
1364		int need_update = B_FALSE;
1365		int c;
1366
1367		/*
1368		 * Claim log blocks that haven't been committed yet.
1369		 * This must all happen in a single txg.
1370		 */
1371		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1372		    spa_first_txg(spa));
1373		(void) dmu_objset_find(spa->spa_name,
1374		    zil_claim, tx, DS_FIND_CHILDREN);
1375		dmu_tx_commit(tx);
1376
1377		spa->spa_sync_on = B_TRUE;
1378		txg_sync_start(spa->spa_dsl_pool);
1379
1380		/*
1381		 * Wait for all claims to sync.
1382		 */
1383		txg_wait_synced(spa->spa_dsl_pool, 0);
1384
1385		/*
1386		 * If the config cache is stale, or we have uninitialized
1387		 * metaslabs (see spa_vdev_add()), then update the config.
1388		 */
1389		if (config_cache_txg != spa->spa_config_txg ||
1390		    state == SPA_LOAD_IMPORT)
1391			need_update = B_TRUE;
1392
1393		for (c = 0; c < rvd->vdev_children; c++)
1394			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1395				need_update = B_TRUE;
1396
1397		/*
1398		 * Update the config cache asychronously in case we're the
1399		 * root pool, in which case the config cache isn't writable yet.
1400		 */
1401		if (need_update)
1402			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1403	}
1404
1405	error = 0;
1406out:
1407	if (error && error != EBADF)
1408		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
1409	spa->spa_load_state = SPA_LOAD_NONE;
1410	spa->spa_ena = 0;
1411
1412	return (error);
1413}
1414
1415/*
1416 * Pool Open/Import
1417 *
1418 * The import case is identical to an open except that the configuration is sent
1419 * down from userland, instead of grabbed from the configuration cache.  For the
1420 * case of an open, the pool configuration will exist in the
1421 * POOL_STATE_UNINITIALIZED state.
1422 *
1423 * The stats information (gen/count/ustats) is used to gather vdev statistics at
1424 * the same time open the pool, without having to keep around the spa_t in some
1425 * ambiguous state.
1426 */
1427static int
1428spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1429{
1430	spa_t *spa;
1431	int error;
1432	int loaded = B_FALSE;
1433	int locked = B_FALSE;
1434
1435	*spapp = NULL;
1436
1437	/*
1438	 * As disgusting as this is, we need to support recursive calls to this
1439	 * function because dsl_dir_open() is called during spa_load(), and ends
1440	 * up calling spa_open() again.  The real fix is to figure out how to
1441	 * avoid dsl_dir_open() calling this in the first place.
1442	 */
1443	if (mutex_owner(&spa_namespace_lock) != curthread) {
1444		mutex_enter(&spa_namespace_lock);
1445		locked = B_TRUE;
1446	}
1447
1448	if ((spa = spa_lookup(pool)) == NULL) {
1449		if (locked)
1450			mutex_exit(&spa_namespace_lock);
1451		return (ENOENT);
1452	}
1453	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1454
1455		spa_activate(spa);
1456
1457		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1458
1459		if (error == EBADF) {
1460			/*
1461			 * If vdev_validate() returns failure (indicated by
1462			 * EBADF), it indicates that one of the vdevs indicates
1463			 * that the pool has been exported or destroyed.  If
1464			 * this is the case, the config cache is out of sync and
1465			 * we should remove the pool from the namespace.
1466			 */
1467			zfs_post_ok(spa, NULL);
1468			spa_unload(spa);
1469			spa_deactivate(spa);
1470			spa_remove(spa);
1471			spa_config_sync();
1472			if (locked)
1473				mutex_exit(&spa_namespace_lock);
1474			return (ENOENT);
1475		}
1476
1477		if (error) {
1478			/*
1479			 * We can't open the pool, but we still have useful
1480			 * information: the state of each vdev after the
1481			 * attempted vdev_open().  Return this to the user.
1482			 */
1483			if (config != NULL && spa->spa_root_vdev != NULL) {
1484				spa_config_enter(spa, RW_READER, FTAG);
1485				*config = spa_config_generate(spa, NULL, -1ULL,
1486				    B_TRUE);
1487				spa_config_exit(spa, FTAG);
1488			}
1489			spa_unload(spa);
1490			spa_deactivate(spa);
1491			spa->spa_last_open_failed = B_TRUE;
1492			if (locked)
1493				mutex_exit(&spa_namespace_lock);
1494			*spapp = NULL;
1495			return (error);
1496		} else {
1497			zfs_post_ok(spa, NULL);
1498			spa->spa_last_open_failed = B_FALSE;
1499		}
1500
1501		loaded = B_TRUE;
1502	}
1503
1504	spa_open_ref(spa, tag);
1505
1506	/*
1507	 * If we just loaded the pool, resilver anything that's out of date.
1508	 */
1509	if (loaded && (spa_mode & FWRITE))
1510		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1511
1512	if (locked)
1513		mutex_exit(&spa_namespace_lock);
1514
1515	*spapp = spa;
1516
1517	if (config != NULL) {
1518		spa_config_enter(spa, RW_READER, FTAG);
1519		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1520		spa_config_exit(spa, FTAG);
1521	}
1522
1523	return (0);
1524}
1525
1526int
1527spa_open(const char *name, spa_t **spapp, void *tag)
1528{
1529	return (spa_open_common(name, spapp, tag, NULL));
1530}
1531
1532/*
1533 * Lookup the given spa_t, incrementing the inject count in the process,
1534 * preventing it from being exported or destroyed.
1535 */
1536spa_t *
1537spa_inject_addref(char *name)
1538{
1539	spa_t *spa;
1540
1541	mutex_enter(&spa_namespace_lock);
1542	if ((spa = spa_lookup(name)) == NULL) {
1543		mutex_exit(&spa_namespace_lock);
1544		return (NULL);
1545	}
1546	spa->spa_inject_ref++;
1547	mutex_exit(&spa_namespace_lock);
1548
1549	return (spa);
1550}
1551
1552void
1553spa_inject_delref(spa_t *spa)
1554{
1555	mutex_enter(&spa_namespace_lock);
1556	spa->spa_inject_ref--;
1557	mutex_exit(&spa_namespace_lock);
1558}
1559
1560/*
1561 * Add spares device information to the nvlist.
1562 */
1563static void
1564spa_add_spares(spa_t *spa, nvlist_t *config)
1565{
1566	nvlist_t **spares;
1567	uint_t i, nspares;
1568	nvlist_t *nvroot;
1569	uint64_t guid;
1570	vdev_stat_t *vs;
1571	uint_t vsc;
1572	uint64_t pool;
1573
1574	if (spa->spa_spares.sav_count == 0)
1575		return;
1576
1577	VERIFY(nvlist_lookup_nvlist(config,
1578	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1579	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1580	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1581	if (nspares != 0) {
1582		VERIFY(nvlist_add_nvlist_array(nvroot,
1583		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1584		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1585		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1586
1587		/*
1588		 * Go through and find any spares which have since been
1589		 * repurposed as an active spare.  If this is the case, update
1590		 * their status appropriately.
1591		 */
1592		for (i = 0; i < nspares; i++) {
1593			VERIFY(nvlist_lookup_uint64(spares[i],
1594			    ZPOOL_CONFIG_GUID, &guid) == 0);
1595			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
1596				VERIFY(nvlist_lookup_uint64_array(
1597				    spares[i], ZPOOL_CONFIG_STATS,
1598				    (uint64_t **)&vs, &vsc) == 0);
1599				vs->vs_state = VDEV_STATE_CANT_OPEN;
1600				vs->vs_aux = VDEV_AUX_SPARED;
1601			}
1602		}
1603	}
1604}
1605
1606/*
1607 * Add l2cache device information to the nvlist, including vdev stats.
1608 */
1609static void
1610spa_add_l2cache(spa_t *spa, nvlist_t *config)
1611{
1612	nvlist_t **l2cache;
1613	uint_t i, j, nl2cache;
1614	nvlist_t *nvroot;
1615	uint64_t guid;
1616	vdev_t *vd;
1617	vdev_stat_t *vs;
1618	uint_t vsc;
1619
1620	if (spa->spa_l2cache.sav_count == 0)
1621		return;
1622
1623	spa_config_enter(spa, RW_READER, FTAG);
1624
1625	VERIFY(nvlist_lookup_nvlist(config,
1626	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1627	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1628	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1629	if (nl2cache != 0) {
1630		VERIFY(nvlist_add_nvlist_array(nvroot,
1631		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1632		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1633		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1634
1635		/*
1636		 * Update level 2 cache device stats.
1637		 */
1638
1639		for (i = 0; i < nl2cache; i++) {
1640			VERIFY(nvlist_lookup_uint64(l2cache[i],
1641			    ZPOOL_CONFIG_GUID, &guid) == 0);
1642
1643			vd = NULL;
1644			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1645				if (guid ==
1646				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1647					vd = spa->spa_l2cache.sav_vdevs[j];
1648					break;
1649				}
1650			}
1651			ASSERT(vd != NULL);
1652
1653			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1654			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1655			vdev_get_stats(vd, vs);
1656		}
1657	}
1658
1659	spa_config_exit(spa, FTAG);
1660}
1661
1662int
1663spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1664{
1665	int error;
1666	spa_t *spa;
1667
1668	*config = NULL;
1669	error = spa_open_common(name, &spa, FTAG, config);
1670
1671	if (spa && *config != NULL) {
1672		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1673		    spa_get_errlog_size(spa)) == 0);
1674
1675		spa_add_spares(spa, *config);
1676		spa_add_l2cache(spa, *config);
1677	}
1678
1679	/*
1680	 * We want to get the alternate root even for faulted pools, so we cheat
1681	 * and call spa_lookup() directly.
1682	 */
1683	if (altroot) {
1684		if (spa == NULL) {
1685			mutex_enter(&spa_namespace_lock);
1686			spa = spa_lookup(name);
1687			if (spa)
1688				spa_altroot(spa, altroot, buflen);
1689			else
1690				altroot[0] = '\0';
1691			spa = NULL;
1692			mutex_exit(&spa_namespace_lock);
1693		} else {
1694			spa_altroot(spa, altroot, buflen);
1695		}
1696	}
1697
1698	if (spa != NULL)
1699		spa_close(spa, FTAG);
1700
1701	return (error);
1702}
1703
1704/*
1705 * Validate that the auxiliary device array is well formed.  We must have an
1706 * array of nvlists, each which describes a valid leaf vdev.  If this is an
1707 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1708 * specified, as long as they are well-formed.
1709 */
1710static int
1711spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1712    spa_aux_vdev_t *sav, const char *config, uint64_t version,
1713    vdev_labeltype_t label)
1714{
1715	nvlist_t **dev;
1716	uint_t i, ndev;
1717	vdev_t *vd;
1718	int error;
1719
1720	/*
1721	 * It's acceptable to have no devs specified.
1722	 */
1723	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1724		return (0);
1725
1726	if (ndev == 0)
1727		return (EINVAL);
1728
1729	/*
1730	 * Make sure the pool is formatted with a version that supports this
1731	 * device type.
1732	 */
1733	if (spa_version(spa) < version)
1734		return (ENOTSUP);
1735
1736	/*
1737	 * Set the pending device list so we correctly handle device in-use
1738	 * checking.
1739	 */
1740	sav->sav_pending = dev;
1741	sav->sav_npending = ndev;
1742
1743	for (i = 0; i < ndev; i++) {
1744		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1745		    mode)) != 0)
1746			goto out;
1747
1748		if (!vd->vdev_ops->vdev_op_leaf) {
1749			vdev_free(vd);
1750			error = EINVAL;
1751			goto out;
1752		}
1753
1754		/*
1755		 * The L2ARC currently only supports disk devices.
1756		 */
1757		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1758		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1759			error = ENOTBLK;
1760			goto out;
1761		}
1762
1763		vd->vdev_top = vd;
1764
1765		if ((error = vdev_open(vd)) == 0 &&
1766		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1767			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1768			    vd->vdev_guid) == 0);
1769		}
1770
1771		vdev_free(vd);
1772
1773		if (error &&
1774		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1775			goto out;
1776		else
1777			error = 0;
1778	}
1779
1780out:
1781	sav->sav_pending = NULL;
1782	sav->sav_npending = 0;
1783	return (error);
1784}
1785
1786static int
1787spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1788{
1789	int error;
1790
1791	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1792	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1793	    VDEV_LABEL_SPARE)) != 0) {
1794		return (error);
1795	}
1796
1797	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1798	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1799	    VDEV_LABEL_L2CACHE));
1800}
1801
1802static void
1803spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1804    const char *config)
1805{
1806	int i;
1807
1808	if (sav->sav_config != NULL) {
1809		nvlist_t **olddevs;
1810		uint_t oldndevs;
1811		nvlist_t **newdevs;
1812
1813		/*
1814		 * Generate new dev list by concatentating with the
1815		 * current dev list.
1816		 */
1817		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
1818		    &olddevs, &oldndevs) == 0);
1819
1820		newdevs = kmem_alloc(sizeof (void *) *
1821		    (ndevs + oldndevs), KM_SLEEP);
1822		for (i = 0; i < oldndevs; i++)
1823			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
1824			    KM_SLEEP) == 0);
1825		for (i = 0; i < ndevs; i++)
1826			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
1827			    KM_SLEEP) == 0);
1828
1829		VERIFY(nvlist_remove(sav->sav_config, config,
1830		    DATA_TYPE_NVLIST_ARRAY) == 0);
1831
1832		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1833		    config, newdevs, ndevs + oldndevs) == 0);
1834		for (i = 0; i < oldndevs + ndevs; i++)
1835			nvlist_free(newdevs[i]);
1836		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
1837	} else {
1838		/*
1839		 * Generate a new dev list.
1840		 */
1841		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
1842		    KM_SLEEP) == 0);
1843		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
1844		    devs, ndevs) == 0);
1845	}
1846}
1847
1848/*
1849 * Stop and drop level 2 ARC devices
1850 */
1851void
1852spa_l2cache_drop(spa_t *spa)
1853{
1854	vdev_t *vd;
1855	int i;
1856	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1857
1858	for (i = 0; i < sav->sav_count; i++) {
1859		uint64_t pool;
1860
1861		vd = sav->sav_vdevs[i];
1862		ASSERT(vd != NULL);
1863
1864		if (spa_mode & FWRITE &&
1865		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
1866			l2arc_remove_vdev(vd);
1867		}
1868		if (vd->vdev_isl2cache)
1869			spa_l2cache_remove(vd);
1870		vdev_clear_stats(vd);
1871		(void) vdev_close(vd);
1872	}
1873}
1874
1875/*
1876 * Pool Creation
1877 */
1878int
1879spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
1880    const char *history_str)
1881{
1882	spa_t *spa;
1883	char *altroot = NULL;
1884	vdev_t *rvd;
1885	dsl_pool_t *dp;
1886	dmu_tx_t *tx;
1887	int c, error = 0;
1888	uint64_t txg = TXG_INITIAL;
1889	nvlist_t **spares, **l2cache;
1890	uint_t nspares, nl2cache;
1891	uint64_t version;
1892
1893	/*
1894	 * If this pool already exists, return failure.
1895	 */
1896	mutex_enter(&spa_namespace_lock);
1897	if (spa_lookup(pool) != NULL) {
1898		mutex_exit(&spa_namespace_lock);
1899		return (EEXIST);
1900	}
1901
1902	/*
1903	 * Allocate a new spa_t structure.
1904	 */
1905	(void) nvlist_lookup_string(props,
1906	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
1907	spa = spa_add(pool, altroot);
1908	spa_activate(spa);
1909
1910	spa->spa_uberblock.ub_txg = txg - 1;
1911
1912	if (props && (error = spa_prop_validate(spa, props))) {
1913		spa_unload(spa);
1914		spa_deactivate(spa);
1915		spa_remove(spa);
1916		return (error);
1917	}
1918
1919	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
1920	    &version) != 0)
1921		version = SPA_VERSION;
1922	ASSERT(version <= SPA_VERSION);
1923	spa->spa_uberblock.ub_version = version;
1924	spa->spa_ubsync = spa->spa_uberblock;
1925
1926	/*
1927	 * Create the root vdev.
1928	 */
1929	spa_config_enter(spa, RW_WRITER, FTAG);
1930
1931	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1932
1933	ASSERT(error != 0 || rvd != NULL);
1934	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1935
1936	if (error == 0 && !zfs_allocatable_devs(nvroot))
1937		error = EINVAL;
1938
1939	if (error == 0 &&
1940	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1941	    (error = spa_validate_aux(spa, nvroot, txg,
1942	    VDEV_ALLOC_ADD)) == 0) {
1943		for (c = 0; c < rvd->vdev_children; c++)
1944			vdev_init(rvd->vdev_child[c], txg);
1945		vdev_config_dirty(rvd);
1946	}
1947
1948	spa_config_exit(spa, FTAG);
1949
1950	if (error != 0) {
1951		spa_unload(spa);
1952		spa_deactivate(spa);
1953		spa_remove(spa);
1954		mutex_exit(&spa_namespace_lock);
1955		return (error);
1956	}
1957
1958	/*
1959	 * Get the list of spares, if specified.
1960	 */
1961	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1962	    &spares, &nspares) == 0) {
1963		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
1964		    KM_SLEEP) == 0);
1965		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1966		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1967		spa_config_enter(spa, RW_WRITER, FTAG);
1968		spa_load_spares(spa);
1969		spa_config_exit(spa, FTAG);
1970		spa->spa_spares.sav_sync = B_TRUE;
1971	}
1972
1973	/*
1974	 * Get the list of level 2 cache devices, if specified.
1975	 */
1976	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1977	    &l2cache, &nl2cache) == 0) {
1978		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
1979		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1980		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
1981		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1982		spa_config_enter(spa, RW_WRITER, FTAG);
1983		spa_load_l2cache(spa);
1984		spa_config_exit(spa, FTAG);
1985		spa->spa_l2cache.sav_sync = B_TRUE;
1986	}
1987
1988	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1989	spa->spa_meta_objset = dp->dp_meta_objset;
1990
1991	tx = dmu_tx_create_assigned(dp, txg);
1992
1993	/*
1994	 * Create the pool config object.
1995	 */
1996	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1997	    DMU_OT_PACKED_NVLIST, 1 << 14,
1998	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1999
2000	if (zap_add(spa->spa_meta_objset,
2001	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2002	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2003		cmn_err(CE_PANIC, "failed to add pool config");
2004	}
2005
2006	/* Newly created pools with the right version are always deflated. */
2007	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2008		spa->spa_deflate = TRUE;
2009		if (zap_add(spa->spa_meta_objset,
2010		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2011		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2012			cmn_err(CE_PANIC, "failed to add deflate");
2013		}
2014	}
2015
2016	/*
2017	 * Create the deferred-free bplist object.  Turn off compression
2018	 * because sync-to-convergence takes longer if the blocksize
2019	 * keeps changing.
2020	 */
2021	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2022	    1 << 14, tx);
2023	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2024	    ZIO_COMPRESS_OFF, tx);
2025
2026	if (zap_add(spa->spa_meta_objset,
2027	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2028	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2029		cmn_err(CE_PANIC, "failed to add bplist");
2030	}
2031
2032	/*
2033	 * Create the pool's history object.
2034	 */
2035	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2036		spa_history_create_obj(spa, tx);
2037
2038	/*
2039	 * Set pool properties.
2040	 */
2041	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2042	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2043	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2044	if (props)
2045		spa_sync_props(spa, props, CRED(), tx);
2046
2047	dmu_tx_commit(tx);
2048
2049	spa->spa_sync_on = B_TRUE;
2050	txg_sync_start(spa->spa_dsl_pool);
2051
2052	/*
2053	 * We explicitly wait for the first transaction to complete so that our
2054	 * bean counters are appropriately updated.
2055	 */
2056	txg_wait_synced(spa->spa_dsl_pool, txg);
2057
2058	spa_config_sync();
2059
2060	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2061		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2062
2063	mutex_exit(&spa_namespace_lock);
2064
2065	return (0);
2066}
2067
2068/*
2069 * Import the given pool into the system.  We set up the necessary spa_t and
2070 * then call spa_load() to do the dirty work.
2071 */
2072int
2073spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2074{
2075	spa_t *spa;
2076	char *altroot = NULL;
2077	int error;
2078	nvlist_t *nvroot;
2079	nvlist_t **spares, **l2cache;
2080	uint_t nspares, nl2cache;
2081
2082	/*
2083	 * If a pool with this name exists, return failure.
2084	 */
2085	mutex_enter(&spa_namespace_lock);
2086	if (spa_lookup(pool) != NULL) {
2087		mutex_exit(&spa_namespace_lock);
2088		return (EEXIST);
2089	}
2090
2091	/*
2092	 * Create and initialize the spa structure.
2093	 */
2094	(void) nvlist_lookup_string(props,
2095	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2096	spa = spa_add(pool, altroot);
2097	spa_activate(spa);
2098
2099	/*
2100	 * Pass off the heavy lifting to spa_load().
2101	 * Pass TRUE for mosconfig because the user-supplied config
2102	 * is actually the one to trust when doing an import.
2103	 */
2104	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
2105
2106	spa_config_enter(spa, RW_WRITER, FTAG);
2107	/*
2108	 * Toss any existing sparelist, as it doesn't have any validity anymore,
2109	 * and conflicts with spa_has_spare().
2110	 */
2111	if (spa->spa_spares.sav_config) {
2112		nvlist_free(spa->spa_spares.sav_config);
2113		spa->spa_spares.sav_config = NULL;
2114		spa_load_spares(spa);
2115	}
2116	if (spa->spa_l2cache.sav_config) {
2117		nvlist_free(spa->spa_l2cache.sav_config);
2118		spa->spa_l2cache.sav_config = NULL;
2119		spa_load_l2cache(spa);
2120	}
2121
2122	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2123	    &nvroot) == 0);
2124	if (error == 0)
2125		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
2126	if (error == 0)
2127		error = spa_validate_aux(spa, nvroot, -1ULL,
2128		    VDEV_ALLOC_L2CACHE);
2129	spa_config_exit(spa, FTAG);
2130
2131	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
2132		spa_unload(spa);
2133		spa_deactivate(spa);
2134		spa_remove(spa);
2135		mutex_exit(&spa_namespace_lock);
2136		return (error);
2137	}
2138
2139	/*
2140	 * Override any spares and level 2 cache devices as specified by
2141	 * the user, as these may have correct device names/devids, etc.
2142	 */
2143	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2144	    &spares, &nspares) == 0) {
2145		if (spa->spa_spares.sav_config)
2146			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2147			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2148		else
2149			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2150			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2151		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2152		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2153		spa_config_enter(spa, RW_WRITER, FTAG);
2154		spa_load_spares(spa);
2155		spa_config_exit(spa, FTAG);
2156		spa->spa_spares.sav_sync = B_TRUE;
2157	}
2158	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2159	    &l2cache, &nl2cache) == 0) {
2160		if (spa->spa_l2cache.sav_config)
2161			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2162			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2163		else
2164			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2165			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2166		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2167		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2168		spa_config_enter(spa, RW_WRITER, FTAG);
2169		spa_load_l2cache(spa);
2170		spa_config_exit(spa, FTAG);
2171		spa->spa_l2cache.sav_sync = B_TRUE;
2172	}
2173
2174	/*
2175	 * Update the config cache to include the newly-imported pool.
2176	 */
2177	if (spa_mode & FWRITE)
2178		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2179
2180	/*
2181	 * Resilver anything that's out of date.
2182	 */
2183	if (spa_mode & FWRITE)
2184		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2185
2186	mutex_exit(&spa_namespace_lock);
2187
2188	return (0);
2189}
2190
2191/*
2192 * This (illegal) pool name is used when temporarily importing a spa_t in order
2193 * to get the vdev stats associated with the imported devices.
2194 */
2195#define	TRYIMPORT_NAME	"$import"
2196
2197nvlist_t *
2198spa_tryimport(nvlist_t *tryconfig)
2199{
2200	nvlist_t *config = NULL;
2201	char *poolname;
2202	spa_t *spa;
2203	uint64_t state;
2204
2205	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2206		return (NULL);
2207
2208	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2209		return (NULL);
2210
2211	/*
2212	 * Create and initialize the spa structure.
2213	 */
2214	mutex_enter(&spa_namespace_lock);
2215	spa = spa_add(TRYIMPORT_NAME, NULL);
2216	spa_activate(spa);
2217
2218	/*
2219	 * Pass off the heavy lifting to spa_load().
2220	 * Pass TRUE for mosconfig because the user-supplied config
2221	 * is actually the one to trust when doing an import.
2222	 */
2223	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2224
2225	/*
2226	 * If 'tryconfig' was at least parsable, return the current config.
2227	 */
2228	if (spa->spa_root_vdev != NULL) {
2229		spa_config_enter(spa, RW_READER, FTAG);
2230		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2231		spa_config_exit(spa, FTAG);
2232		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2233		    poolname) == 0);
2234		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2235		    state) == 0);
2236		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2237		    spa->spa_uberblock.ub_timestamp) == 0);
2238
2239		/*
2240		 * Add the list of hot spares and level 2 cache devices.
2241		 */
2242		spa_add_spares(spa, config);
2243		spa_add_l2cache(spa, config);
2244	}
2245
2246	spa_unload(spa);
2247	spa_deactivate(spa);
2248	spa_remove(spa);
2249	mutex_exit(&spa_namespace_lock);
2250
2251	return (config);
2252}
2253
2254/*
2255 * Pool export/destroy
2256 *
2257 * The act of destroying or exporting a pool is very simple.  We make sure there
2258 * is no more pending I/O and any references to the pool are gone.  Then, we
2259 * update the pool state and sync all the labels to disk, removing the
2260 * configuration from the cache afterwards.
2261 */
2262static int
2263spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
2264{
2265	spa_t *spa;
2266
2267	if (oldconfig)
2268		*oldconfig = NULL;
2269
2270	if (!(spa_mode & FWRITE))
2271		return (EROFS);
2272
2273	mutex_enter(&spa_namespace_lock);
2274	if ((spa = spa_lookup(pool)) == NULL) {
2275		mutex_exit(&spa_namespace_lock);
2276		return (ENOENT);
2277	}
2278
2279	/*
2280	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2281	 * reacquire the namespace lock, and see if we can export.
2282	 */
2283	spa_open_ref(spa, FTAG);
2284	mutex_exit(&spa_namespace_lock);
2285	spa_async_suspend(spa);
2286	mutex_enter(&spa_namespace_lock);
2287	spa_close(spa, FTAG);
2288
2289	/*
2290	 * The pool will be in core if it's openable,
2291	 * in which case we can modify its state.
2292	 */
2293	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2294		/*
2295		 * Objsets may be open only because they're dirty, so we
2296		 * have to force it to sync before checking spa_refcnt.
2297		 */
2298		spa_scrub_suspend(spa);
2299		txg_wait_synced(spa->spa_dsl_pool, 0);
2300
2301		/*
2302		 * A pool cannot be exported or destroyed if there are active
2303		 * references.  If we are resetting a pool, allow references by
2304		 * fault injection handlers.
2305		 */
2306		if (!spa_refcount_zero(spa) ||
2307		    (spa->spa_inject_ref != 0 &&
2308		    new_state != POOL_STATE_UNINITIALIZED)) {
2309			spa_scrub_resume(spa);
2310			spa_async_resume(spa);
2311			mutex_exit(&spa_namespace_lock);
2312			return (EBUSY);
2313		}
2314
2315		spa_scrub_resume(spa);
2316		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
2317
2318		/*
2319		 * We want this to be reflected on every label,
2320		 * so mark them all dirty.  spa_unload() will do the
2321		 * final sync that pushes these changes out.
2322		 */
2323		if (new_state != POOL_STATE_UNINITIALIZED) {
2324			spa_config_enter(spa, RW_WRITER, FTAG);
2325			spa->spa_state = new_state;
2326			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2327			vdev_config_dirty(spa->spa_root_vdev);
2328			spa_config_exit(spa, FTAG);
2329		}
2330	}
2331
2332	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2333
2334	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2335		spa_unload(spa);
2336		spa_deactivate(spa);
2337	}
2338
2339	if (oldconfig && spa->spa_config)
2340		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2341
2342	if (new_state != POOL_STATE_UNINITIALIZED) {
2343		spa_config_check(spa->spa_config_dir,
2344		    spa->spa_config_file);
2345		spa_remove(spa);
2346		spa_config_sync();
2347	}
2348	mutex_exit(&spa_namespace_lock);
2349
2350	return (0);
2351}
2352
2353/*
2354 * Destroy a storage pool.
2355 */
2356int
2357spa_destroy(char *pool)
2358{
2359	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
2360}
2361
2362/*
2363 * Export a storage pool.
2364 */
2365int
2366spa_export(char *pool, nvlist_t **oldconfig)
2367{
2368	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
2369}
2370
2371/*
2372 * Similar to spa_export(), this unloads the spa_t without actually removing it
2373 * from the namespace in any way.
2374 */
2375int
2376spa_reset(char *pool)
2377{
2378	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
2379}
2380
2381
2382/*
2383 * ==========================================================================
2384 * Device manipulation
2385 * ==========================================================================
2386 */
2387
2388/*
2389 * Add a device to a storage pool.
2390 */
2391int
2392spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2393{
2394	uint64_t txg;
2395	int c, error;
2396	vdev_t *rvd = spa->spa_root_vdev;
2397	vdev_t *vd, *tvd;
2398	nvlist_t **spares, **l2cache;
2399	uint_t nspares, nl2cache;
2400
2401	txg = spa_vdev_enter(spa);
2402
2403	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2404	    VDEV_ALLOC_ADD)) != 0)
2405		return (spa_vdev_exit(spa, NULL, txg, error));
2406
2407	spa->spa_pending_vdev = vd;
2408
2409	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2410	    &nspares) != 0)
2411		nspares = 0;
2412
2413	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2414	    &nl2cache) != 0)
2415		nl2cache = 0;
2416
2417	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
2418		spa->spa_pending_vdev = NULL;
2419		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2420	}
2421
2422	if (vd->vdev_children != 0) {
2423		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
2424			spa->spa_pending_vdev = NULL;
2425			return (spa_vdev_exit(spa, vd, txg, error));
2426		}
2427	}
2428
2429	/*
2430	 * We must validate the spares and l2cache devices after checking the
2431	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2432	 */
2433	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
2434		spa->spa_pending_vdev = NULL;
2435		return (spa_vdev_exit(spa, vd, txg, error));
2436	}
2437
2438	spa->spa_pending_vdev = NULL;
2439
2440	/*
2441	 * Transfer each new top-level vdev from vd to rvd.
2442	 */
2443	for (c = 0; c < vd->vdev_children; c++) {
2444		tvd = vd->vdev_child[c];
2445		vdev_remove_child(vd, tvd);
2446		tvd->vdev_id = rvd->vdev_children;
2447		vdev_add_child(rvd, tvd);
2448		vdev_config_dirty(tvd);
2449	}
2450
2451	if (nspares != 0) {
2452		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2453		    ZPOOL_CONFIG_SPARES);
2454		spa_load_spares(spa);
2455		spa->spa_spares.sav_sync = B_TRUE;
2456	}
2457
2458	if (nl2cache != 0) {
2459		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2460		    ZPOOL_CONFIG_L2CACHE);
2461		spa_load_l2cache(spa);
2462		spa->spa_l2cache.sav_sync = B_TRUE;
2463	}
2464
2465	/*
2466	 * We have to be careful when adding new vdevs to an existing pool.
2467	 * If other threads start allocating from these vdevs before we
2468	 * sync the config cache, and we lose power, then upon reboot we may
2469	 * fail to open the pool because there are DVAs that the config cache
2470	 * can't translate.  Therefore, we first add the vdevs without
2471	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2472	 * and then let spa_config_update() initialize the new metaslabs.
2473	 *
2474	 * spa_load() checks for added-but-not-initialized vdevs, so that
2475	 * if we lose power at any point in this sequence, the remaining
2476	 * steps will be completed the next time we load the pool.
2477	 */
2478	(void) spa_vdev_exit(spa, vd, txg, 0);
2479
2480	mutex_enter(&spa_namespace_lock);
2481	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2482	mutex_exit(&spa_namespace_lock);
2483
2484	return (0);
2485}
2486
2487/*
2488 * Attach a device to a mirror.  The arguments are the path to any device
2489 * in the mirror, and the nvroot for the new device.  If the path specifies
2490 * a device that is not mirrored, we automatically insert the mirror vdev.
2491 *
2492 * If 'replacing' is specified, the new device is intended to replace the
2493 * existing device; in this case the two devices are made into their own
2494 * mirror using the 'replacing' vdev, which is functionally identical to
2495 * the mirror vdev (it actually reuses all the same ops) but has a few
2496 * extra rules: you can't attach to it after it's been created, and upon
2497 * completion of resilvering, the first disk (the one being replaced)
2498 * is automatically detached.
2499 */
2500int
2501spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2502{
2503	uint64_t txg, open_txg;
2504	int error;
2505	vdev_t *rvd = spa->spa_root_vdev;
2506	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
2507	vdev_ops_t *pvops;
2508	int is_log;
2509
2510	txg = spa_vdev_enter(spa);
2511
2512	oldvd = vdev_lookup_by_guid(rvd, guid);
2513
2514	if (oldvd == NULL)
2515		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2516
2517	if (!oldvd->vdev_ops->vdev_op_leaf)
2518		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2519
2520	pvd = oldvd->vdev_parent;
2521
2522	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
2523	    VDEV_ALLOC_ADD)) != 0)
2524		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
2525
2526	if (newrootvd->vdev_children != 1)
2527		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2528
2529	newvd = newrootvd->vdev_child[0];
2530
2531	if (!newvd->vdev_ops->vdev_op_leaf)
2532		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2533
2534	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
2535		return (spa_vdev_exit(spa, newrootvd, txg, error));
2536
2537	/*
2538	 * Spares can't replace logs
2539	 */
2540	is_log = oldvd->vdev_islog;
2541	if (is_log && newvd->vdev_isspare)
2542		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2543
2544	if (!replacing) {
2545		/*
2546		 * For attach, the only allowable parent is a mirror or the root
2547		 * vdev.
2548		 */
2549		if (pvd->vdev_ops != &vdev_mirror_ops &&
2550		    pvd->vdev_ops != &vdev_root_ops)
2551			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2552
2553		pvops = &vdev_mirror_ops;
2554	} else {
2555		/*
2556		 * Active hot spares can only be replaced by inactive hot
2557		 * spares.
2558		 */
2559		if (pvd->vdev_ops == &vdev_spare_ops &&
2560		    pvd->vdev_child[1] == oldvd &&
2561		    !spa_has_spare(spa, newvd->vdev_guid))
2562			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2563
2564		/*
2565		 * If the source is a hot spare, and the parent isn't already a
2566		 * spare, then we want to create a new hot spare.  Otherwise, we
2567		 * want to create a replacing vdev.  The user is not allowed to
2568		 * attach to a spared vdev child unless the 'isspare' state is
2569		 * the same (spare replaces spare, non-spare replaces
2570		 * non-spare).
2571		 */
2572		if (pvd->vdev_ops == &vdev_replacing_ops)
2573			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2574		else if (pvd->vdev_ops == &vdev_spare_ops &&
2575		    newvd->vdev_isspare != oldvd->vdev_isspare)
2576			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2577		else if (pvd->vdev_ops != &vdev_spare_ops &&
2578		    newvd->vdev_isspare)
2579			pvops = &vdev_spare_ops;
2580		else
2581			pvops = &vdev_replacing_ops;
2582	}
2583
2584	/*
2585	 * Compare the new device size with the replaceable/attachable
2586	 * device size.
2587	 */
2588	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
2589		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
2590
2591	/*
2592	 * The new device cannot have a higher alignment requirement
2593	 * than the top-level vdev.
2594	 */
2595	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
2596		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
2597
2598	/*
2599	 * If this is an in-place replacement, update oldvd's path and devid
2600	 * to make it distinguishable from newvd, and unopenable from now on.
2601	 */
2602	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
2603		spa_strfree(oldvd->vdev_path);
2604		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
2605		    KM_SLEEP);
2606		(void) sprintf(oldvd->vdev_path, "%s/%s",
2607		    newvd->vdev_path, "old");
2608		if (oldvd->vdev_devid != NULL) {
2609			spa_strfree(oldvd->vdev_devid);
2610			oldvd->vdev_devid = NULL;
2611		}
2612	}
2613
2614	/*
2615	 * If the parent is not a mirror, or if we're replacing, insert the new
2616	 * mirror/replacing/spare vdev above oldvd.
2617	 */
2618	if (pvd->vdev_ops != pvops)
2619		pvd = vdev_add_parent(oldvd, pvops);
2620
2621	ASSERT(pvd->vdev_top->vdev_parent == rvd);
2622	ASSERT(pvd->vdev_ops == pvops);
2623	ASSERT(oldvd->vdev_parent == pvd);
2624
2625	/*
2626	 * Extract the new device from its root and add it to pvd.
2627	 */
2628	vdev_remove_child(newrootvd, newvd);
2629	newvd->vdev_id = pvd->vdev_children;
2630	vdev_add_child(pvd, newvd);
2631
2632	/*
2633	 * If newvd is smaller than oldvd, but larger than its rsize,
2634	 * the addition of newvd may have decreased our parent's asize.
2635	 */
2636	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
2637
2638	tvd = newvd->vdev_top;
2639	ASSERT(pvd->vdev_top == tvd);
2640	ASSERT(tvd->vdev_parent == rvd);
2641
2642	vdev_config_dirty(tvd);
2643
2644	/*
2645	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
2646	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
2647	 */
2648	open_txg = txg + TXG_CONCURRENT_STATES - 1;
2649
2650	mutex_enter(&newvd->vdev_dtl_lock);
2651	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
2652	    open_txg - TXG_INITIAL + 1);
2653	mutex_exit(&newvd->vdev_dtl_lock);
2654
2655	if (newvd->vdev_isspare)
2656		spa_spare_activate(newvd);
2657
2658	/*
2659	 * Mark newvd's DTL dirty in this txg.
2660	 */
2661	vdev_dirty(tvd, VDD_DTL, newvd, txg);
2662
2663	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
2664
2665	/*
2666	 * Kick off a resilver to update newvd.  We need to grab the namespace
2667	 * lock because spa_scrub() needs to post a sysevent with the pool name.
2668	 */
2669	mutex_enter(&spa_namespace_lock);
2670	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2671	mutex_exit(&spa_namespace_lock);
2672
2673	return (0);
2674}
2675
2676/*
2677 * Detach a device from a mirror or replacing vdev.
2678 * If 'replace_done' is specified, only detach if the parent
2679 * is a replacing vdev.
2680 */
2681int
2682spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
2683{
2684	uint64_t txg;
2685	int c, t, error;
2686	vdev_t *rvd = spa->spa_root_vdev;
2687	vdev_t *vd, *pvd, *cvd, *tvd;
2688	boolean_t unspare = B_FALSE;
2689	uint64_t unspare_guid;
2690
2691	txg = spa_vdev_enter(spa);
2692
2693	vd = vdev_lookup_by_guid(rvd, guid);
2694
2695	if (vd == NULL)
2696		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2697
2698	if (!vd->vdev_ops->vdev_op_leaf)
2699		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2700
2701	pvd = vd->vdev_parent;
2702
2703	/*
2704	 * If replace_done is specified, only remove this device if it's
2705	 * the first child of a replacing vdev.  For the 'spare' vdev, either
2706	 * disk can be removed.
2707	 */
2708	if (replace_done) {
2709		if (pvd->vdev_ops == &vdev_replacing_ops) {
2710			if (vd->vdev_id != 0)
2711				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2712		} else if (pvd->vdev_ops != &vdev_spare_ops) {
2713			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2714		}
2715	}
2716
2717	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
2718	    spa_version(spa) >= SPA_VERSION_SPARES);
2719
2720	/*
2721	 * Only mirror, replacing, and spare vdevs support detach.
2722	 */
2723	if (pvd->vdev_ops != &vdev_replacing_ops &&
2724	    pvd->vdev_ops != &vdev_mirror_ops &&
2725	    pvd->vdev_ops != &vdev_spare_ops)
2726		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2727
2728	/*
2729	 * If there's only one replica, you can't detach it.
2730	 */
2731	if (pvd->vdev_children <= 1)
2732		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2733
2734	/*
2735	 * If all siblings have non-empty DTLs, this device may have the only
2736	 * valid copy of the data, which means we cannot safely detach it.
2737	 *
2738	 * XXX -- as in the vdev_offline() case, we really want a more
2739	 * precise DTL check.
2740	 */
2741	for (c = 0; c < pvd->vdev_children; c++) {
2742		uint64_t dirty;
2743
2744		cvd = pvd->vdev_child[c];
2745		if (cvd == vd)
2746			continue;
2747		if (vdev_is_dead(cvd))
2748			continue;
2749		mutex_enter(&cvd->vdev_dtl_lock);
2750		dirty = cvd->vdev_dtl_map.sm_space |
2751		    cvd->vdev_dtl_scrub.sm_space;
2752		mutex_exit(&cvd->vdev_dtl_lock);
2753		if (!dirty)
2754			break;
2755	}
2756
2757	/*
2758	 * If we are a replacing or spare vdev, then we can always detach the
2759	 * latter child, as that is how one cancels the operation.
2760	 */
2761	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
2762	    c == pvd->vdev_children)
2763		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2764
2765	/*
2766	 * If we are detaching the original disk from a spare, then it implies
2767	 * that the spare should become a real disk, and be removed from the
2768	 * active spare list for the pool.
2769	 */
2770	if (pvd->vdev_ops == &vdev_spare_ops &&
2771	    vd->vdev_id == 0)
2772		unspare = B_TRUE;
2773
2774	/*
2775	 * Erase the disk labels so the disk can be used for other things.
2776	 * This must be done after all other error cases are handled,
2777	 * but before we disembowel vd (so we can still do I/O to it).
2778	 * But if we can't do it, don't treat the error as fatal --
2779	 * it may be that the unwritability of the disk is the reason
2780	 * it's being detached!
2781	 */
2782	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
2783
2784	/*
2785	 * Remove vd from its parent and compact the parent's children.
2786	 */
2787	vdev_remove_child(pvd, vd);
2788	vdev_compact_children(pvd);
2789
2790	/*
2791	 * Remember one of the remaining children so we can get tvd below.
2792	 */
2793	cvd = pvd->vdev_child[0];
2794
2795	/*
2796	 * If we need to remove the remaining child from the list of hot spares,
2797	 * do it now, marking the vdev as no longer a spare in the process.  We
2798	 * must do this before vdev_remove_parent(), because that can change the
2799	 * GUID if it creates a new toplevel GUID.
2800	 */
2801	if (unspare) {
2802		ASSERT(cvd->vdev_isspare);
2803		spa_spare_remove(cvd);
2804		unspare_guid = cvd->vdev_guid;
2805	}
2806
2807	/*
2808	 * If the parent mirror/replacing vdev only has one child,
2809	 * the parent is no longer needed.  Remove it from the tree.
2810	 */
2811	if (pvd->vdev_children == 1)
2812		vdev_remove_parent(cvd);
2813
2814	/*
2815	 * We don't set tvd until now because the parent we just removed
2816	 * may have been the previous top-level vdev.
2817	 */
2818	tvd = cvd->vdev_top;
2819	ASSERT(tvd->vdev_parent == rvd);
2820
2821	/*
2822	 * Reevaluate the parent vdev state.
2823	 */
2824	vdev_propagate_state(cvd);
2825
2826	/*
2827	 * If the device we just detached was smaller than the others, it may be
2828	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
2829	 * can't fail because the existing metaslabs are already in core, so
2830	 * there's nothing to read from disk.
2831	 */
2832	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
2833
2834	vdev_config_dirty(tvd);
2835
2836	/*
2837	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
2838	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
2839	 * But first make sure we're not on any *other* txg's DTL list, to
2840	 * prevent vd from being accessed after it's freed.
2841	 */
2842	for (t = 0; t < TXG_SIZE; t++)
2843		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
2844	vd->vdev_detached = B_TRUE;
2845	vdev_dirty(tvd, VDD_DTL, vd, txg);
2846
2847	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
2848
2849	error = spa_vdev_exit(spa, vd, txg, 0);
2850
2851	/*
2852	 * If this was the removal of the original device in a hot spare vdev,
2853	 * then we want to go through and remove the device from the hot spare
2854	 * list of every other pool.
2855	 */
2856	if (unspare) {
2857		spa = NULL;
2858		mutex_enter(&spa_namespace_lock);
2859		while ((spa = spa_next(spa)) != NULL) {
2860			if (spa->spa_state != POOL_STATE_ACTIVE)
2861				continue;
2862
2863			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
2864		}
2865		mutex_exit(&spa_namespace_lock);
2866	}
2867
2868	return (error);
2869}
2870
2871/*
2872 * Remove a spares vdev from the nvlist config.
2873 */
2874static int
2875spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
2876    nvlist_t **spares, int nspares, vdev_t *vd)
2877{
2878	nvlist_t *nv, **newspares;
2879	int i, j;
2880
2881	nv = NULL;
2882	for (i = 0; i < nspares; i++) {
2883		uint64_t theguid;
2884
2885		VERIFY(nvlist_lookup_uint64(spares[i],
2886		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2887		if (theguid == guid) {
2888			nv = spares[i];
2889			break;
2890		}
2891	}
2892
2893	/*
2894	 * Only remove the hot spare if it's not currently in use in this pool.
2895	 */
2896	if (nv == NULL && vd == NULL)
2897		return (ENOENT);
2898
2899	if (nv == NULL && vd != NULL)
2900		return (ENOTSUP);
2901
2902	if (!unspare && nv != NULL && vd != NULL)
2903		return (EBUSY);
2904
2905	if (nspares == 1) {
2906		newspares = NULL;
2907	} else {
2908		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2909		    KM_SLEEP);
2910		for (i = 0, j = 0; i < nspares; i++) {
2911			if (spares[i] != nv)
2912				VERIFY(nvlist_dup(spares[i],
2913				    &newspares[j++], KM_SLEEP) == 0);
2914		}
2915	}
2916
2917	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
2918	    DATA_TYPE_NVLIST_ARRAY) == 0);
2919	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2920	    ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
2921	for (i = 0; i < nspares - 1; i++)
2922		nvlist_free(newspares[i]);
2923	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2924
2925	return (0);
2926}
2927
2928/*
2929 * Remove an l2cache vdev from the nvlist config.
2930 */
2931static int
2932spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
2933    int nl2cache, vdev_t *vd)
2934{
2935	nvlist_t *nv, **newl2cache;
2936	int i, j;
2937
2938	nv = NULL;
2939	for (i = 0; i < nl2cache; i++) {
2940		uint64_t theguid;
2941
2942		VERIFY(nvlist_lookup_uint64(l2cache[i],
2943		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2944		if (theguid == guid) {
2945			nv = l2cache[i];
2946			break;
2947		}
2948	}
2949
2950	if (vd == NULL) {
2951		for (i = 0; i < nl2cache; i++) {
2952			if (sav->sav_vdevs[i]->vdev_guid == guid) {
2953				vd = sav->sav_vdevs[i];
2954				break;
2955			}
2956		}
2957	}
2958
2959	if (nv == NULL && vd == NULL)
2960		return (ENOENT);
2961
2962	if (nv == NULL && vd != NULL)
2963		return (ENOTSUP);
2964
2965	if (nl2cache == 1) {
2966		newl2cache = NULL;
2967	} else {
2968		newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
2969		    KM_SLEEP);
2970		for (i = 0, j = 0; i < nl2cache; i++) {
2971			if (l2cache[i] != nv)
2972				VERIFY(nvlist_dup(l2cache[i],
2973				    &newl2cache[j++], KM_SLEEP) == 0);
2974		}
2975	}
2976
2977	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2978	    DATA_TYPE_NVLIST_ARRAY) == 0);
2979	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2980	    ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
2981	for (i = 0; i < nl2cache - 1; i++)
2982		nvlist_free(newl2cache[i]);
2983	kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
2984
2985	return (0);
2986}
2987
2988/*
2989 * Remove a device from the pool.  Currently, this supports removing only hot
2990 * spares and level 2 ARC devices.
2991 */
2992int
2993spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2994{
2995	vdev_t *vd;
2996	nvlist_t **spares, **l2cache;
2997	uint_t nspares, nl2cache;
2998	int error = 0;
2999
3000	spa_config_enter(spa, RW_WRITER, FTAG);
3001
3002	vd = spa_lookup_by_guid(spa, guid);
3003
3004	if (spa->spa_spares.sav_vdevs != NULL &&
3005	    spa_spare_exists(guid, NULL) &&
3006	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3007	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
3008		if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
3009		    spares, nspares, vd)) != 0)
3010			goto out;
3011		spa_load_spares(spa);
3012		spa->spa_spares.sav_sync = B_TRUE;
3013		goto out;
3014	}
3015
3016	if (spa->spa_l2cache.sav_vdevs != NULL &&
3017	    spa_l2cache_exists(guid, NULL) &&
3018	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3019	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
3020		if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
3021		    l2cache, nl2cache, vd)) != 0)
3022			goto out;
3023		spa_load_l2cache(spa);
3024		spa->spa_l2cache.sav_sync = B_TRUE;
3025	}
3026
3027out:
3028	spa_config_exit(spa, FTAG);
3029	return (error);
3030}
3031
3032/*
3033 * Find any device that's done replacing, or a vdev marked 'unspare' that's
3034 * current spared, so we can detach it.
3035 */
3036static vdev_t *
3037spa_vdev_resilver_done_hunt(vdev_t *vd)
3038{
3039	vdev_t *newvd, *oldvd;
3040	int c;
3041
3042	for (c = 0; c < vd->vdev_children; c++) {
3043		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3044		if (oldvd != NULL)
3045			return (oldvd);
3046	}
3047
3048	/*
3049	 * Check for a completed replacement.
3050	 */
3051	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3052		oldvd = vd->vdev_child[0];
3053		newvd = vd->vdev_child[1];
3054
3055		mutex_enter(&newvd->vdev_dtl_lock);
3056		if (newvd->vdev_dtl_map.sm_space == 0 &&
3057		    newvd->vdev_dtl_scrub.sm_space == 0) {
3058			mutex_exit(&newvd->vdev_dtl_lock);
3059			return (oldvd);
3060		}
3061		mutex_exit(&newvd->vdev_dtl_lock);
3062	}
3063
3064	/*
3065	 * Check for a completed resilver with the 'unspare' flag set.
3066	 */
3067	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3068		newvd = vd->vdev_child[0];
3069		oldvd = vd->vdev_child[1];
3070
3071		mutex_enter(&newvd->vdev_dtl_lock);
3072		if (newvd->vdev_unspare &&
3073		    newvd->vdev_dtl_map.sm_space == 0 &&
3074		    newvd->vdev_dtl_scrub.sm_space == 0) {
3075			newvd->vdev_unspare = 0;
3076			mutex_exit(&newvd->vdev_dtl_lock);
3077			return (oldvd);
3078		}
3079		mutex_exit(&newvd->vdev_dtl_lock);
3080	}
3081
3082	return (NULL);
3083}
3084
3085static void
3086spa_vdev_resilver_done(spa_t *spa)
3087{
3088	vdev_t *vd;
3089	vdev_t *pvd;
3090	uint64_t guid;
3091	uint64_t pguid = 0;
3092
3093	spa_config_enter(spa, RW_READER, FTAG);
3094
3095	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3096		guid = vd->vdev_guid;
3097		/*
3098		 * If we have just finished replacing a hot spared device, then
3099		 * we need to detach the parent's first child (the original hot
3100		 * spare) as well.
3101		 */
3102		pvd = vd->vdev_parent;
3103		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3104		    pvd->vdev_id == 0) {
3105			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3106			ASSERT(pvd->vdev_parent->vdev_children == 2);
3107			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
3108		}
3109		spa_config_exit(spa, FTAG);
3110		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
3111			return;
3112		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
3113			return;
3114		spa_config_enter(spa, RW_READER, FTAG);
3115	}
3116
3117	spa_config_exit(spa, FTAG);
3118}
3119
3120/*
3121 * Update the stored path for this vdev.  Dirty the vdev configuration, relying
3122 * on spa_vdev_enter/exit() to synchronize the labels and cache.
3123 */
3124int
3125spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3126{
3127	vdev_t *rvd, *vd;
3128	uint64_t txg;
3129
3130	rvd = spa->spa_root_vdev;
3131
3132	txg = spa_vdev_enter(spa);
3133
3134	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3135		/*
3136		 * Determine if this is a reference to a hot spare or l2cache
3137		 * device.  If it is, update the path as stored in their
3138		 * device list.
3139		 */
3140		nvlist_t **spares, **l2cache;
3141		uint_t i, nspares, nl2cache;
3142
3143		if (spa->spa_spares.sav_config != NULL) {
3144			VERIFY(nvlist_lookup_nvlist_array(
3145			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
3146			    &spares, &nspares) == 0);
3147			for (i = 0; i < nspares; i++) {
3148				uint64_t theguid;
3149				VERIFY(nvlist_lookup_uint64(spares[i],
3150				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3151				if (theguid == guid) {
3152					VERIFY(nvlist_add_string(spares[i],
3153					    ZPOOL_CONFIG_PATH, newpath) == 0);
3154					spa_load_spares(spa);
3155					spa->spa_spares.sav_sync = B_TRUE;
3156					return (spa_vdev_exit(spa, NULL, txg,
3157					    0));
3158				}
3159			}
3160		}
3161
3162		if (spa->spa_l2cache.sav_config != NULL) {
3163			VERIFY(nvlist_lookup_nvlist_array(
3164			    spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
3165			    &l2cache, &nl2cache) == 0);
3166			for (i = 0; i < nl2cache; i++) {
3167				uint64_t theguid;
3168				VERIFY(nvlist_lookup_uint64(l2cache[i],
3169				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3170				if (theguid == guid) {
3171					VERIFY(nvlist_add_string(l2cache[i],
3172					    ZPOOL_CONFIG_PATH, newpath) == 0);
3173					spa_load_l2cache(spa);
3174					spa->spa_l2cache.sav_sync = B_TRUE;
3175					return (spa_vdev_exit(spa, NULL, txg,
3176					    0));
3177				}
3178			}
3179		}
3180
3181		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3182	}
3183
3184	if (!vd->vdev_ops->vdev_op_leaf)
3185		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3186
3187	spa_strfree(vd->vdev_path);
3188	vd->vdev_path = spa_strdup(newpath);
3189
3190	vdev_config_dirty(vd->vdev_top);
3191
3192	return (spa_vdev_exit(spa, NULL, txg, 0));
3193}
3194
3195/*
3196 * ==========================================================================
3197 * SPA Scrubbing
3198 * ==========================================================================
3199 */
3200
3201static void
3202spa_scrub_io_done(zio_t *zio)
3203{
3204	spa_t *spa = zio->io_spa;
3205
3206	arc_data_buf_free(zio->io_data, zio->io_size);
3207
3208	mutex_enter(&spa->spa_scrub_lock);
3209	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3210		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
3211		spa->spa_scrub_errors++;
3212		mutex_enter(&vd->vdev_stat_lock);
3213		vd->vdev_stat.vs_scrub_errors++;
3214		mutex_exit(&vd->vdev_stat_lock);
3215	}
3216
3217	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
3218		cv_broadcast(&spa->spa_scrub_io_cv);
3219
3220	ASSERT(spa->spa_scrub_inflight >= 0);
3221
3222	mutex_exit(&spa->spa_scrub_lock);
3223}
3224
3225static void
3226spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
3227    zbookmark_t *zb)
3228{
3229	size_t size = BP_GET_LSIZE(bp);
3230	void *data;
3231
3232	mutex_enter(&spa->spa_scrub_lock);
3233	/*
3234	 * Do not give too much work to vdev(s).
3235	 */
3236	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
3237		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3238	}
3239	spa->spa_scrub_inflight++;
3240	mutex_exit(&spa->spa_scrub_lock);
3241
3242	data = arc_data_buf_alloc(size);
3243
3244	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
3245		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
3246
3247	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
3248
3249	zio_nowait(zio_read(NULL, spa, bp, data, size,
3250	    spa_scrub_io_done, NULL, priority, flags, zb));
3251}
3252
3253/* ARGSUSED */
3254static int
3255spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
3256{
3257	blkptr_t *bp = &bc->bc_blkptr;
3258	vdev_t *vd = spa->spa_root_vdev;
3259	dva_t *dva = bp->blk_dva;
3260	int needs_resilver = B_FALSE;
3261	int d;
3262
3263	if (bc->bc_errno) {
3264		/*
3265		 * We can't scrub this block, but we can continue to scrub
3266		 * the rest of the pool.  Note the error and move along.
3267		 */
3268		mutex_enter(&spa->spa_scrub_lock);
3269		spa->spa_scrub_errors++;
3270		mutex_exit(&spa->spa_scrub_lock);
3271
3272		mutex_enter(&vd->vdev_stat_lock);
3273		vd->vdev_stat.vs_scrub_errors++;
3274		mutex_exit(&vd->vdev_stat_lock);
3275
3276		return (ERESTART);
3277	}
3278
3279	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
3280
3281	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
3282		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
3283
3284		ASSERT(vd != NULL);
3285
3286		/*
3287		 * Keep track of how much data we've examined so that
3288		 * zpool(1M) status can make useful progress reports.
3289		 */
3290		mutex_enter(&vd->vdev_stat_lock);
3291		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
3292		mutex_exit(&vd->vdev_stat_lock);
3293
3294		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
3295			if (DVA_GET_GANG(&dva[d])) {
3296				/*
3297				 * Gang members may be spread across multiple
3298				 * vdevs, so the best we can do is look at the
3299				 * pool-wide DTL.
3300				 * XXX -- it would be better to change our
3301				 * allocation policy to ensure that this can't
3302				 * happen.
3303				 */
3304				vd = spa->spa_root_vdev;
3305			}
3306			if (vdev_dtl_contains(&vd->vdev_dtl_map,
3307			    bp->blk_birth, 1))
3308				needs_resilver = B_TRUE;
3309		}
3310	}
3311
3312	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
3313		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
3314		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
3315	else if (needs_resilver)
3316		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
3317		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
3318
3319	return (0);
3320}
3321
3322static void
3323spa_scrub_thread(spa_t *spa)
3324{
3325	callb_cpr_t cprinfo;
3326	traverse_handle_t *th = spa->spa_scrub_th;
3327	vdev_t *rvd = spa->spa_root_vdev;
3328	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
3329	int error = 0;
3330	boolean_t complete;
3331
3332	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
3333
3334	/*
3335	 * If we're restarting due to a snapshot create/delete,
3336	 * wait for that to complete.
3337	 */
3338	txg_wait_synced(spa_get_dsl(spa), 0);
3339
3340	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
3341	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3342	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
3343
3344	spa_config_enter(spa, RW_WRITER, FTAG);
3345	vdev_reopen(rvd);		/* purge all vdev caches */
3346	vdev_config_dirty(rvd);		/* rewrite all disk labels */
3347	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
3348	spa_config_exit(spa, FTAG);
3349
3350	mutex_enter(&spa->spa_scrub_lock);
3351	spa->spa_scrub_errors = 0;
3352	spa->spa_scrub_active = 1;
3353	ASSERT(spa->spa_scrub_inflight == 0);
3354
3355	while (!spa->spa_scrub_stop) {
3356		CALLB_CPR_SAFE_BEGIN(&cprinfo);
3357		while (spa->spa_scrub_suspended) {
3358			spa->spa_scrub_active = 0;
3359			cv_broadcast(&spa->spa_scrub_cv);
3360			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3361			spa->spa_scrub_active = 1;
3362		}
3363		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
3364
3365		if (spa->spa_scrub_restart_txg != 0)
3366			break;
3367
3368		mutex_exit(&spa->spa_scrub_lock);
3369		error = traverse_more(th);
3370		mutex_enter(&spa->spa_scrub_lock);
3371		if (error != EAGAIN)
3372			break;
3373	}
3374
3375	while (spa->spa_scrub_inflight)
3376		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3377
3378	spa->spa_scrub_active = 0;
3379	cv_broadcast(&spa->spa_scrub_cv);
3380
3381	mutex_exit(&spa->spa_scrub_lock);
3382
3383	spa_config_enter(spa, RW_WRITER, FTAG);
3384
3385	mutex_enter(&spa->spa_scrub_lock);
3386
3387	/*
3388	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
3389	 * AND the spa config lock to synchronize with any config changes
3390	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
3391	 */
3392	if (spa->spa_scrub_restart_txg != 0)
3393		error = ERESTART;
3394
3395	if (spa->spa_scrub_stop)
3396		error = EINTR;
3397
3398	/*
3399	 * Even if there were uncorrectable errors, we consider the scrub
3400	 * completed.  The downside is that if there is a transient error during
3401	 * a resilver, we won't resilver the data properly to the target.  But
3402	 * if the damage is permanent (more likely) we will resilver forever,
3403	 * which isn't really acceptable.  Since there is enough information for
3404	 * the user to know what has failed and why, this seems like a more
3405	 * tractable approach.
3406	 */
3407	complete = (error == 0);
3408
3409	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
3410	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3411	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
3412	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
3413
3414	mutex_exit(&spa->spa_scrub_lock);
3415
3416	/*
3417	 * If the scrub/resilver completed, update all DTLs to reflect this.
3418	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
3419	 */
3420	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
3421	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
3422	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
3423	spa_errlog_rotate(spa);
3424
3425	if (scrub_type == POOL_SCRUB_RESILVER && complete)
3426		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
3427
3428	spa_config_exit(spa, FTAG);
3429
3430	mutex_enter(&spa->spa_scrub_lock);
3431
3432	/*
3433	 * We may have finished replacing a device.
3434	 * Let the async thread assess this and handle the detach.
3435	 */
3436	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3437
3438	/*
3439	 * If we were told to restart, our final act is to start a new scrub.
3440	 */
3441	if (error == ERESTART)
3442		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
3443		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
3444
3445	spa->spa_scrub_type = POOL_SCRUB_NONE;
3446	spa->spa_scrub_active = 0;
3447	spa->spa_scrub_thread = NULL;
3448	cv_broadcast(&spa->spa_scrub_cv);
3449	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
3450	thread_exit();
3451}
3452
3453void
3454spa_scrub_suspend(spa_t *spa)
3455{
3456	mutex_enter(&spa->spa_scrub_lock);
3457	spa->spa_scrub_suspended++;
3458	while (spa->spa_scrub_active) {
3459		cv_broadcast(&spa->spa_scrub_cv);
3460		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3461	}
3462	while (spa->spa_scrub_inflight)
3463		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3464	mutex_exit(&spa->spa_scrub_lock);
3465}
3466
3467void
3468spa_scrub_resume(spa_t *spa)
3469{
3470	mutex_enter(&spa->spa_scrub_lock);
3471	ASSERT(spa->spa_scrub_suspended != 0);
3472	if (--spa->spa_scrub_suspended == 0)
3473		cv_broadcast(&spa->spa_scrub_cv);
3474	mutex_exit(&spa->spa_scrub_lock);
3475}
3476
3477void
3478spa_scrub_restart(spa_t *spa, uint64_t txg)
3479{
3480	/*
3481	 * Something happened (e.g. snapshot create/delete) that means
3482	 * we must restart any in-progress scrubs.  The itinerary will
3483	 * fix this properly.
3484	 */
3485	mutex_enter(&spa->spa_scrub_lock);
3486	spa->spa_scrub_restart_txg = txg;
3487	mutex_exit(&spa->spa_scrub_lock);
3488}
3489
3490int
3491spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
3492{
3493	space_seg_t *ss;
3494	uint64_t mintxg, maxtxg;
3495	vdev_t *rvd = spa->spa_root_vdev;
3496
3497	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3498	ASSERT(!spa_config_held(spa, RW_WRITER));
3499
3500	if ((uint_t)type >= POOL_SCRUB_TYPES)
3501		return (ENOTSUP);
3502
3503	mutex_enter(&spa->spa_scrub_lock);
3504
3505	/*
3506	 * If there's a scrub or resilver already in progress, stop it.
3507	 */
3508	while (spa->spa_scrub_thread != NULL) {
3509		/*
3510		 * Don't stop a resilver unless forced.
3511		 */
3512		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
3513			mutex_exit(&spa->spa_scrub_lock);
3514			return (EBUSY);
3515		}
3516		spa->spa_scrub_stop = 1;
3517		cv_broadcast(&spa->spa_scrub_cv);
3518		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3519	}
3520
3521	/*
3522	 * Terminate the previous traverse.
3523	 */
3524	if (spa->spa_scrub_th != NULL) {
3525		traverse_fini(spa->spa_scrub_th);
3526		spa->spa_scrub_th = NULL;
3527	}
3528
3529	if (rvd == NULL) {
3530		ASSERT(spa->spa_scrub_stop == 0);
3531		ASSERT(spa->spa_scrub_type == type);
3532		ASSERT(spa->spa_scrub_restart_txg == 0);
3533		mutex_exit(&spa->spa_scrub_lock);
3534		return (0);
3535	}
3536
3537	mintxg = TXG_INITIAL - 1;
3538	maxtxg = spa_last_synced_txg(spa) + 1;
3539
3540	mutex_enter(&rvd->vdev_dtl_lock);
3541
3542	if (rvd->vdev_dtl_map.sm_space == 0) {
3543		/*
3544		 * The pool-wide DTL is empty.
3545		 * If this is a resilver, there's nothing to do except
3546		 * check whether any in-progress replacements have completed.
3547		 */
3548		if (type == POOL_SCRUB_RESILVER) {
3549			type = POOL_SCRUB_NONE;
3550			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3551		}
3552	} else {
3553		/*
3554		 * The pool-wide DTL is non-empty.
3555		 * If this is a normal scrub, upgrade to a resilver instead.
3556		 */
3557		if (type == POOL_SCRUB_EVERYTHING)
3558			type = POOL_SCRUB_RESILVER;
3559	}
3560
3561	if (type == POOL_SCRUB_RESILVER) {
3562		/*
3563		 * Determine the resilvering boundaries.
3564		 *
3565		 * Note: (mintxg, maxtxg) is an open interval,
3566		 * i.e. mintxg and maxtxg themselves are not included.
3567		 *
3568		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
3569		 * so we don't claim to resilver a txg that's still changing.
3570		 */
3571		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
3572		mintxg = ss->ss_start - 1;
3573		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
3574		maxtxg = MIN(ss->ss_end, maxtxg);
3575
3576		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
3577	}
3578
3579	mutex_exit(&rvd->vdev_dtl_lock);
3580
3581	spa->spa_scrub_stop = 0;
3582	spa->spa_scrub_type = type;
3583	spa->spa_scrub_restart_txg = 0;
3584
3585	if (type != POOL_SCRUB_NONE) {
3586		spa->spa_scrub_mintxg = mintxg;
3587		spa->spa_scrub_maxtxg = maxtxg;
3588		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
3589		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
3590		    ZIO_FLAG_CANFAIL);
3591		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
3592		spa->spa_scrub_thread = thread_create(NULL, 0,
3593		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
3594	}
3595
3596	mutex_exit(&spa->spa_scrub_lock);
3597
3598	return (0);
3599}
3600
3601/*
3602 * ==========================================================================
3603 * SPA async task processing
3604 * ==========================================================================
3605 */
3606
3607static void
3608spa_async_remove(spa_t *spa, vdev_t *vd)
3609{
3610	vdev_t *tvd;
3611	int c;
3612
3613	for (c = 0; c < vd->vdev_children; c++) {
3614		tvd = vd->vdev_child[c];
3615		if (tvd->vdev_remove_wanted) {
3616			tvd->vdev_remove_wanted = 0;
3617			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
3618			    VDEV_AUX_NONE);
3619			vdev_clear(spa, tvd, B_TRUE);
3620			vdev_config_dirty(tvd->vdev_top);
3621		}
3622		spa_async_remove(spa, tvd);
3623	}
3624}
3625
3626static void
3627spa_async_thread(spa_t *spa)
3628{
3629	int tasks;
3630	uint64_t txg;
3631
3632	ASSERT(spa->spa_sync_on);
3633
3634	mutex_enter(&spa->spa_async_lock);
3635	tasks = spa->spa_async_tasks;
3636	spa->spa_async_tasks = 0;
3637	mutex_exit(&spa->spa_async_lock);
3638
3639	/*
3640	 * See if the config needs to be updated.
3641	 */
3642	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3643		mutex_enter(&spa_namespace_lock);
3644		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3645		mutex_exit(&spa_namespace_lock);
3646	}
3647
3648	/*
3649	 * See if any devices need to be marked REMOVED.
3650	 *
3651	 * XXX - We avoid doing this when we are in
3652	 * I/O failure state since spa_vdev_enter() grabs
3653	 * the namespace lock and would not be able to obtain
3654	 * the writer config lock.
3655	 */
3656	if (tasks & SPA_ASYNC_REMOVE &&
3657	    spa_state(spa) != POOL_STATE_IO_FAILURE) {
3658		txg = spa_vdev_enter(spa);
3659		spa_async_remove(spa, spa->spa_root_vdev);
3660		(void) spa_vdev_exit(spa, NULL, txg, 0);
3661	}
3662
3663	/*
3664	 * If any devices are done replacing, detach them.
3665	 */
3666	if (tasks & SPA_ASYNC_RESILVER_DONE)
3667		spa_vdev_resilver_done(spa);
3668
3669	/*
3670	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
3671	 * scrub which can become a resilver), we need to hold
3672	 * spa_namespace_lock() because the sysevent we post via
3673	 * spa_event_notify() needs to get the name of the pool.
3674	 */
3675	if (tasks & SPA_ASYNC_SCRUB) {
3676		mutex_enter(&spa_namespace_lock);
3677		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
3678		mutex_exit(&spa_namespace_lock);
3679	}
3680
3681	/*
3682	 * Kick off a resilver.
3683	 */
3684	if (tasks & SPA_ASYNC_RESILVER) {
3685		mutex_enter(&spa_namespace_lock);
3686		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
3687		mutex_exit(&spa_namespace_lock);
3688	}
3689
3690	/*
3691	 * Let the world know that we're done.
3692	 */
3693	mutex_enter(&spa->spa_async_lock);
3694	spa->spa_async_thread = NULL;
3695	cv_broadcast(&spa->spa_async_cv);
3696	mutex_exit(&spa->spa_async_lock);
3697	thread_exit();
3698}
3699
3700void
3701spa_async_suspend(spa_t *spa)
3702{
3703	mutex_enter(&spa->spa_async_lock);
3704	spa->spa_async_suspended++;
3705	while (spa->spa_async_thread != NULL)
3706		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3707	mutex_exit(&spa->spa_async_lock);
3708}
3709
3710void
3711spa_async_resume(spa_t *spa)
3712{
3713	mutex_enter(&spa->spa_async_lock);
3714	ASSERT(spa->spa_async_suspended != 0);
3715	spa->spa_async_suspended--;
3716	mutex_exit(&spa->spa_async_lock);
3717}
3718
3719static void
3720spa_async_dispatch(spa_t *spa)
3721{
3722	mutex_enter(&spa->spa_async_lock);
3723	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3724	    spa->spa_async_thread == NULL &&
3725	    rootdir != NULL && !vn_is_readonly(rootdir))
3726		spa->spa_async_thread = thread_create(NULL, 0,
3727		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3728	mutex_exit(&spa->spa_async_lock);
3729}
3730
3731void
3732spa_async_request(spa_t *spa, int task)
3733{
3734	mutex_enter(&spa->spa_async_lock);
3735	spa->spa_async_tasks |= task;
3736	mutex_exit(&spa->spa_async_lock);
3737}
3738
3739/*
3740 * ==========================================================================
3741 * SPA syncing routines
3742 * ==========================================================================
3743 */
3744
3745static void
3746spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3747{
3748	bplist_t *bpl = &spa->spa_sync_bplist;
3749	dmu_tx_t *tx;
3750	blkptr_t blk;
3751	uint64_t itor = 0;
3752	zio_t *zio;
3753	int error;
3754	uint8_t c = 1;
3755
3756	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
3757
3758	while (bplist_iterate(bpl, &itor, &blk) == 0)
3759		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
3760
3761	error = zio_wait(zio);
3762	ASSERT3U(error, ==, 0);
3763
3764	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3765	bplist_vacate(bpl, tx);
3766
3767	/*
3768	 * Pre-dirty the first block so we sync to convergence faster.
3769	 * (Usually only the first block is needed.)
3770	 */
3771	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3772	dmu_tx_commit(tx);
3773}
3774
3775static void
3776spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3777{
3778	char *packed = NULL;
3779	size_t nvsize = 0;
3780	dmu_buf_t *db;
3781
3782	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3783
3784	packed = kmem_alloc(nvsize, KM_SLEEP);
3785
3786	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3787	    KM_SLEEP) == 0);
3788
3789	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
3790
3791	kmem_free(packed, nvsize);
3792
3793	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3794	dmu_buf_will_dirty(db, tx);
3795	*(uint64_t *)db->db_data = nvsize;
3796	dmu_buf_rele(db, FTAG);
3797}
3798
3799static void
3800spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3801    const char *config, const char *entry)
3802{
3803	nvlist_t *nvroot;
3804	nvlist_t **list;
3805	int i;
3806
3807	if (!sav->sav_sync)
3808		return;
3809
3810	/*
3811	 * Update the MOS nvlist describing the list of available devices.
3812	 * spa_validate_aux() will have already made sure this nvlist is
3813	 * valid and the vdevs are labeled appropriately.
3814	 */
3815	if (sav->sav_object == 0) {
3816		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3817		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3818		    sizeof (uint64_t), tx);
3819		VERIFY(zap_update(spa->spa_meta_objset,
3820		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3821		    &sav->sav_object, tx) == 0);
3822	}
3823
3824	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3825	if (sav->sav_count == 0) {
3826		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3827	} else {
3828		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3829		for (i = 0; i < sav->sav_count; i++)
3830			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3831			    B_FALSE, B_FALSE, B_TRUE);
3832		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3833		    sav->sav_count) == 0);
3834		for (i = 0; i < sav->sav_count; i++)
3835			nvlist_free(list[i]);
3836		kmem_free(list, sav->sav_count * sizeof (void *));
3837	}
3838
3839	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3840	nvlist_free(nvroot);
3841
3842	sav->sav_sync = B_FALSE;
3843}
3844
3845static void
3846spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3847{
3848	nvlist_t *config;
3849
3850	if (list_is_empty(&spa->spa_dirty_list))
3851		return;
3852
3853	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
3854
3855	if (spa->spa_config_syncing)
3856		nvlist_free(spa->spa_config_syncing);
3857	spa->spa_config_syncing = config;
3858
3859	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3860}
3861
3862/*
3863 * Set zpool properties.
3864 */
3865static void
3866spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3867{
3868	spa_t *spa = arg1;
3869	objset_t *mos = spa->spa_meta_objset;
3870	nvlist_t *nvp = arg2;
3871	nvpair_t *elem;
3872	uint64_t intval;
3873	char *strval, *slash;
3874	zpool_prop_t prop;
3875	const char *propname;
3876	zprop_type_t proptype;
3877
3878	elem = NULL;
3879	while ((elem = nvlist_next_nvpair(nvp, elem))) {
3880		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
3881		case ZPOOL_PROP_VERSION:
3882			/*
3883			 * Only set version for non-zpool-creation cases
3884			 * (set/import). spa_create() needs special care
3885			 * for version setting.
3886			 */
3887			if (tx->tx_txg != TXG_INITIAL) {
3888				VERIFY(nvpair_value_uint64(elem,
3889				    &intval) == 0);
3890				ASSERT(intval <= SPA_VERSION);
3891				ASSERT(intval >= spa_version(spa));
3892				spa->spa_uberblock.ub_version = intval;
3893				vdev_config_dirty(spa->spa_root_vdev);
3894			}
3895			break;
3896
3897		case ZPOOL_PROP_ALTROOT:
3898			/*
3899			 * 'altroot' is a non-persistent property. It should
3900			 * have been set temporarily at creation or import time.
3901			 */
3902			ASSERT(spa->spa_root != NULL);
3903			break;
3904
3905		case ZPOOL_PROP_CACHEFILE:
3906			/*
3907			 * 'cachefile' is a non-persistent property, but note
3908			 * an async request that the config cache needs to be
3909			 * udpated.
3910			 */
3911			VERIFY(nvpair_value_string(elem, &strval) == 0);
3912			if (spa->spa_config_dir)
3913				spa_strfree(spa->spa_config_dir);
3914			if (spa->spa_config_file)
3915				spa_strfree(spa->spa_config_file);
3916
3917			if (strval[0] == '\0') {
3918				spa->spa_config_dir = NULL;
3919				spa->spa_config_file = NULL;
3920			} else if (strcmp(strval, "none") == 0) {
3921				spa->spa_config_dir = spa_strdup(strval);
3922				spa->spa_config_file = NULL;
3923			} else {
3924				/*
3925				 * If the cachefile is in the root directory,
3926				 * we will end up with an empty string for
3927				 * spa_config_dir.  This value is only ever
3928				 * used when concatenated with '/', so an empty
3929				 * string still behaves correctly and keeps the
3930				 * rest of the code simple.
3931				 */
3932				slash = strrchr(strval, '/');
3933				ASSERT(slash != NULL);
3934				*slash = '\0';
3935				if (strcmp(strval, spa_config_dir) == 0 &&
3936				    strcmp(slash + 1, ZPOOL_CACHE_FILE) == 0) {
3937					spa->spa_config_dir = NULL;
3938					spa->spa_config_file = NULL;
3939				} else {
3940					spa->spa_config_dir =
3941					    spa_strdup(strval);
3942					spa->spa_config_file =
3943					    spa_strdup(slash + 1);
3944				}
3945			}
3946			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3947			break;
3948		default:
3949			/*
3950			 * Set pool property values in the poolprops mos object.
3951			 */
3952			mutex_enter(&spa->spa_props_lock);
3953			if (spa->spa_pool_props_object == 0) {
3954				objset_t *mos = spa->spa_meta_objset;
3955
3956				VERIFY((spa->spa_pool_props_object =
3957				    zap_create(mos, DMU_OT_POOL_PROPS,
3958				    DMU_OT_NONE, 0, tx)) > 0);
3959
3960				VERIFY(zap_update(mos,
3961				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
3962				    8, 1, &spa->spa_pool_props_object, tx)
3963				    == 0);
3964			}
3965			mutex_exit(&spa->spa_props_lock);
3966
3967			/* normalize the property name */
3968			propname = zpool_prop_to_name(prop);
3969			proptype = zpool_prop_get_type(prop);
3970
3971			if (nvpair_type(elem) == DATA_TYPE_STRING) {
3972				ASSERT(proptype == PROP_TYPE_STRING);
3973				VERIFY(nvpair_value_string(elem, &strval) == 0);
3974				VERIFY(zap_update(mos,
3975				    spa->spa_pool_props_object, propname,
3976				    1, strlen(strval) + 1, strval, tx) == 0);
3977
3978			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
3979				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
3980
3981				if (proptype == PROP_TYPE_INDEX) {
3982					const char *unused;
3983					VERIFY(zpool_prop_index_to_string(
3984					    prop, intval, &unused) == 0);
3985				}
3986				VERIFY(zap_update(mos,
3987				    spa->spa_pool_props_object, propname,
3988				    8, 1, &intval, tx) == 0);
3989			} else {
3990				ASSERT(0); /* not allowed */
3991			}
3992
3993			switch (prop) {
3994			case ZPOOL_PROP_DELEGATION:
3995				spa->spa_delegation = intval;
3996				break;
3997			case ZPOOL_PROP_BOOTFS:
3998				spa->spa_bootfs = intval;
3999				break;
4000			case ZPOOL_PROP_FAILUREMODE:
4001				spa->spa_failmode = intval;
4002				break;
4003			default:
4004				break;
4005			}
4006		}
4007
4008		/* log internal history if this is not a zpool create */
4009		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
4010		    tx->tx_txg != TXG_INITIAL) {
4011			spa_history_internal_log(LOG_POOL_PROPSET,
4012			    spa, tx, cr, "%s %lld %s",
4013			    nvpair_name(elem), intval, spa->spa_name);
4014		}
4015	}
4016}
4017
4018/*
4019 * Sync the specified transaction group.  New blocks may be dirtied as
4020 * part of the process, so we iterate until it converges.
4021 */
4022void
4023spa_sync(spa_t *spa, uint64_t txg)
4024{
4025	dsl_pool_t *dp = spa->spa_dsl_pool;
4026	objset_t *mos = spa->spa_meta_objset;
4027	bplist_t *bpl = &spa->spa_sync_bplist;
4028	vdev_t *rvd = spa->spa_root_vdev;
4029	vdev_t *vd;
4030	vdev_t *svd[SPA_DVAS_PER_BP];
4031	int svdcount = 0;
4032	dmu_tx_t *tx;
4033	int dirty_vdevs;
4034
4035	/*
4036	 * Lock out configuration changes.
4037	 */
4038	spa_config_enter(spa, RW_READER, FTAG);
4039
4040	spa->spa_syncing_txg = txg;
4041	spa->spa_sync_pass = 0;
4042
4043	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4044
4045	tx = dmu_tx_create_assigned(dp, txg);
4046
4047	/*
4048	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4049	 * set spa_deflate if we have no raid-z vdevs.
4050	 */
4051	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4052	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4053		int i;
4054
4055		for (i = 0; i < rvd->vdev_children; i++) {
4056			vd = rvd->vdev_child[i];
4057			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4058				break;
4059		}
4060		if (i == rvd->vdev_children) {
4061			spa->spa_deflate = TRUE;
4062			VERIFY(0 == zap_add(spa->spa_meta_objset,
4063			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4064			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4065		}
4066	}
4067
4068	/*
4069	 * If anything has changed in this txg, push the deferred frees
4070	 * from the previous txg.  If not, leave them alone so that we
4071	 * don't generate work on an otherwise idle system.
4072	 */
4073	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4074	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4075	    !txg_list_empty(&dp->dp_sync_tasks, txg))
4076		spa_sync_deferred_frees(spa, txg);
4077
4078	/*
4079	 * Iterate to convergence.
4080	 */
4081	do {
4082		spa->spa_sync_pass++;
4083
4084		spa_sync_config_object(spa, tx);
4085		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4086		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4087		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4088		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4089		spa_errlog_sync(spa, txg);
4090		dsl_pool_sync(dp, txg);
4091
4092		dirty_vdevs = 0;
4093		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4094			vdev_sync(vd, txg);
4095			dirty_vdevs++;
4096		}
4097
4098		bplist_sync(bpl, tx);
4099	} while (dirty_vdevs);
4100
4101	bplist_close(bpl);
4102
4103	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4104
4105	/*
4106	 * Rewrite the vdev configuration (which includes the uberblock)
4107	 * to commit the transaction group.
4108	 *
4109	 * If there are no dirty vdevs, we sync the uberblock to a few
4110	 * random top-level vdevs that are known to be visible in the
4111	 * config cache (see spa_vdev_add() for details).  If there *are*
4112	 * dirty vdevs -- or if the sync to our random subset fails --
4113	 * then sync the uberblock to all vdevs.
4114	 */
4115	if (list_is_empty(&spa->spa_dirty_list)) {
4116		int children = rvd->vdev_children;
4117		int c0 = spa_get_random(children);
4118		int c;
4119
4120		for (c = 0; c < children; c++) {
4121			vd = rvd->vdev_child[(c0 + c) % children];
4122			if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4123				continue;
4124			svd[svdcount++] = vd;
4125			if (svdcount == SPA_DVAS_PER_BP)
4126				break;
4127		}
4128	}
4129	if (svdcount == 0 || vdev_config_sync(svd, svdcount, txg) != 0)
4130		VERIFY3U(vdev_config_sync(rvd->vdev_child,
4131		    rvd->vdev_children, txg), ==, 0);
4132
4133	dmu_tx_commit(tx);
4134
4135	/*
4136	 * Clear the dirty config list.
4137	 */
4138	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
4139		vdev_config_clean(vd);
4140
4141	/*
4142	 * Now that the new config has synced transactionally,
4143	 * let it become visible to the config cache.
4144	 */
4145	if (spa->spa_config_syncing != NULL) {
4146		spa_config_set(spa, spa->spa_config_syncing);
4147		spa->spa_config_txg = txg;
4148		spa->spa_config_syncing = NULL;
4149	}
4150
4151	/*
4152	 * Make a stable copy of the fully synced uberblock.
4153	 * We use this as the root for pool traversals.
4154	 */
4155	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
4156
4157	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
4158
4159	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
4160	spa->spa_traverse_wanted = 0;
4161	spa->spa_ubsync = spa->spa_uberblock;
4162	rw_exit(&spa->spa_traverse_lock);
4163
4164	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
4165
4166	/*
4167	 * Clean up the ZIL records for the synced txg.
4168	 */
4169	dsl_pool_zil_clean(dp);
4170
4171	/*
4172	 * Update usable space statistics.
4173	 */
4174	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4175		vdev_sync_done(vd, txg);
4176
4177	/*
4178	 * It had better be the case that we didn't dirty anything
4179	 * since vdev_config_sync().
4180	 */
4181	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4182	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4183	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4184	ASSERT(bpl->bpl_queue == NULL);
4185
4186	spa_config_exit(spa, FTAG);
4187
4188	/*
4189	 * If any async tasks have been requested, kick them off.
4190	 */
4191	spa_async_dispatch(spa);
4192}
4193
4194/*
4195 * Sync all pools.  We don't want to hold the namespace lock across these
4196 * operations, so we take a reference on the spa_t and drop the lock during the
4197 * sync.
4198 */
4199void
4200spa_sync_allpools(void)
4201{
4202	spa_t *spa = NULL;
4203	mutex_enter(&spa_namespace_lock);
4204	while ((spa = spa_next(spa)) != NULL) {
4205		if (spa_state(spa) != POOL_STATE_ACTIVE)
4206			continue;
4207		spa_open_ref(spa, FTAG);
4208		mutex_exit(&spa_namespace_lock);
4209		txg_wait_synced(spa_get_dsl(spa), 0);
4210		mutex_enter(&spa_namespace_lock);
4211		spa_close(spa, FTAG);
4212	}
4213	mutex_exit(&spa_namespace_lock);
4214}
4215
4216/*
4217 * ==========================================================================
4218 * Miscellaneous routines
4219 * ==========================================================================
4220 */
4221
4222/*
4223 * Remove all pools in the system.
4224 */
4225void
4226spa_evict_all(void)
4227{
4228	spa_t *spa;
4229
4230	/*
4231	 * Remove all cached state.  All pools should be closed now,
4232	 * so every spa in the AVL tree should be unreferenced.
4233	 */
4234	mutex_enter(&spa_namespace_lock);
4235	while ((spa = spa_next(NULL)) != NULL) {
4236		/*
4237		 * Stop async tasks.  The async thread may need to detach
4238		 * a device that's been replaced, which requires grabbing
4239		 * spa_namespace_lock, so we must drop it here.
4240		 */
4241		spa_open_ref(spa, FTAG);
4242		mutex_exit(&spa_namespace_lock);
4243		spa_async_suspend(spa);
4244		mutex_enter(&spa_namespace_lock);
4245		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
4246		spa_close(spa, FTAG);
4247
4248		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4249			spa_unload(spa);
4250			spa_deactivate(spa);
4251		}
4252		spa_remove(spa);
4253	}
4254	mutex_exit(&spa_namespace_lock);
4255}
4256
4257vdev_t *
4258spa_lookup_by_guid(spa_t *spa, uint64_t guid)
4259{
4260	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
4261}
4262
4263void
4264spa_upgrade(spa_t *spa, uint64_t version)
4265{
4266	spa_config_enter(spa, RW_WRITER, FTAG);
4267
4268	/*
4269	 * This should only be called for a non-faulted pool, and since a
4270	 * future version would result in an unopenable pool, this shouldn't be
4271	 * possible.
4272	 */
4273	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4274	ASSERT(version >= spa->spa_uberblock.ub_version);
4275
4276	spa->spa_uberblock.ub_version = version;
4277	vdev_config_dirty(spa->spa_root_vdev);
4278
4279	spa_config_exit(spa, FTAG);
4280
4281	txg_wait_synced(spa_get_dsl(spa), 0);
4282}
4283
4284boolean_t
4285spa_has_spare(spa_t *spa, uint64_t guid)
4286{
4287	int i;
4288	uint64_t spareguid;
4289	spa_aux_vdev_t *sav = &spa->spa_spares;
4290
4291	for (i = 0; i < sav->sav_count; i++)
4292		if (sav->sav_vdevs[i]->vdev_guid == guid)
4293			return (B_TRUE);
4294
4295	for (i = 0; i < sav->sav_npending; i++) {
4296		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4297		    &spareguid) == 0 && spareguid == guid)
4298			return (B_TRUE);
4299	}
4300
4301	return (B_FALSE);
4302}
4303
4304/*
4305 * Post a sysevent corresponding to the given event.  The 'name' must be one of
4306 * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4307 * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4308 * in the userland libzpool, as we don't want consumers to misinterpret ztest
4309 * or zdb as real changes.
4310 */
4311void
4312spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4313{
4314#ifdef _KERNEL
4315	sysevent_t		*ev;
4316	sysevent_attr_list_t	*attr = NULL;
4317	sysevent_value_t	value;
4318	sysevent_id_t		eid;
4319
4320	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4321	    SE_SLEEP);
4322
4323	value.value_type = SE_DATA_TYPE_STRING;
4324	value.value.sv_string = spa_name(spa);
4325	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4326		goto done;
4327
4328	value.value_type = SE_DATA_TYPE_UINT64;
4329	value.value.sv_uint64 = spa_guid(spa);
4330	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4331		goto done;
4332
4333	if (vd) {
4334		value.value_type = SE_DATA_TYPE_UINT64;
4335		value.value.sv_uint64 = vd->vdev_guid;
4336		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4337		    SE_SLEEP) != 0)
4338			goto done;
4339
4340		if (vd->vdev_path) {
4341			value.value_type = SE_DATA_TYPE_STRING;
4342			value.value.sv_string = vd->vdev_path;
4343			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4344			    &value, SE_SLEEP) != 0)
4345				goto done;
4346		}
4347	}
4348
4349	if (sysevent_attach_attributes(ev, attr) != 0)
4350		goto done;
4351	attr = NULL;
4352
4353	(void) log_sysevent(ev, SE_SLEEP, &eid);
4354
4355done:
4356	if (attr)
4357		sysevent_free_attr(attr);
4358	sysevent_free(ev);
4359#endif
4360}
4361