xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa.c (revision 39c23413b8df94a95f67b34cfd4a4dfc3fd0b48d)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
2199653d4eSeschrock 
22fa9e4066Sahrens /*
23*39c23413Seschrock  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24fa9e4066Sahrens  * Use is subject to license terms.
25fa9e4066Sahrens  */
26fa9e4066Sahrens 
27fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28fa9e4066Sahrens 
29fa9e4066Sahrens /*
30fa9e4066Sahrens  * This file contains all the routines used when modifying on-disk SPA state.
31fa9e4066Sahrens  * This includes opening, importing, destroying, exporting a pool, and syncing a
32fa9e4066Sahrens  * pool.
33fa9e4066Sahrens  */
34fa9e4066Sahrens 
35fa9e4066Sahrens #include <sys/zfs_context.h>
36ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
37fa9e4066Sahrens #include <sys/spa_impl.h>
38fa9e4066Sahrens #include <sys/zio.h>
39fa9e4066Sahrens #include <sys/zio_checksum.h>
40fa9e4066Sahrens #include <sys/zio_compress.h>
41fa9e4066Sahrens #include <sys/dmu.h>
42fa9e4066Sahrens #include <sys/dmu_tx.h>
43fa9e4066Sahrens #include <sys/zap.h>
44fa9e4066Sahrens #include <sys/zil.h>
45fa9e4066Sahrens #include <sys/vdev_impl.h>
46fa9e4066Sahrens #include <sys/metaslab.h>
47fa9e4066Sahrens #include <sys/uberblock_impl.h>
48fa9e4066Sahrens #include <sys/txg.h>
49fa9e4066Sahrens #include <sys/avl.h>
50fa9e4066Sahrens #include <sys/dmu_traverse.h>
51fa9e4066Sahrens #include <sys/unique.h>
52fa9e4066Sahrens #include <sys/dsl_pool.h>
53fa9e4066Sahrens #include <sys/dsl_dir.h>
54fa9e4066Sahrens #include <sys/dsl_prop.h>
55fa9e4066Sahrens #include <sys/fs/zfs.h>
56fa9e4066Sahrens #include <sys/callb.h>
57fa9e4066Sahrens 
58416e0cd8Sek int zio_taskq_threads = 8;
59416e0cd8Sek 
60fa9e4066Sahrens /*
61fa9e4066Sahrens  * ==========================================================================
62fa9e4066Sahrens  * SPA state manipulation (open/create/destroy/import/export)
63fa9e4066Sahrens  * ==========================================================================
64fa9e4066Sahrens  */
65fa9e4066Sahrens 
66ea8dc4b6Seschrock static int
67ea8dc4b6Seschrock spa_error_entry_compare(const void *a, const void *b)
68ea8dc4b6Seschrock {
69ea8dc4b6Seschrock 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
70ea8dc4b6Seschrock 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
71ea8dc4b6Seschrock 	int ret;
72ea8dc4b6Seschrock 
73ea8dc4b6Seschrock 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
74ea8dc4b6Seschrock 	    sizeof (zbookmark_t));
75ea8dc4b6Seschrock 
76ea8dc4b6Seschrock 	if (ret < 0)
77ea8dc4b6Seschrock 		return (-1);
78ea8dc4b6Seschrock 	else if (ret > 0)
79ea8dc4b6Seschrock 		return (1);
80ea8dc4b6Seschrock 	else
81ea8dc4b6Seschrock 		return (0);
82ea8dc4b6Seschrock }
83ea8dc4b6Seschrock 
84ea8dc4b6Seschrock /*
85ea8dc4b6Seschrock  * Utility function which retrieves copies of the current logs and
86ea8dc4b6Seschrock  * re-initializes them in the process.
87ea8dc4b6Seschrock  */
88ea8dc4b6Seschrock void
89ea8dc4b6Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
90ea8dc4b6Seschrock {
91ea8dc4b6Seschrock 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
92ea8dc4b6Seschrock 
93ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
94ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
95ea8dc4b6Seschrock 
96ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
97ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
98ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
99ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
100ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
101ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
102ea8dc4b6Seschrock }
103ea8dc4b6Seschrock 
104fa9e4066Sahrens /*
105fa9e4066Sahrens  * Activate an uninitialized pool.
106fa9e4066Sahrens  */
107fa9e4066Sahrens static void
108fa9e4066Sahrens spa_activate(spa_t *spa)
109fa9e4066Sahrens {
110fa9e4066Sahrens 	int t;
111fa9e4066Sahrens 
112fa9e4066Sahrens 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
113fa9e4066Sahrens 
114fa9e4066Sahrens 	spa->spa_state = POOL_STATE_ACTIVE;
115fa9e4066Sahrens 
116fa9e4066Sahrens 	spa->spa_normal_class = metaslab_class_create();
117fa9e4066Sahrens 
118fa9e4066Sahrens 	for (t = 0; t < ZIO_TYPES; t++) {
119fa9e4066Sahrens 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
120416e0cd8Sek 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
121fa9e4066Sahrens 		    TASKQ_PREPOPULATE);
122fa9e4066Sahrens 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
123416e0cd8Sek 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
124fa9e4066Sahrens 		    TASKQ_PREPOPULATE);
125fa9e4066Sahrens 	}
126fa9e4066Sahrens 
127fa9e4066Sahrens 	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
128fa9e4066Sahrens 
1295ad82045Snd 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
1305ad82045Snd 	mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
1315ad82045Snd 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
1325ad82045Snd 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
1335ad82045Snd 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
1345ad82045Snd 	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
1355ad82045Snd 	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
13606eeb2adSek 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
1375ad82045Snd 
138fa9e4066Sahrens 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
139fa9e4066Sahrens 	    offsetof(vdev_t, vdev_dirty_node));
140fa9e4066Sahrens 
141fa9e4066Sahrens 	txg_list_create(&spa->spa_vdev_txg_list,
142fa9e4066Sahrens 	    offsetof(struct vdev, vdev_txg_node));
143ea8dc4b6Seschrock 
144ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
145ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
146ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
147ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
148ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
149ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
150fa9e4066Sahrens }
151fa9e4066Sahrens 
152fa9e4066Sahrens /*
153fa9e4066Sahrens  * Opposite of spa_activate().
154fa9e4066Sahrens  */
155fa9e4066Sahrens static void
156fa9e4066Sahrens spa_deactivate(spa_t *spa)
157fa9e4066Sahrens {
158fa9e4066Sahrens 	int t;
159fa9e4066Sahrens 
160fa9e4066Sahrens 	ASSERT(spa->spa_sync_on == B_FALSE);
161fa9e4066Sahrens 	ASSERT(spa->spa_dsl_pool == NULL);
162fa9e4066Sahrens 	ASSERT(spa->spa_root_vdev == NULL);
163fa9e4066Sahrens 
164fa9e4066Sahrens 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
165fa9e4066Sahrens 
166fa9e4066Sahrens 	txg_list_destroy(&spa->spa_vdev_txg_list);
167fa9e4066Sahrens 
168fa9e4066Sahrens 	list_destroy(&spa->spa_dirty_list);
169fa9e4066Sahrens 
170fa9e4066Sahrens 	rw_destroy(&spa->spa_traverse_lock);
171fa9e4066Sahrens 
172fa9e4066Sahrens 	for (t = 0; t < ZIO_TYPES; t++) {
173fa9e4066Sahrens 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
174fa9e4066Sahrens 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
175fa9e4066Sahrens 		spa->spa_zio_issue_taskq[t] = NULL;
176fa9e4066Sahrens 		spa->spa_zio_intr_taskq[t] = NULL;
177fa9e4066Sahrens 	}
178fa9e4066Sahrens 
179fa9e4066Sahrens 	metaslab_class_destroy(spa->spa_normal_class);
180fa9e4066Sahrens 	spa->spa_normal_class = NULL;
181fa9e4066Sahrens 
182ea8dc4b6Seschrock 	/*
183ea8dc4b6Seschrock 	 * If this was part of an import or the open otherwise failed, we may
184ea8dc4b6Seschrock 	 * still have errors left in the queues.  Empty them just in case.
185ea8dc4b6Seschrock 	 */
186ea8dc4b6Seschrock 	spa_errlog_drain(spa);
187ea8dc4b6Seschrock 
188ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_scrub);
189ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_last);
190ea8dc4b6Seschrock 
191fa9e4066Sahrens 	spa->spa_state = POOL_STATE_UNINITIALIZED;
192fa9e4066Sahrens }
193fa9e4066Sahrens 
194fa9e4066Sahrens /*
195fa9e4066Sahrens  * Verify a pool configuration, and construct the vdev tree appropriately.  This
196fa9e4066Sahrens  * will create all the necessary vdevs in the appropriate layout, with each vdev
197fa9e4066Sahrens  * in the CLOSED state.  This will prep the pool before open/creation/import.
198fa9e4066Sahrens  * All vdev validation is done by the vdev_alloc() routine.
199fa9e4066Sahrens  */
20099653d4eSeschrock static int
20199653d4eSeschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
20299653d4eSeschrock     uint_t id, int atype)
203fa9e4066Sahrens {
204fa9e4066Sahrens 	nvlist_t **child;
205fa9e4066Sahrens 	uint_t c, children;
20699653d4eSeschrock 	int error;
207fa9e4066Sahrens 
20899653d4eSeschrock 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
20999653d4eSeschrock 		return (error);
210fa9e4066Sahrens 
21199653d4eSeschrock 	if ((*vdp)->vdev_ops->vdev_op_leaf)
21299653d4eSeschrock 		return (0);
213fa9e4066Sahrens 
214fa9e4066Sahrens 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
215fa9e4066Sahrens 	    &child, &children) != 0) {
21699653d4eSeschrock 		vdev_free(*vdp);
21799653d4eSeschrock 		*vdp = NULL;
21899653d4eSeschrock 		return (EINVAL);
219fa9e4066Sahrens 	}
220fa9e4066Sahrens 
221fa9e4066Sahrens 	for (c = 0; c < children; c++) {
22299653d4eSeschrock 		vdev_t *vd;
22399653d4eSeschrock 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
22499653d4eSeschrock 		    atype)) != 0) {
22599653d4eSeschrock 			vdev_free(*vdp);
22699653d4eSeschrock 			*vdp = NULL;
22799653d4eSeschrock 			return (error);
228fa9e4066Sahrens 		}
229fa9e4066Sahrens 	}
230fa9e4066Sahrens 
23199653d4eSeschrock 	ASSERT(*vdp != NULL);
23299653d4eSeschrock 
23399653d4eSeschrock 	return (0);
234fa9e4066Sahrens }
235fa9e4066Sahrens 
236fa9e4066Sahrens /*
237fa9e4066Sahrens  * Opposite of spa_load().
238fa9e4066Sahrens  */
239fa9e4066Sahrens static void
240fa9e4066Sahrens spa_unload(spa_t *spa)
241fa9e4066Sahrens {
24299653d4eSeschrock 	int i;
24399653d4eSeschrock 
244ea8dc4b6Seschrock 	/*
245ea8dc4b6Seschrock 	 * Stop async tasks.
246ea8dc4b6Seschrock 	 */
247ea8dc4b6Seschrock 	spa_async_suspend(spa);
248ea8dc4b6Seschrock 
249fa9e4066Sahrens 	/*
250fa9e4066Sahrens 	 * Stop syncing.
251fa9e4066Sahrens 	 */
252fa9e4066Sahrens 	if (spa->spa_sync_on) {
253fa9e4066Sahrens 		txg_sync_stop(spa->spa_dsl_pool);
254fa9e4066Sahrens 		spa->spa_sync_on = B_FALSE;
255fa9e4066Sahrens 	}
256fa9e4066Sahrens 
257fa9e4066Sahrens 	/*
258fa9e4066Sahrens 	 * Wait for any outstanding prefetch I/O to complete.
259fa9e4066Sahrens 	 */
260ea8dc4b6Seschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
261ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
262fa9e4066Sahrens 
263fa9e4066Sahrens 	/*
264fa9e4066Sahrens 	 * Close the dsl pool.
265fa9e4066Sahrens 	 */
266fa9e4066Sahrens 	if (spa->spa_dsl_pool) {
267fa9e4066Sahrens 		dsl_pool_close(spa->spa_dsl_pool);
268fa9e4066Sahrens 		spa->spa_dsl_pool = NULL;
269fa9e4066Sahrens 	}
270fa9e4066Sahrens 
271fa9e4066Sahrens 	/*
272fa9e4066Sahrens 	 * Close all vdevs.
273fa9e4066Sahrens 	 */
2740e34b6a7Sbonwick 	if (spa->spa_root_vdev)
275fa9e4066Sahrens 		vdev_free(spa->spa_root_vdev);
2760e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == NULL);
277ea8dc4b6Seschrock 
27899653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++)
27999653d4eSeschrock 		vdev_free(spa->spa_spares[i]);
28099653d4eSeschrock 	if (spa->spa_spares) {
28199653d4eSeschrock 		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
28299653d4eSeschrock 		spa->spa_spares = NULL;
28399653d4eSeschrock 	}
28499653d4eSeschrock 	if (spa->spa_sparelist) {
28599653d4eSeschrock 		nvlist_free(spa->spa_sparelist);
28699653d4eSeschrock 		spa->spa_sparelist = NULL;
28799653d4eSeschrock 	}
28899653d4eSeschrock 
289ea8dc4b6Seschrock 	spa->spa_async_suspended = 0;
290fa9e4066Sahrens }
291fa9e4066Sahrens 
29299653d4eSeschrock /*
29399653d4eSeschrock  * Load (or re-load) the current list of vdevs describing the active spares for
29499653d4eSeschrock  * this pool.  When this is called, we have some form of basic information in
29599653d4eSeschrock  * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
29699653d4eSeschrock  * re-generate a more complete list including status information.
29799653d4eSeschrock  */
29899653d4eSeschrock static void
29999653d4eSeschrock spa_load_spares(spa_t *spa)
30099653d4eSeschrock {
30199653d4eSeschrock 	nvlist_t **spares;
30299653d4eSeschrock 	uint_t nspares;
30399653d4eSeschrock 	int i;
304*39c23413Seschrock 	vdev_t *vd, *tvd;
30599653d4eSeschrock 
30699653d4eSeschrock 	/*
30799653d4eSeschrock 	 * First, close and free any existing spare vdevs.
30899653d4eSeschrock 	 */
30999653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++) {
310*39c23413Seschrock 		vd = spa->spa_spares[i];
311*39c23413Seschrock 
312*39c23413Seschrock 		/* Undo the call to spa_activate() below */
313*39c23413Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
314*39c23413Seschrock 		    tvd->vdev_isspare)
315*39c23413Seschrock 			spa_spare_remove(tvd);
316*39c23413Seschrock 		vdev_close(vd);
317*39c23413Seschrock 		vdev_free(vd);
31899653d4eSeschrock 	}
319*39c23413Seschrock 
32099653d4eSeschrock 	if (spa->spa_spares)
32199653d4eSeschrock 		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
32299653d4eSeschrock 
32399653d4eSeschrock 	if (spa->spa_sparelist == NULL)
32499653d4eSeschrock 		nspares = 0;
32599653d4eSeschrock 	else
32699653d4eSeschrock 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
32799653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
32899653d4eSeschrock 
32999653d4eSeschrock 	spa->spa_nspares = (int)nspares;
33099653d4eSeschrock 	spa->spa_spares = NULL;
33199653d4eSeschrock 
33299653d4eSeschrock 	if (nspares == 0)
33399653d4eSeschrock 		return;
33499653d4eSeschrock 
33599653d4eSeschrock 	/*
33699653d4eSeschrock 	 * Construct the array of vdevs, opening them to get status in the
337*39c23413Seschrock 	 * process.   For each spare, there is potentially two different vdev_t
338*39c23413Seschrock 	 * structures associated with it: one in the list of spares (used only
339*39c23413Seschrock 	 * for basic validation purposes) and one in the active vdev
340*39c23413Seschrock 	 * configuration (if it's spared in).  During this phase we open and
341*39c23413Seschrock 	 * validate each vdev on the spare list.  If the vdev also exists in the
342*39c23413Seschrock 	 * active configuration, then we also mark this vdev as an active spare.
34399653d4eSeschrock 	 */
34499653d4eSeschrock 	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
34599653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++) {
34699653d4eSeschrock 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
34799653d4eSeschrock 		    VDEV_ALLOC_SPARE) == 0);
34899653d4eSeschrock 		ASSERT(vd != NULL);
34999653d4eSeschrock 
35099653d4eSeschrock 		spa->spa_spares[i] = vd;
35199653d4eSeschrock 
352*39c23413Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
353*39c23413Seschrock 			if (!tvd->vdev_isspare)
354*39c23413Seschrock 				spa_spare_add(tvd);
355*39c23413Seschrock 
356*39c23413Seschrock 			/*
357*39c23413Seschrock 			 * We only mark the spare active if we were successfully
358*39c23413Seschrock 			 * able to load the vdev.  Otherwise, importing a pool
359*39c23413Seschrock 			 * with a bad active spare would result in strange
360*39c23413Seschrock 			 * behavior, because multiple pool would think the spare
361*39c23413Seschrock 			 * is actively in use.
362*39c23413Seschrock 			 *
363*39c23413Seschrock 			 * There is a vulnerability here to an equally bizarre
364*39c23413Seschrock 			 * circumstance, where a dead active spare is later
365*39c23413Seschrock 			 * brought back to life (onlined or otherwise).  Given
366*39c23413Seschrock 			 * the rarity of this scenario, and the extra complexity
367*39c23413Seschrock 			 * it adds, we ignore the possibility.
368*39c23413Seschrock 			 */
369*39c23413Seschrock 			if (!vdev_is_dead(tvd))
370*39c23413Seschrock 				spa_spare_activate(tvd);
371*39c23413Seschrock 		}
372*39c23413Seschrock 
37399653d4eSeschrock 		if (vdev_open(vd) != 0)
37499653d4eSeschrock 			continue;
37599653d4eSeschrock 
37699653d4eSeschrock 		vd->vdev_top = vd;
37799653d4eSeschrock 		(void) vdev_validate_spare(vd);
37899653d4eSeschrock 	}
37999653d4eSeschrock 
38099653d4eSeschrock 	/*
38199653d4eSeschrock 	 * Recompute the stashed list of spares, with status information
38299653d4eSeschrock 	 * this time.
38399653d4eSeschrock 	 */
38499653d4eSeschrock 	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
38599653d4eSeschrock 	    DATA_TYPE_NVLIST_ARRAY) == 0);
38699653d4eSeschrock 
38799653d4eSeschrock 	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
38899653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++)
38999653d4eSeschrock 		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
39099653d4eSeschrock 		    B_TRUE, B_TRUE);
39199653d4eSeschrock 	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
39299653d4eSeschrock 	    spares, spa->spa_nspares) == 0);
39399653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++)
39499653d4eSeschrock 		nvlist_free(spares[i]);
39599653d4eSeschrock 	kmem_free(spares, spa->spa_nspares * sizeof (void *));
39699653d4eSeschrock }
39799653d4eSeschrock 
39899653d4eSeschrock static int
39999653d4eSeschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
40099653d4eSeschrock {
40199653d4eSeschrock 	dmu_buf_t *db;
40299653d4eSeschrock 	char *packed = NULL;
40399653d4eSeschrock 	size_t nvsize = 0;
40499653d4eSeschrock 	int error;
40599653d4eSeschrock 	*value = NULL;
40699653d4eSeschrock 
40799653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
40899653d4eSeschrock 	nvsize = *(uint64_t *)db->db_data;
40999653d4eSeschrock 	dmu_buf_rele(db, FTAG);
41099653d4eSeschrock 
41199653d4eSeschrock 	packed = kmem_alloc(nvsize, KM_SLEEP);
41299653d4eSeschrock 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
41399653d4eSeschrock 	if (error == 0)
41499653d4eSeschrock 		error = nvlist_unpack(packed, nvsize, value, 0);
41599653d4eSeschrock 	kmem_free(packed, nvsize);
41699653d4eSeschrock 
41799653d4eSeschrock 	return (error);
41899653d4eSeschrock }
41999653d4eSeschrock 
420fa9e4066Sahrens /*
421fa9e4066Sahrens  * Load an existing storage pool, using the pool's builtin spa_config as a
422ea8dc4b6Seschrock  * source of configuration information.
423fa9e4066Sahrens  */
424fa9e4066Sahrens static int
425ea8dc4b6Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
426fa9e4066Sahrens {
427fa9e4066Sahrens 	int error = 0;
428fa9e4066Sahrens 	nvlist_t *nvroot = NULL;
429fa9e4066Sahrens 	vdev_t *rvd;
430fa9e4066Sahrens 	uberblock_t *ub = &spa->spa_uberblock;
4310373e76bSbonwick 	uint64_t config_cache_txg = spa->spa_config_txg;
432fa9e4066Sahrens 	uint64_t pool_guid;
43399653d4eSeschrock 	uint64_t version;
434fa9e4066Sahrens 	zio_t *zio;
435fa9e4066Sahrens 
436ea8dc4b6Seschrock 	spa->spa_load_state = state;
4370373e76bSbonwick 
438fa9e4066Sahrens 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
439a9926bf0Sbonwick 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
440ea8dc4b6Seschrock 		error = EINVAL;
441ea8dc4b6Seschrock 		goto out;
442ea8dc4b6Seschrock 	}
443fa9e4066Sahrens 
44499653d4eSeschrock 	/*
44599653d4eSeschrock 	 * Versioning wasn't explicitly added to the label until later, so if
44699653d4eSeschrock 	 * it's not present treat it as the initial version.
44799653d4eSeschrock 	 */
44899653d4eSeschrock 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
44999653d4eSeschrock 		version = ZFS_VERSION_INITIAL;
45099653d4eSeschrock 
451a9926bf0Sbonwick 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
452a9926bf0Sbonwick 	    &spa->spa_config_txg);
453a9926bf0Sbonwick 
4540373e76bSbonwick 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
455ea8dc4b6Seschrock 	    spa_guid_exists(pool_guid, 0)) {
456ea8dc4b6Seschrock 		error = EEXIST;
457ea8dc4b6Seschrock 		goto out;
458ea8dc4b6Seschrock 	}
459fa9e4066Sahrens 
460b5989ec7Seschrock 	spa->spa_load_guid = pool_guid;
461b5989ec7Seschrock 
462fa9e4066Sahrens 	/*
46399653d4eSeschrock 	 * Parse the configuration into a vdev tree.  We explicitly set the
46499653d4eSeschrock 	 * value that will be returned by spa_version() since parsing the
46599653d4eSeschrock 	 * configuration requires knowing the version number.
466fa9e4066Sahrens 	 */
467ea8dc4b6Seschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
46899653d4eSeschrock 	spa->spa_ubsync.ub_version = version;
46999653d4eSeschrock 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
470ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
471fa9e4066Sahrens 
47299653d4eSeschrock 	if (error != 0)
473ea8dc4b6Seschrock 		goto out;
474fa9e4066Sahrens 
4750e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == rvd);
476fa9e4066Sahrens 	ASSERT(spa_guid(spa) == pool_guid);
477fa9e4066Sahrens 
478fa9e4066Sahrens 	/*
479fa9e4066Sahrens 	 * Try to open all vdevs, loading each label in the process.
480fa9e4066Sahrens 	 */
481ea8dc4b6Seschrock 	if (vdev_open(rvd) != 0) {
482ea8dc4b6Seschrock 		error = ENXIO;
483ea8dc4b6Seschrock 		goto out;
484ea8dc4b6Seschrock 	}
485fa9e4066Sahrens 
486560e6e96Seschrock 	/*
487560e6e96Seschrock 	 * Validate the labels for all leaf vdevs.  We need to grab the config
488560e6e96Seschrock 	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
489560e6e96Seschrock 	 * flag.
490560e6e96Seschrock 	 */
491560e6e96Seschrock 	spa_config_enter(spa, RW_READER, FTAG);
492560e6e96Seschrock 	error = vdev_validate(rvd);
493560e6e96Seschrock 	spa_config_exit(spa, FTAG);
494560e6e96Seschrock 
495560e6e96Seschrock 	if (error != 0) {
496560e6e96Seschrock 		error = EBADF;
497560e6e96Seschrock 		goto out;
498560e6e96Seschrock 	}
499560e6e96Seschrock 
500560e6e96Seschrock 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
501560e6e96Seschrock 		error = ENXIO;
502560e6e96Seschrock 		goto out;
503560e6e96Seschrock 	}
504560e6e96Seschrock 
505fa9e4066Sahrens 	/*
506fa9e4066Sahrens 	 * Find the best uberblock.
507fa9e4066Sahrens 	 */
508fa9e4066Sahrens 	bzero(ub, sizeof (uberblock_t));
509fa9e4066Sahrens 
510fa9e4066Sahrens 	zio = zio_root(spa, NULL, NULL,
511fa9e4066Sahrens 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
512fa9e4066Sahrens 	vdev_uberblock_load(zio, rvd, ub);
513fa9e4066Sahrens 	error = zio_wait(zio);
514fa9e4066Sahrens 
515fa9e4066Sahrens 	/*
516fa9e4066Sahrens 	 * If we weren't able to find a single valid uberblock, return failure.
517fa9e4066Sahrens 	 */
518fa9e4066Sahrens 	if (ub->ub_txg == 0) {
519eaca9bbdSeschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
520eaca9bbdSeschrock 		    VDEV_AUX_CORRUPT_DATA);
521ea8dc4b6Seschrock 		error = ENXIO;
522ea8dc4b6Seschrock 		goto out;
523ea8dc4b6Seschrock 	}
524ea8dc4b6Seschrock 
525ea8dc4b6Seschrock 	/*
526ea8dc4b6Seschrock 	 * If the pool is newer than the code, we can't open it.
527ea8dc4b6Seschrock 	 */
528eaca9bbdSeschrock 	if (ub->ub_version > ZFS_VERSION) {
529eaca9bbdSeschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
530eaca9bbdSeschrock 		    VDEV_AUX_VERSION_NEWER);
531ea8dc4b6Seschrock 		error = ENOTSUP;
532ea8dc4b6Seschrock 		goto out;
533fa9e4066Sahrens 	}
534fa9e4066Sahrens 
535fa9e4066Sahrens 	/*
536fa9e4066Sahrens 	 * If the vdev guid sum doesn't match the uberblock, we have an
537fa9e4066Sahrens 	 * incomplete configuration.
538fa9e4066Sahrens 	 */
539ecc2d604Sbonwick 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
540ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
541ea8dc4b6Seschrock 		    VDEV_AUX_BAD_GUID_SUM);
542ea8dc4b6Seschrock 		error = ENXIO;
543ea8dc4b6Seschrock 		goto out;
544fa9e4066Sahrens 	}
545fa9e4066Sahrens 
546fa9e4066Sahrens 	/*
547fa9e4066Sahrens 	 * Initialize internal SPA structures.
548fa9e4066Sahrens 	 */
549fa9e4066Sahrens 	spa->spa_state = POOL_STATE_ACTIVE;
550fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
551fa9e4066Sahrens 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
552ea8dc4b6Seschrock 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
553ea8dc4b6Seschrock 	if (error) {
554ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
555ea8dc4b6Seschrock 		    VDEV_AUX_CORRUPT_DATA);
556ea8dc4b6Seschrock 		goto out;
557ea8dc4b6Seschrock 	}
558fa9e4066Sahrens 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
559fa9e4066Sahrens 
560ea8dc4b6Seschrock 	if (zap_lookup(spa->spa_meta_objset,
561fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
562ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
563ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
564ea8dc4b6Seschrock 		    VDEV_AUX_CORRUPT_DATA);
565ea8dc4b6Seschrock 		error = EIO;
566ea8dc4b6Seschrock 		goto out;
567ea8dc4b6Seschrock 	}
568fa9e4066Sahrens 
569fa9e4066Sahrens 	if (!mosconfig) {
57099653d4eSeschrock 		nvlist_t *newconfig;
571fa9e4066Sahrens 
57299653d4eSeschrock 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
573ea8dc4b6Seschrock 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
574ea8dc4b6Seschrock 			    VDEV_AUX_CORRUPT_DATA);
575ea8dc4b6Seschrock 			error = EIO;
576ea8dc4b6Seschrock 			goto out;
577ea8dc4b6Seschrock 		}
578fa9e4066Sahrens 
579fa9e4066Sahrens 		spa_config_set(spa, newconfig);
580fa9e4066Sahrens 		spa_unload(spa);
581fa9e4066Sahrens 		spa_deactivate(spa);
582fa9e4066Sahrens 		spa_activate(spa);
583fa9e4066Sahrens 
584ea8dc4b6Seschrock 		return (spa_load(spa, newconfig, state, B_TRUE));
585fa9e4066Sahrens 	}
586fa9e4066Sahrens 
587ea8dc4b6Seschrock 	if (zap_lookup(spa->spa_meta_objset,
588fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
589ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
590ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
591ea8dc4b6Seschrock 		    VDEV_AUX_CORRUPT_DATA);
592ea8dc4b6Seschrock 		error = EIO;
593ea8dc4b6Seschrock 		goto out;
594ea8dc4b6Seschrock 	}
595fa9e4066Sahrens 
59699653d4eSeschrock 	/*
59799653d4eSeschrock 	 * Load the bit that tells us to use the new accounting function
59899653d4eSeschrock 	 * (raid-z deflation).  If we have an older pool, this will not
59999653d4eSeschrock 	 * be present.
60099653d4eSeschrock 	 */
60199653d4eSeschrock 	error = zap_lookup(spa->spa_meta_objset,
60299653d4eSeschrock 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
60399653d4eSeschrock 	    sizeof (uint64_t), 1, &spa->spa_deflate);
60499653d4eSeschrock 	if (error != 0 && error != ENOENT) {
60599653d4eSeschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
60699653d4eSeschrock 		    VDEV_AUX_CORRUPT_DATA);
60799653d4eSeschrock 		error = EIO;
60899653d4eSeschrock 		goto out;
60999653d4eSeschrock 	}
61099653d4eSeschrock 
611fa9e4066Sahrens 	/*
612ea8dc4b6Seschrock 	 * Load the persistent error log.  If we have an older pool, this will
613ea8dc4b6Seschrock 	 * not be present.
614fa9e4066Sahrens 	 */
615ea8dc4b6Seschrock 	error = zap_lookup(spa->spa_meta_objset,
616ea8dc4b6Seschrock 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
617ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
618d80c45e0Sbonwick 	if (error != 0 && error != ENOENT) {
619ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
620ea8dc4b6Seschrock 		    VDEV_AUX_CORRUPT_DATA);
621ea8dc4b6Seschrock 		error = EIO;
622ea8dc4b6Seschrock 		goto out;
623ea8dc4b6Seschrock 	}
624ea8dc4b6Seschrock 
625ea8dc4b6Seschrock 	error = zap_lookup(spa->spa_meta_objset,
626ea8dc4b6Seschrock 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
627ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
628ea8dc4b6Seschrock 	if (error != 0 && error != ENOENT) {
629ea8dc4b6Seschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
630ea8dc4b6Seschrock 		    VDEV_AUX_CORRUPT_DATA);
631ea8dc4b6Seschrock 		error = EIO;
632ea8dc4b6Seschrock 		goto out;
633ea8dc4b6Seschrock 	}
634ea8dc4b6Seschrock 
63506eeb2adSek 	/*
63606eeb2adSek 	 * Load the history object.  If we have an older pool, this
63706eeb2adSek 	 * will not be present.
63806eeb2adSek 	 */
63906eeb2adSek 	error = zap_lookup(spa->spa_meta_objset,
64006eeb2adSek 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
64106eeb2adSek 	    sizeof (uint64_t), 1, &spa->spa_history);
64206eeb2adSek 	if (error != 0 && error != ENOENT) {
64306eeb2adSek 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
64406eeb2adSek 		    VDEV_AUX_CORRUPT_DATA);
64506eeb2adSek 		error = EIO;
64606eeb2adSek 		goto out;
64706eeb2adSek 	}
64806eeb2adSek 
64999653d4eSeschrock 	/*
65099653d4eSeschrock 	 * Load any hot spares for this pool.
65199653d4eSeschrock 	 */
65299653d4eSeschrock 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
65399653d4eSeschrock 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
65499653d4eSeschrock 	if (error != 0 && error != ENOENT) {
65599653d4eSeschrock 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
65699653d4eSeschrock 		    VDEV_AUX_CORRUPT_DATA);
65799653d4eSeschrock 		error = EIO;
65899653d4eSeschrock 		goto out;
65999653d4eSeschrock 	}
66099653d4eSeschrock 	if (error == 0) {
66199653d4eSeschrock 		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
66299653d4eSeschrock 		if (load_nvlist(spa, spa->spa_spares_object,
66399653d4eSeschrock 		    &spa->spa_sparelist) != 0) {
66499653d4eSeschrock 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
66599653d4eSeschrock 			    VDEV_AUX_CORRUPT_DATA);
66699653d4eSeschrock 			error = EIO;
66799653d4eSeschrock 			goto out;
66899653d4eSeschrock 		}
66999653d4eSeschrock 
67099653d4eSeschrock 		spa_config_enter(spa, RW_WRITER, FTAG);
67199653d4eSeschrock 		spa_load_spares(spa);
67299653d4eSeschrock 		spa_config_exit(spa, FTAG);
67399653d4eSeschrock 	}
67499653d4eSeschrock 
675ea8dc4b6Seschrock 	/*
676560e6e96Seschrock 	 * Load the vdev state for all toplevel vdevs.
677ea8dc4b6Seschrock 	 */
678560e6e96Seschrock 	vdev_load(rvd);
6790373e76bSbonwick 
680fa9e4066Sahrens 	/*
681fa9e4066Sahrens 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
682fa9e4066Sahrens 	 */
683ea8dc4b6Seschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
684fa9e4066Sahrens 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
685ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
686fa9e4066Sahrens 
687fa9e4066Sahrens 	/*
688fa9e4066Sahrens 	 * Check the state of the root vdev.  If it can't be opened, it
689fa9e4066Sahrens 	 * indicates one or more toplevel vdevs are faulted.
690fa9e4066Sahrens 	 */
691ea8dc4b6Seschrock 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
692ea8dc4b6Seschrock 		error = ENXIO;
693ea8dc4b6Seschrock 		goto out;
694ea8dc4b6Seschrock 	}
695fa9e4066Sahrens 
696ea8dc4b6Seschrock 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
6975dabedeeSbonwick 		dmu_tx_t *tx;
6980373e76bSbonwick 		int need_update = B_FALSE;
6990373e76bSbonwick 		int c;
7005dabedeeSbonwick 
7010373e76bSbonwick 		/*
7020373e76bSbonwick 		 * Claim log blocks that haven't been committed yet.
7030373e76bSbonwick 		 * This must all happen in a single txg.
7040373e76bSbonwick 		 */
7055dabedeeSbonwick 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
706fa9e4066Sahrens 		    spa_first_txg(spa));
7070b69c2f0Sahrens 		(void) dmu_objset_find(spa->spa_name,
7080b69c2f0Sahrens 		    zil_claim, tx, DS_FIND_CHILDREN);
709fa9e4066Sahrens 		dmu_tx_commit(tx);
710fa9e4066Sahrens 
711fa9e4066Sahrens 		spa->spa_sync_on = B_TRUE;
712fa9e4066Sahrens 		txg_sync_start(spa->spa_dsl_pool);
713fa9e4066Sahrens 
714fa9e4066Sahrens 		/*
715fa9e4066Sahrens 		 * Wait for all claims to sync.
716fa9e4066Sahrens 		 */
717fa9e4066Sahrens 		txg_wait_synced(spa->spa_dsl_pool, 0);
7180e34b6a7Sbonwick 
7190e34b6a7Sbonwick 		/*
7200373e76bSbonwick 		 * If the config cache is stale, or we have uninitialized
7210373e76bSbonwick 		 * metaslabs (see spa_vdev_add()), then update the config.
7220e34b6a7Sbonwick 		 */
7230373e76bSbonwick 		if (config_cache_txg != spa->spa_config_txg ||
7240373e76bSbonwick 		    state == SPA_LOAD_IMPORT)
7250373e76bSbonwick 			need_update = B_TRUE;
7260373e76bSbonwick 
7270373e76bSbonwick 		for (c = 0; c < rvd->vdev_children; c++)
7280373e76bSbonwick 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
7290373e76bSbonwick 				need_update = B_TRUE;
7300e34b6a7Sbonwick 
7310e34b6a7Sbonwick 		/*
7320373e76bSbonwick 		 * Update the config cache asychronously in case we're the
7330373e76bSbonwick 		 * root pool, in which case the config cache isn't writable yet.
7340e34b6a7Sbonwick 		 */
7350373e76bSbonwick 		if (need_update)
7360373e76bSbonwick 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
737fa9e4066Sahrens 	}
738fa9e4066Sahrens 
739ea8dc4b6Seschrock 	error = 0;
740ea8dc4b6Seschrock out:
74199653d4eSeschrock 	if (error && error != EBADF)
742ea8dc4b6Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
743ea8dc4b6Seschrock 	spa->spa_load_state = SPA_LOAD_NONE;
744ea8dc4b6Seschrock 	spa->spa_ena = 0;
745ea8dc4b6Seschrock 
746ea8dc4b6Seschrock 	return (error);
747fa9e4066Sahrens }
748fa9e4066Sahrens 
749fa9e4066Sahrens /*
750fa9e4066Sahrens  * Pool Open/Import
751fa9e4066Sahrens  *
752fa9e4066Sahrens  * The import case is identical to an open except that the configuration is sent
753fa9e4066Sahrens  * down from userland, instead of grabbed from the configuration cache.  For the
754fa9e4066Sahrens  * case of an open, the pool configuration will exist in the
755fa9e4066Sahrens  * POOL_STATE_UNITIALIZED state.
756fa9e4066Sahrens  *
757fa9e4066Sahrens  * The stats information (gen/count/ustats) is used to gather vdev statistics at
758fa9e4066Sahrens  * the same time open the pool, without having to keep around the spa_t in some
759fa9e4066Sahrens  * ambiguous state.
760fa9e4066Sahrens  */
761fa9e4066Sahrens static int
762fa9e4066Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
763fa9e4066Sahrens {
764fa9e4066Sahrens 	spa_t *spa;
765fa9e4066Sahrens 	int error;
766fa9e4066Sahrens 	int loaded = B_FALSE;
767fa9e4066Sahrens 	int locked = B_FALSE;
768fa9e4066Sahrens 
769fa9e4066Sahrens 	*spapp = NULL;
770fa9e4066Sahrens 
771fa9e4066Sahrens 	/*
772fa9e4066Sahrens 	 * As disgusting as this is, we need to support recursive calls to this
773fa9e4066Sahrens 	 * function because dsl_dir_open() is called during spa_load(), and ends
774fa9e4066Sahrens 	 * up calling spa_open() again.  The real fix is to figure out how to
775fa9e4066Sahrens 	 * avoid dsl_dir_open() calling this in the first place.
776fa9e4066Sahrens 	 */
777fa9e4066Sahrens 	if (mutex_owner(&spa_namespace_lock) != curthread) {
778fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
779fa9e4066Sahrens 		locked = B_TRUE;
780fa9e4066Sahrens 	}
781fa9e4066Sahrens 
782fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
783fa9e4066Sahrens 		if (locked)
784fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
785fa9e4066Sahrens 		return (ENOENT);
786fa9e4066Sahrens 	}
787fa9e4066Sahrens 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
788fa9e4066Sahrens 
789fa9e4066Sahrens 		spa_activate(spa);
790fa9e4066Sahrens 
7910373e76bSbonwick 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
792fa9e4066Sahrens 
793fa9e4066Sahrens 		if (error == EBADF) {
794fa9e4066Sahrens 			/*
795560e6e96Seschrock 			 * If vdev_validate() returns failure (indicated by
796560e6e96Seschrock 			 * EBADF), it indicates that one of the vdevs indicates
797560e6e96Seschrock 			 * that the pool has been exported or destroyed.  If
798560e6e96Seschrock 			 * this is the case, the config cache is out of sync and
799560e6e96Seschrock 			 * we should remove the pool from the namespace.
800fa9e4066Sahrens 			 */
80199653d4eSeschrock 			zfs_post_ok(spa, NULL);
802fa9e4066Sahrens 			spa_unload(spa);
803fa9e4066Sahrens 			spa_deactivate(spa);
804fa9e4066Sahrens 			spa_remove(spa);
805fa9e4066Sahrens 			spa_config_sync();
806fa9e4066Sahrens 			if (locked)
807fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
808fa9e4066Sahrens 			return (ENOENT);
809ea8dc4b6Seschrock 		}
810ea8dc4b6Seschrock 
811ea8dc4b6Seschrock 		if (error) {
812fa9e4066Sahrens 			/*
813fa9e4066Sahrens 			 * We can't open the pool, but we still have useful
814fa9e4066Sahrens 			 * information: the state of each vdev after the
815fa9e4066Sahrens 			 * attempted vdev_open().  Return this to the user.
816fa9e4066Sahrens 			 */
8170373e76bSbonwick 			if (config != NULL && spa->spa_root_vdev != NULL) {
8180373e76bSbonwick 				spa_config_enter(spa, RW_READER, FTAG);
819fa9e4066Sahrens 				*config = spa_config_generate(spa, NULL, -1ULL,
820fa9e4066Sahrens 				    B_TRUE);
8210373e76bSbonwick 				spa_config_exit(spa, FTAG);
8220373e76bSbonwick 			}
823fa9e4066Sahrens 			spa_unload(spa);
824fa9e4066Sahrens 			spa_deactivate(spa);
825ea8dc4b6Seschrock 			spa->spa_last_open_failed = B_TRUE;
826fa9e4066Sahrens 			if (locked)
827fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
828fa9e4066Sahrens 			*spapp = NULL;
829fa9e4066Sahrens 			return (error);
830ea8dc4b6Seschrock 		} else {
831ea8dc4b6Seschrock 			zfs_post_ok(spa, NULL);
832ea8dc4b6Seschrock 			spa->spa_last_open_failed = B_FALSE;
833fa9e4066Sahrens 		}
834fa9e4066Sahrens 
835fa9e4066Sahrens 		loaded = B_TRUE;
836fa9e4066Sahrens 	}
837fa9e4066Sahrens 
838fa9e4066Sahrens 	spa_open_ref(spa, tag);
839fa9e4066Sahrens 	if (locked)
840fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
841fa9e4066Sahrens 
842fa9e4066Sahrens 	*spapp = spa;
843fa9e4066Sahrens 
844fa9e4066Sahrens 	if (config != NULL) {
845ea8dc4b6Seschrock 		spa_config_enter(spa, RW_READER, FTAG);
846fa9e4066Sahrens 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
847ea8dc4b6Seschrock 		spa_config_exit(spa, FTAG);
848fa9e4066Sahrens 	}
849fa9e4066Sahrens 
850fa9e4066Sahrens 	/*
851fa9e4066Sahrens 	 * If we just loaded the pool, resilver anything that's out of date.
852fa9e4066Sahrens 	 */
853fa9e4066Sahrens 	if (loaded && (spa_mode & FWRITE))
854fa9e4066Sahrens 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
855fa9e4066Sahrens 
856fa9e4066Sahrens 	return (0);
857fa9e4066Sahrens }
858fa9e4066Sahrens 
859fa9e4066Sahrens int
860fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag)
861fa9e4066Sahrens {
862fa9e4066Sahrens 	return (spa_open_common(name, spapp, tag, NULL));
863fa9e4066Sahrens }
864fa9e4066Sahrens 
865ea8dc4b6Seschrock /*
866ea8dc4b6Seschrock  * Lookup the given spa_t, incrementing the inject count in the process,
867ea8dc4b6Seschrock  * preventing it from being exported or destroyed.
868ea8dc4b6Seschrock  */
869ea8dc4b6Seschrock spa_t *
870ea8dc4b6Seschrock spa_inject_addref(char *name)
871ea8dc4b6Seschrock {
872ea8dc4b6Seschrock 	spa_t *spa;
873ea8dc4b6Seschrock 
874ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
875ea8dc4b6Seschrock 	if ((spa = spa_lookup(name)) == NULL) {
876ea8dc4b6Seschrock 		mutex_exit(&spa_namespace_lock);
877ea8dc4b6Seschrock 		return (NULL);
878ea8dc4b6Seschrock 	}
879ea8dc4b6Seschrock 	spa->spa_inject_ref++;
880ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
881ea8dc4b6Seschrock 
882ea8dc4b6Seschrock 	return (spa);
883ea8dc4b6Seschrock }
884ea8dc4b6Seschrock 
885ea8dc4b6Seschrock void
886ea8dc4b6Seschrock spa_inject_delref(spa_t *spa)
887ea8dc4b6Seschrock {
888ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
889ea8dc4b6Seschrock 	spa->spa_inject_ref--;
890ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
891ea8dc4b6Seschrock }
892ea8dc4b6Seschrock 
89399653d4eSeschrock static void
89499653d4eSeschrock spa_add_spares(spa_t *spa, nvlist_t *config)
89599653d4eSeschrock {
89699653d4eSeschrock 	nvlist_t **spares;
89799653d4eSeschrock 	uint_t i, nspares;
89899653d4eSeschrock 	nvlist_t *nvroot;
89999653d4eSeschrock 	uint64_t guid;
90099653d4eSeschrock 	vdev_stat_t *vs;
90199653d4eSeschrock 	uint_t vsc;
902*39c23413Seschrock 	uint64_t pool;
90399653d4eSeschrock 
90499653d4eSeschrock 	if (spa->spa_nspares == 0)
90599653d4eSeschrock 		return;
90699653d4eSeschrock 
90799653d4eSeschrock 	VERIFY(nvlist_lookup_nvlist(config,
90899653d4eSeschrock 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
90999653d4eSeschrock 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
91099653d4eSeschrock 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
91199653d4eSeschrock 	if (nspares != 0) {
91299653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(nvroot,
91399653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
91499653d4eSeschrock 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
91599653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
91699653d4eSeschrock 
91799653d4eSeschrock 		/*
91899653d4eSeschrock 		 * Go through and find any spares which have since been
91999653d4eSeschrock 		 * repurposed as an active spare.  If this is the case, update
92099653d4eSeschrock 		 * their status appropriately.
92199653d4eSeschrock 		 */
92299653d4eSeschrock 		for (i = 0; i < nspares; i++) {
92399653d4eSeschrock 			VERIFY(nvlist_lookup_uint64(spares[i],
92499653d4eSeschrock 			    ZPOOL_CONFIG_GUID, &guid) == 0);
925*39c23413Seschrock 			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
92699653d4eSeschrock 				VERIFY(nvlist_lookup_uint64_array(
92799653d4eSeschrock 				    spares[i], ZPOOL_CONFIG_STATS,
92899653d4eSeschrock 				    (uint64_t **)&vs, &vsc) == 0);
92999653d4eSeschrock 				vs->vs_state = VDEV_STATE_CANT_OPEN;
93099653d4eSeschrock 				vs->vs_aux = VDEV_AUX_SPARED;
93199653d4eSeschrock 			}
93299653d4eSeschrock 		}
93399653d4eSeschrock 	}
93499653d4eSeschrock }
93599653d4eSeschrock 
936fa9e4066Sahrens int
937ea8dc4b6Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
938fa9e4066Sahrens {
939fa9e4066Sahrens 	int error;
940fa9e4066Sahrens 	spa_t *spa;
941fa9e4066Sahrens 
942fa9e4066Sahrens 	*config = NULL;
943fa9e4066Sahrens 	error = spa_open_common(name, &spa, FTAG, config);
944fa9e4066Sahrens 
94599653d4eSeschrock 	if (spa && *config != NULL) {
946ea8dc4b6Seschrock 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
947ea8dc4b6Seschrock 		    spa_get_errlog_size(spa)) == 0);
948ea8dc4b6Seschrock 
94999653d4eSeschrock 		spa_add_spares(spa, *config);
95099653d4eSeschrock 	}
95199653d4eSeschrock 
952ea8dc4b6Seschrock 	/*
953ea8dc4b6Seschrock 	 * We want to get the alternate root even for faulted pools, so we cheat
954ea8dc4b6Seschrock 	 * and call spa_lookup() directly.
955ea8dc4b6Seschrock 	 */
956ea8dc4b6Seschrock 	if (altroot) {
957ea8dc4b6Seschrock 		if (spa == NULL) {
958ea8dc4b6Seschrock 			mutex_enter(&spa_namespace_lock);
959ea8dc4b6Seschrock 			spa = spa_lookup(name);
960ea8dc4b6Seschrock 			if (spa)
961ea8dc4b6Seschrock 				spa_altroot(spa, altroot, buflen);
962ea8dc4b6Seschrock 			else
963ea8dc4b6Seschrock 				altroot[0] = '\0';
964ea8dc4b6Seschrock 			spa = NULL;
965ea8dc4b6Seschrock 			mutex_exit(&spa_namespace_lock);
966ea8dc4b6Seschrock 		} else {
967ea8dc4b6Seschrock 			spa_altroot(spa, altroot, buflen);
968ea8dc4b6Seschrock 		}
969ea8dc4b6Seschrock 	}
970ea8dc4b6Seschrock 
971fa9e4066Sahrens 	if (spa != NULL)
972fa9e4066Sahrens 		spa_close(spa, FTAG);
973fa9e4066Sahrens 
974fa9e4066Sahrens 	return (error);
975fa9e4066Sahrens }
976fa9e4066Sahrens 
97799653d4eSeschrock /*
97899653d4eSeschrock  * Validate that the 'spares' array is well formed.  We must have an array of
979*39c23413Seschrock  * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
980*39c23413Seschrock  * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
981*39c23413Seschrock  * as they are well-formed.
98299653d4eSeschrock  */
98399653d4eSeschrock static int
98499653d4eSeschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
98599653d4eSeschrock {
98699653d4eSeschrock 	nvlist_t **spares;
98799653d4eSeschrock 	uint_t i, nspares;
98899653d4eSeschrock 	vdev_t *vd;
98999653d4eSeschrock 	int error;
99099653d4eSeschrock 
99199653d4eSeschrock 	/*
99299653d4eSeschrock 	 * It's acceptable to have no spares specified.
99399653d4eSeschrock 	 */
99499653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
99599653d4eSeschrock 	    &spares, &nspares) != 0)
99699653d4eSeschrock 		return (0);
99799653d4eSeschrock 
99899653d4eSeschrock 	if (nspares == 0)
99999653d4eSeschrock 		return (EINVAL);
100099653d4eSeschrock 
100199653d4eSeschrock 	/*
100299653d4eSeschrock 	 * Make sure the pool is formatted with a version that supports hot
100399653d4eSeschrock 	 * spares.
100499653d4eSeschrock 	 */
100599653d4eSeschrock 	if (spa_version(spa) < ZFS_VERSION_SPARES)
100699653d4eSeschrock 		return (ENOTSUP);
100799653d4eSeschrock 
1008*39c23413Seschrock 	/*
1009*39c23413Seschrock 	 * Set the pending spare list so we correctly handle device in-use
1010*39c23413Seschrock 	 * checking.
1011*39c23413Seschrock 	 */
1012*39c23413Seschrock 	spa->spa_pending_spares = spares;
1013*39c23413Seschrock 	spa->spa_pending_nspares = nspares;
1014*39c23413Seschrock 
101599653d4eSeschrock 	for (i = 0; i < nspares; i++) {
101699653d4eSeschrock 		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
101799653d4eSeschrock 		    mode)) != 0)
1018*39c23413Seschrock 			goto out;
101999653d4eSeschrock 
102099653d4eSeschrock 		if (!vd->vdev_ops->vdev_op_leaf) {
102199653d4eSeschrock 			vdev_free(vd);
1022*39c23413Seschrock 			error = EINVAL;
1023*39c23413Seschrock 			goto out;
102499653d4eSeschrock 		}
102599653d4eSeschrock 
102699653d4eSeschrock 		vd->vdev_top = vd;
102799653d4eSeschrock 
1028*39c23413Seschrock 		if ((error = vdev_open(vd)) == 0 &&
1029*39c23413Seschrock 		    (error = vdev_label_init(vd, crtxg,
1030*39c23413Seschrock 		    VDEV_LABEL_SPARE)) == 0) {
1031*39c23413Seschrock 			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
1032*39c23413Seschrock 			    vd->vdev_guid) == 0);
1033*39c23413Seschrock 		}
103499653d4eSeschrock 
103599653d4eSeschrock 		vdev_free(vd);
1036*39c23413Seschrock 
1037*39c23413Seschrock 		if (error && mode != VDEV_ALLOC_SPARE)
1038*39c23413Seschrock 			goto out;
1039*39c23413Seschrock 		else
1040*39c23413Seschrock 			error = 0;
104199653d4eSeschrock 	}
104299653d4eSeschrock 
1043*39c23413Seschrock out:
1044*39c23413Seschrock 	spa->spa_pending_spares = NULL;
1045*39c23413Seschrock 	spa->spa_pending_nspares = 0;
1046*39c23413Seschrock 	return (error);
104799653d4eSeschrock }
104899653d4eSeschrock 
1049fa9e4066Sahrens /*
1050fa9e4066Sahrens  * Pool Creation
1051fa9e4066Sahrens  */
1052fa9e4066Sahrens int
10530373e76bSbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
1054fa9e4066Sahrens {
1055fa9e4066Sahrens 	spa_t *spa;
10560373e76bSbonwick 	vdev_t *rvd;
1057fa9e4066Sahrens 	dsl_pool_t *dp;
1058fa9e4066Sahrens 	dmu_tx_t *tx;
105999653d4eSeschrock 	int c, error = 0;
1060fa9e4066Sahrens 	uint64_t txg = TXG_INITIAL;
106199653d4eSeschrock 	nvlist_t **spares;
106299653d4eSeschrock 	uint_t nspares;
1063fa9e4066Sahrens 
1064fa9e4066Sahrens 	/*
1065fa9e4066Sahrens 	 * If this pool already exists, return failure.
1066fa9e4066Sahrens 	 */
1067fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
1068fa9e4066Sahrens 	if (spa_lookup(pool) != NULL) {
1069fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
1070fa9e4066Sahrens 		return (EEXIST);
1071fa9e4066Sahrens 	}
1072fa9e4066Sahrens 
1073fa9e4066Sahrens 	/*
1074fa9e4066Sahrens 	 * Allocate a new spa_t structure.
1075fa9e4066Sahrens 	 */
10760373e76bSbonwick 	spa = spa_add(pool, altroot);
1077fa9e4066Sahrens 	spa_activate(spa);
1078fa9e4066Sahrens 
1079fa9e4066Sahrens 	spa->spa_uberblock.ub_txg = txg - 1;
1080eaca9bbdSeschrock 	spa->spa_uberblock.ub_version = ZFS_VERSION;
1081fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
1082fa9e4066Sahrens 
10830373e76bSbonwick 	/*
10840373e76bSbonwick 	 * Create the root vdev.
10850373e76bSbonwick 	 */
10860373e76bSbonwick 	spa_config_enter(spa, RW_WRITER, FTAG);
10870373e76bSbonwick 
108899653d4eSeschrock 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
10890373e76bSbonwick 
109099653d4eSeschrock 	ASSERT(error != 0 || rvd != NULL);
109199653d4eSeschrock 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
10920373e76bSbonwick 
109399653d4eSeschrock 	if (error == 0 && rvd->vdev_children == 0)
10940373e76bSbonwick 		error = EINVAL;
109599653d4eSeschrock 
109699653d4eSeschrock 	if (error == 0 &&
109799653d4eSeschrock 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
109899653d4eSeschrock 	    (error = spa_validate_spares(spa, nvroot, txg,
109999653d4eSeschrock 	    VDEV_ALLOC_ADD)) == 0) {
110099653d4eSeschrock 		for (c = 0; c < rvd->vdev_children; c++)
110199653d4eSeschrock 			vdev_init(rvd->vdev_child[c], txg);
110299653d4eSeschrock 		vdev_config_dirty(rvd);
11030373e76bSbonwick 	}
11040373e76bSbonwick 
11050373e76bSbonwick 	spa_config_exit(spa, FTAG);
1106fa9e4066Sahrens 
110799653d4eSeschrock 	if (error != 0) {
1108fa9e4066Sahrens 		spa_unload(spa);
1109fa9e4066Sahrens 		spa_deactivate(spa);
1110fa9e4066Sahrens 		spa_remove(spa);
1111fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
1112fa9e4066Sahrens 		return (error);
1113fa9e4066Sahrens 	}
1114fa9e4066Sahrens 
111599653d4eSeschrock 	/*
111699653d4eSeschrock 	 * Get the list of spares, if specified.
111799653d4eSeschrock 	 */
111899653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
111999653d4eSeschrock 	    &spares, &nspares) == 0) {
112099653d4eSeschrock 		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
112199653d4eSeschrock 		    KM_SLEEP) == 0);
112299653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
112399653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
112499653d4eSeschrock 		spa_config_enter(spa, RW_WRITER, FTAG);
112599653d4eSeschrock 		spa_load_spares(spa);
112699653d4eSeschrock 		spa_config_exit(spa, FTAG);
112799653d4eSeschrock 		spa->spa_sync_spares = B_TRUE;
112899653d4eSeschrock 	}
112999653d4eSeschrock 
1130fa9e4066Sahrens 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1131fa9e4066Sahrens 	spa->spa_meta_objset = dp->dp_meta_objset;
1132fa9e4066Sahrens 
1133fa9e4066Sahrens 	tx = dmu_tx_create_assigned(dp, txg);
1134fa9e4066Sahrens 
1135fa9e4066Sahrens 	/*
1136fa9e4066Sahrens 	 * Create the pool config object.
1137fa9e4066Sahrens 	 */
1138fa9e4066Sahrens 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1139fa9e4066Sahrens 	    DMU_OT_PACKED_NVLIST, 1 << 14,
1140fa9e4066Sahrens 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1141fa9e4066Sahrens 
1142ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
1143fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1144ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
1145ea8dc4b6Seschrock 		cmn_err(CE_PANIC, "failed to add pool config");
1146ea8dc4b6Seschrock 	}
1147fa9e4066Sahrens 
114899653d4eSeschrock 	/* Newly created pools are always deflated. */
114999653d4eSeschrock 	spa->spa_deflate = TRUE;
115099653d4eSeschrock 	if (zap_add(spa->spa_meta_objset,
115199653d4eSeschrock 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
115299653d4eSeschrock 	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
115399653d4eSeschrock 		cmn_err(CE_PANIC, "failed to add deflate");
115499653d4eSeschrock 	}
115599653d4eSeschrock 
1156fa9e4066Sahrens 	/*
1157fa9e4066Sahrens 	 * Create the deferred-free bplist object.  Turn off compression
1158fa9e4066Sahrens 	 * because sync-to-convergence takes longer if the blocksize
1159fa9e4066Sahrens 	 * keeps changing.
1160fa9e4066Sahrens 	 */
1161fa9e4066Sahrens 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
1162fa9e4066Sahrens 	    1 << 14, tx);
1163fa9e4066Sahrens 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
1164fa9e4066Sahrens 	    ZIO_COMPRESS_OFF, tx);
1165fa9e4066Sahrens 
1166ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
1167fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1168ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
1169ea8dc4b6Seschrock 		cmn_err(CE_PANIC, "failed to add bplist");
1170ea8dc4b6Seschrock 	}
1171fa9e4066Sahrens 
117206eeb2adSek 	/*
117306eeb2adSek 	 * Create the pool's history object.
117406eeb2adSek 	 */
117506eeb2adSek 	spa_history_create_obj(spa, tx);
117606eeb2adSek 
1177fa9e4066Sahrens 	dmu_tx_commit(tx);
1178fa9e4066Sahrens 
1179fa9e4066Sahrens 	spa->spa_sync_on = B_TRUE;
1180fa9e4066Sahrens 	txg_sync_start(spa->spa_dsl_pool);
1181fa9e4066Sahrens 
1182fa9e4066Sahrens 	/*
1183fa9e4066Sahrens 	 * We explicitly wait for the first transaction to complete so that our
1184fa9e4066Sahrens 	 * bean counters are appropriately updated.
1185fa9e4066Sahrens 	 */
1186fa9e4066Sahrens 	txg_wait_synced(spa->spa_dsl_pool, txg);
1187fa9e4066Sahrens 
1188fa9e4066Sahrens 	spa_config_sync();
1189fa9e4066Sahrens 
1190fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
1191fa9e4066Sahrens 
1192fa9e4066Sahrens 	return (0);
1193fa9e4066Sahrens }
1194fa9e4066Sahrens 
1195fa9e4066Sahrens /*
1196fa9e4066Sahrens  * Import the given pool into the system.  We set up the necessary spa_t and
1197fa9e4066Sahrens  * then call spa_load() to do the dirty work.
1198fa9e4066Sahrens  */
1199fa9e4066Sahrens int
12000373e76bSbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot)
1201fa9e4066Sahrens {
1202fa9e4066Sahrens 	spa_t *spa;
1203fa9e4066Sahrens 	int error;
120499653d4eSeschrock 	nvlist_t *nvroot;
120599653d4eSeschrock 	nvlist_t **spares;
120699653d4eSeschrock 	uint_t nspares;
1207fa9e4066Sahrens 
1208fa9e4066Sahrens 	if (!(spa_mode & FWRITE))
1209fa9e4066Sahrens 		return (EROFS);
1210fa9e4066Sahrens 
1211fa9e4066Sahrens 	/*
1212fa9e4066Sahrens 	 * If a pool with this name exists, return failure.
1213fa9e4066Sahrens 	 */
1214fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
1215fa9e4066Sahrens 	if (spa_lookup(pool) != NULL) {
1216fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
1217fa9e4066Sahrens 		return (EEXIST);
1218fa9e4066Sahrens 	}
1219fa9e4066Sahrens 
1220fa9e4066Sahrens 	/*
12210373e76bSbonwick 	 * Create and initialize the spa structure.
1222fa9e4066Sahrens 	 */
12230373e76bSbonwick 	spa = spa_add(pool, altroot);
1224fa9e4066Sahrens 	spa_activate(spa);
1225fa9e4066Sahrens 
12265dabedeeSbonwick 	/*
12270373e76bSbonwick 	 * Pass off the heavy lifting to spa_load().
1228ecc2d604Sbonwick 	 * Pass TRUE for mosconfig because the user-supplied config
1229ecc2d604Sbonwick 	 * is actually the one to trust when doing an import.
12305dabedeeSbonwick 	 */
1231ecc2d604Sbonwick 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
1232fa9e4066Sahrens 
123399653d4eSeschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
123499653d4eSeschrock 	/*
123599653d4eSeschrock 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
123699653d4eSeschrock 	 * and conflicts with spa_has_spare().
123799653d4eSeschrock 	 */
123899653d4eSeschrock 	if (spa->spa_sparelist) {
123999653d4eSeschrock 		nvlist_free(spa->spa_sparelist);
124099653d4eSeschrock 		spa->spa_sparelist = NULL;
124199653d4eSeschrock 		spa_load_spares(spa);
124299653d4eSeschrock 	}
124399653d4eSeschrock 
124499653d4eSeschrock 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
124599653d4eSeschrock 	    &nvroot) == 0);
124699653d4eSeschrock 	if (error == 0)
124799653d4eSeschrock 		error = spa_validate_spares(spa, nvroot, -1ULL,
124899653d4eSeschrock 		    VDEV_ALLOC_SPARE);
124999653d4eSeschrock 	spa_config_exit(spa, FTAG);
125099653d4eSeschrock 
125199653d4eSeschrock 	if (error != 0) {
1252fa9e4066Sahrens 		spa_unload(spa);
1253fa9e4066Sahrens 		spa_deactivate(spa);
1254fa9e4066Sahrens 		spa_remove(spa);
1255fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
1256fa9e4066Sahrens 		return (error);
1257fa9e4066Sahrens 	}
1258fa9e4066Sahrens 
125999653d4eSeschrock 	/*
126099653d4eSeschrock 	 * Override any spares as specified by the user, as these may have
126199653d4eSeschrock 	 * correct device names/devids, etc.
126299653d4eSeschrock 	 */
126399653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
126499653d4eSeschrock 	    &spares, &nspares) == 0) {
126599653d4eSeschrock 		if (spa->spa_sparelist)
126699653d4eSeschrock 			VERIFY(nvlist_remove(spa->spa_sparelist,
126799653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
126899653d4eSeschrock 		else
126999653d4eSeschrock 			VERIFY(nvlist_alloc(&spa->spa_sparelist,
127099653d4eSeschrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
127199653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
127299653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
127399653d4eSeschrock 		spa_config_enter(spa, RW_WRITER, FTAG);
127499653d4eSeschrock 		spa_load_spares(spa);
127599653d4eSeschrock 		spa_config_exit(spa, FTAG);
127699653d4eSeschrock 		spa->spa_sync_spares = B_TRUE;
127799653d4eSeschrock 	}
127899653d4eSeschrock 
12790373e76bSbonwick 	/*
12800373e76bSbonwick 	 * Update the config cache to include the newly-imported pool.
12810373e76bSbonwick 	 */
12820373e76bSbonwick 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
12830373e76bSbonwick 
1284fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
1285fa9e4066Sahrens 
1286fa9e4066Sahrens 	/*
1287fa9e4066Sahrens 	 * Resilver anything that's out of date.
1288fa9e4066Sahrens 	 */
1289fa9e4066Sahrens 	if (spa_mode & FWRITE)
1290fa9e4066Sahrens 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1291fa9e4066Sahrens 
1292fa9e4066Sahrens 	return (0);
1293fa9e4066Sahrens }
1294fa9e4066Sahrens 
1295fa9e4066Sahrens /*
1296fa9e4066Sahrens  * This (illegal) pool name is used when temporarily importing a spa_t in order
1297fa9e4066Sahrens  * to get the vdev stats associated with the imported devices.
1298fa9e4066Sahrens  */
1299fa9e4066Sahrens #define	TRYIMPORT_NAME	"$import"
1300fa9e4066Sahrens 
1301fa9e4066Sahrens nvlist_t *
1302fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig)
1303fa9e4066Sahrens {
1304fa9e4066Sahrens 	nvlist_t *config = NULL;
1305fa9e4066Sahrens 	char *poolname;
1306fa9e4066Sahrens 	spa_t *spa;
1307fa9e4066Sahrens 	uint64_t state;
1308fa9e4066Sahrens 
1309fa9e4066Sahrens 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
1310fa9e4066Sahrens 		return (NULL);
1311fa9e4066Sahrens 
1312fa9e4066Sahrens 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
1313fa9e4066Sahrens 		return (NULL);
1314fa9e4066Sahrens 
1315fa9e4066Sahrens 	/*
13160373e76bSbonwick 	 * Create and initialize the spa structure.
1317fa9e4066Sahrens 	 */
13180373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
13190373e76bSbonwick 	spa = spa_add(TRYIMPORT_NAME, NULL);
1320fa9e4066Sahrens 	spa_activate(spa);
1321fa9e4066Sahrens 
1322fa9e4066Sahrens 	/*
13230373e76bSbonwick 	 * Pass off the heavy lifting to spa_load().
1324ecc2d604Sbonwick 	 * Pass TRUE for mosconfig because the user-supplied config
1325ecc2d604Sbonwick 	 * is actually the one to trust when doing an import.
1326fa9e4066Sahrens 	 */
1327ecc2d604Sbonwick 	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
1328fa9e4066Sahrens 
1329fa9e4066Sahrens 	/*
1330fa9e4066Sahrens 	 * If 'tryconfig' was at least parsable, return the current config.
1331fa9e4066Sahrens 	 */
1332fa9e4066Sahrens 	if (spa->spa_root_vdev != NULL) {
13330373e76bSbonwick 		spa_config_enter(spa, RW_READER, FTAG);
1334fa9e4066Sahrens 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
13350373e76bSbonwick 		spa_config_exit(spa, FTAG);
1336fa9e4066Sahrens 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
1337fa9e4066Sahrens 		    poolname) == 0);
1338fa9e4066Sahrens 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
1339fa9e4066Sahrens 		    state) == 0);
134099653d4eSeschrock 
134199653d4eSeschrock 		/*
134299653d4eSeschrock 		 * Add the list of hot spares.
134399653d4eSeschrock 		 */
134499653d4eSeschrock 		spa_add_spares(spa, config);
1345fa9e4066Sahrens 	}
1346fa9e4066Sahrens 
1347fa9e4066Sahrens 	spa_unload(spa);
1348fa9e4066Sahrens 	spa_deactivate(spa);
1349fa9e4066Sahrens 	spa_remove(spa);
1350fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
1351fa9e4066Sahrens 
1352fa9e4066Sahrens 	return (config);
1353fa9e4066Sahrens }
1354fa9e4066Sahrens 
1355fa9e4066Sahrens /*
1356fa9e4066Sahrens  * Pool export/destroy
1357fa9e4066Sahrens  *
1358fa9e4066Sahrens  * The act of destroying or exporting a pool is very simple.  We make sure there
1359fa9e4066Sahrens  * is no more pending I/O and any references to the pool are gone.  Then, we
1360fa9e4066Sahrens  * update the pool state and sync all the labels to disk, removing the
1361fa9e4066Sahrens  * configuration from the cache afterwards.
1362fa9e4066Sahrens  */
1363fa9e4066Sahrens static int
136444cd46caSbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
1365fa9e4066Sahrens {
1366fa9e4066Sahrens 	spa_t *spa;
1367fa9e4066Sahrens 
136844cd46caSbillm 	if (oldconfig)
136944cd46caSbillm 		*oldconfig = NULL;
137044cd46caSbillm 
1371fa9e4066Sahrens 	if (!(spa_mode & FWRITE))
1372fa9e4066Sahrens 		return (EROFS);
1373fa9e4066Sahrens 
1374fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
1375fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
1376fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
1377fa9e4066Sahrens 		return (ENOENT);
1378fa9e4066Sahrens 	}
1379fa9e4066Sahrens 
1380ea8dc4b6Seschrock 	/*
1381ea8dc4b6Seschrock 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
1382ea8dc4b6Seschrock 	 * reacquire the namespace lock, and see if we can export.
1383ea8dc4b6Seschrock 	 */
1384ea8dc4b6Seschrock 	spa_open_ref(spa, FTAG);
1385ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
1386ea8dc4b6Seschrock 	spa_async_suspend(spa);
1387ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
1388ea8dc4b6Seschrock 	spa_close(spa, FTAG);
1389ea8dc4b6Seschrock 
1390fa9e4066Sahrens 	/*
1391fa9e4066Sahrens 	 * The pool will be in core if it's openable,
1392fa9e4066Sahrens 	 * in which case we can modify its state.
1393fa9e4066Sahrens 	 */
1394fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
1395fa9e4066Sahrens 		/*
1396fa9e4066Sahrens 		 * Objsets may be open only because they're dirty, so we
1397fa9e4066Sahrens 		 * have to force it to sync before checking spa_refcnt.
1398fa9e4066Sahrens 		 */
1399fa9e4066Sahrens 		spa_scrub_suspend(spa);
1400fa9e4066Sahrens 		txg_wait_synced(spa->spa_dsl_pool, 0);
1401fa9e4066Sahrens 
1402ea8dc4b6Seschrock 		/*
1403ea8dc4b6Seschrock 		 * A pool cannot be exported or destroyed if there are active
1404ea8dc4b6Seschrock 		 * references.  If we are resetting a pool, allow references by
1405ea8dc4b6Seschrock 		 * fault injection handlers.
1406ea8dc4b6Seschrock 		 */
1407ea8dc4b6Seschrock 		if (!spa_refcount_zero(spa) ||
1408ea8dc4b6Seschrock 		    (spa->spa_inject_ref != 0 &&
1409ea8dc4b6Seschrock 		    new_state != POOL_STATE_UNINITIALIZED)) {
1410fa9e4066Sahrens 			spa_scrub_resume(spa);
1411ea8dc4b6Seschrock 			spa_async_resume(spa);
1412fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
1413fa9e4066Sahrens 			return (EBUSY);
1414fa9e4066Sahrens 		}
1415fa9e4066Sahrens 
1416fa9e4066Sahrens 		spa_scrub_resume(spa);
1417fa9e4066Sahrens 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
1418fa9e4066Sahrens 
1419fa9e4066Sahrens 		/*
1420fa9e4066Sahrens 		 * We want this to be reflected on every label,
1421fa9e4066Sahrens 		 * so mark them all dirty.  spa_unload() will do the
1422fa9e4066Sahrens 		 * final sync that pushes these changes out.
1423fa9e4066Sahrens 		 */
1424ea8dc4b6Seschrock 		if (new_state != POOL_STATE_UNINITIALIZED) {
14255dabedeeSbonwick 			spa_config_enter(spa, RW_WRITER, FTAG);
1426ea8dc4b6Seschrock 			spa->spa_state = new_state;
14270373e76bSbonwick 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
1428ea8dc4b6Seschrock 			vdev_config_dirty(spa->spa_root_vdev);
14295dabedeeSbonwick 			spa_config_exit(spa, FTAG);
1430ea8dc4b6Seschrock 		}
1431fa9e4066Sahrens 	}
1432fa9e4066Sahrens 
1433fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
1434fa9e4066Sahrens 		spa_unload(spa);
1435fa9e4066Sahrens 		spa_deactivate(spa);
1436fa9e4066Sahrens 	}
1437fa9e4066Sahrens 
143844cd46caSbillm 	if (oldconfig && spa->spa_config)
143944cd46caSbillm 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
144044cd46caSbillm 
1441ea8dc4b6Seschrock 	if (new_state != POOL_STATE_UNINITIALIZED) {
1442ea8dc4b6Seschrock 		spa_remove(spa);
1443ea8dc4b6Seschrock 		spa_config_sync();
1444ea8dc4b6Seschrock 	}
1445fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
1446fa9e4066Sahrens 
1447fa9e4066Sahrens 	return (0);
1448fa9e4066Sahrens }
1449fa9e4066Sahrens 
1450fa9e4066Sahrens /*
1451fa9e4066Sahrens  * Destroy a storage pool.
1452fa9e4066Sahrens  */
1453fa9e4066Sahrens int
1454fa9e4066Sahrens spa_destroy(char *pool)
1455fa9e4066Sahrens {
145644cd46caSbillm 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
1457fa9e4066Sahrens }
1458fa9e4066Sahrens 
1459fa9e4066Sahrens /*
1460fa9e4066Sahrens  * Export a storage pool.
1461fa9e4066Sahrens  */
1462fa9e4066Sahrens int
146344cd46caSbillm spa_export(char *pool, nvlist_t **oldconfig)
1464fa9e4066Sahrens {
146544cd46caSbillm 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
1466fa9e4066Sahrens }
1467fa9e4066Sahrens 
1468ea8dc4b6Seschrock /*
1469ea8dc4b6Seschrock  * Similar to spa_export(), this unloads the spa_t without actually removing it
1470ea8dc4b6Seschrock  * from the namespace in any way.
1471ea8dc4b6Seschrock  */
1472ea8dc4b6Seschrock int
1473ea8dc4b6Seschrock spa_reset(char *pool)
1474ea8dc4b6Seschrock {
147544cd46caSbillm 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
1476ea8dc4b6Seschrock }
1477ea8dc4b6Seschrock 
1478ea8dc4b6Seschrock 
1479fa9e4066Sahrens /*
1480fa9e4066Sahrens  * ==========================================================================
1481fa9e4066Sahrens  * Device manipulation
1482fa9e4066Sahrens  * ==========================================================================
1483fa9e4066Sahrens  */
1484fa9e4066Sahrens 
1485fa9e4066Sahrens /*
1486fa9e4066Sahrens  * Add capacity to a storage pool.
1487fa9e4066Sahrens  */
1488fa9e4066Sahrens int
1489fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
1490fa9e4066Sahrens {
1491fa9e4066Sahrens 	uint64_t txg;
14920373e76bSbonwick 	int c, error;
1493fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
14940e34b6a7Sbonwick 	vdev_t *vd, *tvd;
149599653d4eSeschrock 	nvlist_t **spares;
149699653d4eSeschrock 	uint_t i, nspares;
1497fa9e4066Sahrens 
1498fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
1499fa9e4066Sahrens 
150099653d4eSeschrock 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
150199653d4eSeschrock 	    VDEV_ALLOC_ADD)) != 0)
150299653d4eSeschrock 		return (spa_vdev_exit(spa, NULL, txg, error));
1503fa9e4066Sahrens 
1504*39c23413Seschrock 	spa->spa_pending_vdev = vd;
150599653d4eSeschrock 
150699653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
150799653d4eSeschrock 	    &spares, &nspares) != 0)
150899653d4eSeschrock 		nspares = 0;
150999653d4eSeschrock 
1510*39c23413Seschrock 	if (vd->vdev_children == 0 && nspares == 0) {
1511*39c23413Seschrock 		spa->spa_pending_vdev = NULL;
1512fa9e4066Sahrens 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
1513*39c23413Seschrock 	}
1514fa9e4066Sahrens 
151599653d4eSeschrock 	if (vd->vdev_children != 0) {
1516*39c23413Seschrock 		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
1517*39c23413Seschrock 			spa->spa_pending_vdev = NULL;
151899653d4eSeschrock 			return (spa_vdev_exit(spa, vd, txg, error));
151999653d4eSeschrock 		}
152099653d4eSeschrock 	}
152199653d4eSeschrock 
1522*39c23413Seschrock 	/*
1523*39c23413Seschrock 	 * We must validate the spares after checking the children.  Otherwise,
1524*39c23413Seschrock 	 * vdev_inuse() will blindly overwrite the spare.
1525*39c23413Seschrock 	 */
1526*39c23413Seschrock 	if ((error = spa_validate_spares(spa, nvroot, txg,
1527*39c23413Seschrock 	    VDEV_ALLOC_ADD)) != 0) {
1528*39c23413Seschrock 		spa->spa_pending_vdev = NULL;
1529*39c23413Seschrock 		return (spa_vdev_exit(spa, vd, txg, error));
1530*39c23413Seschrock 	}
1531*39c23413Seschrock 
1532*39c23413Seschrock 	spa->spa_pending_vdev = NULL;
1533*39c23413Seschrock 
1534*39c23413Seschrock 	/*
1535*39c23413Seschrock 	 * Transfer each new top-level vdev from vd to rvd.
1536*39c23413Seschrock 	 */
1537*39c23413Seschrock 	for (c = 0; c < vd->vdev_children; c++) {
1538*39c23413Seschrock 		tvd = vd->vdev_child[c];
1539*39c23413Seschrock 		vdev_remove_child(vd, tvd);
1540*39c23413Seschrock 		tvd->vdev_id = rvd->vdev_children;
1541*39c23413Seschrock 		vdev_add_child(rvd, tvd);
1542*39c23413Seschrock 		vdev_config_dirty(tvd);
1543*39c23413Seschrock 	}
1544*39c23413Seschrock 
154599653d4eSeschrock 	if (nspares != 0) {
154699653d4eSeschrock 		if (spa->spa_sparelist != NULL) {
154799653d4eSeschrock 			nvlist_t **oldspares;
154899653d4eSeschrock 			uint_t oldnspares;
154999653d4eSeschrock 			nvlist_t **newspares;
155099653d4eSeschrock 
155199653d4eSeschrock 			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
155299653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
155399653d4eSeschrock 
155499653d4eSeschrock 			newspares = kmem_alloc(sizeof (void *) *
155599653d4eSeschrock 			    (nspares + oldnspares), KM_SLEEP);
155699653d4eSeschrock 			for (i = 0; i < oldnspares; i++)
155799653d4eSeschrock 				VERIFY(nvlist_dup(oldspares[i],
155899653d4eSeschrock 				    &newspares[i], KM_SLEEP) == 0);
155999653d4eSeschrock 			for (i = 0; i < nspares; i++)
156099653d4eSeschrock 				VERIFY(nvlist_dup(spares[i],
156199653d4eSeschrock 				    &newspares[i + oldnspares],
156299653d4eSeschrock 				    KM_SLEEP) == 0);
156399653d4eSeschrock 
156499653d4eSeschrock 			VERIFY(nvlist_remove(spa->spa_sparelist,
156599653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
156699653d4eSeschrock 
156799653d4eSeschrock 			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
156899653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, newspares,
156999653d4eSeschrock 			    nspares + oldnspares) == 0);
157099653d4eSeschrock 			for (i = 0; i < oldnspares + nspares; i++)
157199653d4eSeschrock 				nvlist_free(newspares[i]);
157299653d4eSeschrock 			kmem_free(newspares, (oldnspares + nspares) *
157399653d4eSeschrock 			    sizeof (void *));
157499653d4eSeschrock 		} else {
157599653d4eSeschrock 			VERIFY(nvlist_alloc(&spa->spa_sparelist,
157699653d4eSeschrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
157799653d4eSeschrock 			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
157899653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
157999653d4eSeschrock 		}
158099653d4eSeschrock 
158199653d4eSeschrock 		spa_load_spares(spa);
158299653d4eSeschrock 		spa->spa_sync_spares = B_TRUE;
1583fa9e4066Sahrens 	}
1584fa9e4066Sahrens 
1585fa9e4066Sahrens 	/*
15860e34b6a7Sbonwick 	 * We have to be careful when adding new vdevs to an existing pool.
15870e34b6a7Sbonwick 	 * If other threads start allocating from these vdevs before we
15880e34b6a7Sbonwick 	 * sync the config cache, and we lose power, then upon reboot we may
15890e34b6a7Sbonwick 	 * fail to open the pool because there are DVAs that the config cache
15900e34b6a7Sbonwick 	 * can't translate.  Therefore, we first add the vdevs without
15910e34b6a7Sbonwick 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
15920373e76bSbonwick 	 * and then let spa_config_update() initialize the new metaslabs.
15930e34b6a7Sbonwick 	 *
15940e34b6a7Sbonwick 	 * spa_load() checks for added-but-not-initialized vdevs, so that
15950e34b6a7Sbonwick 	 * if we lose power at any point in this sequence, the remaining
15960e34b6a7Sbonwick 	 * steps will be completed the next time we load the pool.
15970e34b6a7Sbonwick 	 */
15980373e76bSbonwick 	(void) spa_vdev_exit(spa, vd, txg, 0);
15990e34b6a7Sbonwick 
16000373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
16010373e76bSbonwick 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
16020373e76bSbonwick 	mutex_exit(&spa_namespace_lock);
1603fa9e4066Sahrens 
16040373e76bSbonwick 	return (0);
1605fa9e4066Sahrens }
1606fa9e4066Sahrens 
1607fa9e4066Sahrens /*
1608fa9e4066Sahrens  * Attach a device to a mirror.  The arguments are the path to any device
1609fa9e4066Sahrens  * in the mirror, and the nvroot for the new device.  If the path specifies
1610fa9e4066Sahrens  * a device that is not mirrored, we automatically insert the mirror vdev.
1611fa9e4066Sahrens  *
1612fa9e4066Sahrens  * If 'replacing' is specified, the new device is intended to replace the
1613fa9e4066Sahrens  * existing device; in this case the two devices are made into their own
1614fa9e4066Sahrens  * mirror using the 'replacing' vdev, which is functionally idendical to
1615fa9e4066Sahrens  * the mirror vdev (it actually reuses all the same ops) but has a few
1616fa9e4066Sahrens  * extra rules: you can't attach to it after it's been created, and upon
1617fa9e4066Sahrens  * completion of resilvering, the first disk (the one being replaced)
1618fa9e4066Sahrens  * is automatically detached.
1619fa9e4066Sahrens  */
1620fa9e4066Sahrens int
1621ea8dc4b6Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
1622fa9e4066Sahrens {
1623fa9e4066Sahrens 	uint64_t txg, open_txg;
1624fa9e4066Sahrens 	int error;
1625fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
1626fa9e4066Sahrens 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
162799653d4eSeschrock 	vdev_ops_t *pvops;
1628fa9e4066Sahrens 
1629fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
1630fa9e4066Sahrens 
1631ea8dc4b6Seschrock 	oldvd = vdev_lookup_by_guid(rvd, guid);
1632fa9e4066Sahrens 
1633fa9e4066Sahrens 	if (oldvd == NULL)
1634fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1635fa9e4066Sahrens 
16360e34b6a7Sbonwick 	if (!oldvd->vdev_ops->vdev_op_leaf)
16370e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
16380e34b6a7Sbonwick 
1639fa9e4066Sahrens 	pvd = oldvd->vdev_parent;
1640fa9e4066Sahrens 
164199653d4eSeschrock 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
164299653d4eSeschrock 	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
1643fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1644fa9e4066Sahrens 
1645fa9e4066Sahrens 	newvd = newrootvd->vdev_child[0];
1646fa9e4066Sahrens 
1647fa9e4066Sahrens 	if (!newvd->vdev_ops->vdev_op_leaf)
1648fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1649fa9e4066Sahrens 
165099653d4eSeschrock 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
1651fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, error));
1652fa9e4066Sahrens 
165399653d4eSeschrock 	if (!replacing) {
165499653d4eSeschrock 		/*
165599653d4eSeschrock 		 * For attach, the only allowable parent is a mirror or the root
165699653d4eSeschrock 		 * vdev.
165799653d4eSeschrock 		 */
165899653d4eSeschrock 		if (pvd->vdev_ops != &vdev_mirror_ops &&
165999653d4eSeschrock 		    pvd->vdev_ops != &vdev_root_ops)
166099653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
166199653d4eSeschrock 
166299653d4eSeschrock 		pvops = &vdev_mirror_ops;
166399653d4eSeschrock 	} else {
166499653d4eSeschrock 		/*
166599653d4eSeschrock 		 * Active hot spares can only be replaced by inactive hot
166699653d4eSeschrock 		 * spares.
166799653d4eSeschrock 		 */
166899653d4eSeschrock 		if (pvd->vdev_ops == &vdev_spare_ops &&
166999653d4eSeschrock 		    pvd->vdev_child[1] == oldvd &&
167099653d4eSeschrock 		    !spa_has_spare(spa, newvd->vdev_guid))
167199653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
167299653d4eSeschrock 
167399653d4eSeschrock 		/*
167499653d4eSeschrock 		 * If the source is a hot spare, and the parent isn't already a
167599653d4eSeschrock 		 * spare, then we want to create a new hot spare.  Otherwise, we
1676*39c23413Seschrock 		 * want to create a replacing vdev.  The user is not allowed to
1677*39c23413Seschrock 		 * attach to a spared vdev child unless the 'isspare' state is
1678*39c23413Seschrock 		 * the same (spare replaces spare, non-spare replaces
1679*39c23413Seschrock 		 * non-spare).
168099653d4eSeschrock 		 */
168199653d4eSeschrock 		if (pvd->vdev_ops == &vdev_replacing_ops)
168299653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1683*39c23413Seschrock 		else if (pvd->vdev_ops == &vdev_spare_ops &&
1684*39c23413Seschrock 		    newvd->vdev_isspare != oldvd->vdev_isspare)
1685*39c23413Seschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
168699653d4eSeschrock 		else if (pvd->vdev_ops != &vdev_spare_ops &&
168799653d4eSeschrock 		    newvd->vdev_isspare)
168899653d4eSeschrock 			pvops = &vdev_spare_ops;
168999653d4eSeschrock 		else
169099653d4eSeschrock 			pvops = &vdev_replacing_ops;
169199653d4eSeschrock 	}
169299653d4eSeschrock 
16932a79c5feSlling 	/*
16942a79c5feSlling 	 * Compare the new device size with the replaceable/attachable
16952a79c5feSlling 	 * device size.
16962a79c5feSlling 	 */
16972a79c5feSlling 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
1698fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
1699fa9e4066Sahrens 
1700ecc2d604Sbonwick 	/*
1701ecc2d604Sbonwick 	 * The new device cannot have a higher alignment requirement
1702ecc2d604Sbonwick 	 * than the top-level vdev.
1703ecc2d604Sbonwick 	 */
1704ecc2d604Sbonwick 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
1705fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
1706fa9e4066Sahrens 
1707fa9e4066Sahrens 	/*
1708fa9e4066Sahrens 	 * If this is an in-place replacement, update oldvd's path and devid
1709fa9e4066Sahrens 	 * to make it distinguishable from newvd, and unopenable from now on.
1710fa9e4066Sahrens 	 */
1711fa9e4066Sahrens 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
1712fa9e4066Sahrens 		spa_strfree(oldvd->vdev_path);
1713fa9e4066Sahrens 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
1714fa9e4066Sahrens 		    KM_SLEEP);
1715fa9e4066Sahrens 		(void) sprintf(oldvd->vdev_path, "%s/%s",
1716fa9e4066Sahrens 		    newvd->vdev_path, "old");
1717fa9e4066Sahrens 		if (oldvd->vdev_devid != NULL) {
1718fa9e4066Sahrens 			spa_strfree(oldvd->vdev_devid);
1719fa9e4066Sahrens 			oldvd->vdev_devid = NULL;
1720fa9e4066Sahrens 		}
1721fa9e4066Sahrens 	}
1722fa9e4066Sahrens 
1723fa9e4066Sahrens 	/*
172499653d4eSeschrock 	 * If the parent is not a mirror, or if we're replacing, insert the new
172599653d4eSeschrock 	 * mirror/replacing/spare vdev above oldvd.
1726fa9e4066Sahrens 	 */
1727fa9e4066Sahrens 	if (pvd->vdev_ops != pvops)
1728fa9e4066Sahrens 		pvd = vdev_add_parent(oldvd, pvops);
1729fa9e4066Sahrens 
1730fa9e4066Sahrens 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
1731fa9e4066Sahrens 	ASSERT(pvd->vdev_ops == pvops);
1732fa9e4066Sahrens 	ASSERT(oldvd->vdev_parent == pvd);
1733fa9e4066Sahrens 
1734fa9e4066Sahrens 	/*
1735fa9e4066Sahrens 	 * Extract the new device from its root and add it to pvd.
1736fa9e4066Sahrens 	 */
1737fa9e4066Sahrens 	vdev_remove_child(newrootvd, newvd);
1738fa9e4066Sahrens 	newvd->vdev_id = pvd->vdev_children;
1739fa9e4066Sahrens 	vdev_add_child(pvd, newvd);
1740fa9e4066Sahrens 
1741ea8dc4b6Seschrock 	/*
1742ea8dc4b6Seschrock 	 * If newvd is smaller than oldvd, but larger than its rsize,
1743ea8dc4b6Seschrock 	 * the addition of newvd may have decreased our parent's asize.
1744ea8dc4b6Seschrock 	 */
1745ea8dc4b6Seschrock 	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
1746ea8dc4b6Seschrock 
1747fa9e4066Sahrens 	tvd = newvd->vdev_top;
1748fa9e4066Sahrens 	ASSERT(pvd->vdev_top == tvd);
1749fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
1750fa9e4066Sahrens 
1751fa9e4066Sahrens 	vdev_config_dirty(tvd);
1752fa9e4066Sahrens 
1753fa9e4066Sahrens 	/*
1754fa9e4066Sahrens 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
1755fa9e4066Sahrens 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
1756fa9e4066Sahrens 	 */
1757fa9e4066Sahrens 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
1758fa9e4066Sahrens 
1759fa9e4066Sahrens 	mutex_enter(&newvd->vdev_dtl_lock);
1760fa9e4066Sahrens 	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
1761fa9e4066Sahrens 	    open_txg - TXG_INITIAL + 1);
1762fa9e4066Sahrens 	mutex_exit(&newvd->vdev_dtl_lock);
1763fa9e4066Sahrens 
1764*39c23413Seschrock 	if (newvd->vdev_isspare)
1765*39c23413Seschrock 		spa_spare_activate(newvd);
1766ea8dc4b6Seschrock 
1767fa9e4066Sahrens 	/*
1768fa9e4066Sahrens 	 * Mark newvd's DTL dirty in this txg.
1769fa9e4066Sahrens 	 */
1770ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
1771fa9e4066Sahrens 
1772fa9e4066Sahrens 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
1773fa9e4066Sahrens 
1774fa9e4066Sahrens 	/*
1775fa9e4066Sahrens 	 * Kick off a resilver to update newvd.
1776fa9e4066Sahrens 	 */
1777fa9e4066Sahrens 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1778fa9e4066Sahrens 
1779fa9e4066Sahrens 	return (0);
1780fa9e4066Sahrens }
1781fa9e4066Sahrens 
1782fa9e4066Sahrens /*
1783fa9e4066Sahrens  * Detach a device from a mirror or replacing vdev.
1784fa9e4066Sahrens  * If 'replace_done' is specified, only detach if the parent
1785fa9e4066Sahrens  * is a replacing vdev.
1786fa9e4066Sahrens  */
1787fa9e4066Sahrens int
1788ea8dc4b6Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
1789fa9e4066Sahrens {
1790fa9e4066Sahrens 	uint64_t txg;
1791fa9e4066Sahrens 	int c, t, error;
1792fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
1793fa9e4066Sahrens 	vdev_t *vd, *pvd, *cvd, *tvd;
179499653d4eSeschrock 	boolean_t unspare = B_FALSE;
179599653d4eSeschrock 	uint64_t unspare_guid;
1796fa9e4066Sahrens 
1797fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
1798fa9e4066Sahrens 
1799ea8dc4b6Seschrock 	vd = vdev_lookup_by_guid(rvd, guid);
1800fa9e4066Sahrens 
1801fa9e4066Sahrens 	if (vd == NULL)
1802fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1803fa9e4066Sahrens 
18040e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
18050e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
18060e34b6a7Sbonwick 
1807fa9e4066Sahrens 	pvd = vd->vdev_parent;
1808fa9e4066Sahrens 
1809fa9e4066Sahrens 	/*
1810fa9e4066Sahrens 	 * If replace_done is specified, only remove this device if it's
181199653d4eSeschrock 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
181299653d4eSeschrock 	 * disk can be removed.
181399653d4eSeschrock 	 */
181499653d4eSeschrock 	if (replace_done) {
181599653d4eSeschrock 		if (pvd->vdev_ops == &vdev_replacing_ops) {
181699653d4eSeschrock 			if (vd->vdev_id != 0)
181799653d4eSeschrock 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
181899653d4eSeschrock 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
181999653d4eSeschrock 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
182099653d4eSeschrock 		}
182199653d4eSeschrock 	}
182299653d4eSeschrock 
182399653d4eSeschrock 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
182499653d4eSeschrock 	    spa_version(spa) >= ZFS_VERSION_SPARES);
1825fa9e4066Sahrens 
1826fa9e4066Sahrens 	/*
182799653d4eSeschrock 	 * Only mirror, replacing, and spare vdevs support detach.
1828fa9e4066Sahrens 	 */
1829fa9e4066Sahrens 	if (pvd->vdev_ops != &vdev_replacing_ops &&
183099653d4eSeschrock 	    pvd->vdev_ops != &vdev_mirror_ops &&
183199653d4eSeschrock 	    pvd->vdev_ops != &vdev_spare_ops)
1832fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1833fa9e4066Sahrens 
1834fa9e4066Sahrens 	/*
1835fa9e4066Sahrens 	 * If there's only one replica, you can't detach it.
1836fa9e4066Sahrens 	 */
1837fa9e4066Sahrens 	if (pvd->vdev_children <= 1)
1838fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1839fa9e4066Sahrens 
1840fa9e4066Sahrens 	/*
1841fa9e4066Sahrens 	 * If all siblings have non-empty DTLs, this device may have the only
1842fa9e4066Sahrens 	 * valid copy of the data, which means we cannot safely detach it.
1843fa9e4066Sahrens 	 *
1844fa9e4066Sahrens 	 * XXX -- as in the vdev_offline() case, we really want a more
1845fa9e4066Sahrens 	 * precise DTL check.
1846fa9e4066Sahrens 	 */
1847fa9e4066Sahrens 	for (c = 0; c < pvd->vdev_children; c++) {
1848fa9e4066Sahrens 		uint64_t dirty;
1849fa9e4066Sahrens 
1850fa9e4066Sahrens 		cvd = pvd->vdev_child[c];
1851fa9e4066Sahrens 		if (cvd == vd)
1852fa9e4066Sahrens 			continue;
1853fa9e4066Sahrens 		if (vdev_is_dead(cvd))
1854fa9e4066Sahrens 			continue;
1855fa9e4066Sahrens 		mutex_enter(&cvd->vdev_dtl_lock);
1856fa9e4066Sahrens 		dirty = cvd->vdev_dtl_map.sm_space |
1857fa9e4066Sahrens 		    cvd->vdev_dtl_scrub.sm_space;
1858fa9e4066Sahrens 		mutex_exit(&cvd->vdev_dtl_lock);
1859fa9e4066Sahrens 		if (!dirty)
1860fa9e4066Sahrens 			break;
1861fa9e4066Sahrens 	}
186299653d4eSeschrock 
186399653d4eSeschrock 	/*
186499653d4eSeschrock 	 * If we are a replacing or spare vdev, then we can always detach the
186599653d4eSeschrock 	 * latter child, as that is how one cancels the operation.
186699653d4eSeschrock 	 */
186799653d4eSeschrock 	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
186899653d4eSeschrock 	    c == pvd->vdev_children)
1869fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1870fa9e4066Sahrens 
187199653d4eSeschrock 	/*
187299653d4eSeschrock 	 * If we are detaching the original disk from a spare, then it implies
187399653d4eSeschrock 	 * that the spare should become a real disk, and be removed from the
187499653d4eSeschrock 	 * active spare list for the pool.
187599653d4eSeschrock 	 */
187699653d4eSeschrock 	if (pvd->vdev_ops == &vdev_spare_ops &&
187799653d4eSeschrock 	    vd->vdev_id == 0)
187899653d4eSeschrock 		unspare = B_TRUE;
187999653d4eSeschrock 
1880fa9e4066Sahrens 	/*
1881fa9e4066Sahrens 	 * Erase the disk labels so the disk can be used for other things.
1882fa9e4066Sahrens 	 * This must be done after all other error cases are handled,
1883fa9e4066Sahrens 	 * but before we disembowel vd (so we can still do I/O to it).
1884fa9e4066Sahrens 	 * But if we can't do it, don't treat the error as fatal --
1885fa9e4066Sahrens 	 * it may be that the unwritability of the disk is the reason
1886fa9e4066Sahrens 	 * it's being detached!
1887fa9e4066Sahrens 	 */
1888*39c23413Seschrock 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1889fa9e4066Sahrens 
1890fa9e4066Sahrens 	/*
1891fa9e4066Sahrens 	 * Remove vd from its parent and compact the parent's children.
1892fa9e4066Sahrens 	 */
1893fa9e4066Sahrens 	vdev_remove_child(pvd, vd);
1894fa9e4066Sahrens 	vdev_compact_children(pvd);
1895fa9e4066Sahrens 
1896fa9e4066Sahrens 	/*
1897fa9e4066Sahrens 	 * Remember one of the remaining children so we can get tvd below.
1898fa9e4066Sahrens 	 */
1899fa9e4066Sahrens 	cvd = pvd->vdev_child[0];
1900fa9e4066Sahrens 
190199653d4eSeschrock 	/*
190299653d4eSeschrock 	 * If we need to remove the remaining child from the list of hot spares,
190399653d4eSeschrock 	 * do it now, marking the vdev as no longer a spare in the process.  We
190499653d4eSeschrock 	 * must do this before vdev_remove_parent(), because that can change the
190599653d4eSeschrock 	 * GUID if it creates a new toplevel GUID.
190699653d4eSeschrock 	 */
190799653d4eSeschrock 	if (unspare) {
190899653d4eSeschrock 		ASSERT(cvd->vdev_isspare);
1909*39c23413Seschrock 		spa_spare_remove(cvd);
191099653d4eSeschrock 		unspare_guid = cvd->vdev_guid;
191199653d4eSeschrock 	}
191299653d4eSeschrock 
1913fa9e4066Sahrens 	/*
1914fa9e4066Sahrens 	 * If the parent mirror/replacing vdev only has one child,
1915fa9e4066Sahrens 	 * the parent is no longer needed.  Remove it from the tree.
1916fa9e4066Sahrens 	 */
1917fa9e4066Sahrens 	if (pvd->vdev_children == 1)
1918fa9e4066Sahrens 		vdev_remove_parent(cvd);
1919fa9e4066Sahrens 
1920fa9e4066Sahrens 	/*
1921fa9e4066Sahrens 	 * We don't set tvd until now because the parent we just removed
1922fa9e4066Sahrens 	 * may have been the previous top-level vdev.
1923fa9e4066Sahrens 	 */
1924fa9e4066Sahrens 	tvd = cvd->vdev_top;
1925fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
1926fa9e4066Sahrens 
1927fa9e4066Sahrens 	/*
1928*39c23413Seschrock 	 * Reevaluate the parent vdev state.
1929fa9e4066Sahrens 	 */
1930*39c23413Seschrock 	vdev_propagate_state(cvd->vdev_parent);
1931fa9e4066Sahrens 
1932fa9e4066Sahrens 	/*
1933*39c23413Seschrock 	 * If the device we just detached was smaller than the others, it may be
1934*39c23413Seschrock 	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
1935*39c23413Seschrock 	 * can't fail because the existing metaslabs are already in core, so
1936*39c23413Seschrock 	 * there's nothing to read from disk.
1937fa9e4066Sahrens 	 */
1938ecc2d604Sbonwick 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
1939fa9e4066Sahrens 
1940fa9e4066Sahrens 	vdev_config_dirty(tvd);
1941fa9e4066Sahrens 
1942fa9e4066Sahrens 	/*
1943*39c23413Seschrock 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
1944*39c23413Seschrock 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
1945*39c23413Seschrock 	 * But first make sure we're not on any *other* txg's DTL list, to
1946*39c23413Seschrock 	 * prevent vd from being accessed after it's freed.
1947fa9e4066Sahrens 	 */
1948fa9e4066Sahrens 	for (t = 0; t < TXG_SIZE; t++)
1949fa9e4066Sahrens 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
1950ecc2d604Sbonwick 	vd->vdev_detached = B_TRUE;
1951ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, vd, txg);
1952fa9e4066Sahrens 
195399653d4eSeschrock 	error = spa_vdev_exit(spa, vd, txg, 0);
195499653d4eSeschrock 
195599653d4eSeschrock 	/*
1956*39c23413Seschrock 	 * If this was the removal of the original device in a hot spare vdev,
1957*39c23413Seschrock 	 * then we want to go through and remove the device from the hot spare
1958*39c23413Seschrock 	 * list of every other pool.
195999653d4eSeschrock 	 */
196099653d4eSeschrock 	if (unspare) {
196199653d4eSeschrock 		spa = NULL;
196299653d4eSeschrock 		mutex_enter(&spa_namespace_lock);
196399653d4eSeschrock 		while ((spa = spa_next(spa)) != NULL) {
196499653d4eSeschrock 			if (spa->spa_state != POOL_STATE_ACTIVE)
196599653d4eSeschrock 				continue;
196699653d4eSeschrock 
196799653d4eSeschrock 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
196899653d4eSeschrock 		}
196999653d4eSeschrock 		mutex_exit(&spa_namespace_lock);
197099653d4eSeschrock 	}
197199653d4eSeschrock 
197299653d4eSeschrock 	return (error);
197399653d4eSeschrock }
197499653d4eSeschrock 
197599653d4eSeschrock /*
197699653d4eSeschrock  * Remove a device from the pool.  Currently, this supports removing only hot
197799653d4eSeschrock  * spares.
197899653d4eSeschrock  */
197999653d4eSeschrock int
198099653d4eSeschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
198199653d4eSeschrock {
198299653d4eSeschrock 	vdev_t *vd;
198399653d4eSeschrock 	nvlist_t **spares, *nv, **newspares;
198499653d4eSeschrock 	uint_t i, j, nspares;
198599653d4eSeschrock 	int ret = 0;
198699653d4eSeschrock 
198799653d4eSeschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
198899653d4eSeschrock 
198999653d4eSeschrock 	vd = spa_lookup_by_guid(spa, guid);
199099653d4eSeschrock 
199199653d4eSeschrock 	nv = NULL;
199299653d4eSeschrock 	if (spa->spa_spares != NULL &&
199399653d4eSeschrock 	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
199499653d4eSeschrock 	    &spares, &nspares) == 0) {
199599653d4eSeschrock 		for (i = 0; i < nspares; i++) {
199699653d4eSeschrock 			uint64_t theguid;
199799653d4eSeschrock 
199899653d4eSeschrock 			VERIFY(nvlist_lookup_uint64(spares[i],
199999653d4eSeschrock 			    ZPOOL_CONFIG_GUID, &theguid) == 0);
200099653d4eSeschrock 			if (theguid == guid) {
200199653d4eSeschrock 				nv = spares[i];
200299653d4eSeschrock 				break;
200399653d4eSeschrock 			}
200499653d4eSeschrock 		}
200599653d4eSeschrock 	}
200699653d4eSeschrock 
200799653d4eSeschrock 	/*
200899653d4eSeschrock 	 * We only support removing a hot spare, and only if it's not currently
200999653d4eSeschrock 	 * in use in this pool.
201099653d4eSeschrock 	 */
201199653d4eSeschrock 	if (nv == NULL && vd == NULL) {
201299653d4eSeschrock 		ret = ENOENT;
201399653d4eSeschrock 		goto out;
201499653d4eSeschrock 	}
201599653d4eSeschrock 
201699653d4eSeschrock 	if (nv == NULL && vd != NULL) {
201799653d4eSeschrock 		ret = ENOTSUP;
201899653d4eSeschrock 		goto out;
201999653d4eSeschrock 	}
202099653d4eSeschrock 
202199653d4eSeschrock 	if (!unspare && nv != NULL && vd != NULL) {
202299653d4eSeschrock 		ret = EBUSY;
202399653d4eSeschrock 		goto out;
202499653d4eSeschrock 	}
202599653d4eSeschrock 
202699653d4eSeschrock 	if (nspares == 1) {
202799653d4eSeschrock 		newspares = NULL;
202899653d4eSeschrock 	} else {
202999653d4eSeschrock 		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
203099653d4eSeschrock 		    KM_SLEEP);
203199653d4eSeschrock 		for (i = 0, j = 0; i < nspares; i++) {
203299653d4eSeschrock 			if (spares[i] != nv)
203399653d4eSeschrock 				VERIFY(nvlist_dup(spares[i],
203499653d4eSeschrock 				    &newspares[j++], KM_SLEEP) == 0);
203599653d4eSeschrock 		}
203699653d4eSeschrock 	}
203799653d4eSeschrock 
203899653d4eSeschrock 	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
203999653d4eSeschrock 	    DATA_TYPE_NVLIST_ARRAY) == 0);
204099653d4eSeschrock 	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
204199653d4eSeschrock 	    newspares, nspares - 1) == 0);
204299653d4eSeschrock 	for (i = 0; i < nspares - 1; i++)
204399653d4eSeschrock 		nvlist_free(newspares[i]);
204499653d4eSeschrock 	kmem_free(newspares, (nspares - 1) * sizeof (void *));
204599653d4eSeschrock 	spa_load_spares(spa);
204699653d4eSeschrock 	spa->spa_sync_spares = B_TRUE;
204799653d4eSeschrock 
204899653d4eSeschrock out:
204999653d4eSeschrock 	spa_config_exit(spa, FTAG);
205099653d4eSeschrock 
205199653d4eSeschrock 	return (ret);
2052fa9e4066Sahrens }
2053fa9e4066Sahrens 
2054fa9e4066Sahrens /*
2055ea8dc4b6Seschrock  * Find any device that's done replacing, so we can detach it.
2056fa9e4066Sahrens  */
2057ea8dc4b6Seschrock static vdev_t *
2058ea8dc4b6Seschrock spa_vdev_replace_done_hunt(vdev_t *vd)
2059fa9e4066Sahrens {
2060ea8dc4b6Seschrock 	vdev_t *newvd, *oldvd;
2061fa9e4066Sahrens 	int c;
2062fa9e4066Sahrens 
2063ea8dc4b6Seschrock 	for (c = 0; c < vd->vdev_children; c++) {
2064ea8dc4b6Seschrock 		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
2065ea8dc4b6Seschrock 		if (oldvd != NULL)
2066ea8dc4b6Seschrock 			return (oldvd);
2067ea8dc4b6Seschrock 	}
2068fa9e4066Sahrens 
2069fa9e4066Sahrens 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
2070ea8dc4b6Seschrock 		oldvd = vd->vdev_child[0];
2071ea8dc4b6Seschrock 		newvd = vd->vdev_child[1];
2072ea8dc4b6Seschrock 
2073ea8dc4b6Seschrock 		mutex_enter(&newvd->vdev_dtl_lock);
2074ea8dc4b6Seschrock 		if (newvd->vdev_dtl_map.sm_space == 0 &&
2075ea8dc4b6Seschrock 		    newvd->vdev_dtl_scrub.sm_space == 0) {
2076ea8dc4b6Seschrock 			mutex_exit(&newvd->vdev_dtl_lock);
2077ea8dc4b6Seschrock 			return (oldvd);
2078fa9e4066Sahrens 		}
2079ea8dc4b6Seschrock 		mutex_exit(&newvd->vdev_dtl_lock);
2080fa9e4066Sahrens 	}
2081ea8dc4b6Seschrock 
2082ea8dc4b6Seschrock 	return (NULL);
2083fa9e4066Sahrens }
2084fa9e4066Sahrens 
2085ea8dc4b6Seschrock static void
2086fa9e4066Sahrens spa_vdev_replace_done(spa_t *spa)
2087fa9e4066Sahrens {
2088ea8dc4b6Seschrock 	vdev_t *vd;
208999653d4eSeschrock 	vdev_t *pvd;
2090ea8dc4b6Seschrock 	uint64_t guid;
209199653d4eSeschrock 	uint64_t pguid = 0;
2092ea8dc4b6Seschrock 
2093ea8dc4b6Seschrock 	spa_config_enter(spa, RW_READER, FTAG);
2094ea8dc4b6Seschrock 
2095ea8dc4b6Seschrock 	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
2096ea8dc4b6Seschrock 		guid = vd->vdev_guid;
209799653d4eSeschrock 		/*
209899653d4eSeschrock 		 * If we have just finished replacing a hot spared device, then
209999653d4eSeschrock 		 * we need to detach the parent's first child (the original hot
210099653d4eSeschrock 		 * spare) as well.
210199653d4eSeschrock 		 */
210299653d4eSeschrock 		pvd = vd->vdev_parent;
210399653d4eSeschrock 		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
210499653d4eSeschrock 		    pvd->vdev_id == 0) {
210599653d4eSeschrock 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
210699653d4eSeschrock 			ASSERT(pvd->vdev_parent->vdev_children == 2);
210799653d4eSeschrock 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
210899653d4eSeschrock 		}
2109ea8dc4b6Seschrock 		spa_config_exit(spa, FTAG);
2110ea8dc4b6Seschrock 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
2111ea8dc4b6Seschrock 			return;
211299653d4eSeschrock 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
211399653d4eSeschrock 			return;
2114ea8dc4b6Seschrock 		spa_config_enter(spa, RW_READER, FTAG);
2115fa9e4066Sahrens 	}
2116fa9e4066Sahrens 
2117ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2118fa9e4066Sahrens }
2119fa9e4066Sahrens 
2120c67d9675Seschrock /*
2121c67d9675Seschrock  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
2122c67d9675Seschrock  * on spa_vdev_enter/exit() to synchronize the labels and cache.
2123c67d9675Seschrock  */
2124c67d9675Seschrock int
2125c67d9675Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
2126c67d9675Seschrock {
2127c67d9675Seschrock 	vdev_t *rvd, *vd;
2128c67d9675Seschrock 	uint64_t txg;
2129c67d9675Seschrock 
2130c67d9675Seschrock 	rvd = spa->spa_root_vdev;
2131c67d9675Seschrock 
2132c67d9675Seschrock 	txg = spa_vdev_enter(spa);
2133c67d9675Seschrock 
213499653d4eSeschrock 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
213599653d4eSeschrock 		/*
213699653d4eSeschrock 		 * Determine if this is a reference to a hot spare.  In that
213799653d4eSeschrock 		 * case, update the path as stored in the spare list.
213899653d4eSeschrock 		 */
213999653d4eSeschrock 		nvlist_t **spares;
214099653d4eSeschrock 		uint_t i, nspares;
214199653d4eSeschrock 		if (spa->spa_sparelist != NULL) {
214299653d4eSeschrock 			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
214399653d4eSeschrock 			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
214499653d4eSeschrock 			for (i = 0; i < nspares; i++) {
214599653d4eSeschrock 				uint64_t theguid;
214699653d4eSeschrock 				VERIFY(nvlist_lookup_uint64(spares[i],
214799653d4eSeschrock 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
214899653d4eSeschrock 				if (theguid == guid)
214999653d4eSeschrock 					break;
215099653d4eSeschrock 			}
215199653d4eSeschrock 
215299653d4eSeschrock 			if (i == nspares)
215399653d4eSeschrock 				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
215499653d4eSeschrock 
215599653d4eSeschrock 			VERIFY(nvlist_add_string(spares[i],
215699653d4eSeschrock 			    ZPOOL_CONFIG_PATH, newpath) == 0);
215799653d4eSeschrock 			spa_load_spares(spa);
215899653d4eSeschrock 			spa->spa_sync_spares = B_TRUE;
215999653d4eSeschrock 			return (spa_vdev_exit(spa, NULL, txg, 0));
216099653d4eSeschrock 		} else {
216199653d4eSeschrock 			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
216299653d4eSeschrock 		}
216399653d4eSeschrock 	}
2164c67d9675Seschrock 
21650e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
21660e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
21670e34b6a7Sbonwick 
2168c67d9675Seschrock 	spa_strfree(vd->vdev_path);
2169c67d9675Seschrock 	vd->vdev_path = spa_strdup(newpath);
2170c67d9675Seschrock 
2171c67d9675Seschrock 	vdev_config_dirty(vd->vdev_top);
2172c67d9675Seschrock 
2173c67d9675Seschrock 	return (spa_vdev_exit(spa, NULL, txg, 0));
2174c67d9675Seschrock }
2175c67d9675Seschrock 
2176fa9e4066Sahrens /*
2177fa9e4066Sahrens  * ==========================================================================
2178fa9e4066Sahrens  * SPA Scrubbing
2179fa9e4066Sahrens  * ==========================================================================
2180fa9e4066Sahrens  */
2181fa9e4066Sahrens 
2182ea8dc4b6Seschrock void
2183ea8dc4b6Seschrock spa_scrub_throttle(spa_t *spa, int direction)
2184ea8dc4b6Seschrock {
2185ea8dc4b6Seschrock 	mutex_enter(&spa->spa_scrub_lock);
2186ea8dc4b6Seschrock 	spa->spa_scrub_throttled += direction;
2187ea8dc4b6Seschrock 	ASSERT(spa->spa_scrub_throttled >= 0);
2188ea8dc4b6Seschrock 	if (spa->spa_scrub_throttled == 0)
2189ea8dc4b6Seschrock 		cv_broadcast(&spa->spa_scrub_io_cv);
2190ea8dc4b6Seschrock 	mutex_exit(&spa->spa_scrub_lock);
2191ea8dc4b6Seschrock }
2192fa9e4066Sahrens 
2193fa9e4066Sahrens static void
2194fa9e4066Sahrens spa_scrub_io_done(zio_t *zio)
2195fa9e4066Sahrens {
2196fa9e4066Sahrens 	spa_t *spa = zio->io_spa;
2197fa9e4066Sahrens 
2198ad23a2dbSjohansen 	zio_data_buf_free(zio->io_data, zio->io_size);
2199fa9e4066Sahrens 
2200fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2201ea8dc4b6Seschrock 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
220244cd46caSbillm 		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
2203ea8dc4b6Seschrock 		spa->spa_scrub_errors++;
2204fa9e4066Sahrens 		mutex_enter(&vd->vdev_stat_lock);
2205fa9e4066Sahrens 		vd->vdev_stat.vs_scrub_errors++;
2206fa9e4066Sahrens 		mutex_exit(&vd->vdev_stat_lock);
2207fa9e4066Sahrens 	}
2208ea8dc4b6Seschrock 	if (--spa->spa_scrub_inflight == 0) {
2209ea8dc4b6Seschrock 		cv_broadcast(&spa->spa_scrub_io_cv);
2210ea8dc4b6Seschrock 		ASSERT(spa->spa_scrub_throttled == 0);
2211ea8dc4b6Seschrock 	}
2212ea8dc4b6Seschrock 	mutex_exit(&spa->spa_scrub_lock);
2213fa9e4066Sahrens }
2214fa9e4066Sahrens 
2215fa9e4066Sahrens static void
2216ea8dc4b6Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
2217ea8dc4b6Seschrock     zbookmark_t *zb)
2218fa9e4066Sahrens {
2219fa9e4066Sahrens 	size_t size = BP_GET_LSIZE(bp);
2220ad23a2dbSjohansen 	void *data = zio_data_buf_alloc(size);
2221fa9e4066Sahrens 
2222fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2223fa9e4066Sahrens 	spa->spa_scrub_inflight++;
2224fa9e4066Sahrens 	mutex_exit(&spa->spa_scrub_lock);
2225fa9e4066Sahrens 
2226ea8dc4b6Seschrock 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
2227ea8dc4b6Seschrock 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
2228ea8dc4b6Seschrock 
2229d80c45e0Sbonwick 	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
2230ea8dc4b6Seschrock 
2231fa9e4066Sahrens 	zio_nowait(zio_read(NULL, spa, bp, data, size,
2232ea8dc4b6Seschrock 	    spa_scrub_io_done, NULL, priority, flags, zb));
2233fa9e4066Sahrens }
2234fa9e4066Sahrens 
2235fa9e4066Sahrens /* ARGSUSED */
2236fa9e4066Sahrens static int
2237fa9e4066Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
2238fa9e4066Sahrens {
2239fa9e4066Sahrens 	blkptr_t *bp = &bc->bc_blkptr;
224044cd46caSbillm 	vdev_t *vd = spa->spa_root_vdev;
224144cd46caSbillm 	dva_t *dva = bp->blk_dva;
224244cd46caSbillm 	int needs_resilver = B_FALSE;
224344cd46caSbillm 	int d;
2244fa9e4066Sahrens 
224544cd46caSbillm 	if (bc->bc_errno) {
2246fa9e4066Sahrens 		/*
2247fa9e4066Sahrens 		 * We can't scrub this block, but we can continue to scrub
2248fa9e4066Sahrens 		 * the rest of the pool.  Note the error and move along.
2249fa9e4066Sahrens 		 */
2250fa9e4066Sahrens 		mutex_enter(&spa->spa_scrub_lock);
2251fa9e4066Sahrens 		spa->spa_scrub_errors++;
2252fa9e4066Sahrens 		mutex_exit(&spa->spa_scrub_lock);
2253fa9e4066Sahrens 
225444cd46caSbillm 		mutex_enter(&vd->vdev_stat_lock);
225544cd46caSbillm 		vd->vdev_stat.vs_scrub_errors++;
225644cd46caSbillm 		mutex_exit(&vd->vdev_stat_lock);
2257fa9e4066Sahrens 
2258fa9e4066Sahrens 		return (ERESTART);
2259fa9e4066Sahrens 	}
2260fa9e4066Sahrens 
2261fa9e4066Sahrens 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
2262fa9e4066Sahrens 
226344cd46caSbillm 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
226444cd46caSbillm 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
2265fa9e4066Sahrens 
226644cd46caSbillm 		ASSERT(vd != NULL);
226744cd46caSbillm 
226844cd46caSbillm 		/*
226944cd46caSbillm 		 * Keep track of how much data we've examined so that
227044cd46caSbillm 		 * zpool(1M) status can make useful progress reports.
227144cd46caSbillm 		 */
227244cd46caSbillm 		mutex_enter(&vd->vdev_stat_lock);
227344cd46caSbillm 		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
227444cd46caSbillm 		mutex_exit(&vd->vdev_stat_lock);
227544cd46caSbillm 
227644cd46caSbillm 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
227744cd46caSbillm 			if (DVA_GET_GANG(&dva[d])) {
227844cd46caSbillm 				/*
227944cd46caSbillm 				 * Gang members may be spread across multiple
228044cd46caSbillm 				 * vdevs, so the best we can do is look at the
228144cd46caSbillm 				 * pool-wide DTL.
228244cd46caSbillm 				 * XXX -- it would be better to change our
228344cd46caSbillm 				 * allocation policy to ensure that this can't
228444cd46caSbillm 				 * happen.
228544cd46caSbillm 				 */
228644cd46caSbillm 				vd = spa->spa_root_vdev;
228744cd46caSbillm 			}
228844cd46caSbillm 			if (vdev_dtl_contains(&vd->vdev_dtl_map,
228944cd46caSbillm 			    bp->blk_birth, 1))
229044cd46caSbillm 				needs_resilver = B_TRUE;
2291fa9e4066Sahrens 		}
229244cd46caSbillm 	}
229344cd46caSbillm 
229444cd46caSbillm 	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
2295fa9e4066Sahrens 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
2296ea8dc4b6Seschrock 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
229744cd46caSbillm 	else if (needs_resilver)
229844cd46caSbillm 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
229944cd46caSbillm 		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
2300fa9e4066Sahrens 
2301fa9e4066Sahrens 	return (0);
2302fa9e4066Sahrens }
2303fa9e4066Sahrens 
2304fa9e4066Sahrens static void
2305fa9e4066Sahrens spa_scrub_thread(spa_t *spa)
2306fa9e4066Sahrens {
2307fa9e4066Sahrens 	callb_cpr_t cprinfo;
2308fa9e4066Sahrens 	traverse_handle_t *th = spa->spa_scrub_th;
2309fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
2310fa9e4066Sahrens 	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
2311fa9e4066Sahrens 	int error = 0;
2312fa9e4066Sahrens 	boolean_t complete;
2313fa9e4066Sahrens 
2314fa9e4066Sahrens 	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
2315fa9e4066Sahrens 
2316f0aa80d4Sbonwick 	/*
2317f0aa80d4Sbonwick 	 * If we're restarting due to a snapshot create/delete,
2318f0aa80d4Sbonwick 	 * wait for that to complete.
2319f0aa80d4Sbonwick 	 */
2320f0aa80d4Sbonwick 	txg_wait_synced(spa_get_dsl(spa), 0);
2321f0aa80d4Sbonwick 
2322ea8dc4b6Seschrock 	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
2323ea8dc4b6Seschrock 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2324ea8dc4b6Seschrock 	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
2325ea8dc4b6Seschrock 
2326ea8dc4b6Seschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
2327ea8dc4b6Seschrock 	vdev_reopen(rvd);		/* purge all vdev caches */
2328fa9e4066Sahrens 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
2329fa9e4066Sahrens 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
2330ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2331fa9e4066Sahrens 
2332fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2333fa9e4066Sahrens 	spa->spa_scrub_errors = 0;
2334fa9e4066Sahrens 	spa->spa_scrub_active = 1;
2335ea8dc4b6Seschrock 	ASSERT(spa->spa_scrub_inflight == 0);
2336ea8dc4b6Seschrock 	ASSERT(spa->spa_scrub_throttled == 0);
2337fa9e4066Sahrens 
2338fa9e4066Sahrens 	while (!spa->spa_scrub_stop) {
2339fa9e4066Sahrens 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2340ea8dc4b6Seschrock 		while (spa->spa_scrub_suspended) {
2341fa9e4066Sahrens 			spa->spa_scrub_active = 0;
2342fa9e4066Sahrens 			cv_broadcast(&spa->spa_scrub_cv);
2343fa9e4066Sahrens 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2344fa9e4066Sahrens 			spa->spa_scrub_active = 1;
2345fa9e4066Sahrens 		}
2346fa9e4066Sahrens 		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
2347fa9e4066Sahrens 
2348fa9e4066Sahrens 		if (spa->spa_scrub_restart_txg != 0)
2349fa9e4066Sahrens 			break;
2350fa9e4066Sahrens 
2351fa9e4066Sahrens 		mutex_exit(&spa->spa_scrub_lock);
2352fa9e4066Sahrens 		error = traverse_more(th);
2353fa9e4066Sahrens 		mutex_enter(&spa->spa_scrub_lock);
2354fa9e4066Sahrens 		if (error != EAGAIN)
2355fa9e4066Sahrens 			break;
2356ea8dc4b6Seschrock 
2357ea8dc4b6Seschrock 		while (spa->spa_scrub_throttled > 0)
2358ea8dc4b6Seschrock 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2359fa9e4066Sahrens 	}
2360fa9e4066Sahrens 
2361fa9e4066Sahrens 	while (spa->spa_scrub_inflight)
2362fa9e4066Sahrens 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2363fa9e4066Sahrens 
23645dabedeeSbonwick 	spa->spa_scrub_active = 0;
23655dabedeeSbonwick 	cv_broadcast(&spa->spa_scrub_cv);
23665dabedeeSbonwick 
23675dabedeeSbonwick 	mutex_exit(&spa->spa_scrub_lock);
23685dabedeeSbonwick 
23695dabedeeSbonwick 	spa_config_enter(spa, RW_WRITER, FTAG);
23705dabedeeSbonwick 
23715dabedeeSbonwick 	mutex_enter(&spa->spa_scrub_lock);
23725dabedeeSbonwick 
23735dabedeeSbonwick 	/*
23745dabedeeSbonwick 	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
23755dabedeeSbonwick 	 * AND the spa config lock to synchronize with any config changes
23765dabedeeSbonwick 	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
23775dabedeeSbonwick 	 */
2378fa9e4066Sahrens 	if (spa->spa_scrub_restart_txg != 0)
2379fa9e4066Sahrens 		error = ERESTART;
2380fa9e4066Sahrens 
2381ea8dc4b6Seschrock 	if (spa->spa_scrub_stop)
2382ea8dc4b6Seschrock 		error = EINTR;
2383ea8dc4b6Seschrock 
2384fa9e4066Sahrens 	/*
2385ea8dc4b6Seschrock 	 * Even if there were uncorrectable errors, we consider the scrub
2386ea8dc4b6Seschrock 	 * completed.  The downside is that if there is a transient error during
2387ea8dc4b6Seschrock 	 * a resilver, we won't resilver the data properly to the target.  But
2388ea8dc4b6Seschrock 	 * if the damage is permanent (more likely) we will resilver forever,
2389ea8dc4b6Seschrock 	 * which isn't really acceptable.  Since there is enough information for
2390ea8dc4b6Seschrock 	 * the user to know what has failed and why, this seems like a more
2391ea8dc4b6Seschrock 	 * tractable approach.
2392fa9e4066Sahrens 	 */
2393ea8dc4b6Seschrock 	complete = (error == 0);
2394fa9e4066Sahrens 
2395ea8dc4b6Seschrock 	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
2396ea8dc4b6Seschrock 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2397fa9e4066Sahrens 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
2398fa9e4066Sahrens 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
2399fa9e4066Sahrens 
2400fa9e4066Sahrens 	mutex_exit(&spa->spa_scrub_lock);
2401fa9e4066Sahrens 
2402fa9e4066Sahrens 	/*
2403fa9e4066Sahrens 	 * If the scrub/resilver completed, update all DTLs to reflect this.
2404fa9e4066Sahrens 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
2405fa9e4066Sahrens 	 */
2406fa9e4066Sahrens 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
2407fa9e4066Sahrens 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
2408fa9e4066Sahrens 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
2409ea8dc4b6Seschrock 	spa_errlog_rotate(spa);
24105dabedeeSbonwick 
2411ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2412fa9e4066Sahrens 
2413fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2414fa9e4066Sahrens 
2415ea8dc4b6Seschrock 	/*
2416ea8dc4b6Seschrock 	 * We may have finished replacing a device.
2417ea8dc4b6Seschrock 	 * Let the async thread assess this and handle the detach.
2418ea8dc4b6Seschrock 	 */
2419ea8dc4b6Seschrock 	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2420fa9e4066Sahrens 
2421fa9e4066Sahrens 	/*
2422fa9e4066Sahrens 	 * If we were told to restart, our final act is to start a new scrub.
2423fa9e4066Sahrens 	 */
2424fa9e4066Sahrens 	if (error == ERESTART)
2425ea8dc4b6Seschrock 		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
2426ea8dc4b6Seschrock 		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
2427fa9e4066Sahrens 
2428ea8dc4b6Seschrock 	spa->spa_scrub_type = POOL_SCRUB_NONE;
2429ea8dc4b6Seschrock 	spa->spa_scrub_active = 0;
2430ea8dc4b6Seschrock 	spa->spa_scrub_thread = NULL;
2431ea8dc4b6Seschrock 	cv_broadcast(&spa->spa_scrub_cv);
2432fa9e4066Sahrens 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
2433fa9e4066Sahrens 	thread_exit();
2434fa9e4066Sahrens }
2435fa9e4066Sahrens 
2436fa9e4066Sahrens void
2437fa9e4066Sahrens spa_scrub_suspend(spa_t *spa)
2438fa9e4066Sahrens {
2439fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2440ea8dc4b6Seschrock 	spa->spa_scrub_suspended++;
2441fa9e4066Sahrens 	while (spa->spa_scrub_active) {
2442fa9e4066Sahrens 		cv_broadcast(&spa->spa_scrub_cv);
2443fa9e4066Sahrens 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2444fa9e4066Sahrens 	}
2445fa9e4066Sahrens 	while (spa->spa_scrub_inflight)
2446fa9e4066Sahrens 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2447fa9e4066Sahrens 	mutex_exit(&spa->spa_scrub_lock);
2448fa9e4066Sahrens }
2449fa9e4066Sahrens 
2450fa9e4066Sahrens void
2451fa9e4066Sahrens spa_scrub_resume(spa_t *spa)
2452fa9e4066Sahrens {
2453fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2454ea8dc4b6Seschrock 	ASSERT(spa->spa_scrub_suspended != 0);
2455ea8dc4b6Seschrock 	if (--spa->spa_scrub_suspended == 0)
2456fa9e4066Sahrens 		cv_broadcast(&spa->spa_scrub_cv);
2457fa9e4066Sahrens 	mutex_exit(&spa->spa_scrub_lock);
2458fa9e4066Sahrens }
2459fa9e4066Sahrens 
2460fa9e4066Sahrens void
2461fa9e4066Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg)
2462fa9e4066Sahrens {
2463fa9e4066Sahrens 	/*
2464fa9e4066Sahrens 	 * Something happened (e.g. snapshot create/delete) that means
2465fa9e4066Sahrens 	 * we must restart any in-progress scrubs.  The itinerary will
2466fa9e4066Sahrens 	 * fix this properly.
2467fa9e4066Sahrens 	 */
2468fa9e4066Sahrens 	mutex_enter(&spa->spa_scrub_lock);
2469fa9e4066Sahrens 	spa->spa_scrub_restart_txg = txg;
2470fa9e4066Sahrens 	mutex_exit(&spa->spa_scrub_lock);
2471fa9e4066Sahrens }
2472fa9e4066Sahrens 
2473ea8dc4b6Seschrock int
2474ea8dc4b6Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
2475fa9e4066Sahrens {
2476fa9e4066Sahrens 	space_seg_t *ss;
2477fa9e4066Sahrens 	uint64_t mintxg, maxtxg;
2478fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
2479fa9e4066Sahrens 
2480fa9e4066Sahrens 	if ((uint_t)type >= POOL_SCRUB_TYPES)
2481fa9e4066Sahrens 		return (ENOTSUP);
2482fa9e4066Sahrens 
2483ea8dc4b6Seschrock 	mutex_enter(&spa->spa_scrub_lock);
2484ea8dc4b6Seschrock 
2485fa9e4066Sahrens 	/*
2486fa9e4066Sahrens 	 * If there's a scrub or resilver already in progress, stop it.
2487fa9e4066Sahrens 	 */
2488fa9e4066Sahrens 	while (spa->spa_scrub_thread != NULL) {
2489fa9e4066Sahrens 		/*
2490fa9e4066Sahrens 		 * Don't stop a resilver unless forced.
2491fa9e4066Sahrens 		 */
2492ea8dc4b6Seschrock 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
2493ea8dc4b6Seschrock 			mutex_exit(&spa->spa_scrub_lock);
2494fa9e4066Sahrens 			return (EBUSY);
2495ea8dc4b6Seschrock 		}
2496fa9e4066Sahrens 		spa->spa_scrub_stop = 1;
2497fa9e4066Sahrens 		cv_broadcast(&spa->spa_scrub_cv);
2498fa9e4066Sahrens 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2499fa9e4066Sahrens 	}
2500fa9e4066Sahrens 
2501fa9e4066Sahrens 	/*
2502fa9e4066Sahrens 	 * Terminate the previous traverse.
2503fa9e4066Sahrens 	 */
2504fa9e4066Sahrens 	if (spa->spa_scrub_th != NULL) {
2505fa9e4066Sahrens 		traverse_fini(spa->spa_scrub_th);
2506fa9e4066Sahrens 		spa->spa_scrub_th = NULL;
2507fa9e4066Sahrens 	}
2508fa9e4066Sahrens 
2509ea8dc4b6Seschrock 	if (rvd == NULL) {
2510ea8dc4b6Seschrock 		ASSERT(spa->spa_scrub_stop == 0);
2511ea8dc4b6Seschrock 		ASSERT(spa->spa_scrub_type == type);
2512ea8dc4b6Seschrock 		ASSERT(spa->spa_scrub_restart_txg == 0);
2513ea8dc4b6Seschrock 		mutex_exit(&spa->spa_scrub_lock);
2514ea8dc4b6Seschrock 		return (0);
2515ea8dc4b6Seschrock 	}
2516fa9e4066Sahrens 
2517fa9e4066Sahrens 	mintxg = TXG_INITIAL - 1;
2518fa9e4066Sahrens 	maxtxg = spa_last_synced_txg(spa) + 1;
2519fa9e4066Sahrens 
2520ea8dc4b6Seschrock 	mutex_enter(&rvd->vdev_dtl_lock);
2521fa9e4066Sahrens 
2522ea8dc4b6Seschrock 	if (rvd->vdev_dtl_map.sm_space == 0) {
2523ea8dc4b6Seschrock 		/*
2524ea8dc4b6Seschrock 		 * The pool-wide DTL is empty.
2525ecc2d604Sbonwick 		 * If this is a resilver, there's nothing to do except
2526ecc2d604Sbonwick 		 * check whether any in-progress replacements have completed.
2527ea8dc4b6Seschrock 		 */
2528ecc2d604Sbonwick 		if (type == POOL_SCRUB_RESILVER) {
2529ea8dc4b6Seschrock 			type = POOL_SCRUB_NONE;
2530ecc2d604Sbonwick 			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2531ecc2d604Sbonwick 		}
2532ea8dc4b6Seschrock 	} else {
2533ea8dc4b6Seschrock 		/*
2534ea8dc4b6Seschrock 		 * The pool-wide DTL is non-empty.
2535ea8dc4b6Seschrock 		 * If this is a normal scrub, upgrade to a resilver instead.
2536ea8dc4b6Seschrock 		 */
2537ea8dc4b6Seschrock 		if (type == POOL_SCRUB_EVERYTHING)
2538ea8dc4b6Seschrock 			type = POOL_SCRUB_RESILVER;
2539ea8dc4b6Seschrock 	}
2540fa9e4066Sahrens 
2541ea8dc4b6Seschrock 	if (type == POOL_SCRUB_RESILVER) {
2542fa9e4066Sahrens 		/*
2543fa9e4066Sahrens 		 * Determine the resilvering boundaries.
2544fa9e4066Sahrens 		 *
2545fa9e4066Sahrens 		 * Note: (mintxg, maxtxg) is an open interval,
2546fa9e4066Sahrens 		 * i.e. mintxg and maxtxg themselves are not included.
2547fa9e4066Sahrens 		 *
2548fa9e4066Sahrens 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
2549fa9e4066Sahrens 		 * so we don't claim to resilver a txg that's still changing.
2550fa9e4066Sahrens 		 */
2551fa9e4066Sahrens 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
2552ea8dc4b6Seschrock 		mintxg = ss->ss_start - 1;
2553fa9e4066Sahrens 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
2554ea8dc4b6Seschrock 		maxtxg = MIN(ss->ss_end, maxtxg);
2555fa9e4066Sahrens 	}
2556fa9e4066Sahrens 
2557ea8dc4b6Seschrock 	mutex_exit(&rvd->vdev_dtl_lock);
2558ea8dc4b6Seschrock 
2559ea8dc4b6Seschrock 	spa->spa_scrub_stop = 0;
2560ea8dc4b6Seschrock 	spa->spa_scrub_type = type;
2561ea8dc4b6Seschrock 	spa->spa_scrub_restart_txg = 0;
2562ea8dc4b6Seschrock 
2563ea8dc4b6Seschrock 	if (type != POOL_SCRUB_NONE) {
2564ea8dc4b6Seschrock 		spa->spa_scrub_mintxg = mintxg;
2565fa9e4066Sahrens 		spa->spa_scrub_maxtxg = maxtxg;
2566fa9e4066Sahrens 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
25670373e76bSbonwick 		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
25680373e76bSbonwick 		    ZIO_FLAG_CANFAIL);
2569fa9e4066Sahrens 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
2570fa9e4066Sahrens 		spa->spa_scrub_thread = thread_create(NULL, 0,
2571fa9e4066Sahrens 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
2572fa9e4066Sahrens 	}
2573fa9e4066Sahrens 
2574ea8dc4b6Seschrock 	mutex_exit(&spa->spa_scrub_lock);
2575ea8dc4b6Seschrock 
2576fa9e4066Sahrens 	return (0);
2577fa9e4066Sahrens }
2578fa9e4066Sahrens 
2579ea8dc4b6Seschrock /*
2580ea8dc4b6Seschrock  * ==========================================================================
2581ea8dc4b6Seschrock  * SPA async task processing
2582ea8dc4b6Seschrock  * ==========================================================================
2583ea8dc4b6Seschrock  */
2584ea8dc4b6Seschrock 
2585ea8dc4b6Seschrock static void
2586ea8dc4b6Seschrock spa_async_reopen(spa_t *spa)
2587fa9e4066Sahrens {
2588ea8dc4b6Seschrock 	vdev_t *rvd = spa->spa_root_vdev;
2589ea8dc4b6Seschrock 	vdev_t *tvd;
2590ea8dc4b6Seschrock 	int c;
2591fa9e4066Sahrens 
2592ea8dc4b6Seschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
2593ea8dc4b6Seschrock 
2594ea8dc4b6Seschrock 	for (c = 0; c < rvd->vdev_children; c++) {
2595ea8dc4b6Seschrock 		tvd = rvd->vdev_child[c];
2596ea8dc4b6Seschrock 		if (tvd->vdev_reopen_wanted) {
2597ea8dc4b6Seschrock 			tvd->vdev_reopen_wanted = 0;
2598ea8dc4b6Seschrock 			vdev_reopen(tvd);
2599ea8dc4b6Seschrock 		}
2600ea8dc4b6Seschrock 	}
2601ea8dc4b6Seschrock 
2602ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2603ea8dc4b6Seschrock }
2604fa9e4066Sahrens 
2605ea8dc4b6Seschrock static void
2606ea8dc4b6Seschrock spa_async_thread(spa_t *spa)
2607ea8dc4b6Seschrock {
2608ea8dc4b6Seschrock 	int tasks;
2609ea8dc4b6Seschrock 
2610ea8dc4b6Seschrock 	ASSERT(spa->spa_sync_on);
2611ea8dc4b6Seschrock 
2612ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2613ea8dc4b6Seschrock 	tasks = spa->spa_async_tasks;
2614ea8dc4b6Seschrock 	spa->spa_async_tasks = 0;
2615ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2616ea8dc4b6Seschrock 
26170373e76bSbonwick 	/*
26180373e76bSbonwick 	 * See if the config needs to be updated.
26190373e76bSbonwick 	 */
26200373e76bSbonwick 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
26210373e76bSbonwick 		mutex_enter(&spa_namespace_lock);
26220373e76bSbonwick 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
26230373e76bSbonwick 		mutex_exit(&spa_namespace_lock);
26240373e76bSbonwick 	}
26250373e76bSbonwick 
2626ea8dc4b6Seschrock 	/*
2627ea8dc4b6Seschrock 	 * See if any devices need to be reopened.
2628ea8dc4b6Seschrock 	 */
2629ea8dc4b6Seschrock 	if (tasks & SPA_ASYNC_REOPEN)
2630ea8dc4b6Seschrock 		spa_async_reopen(spa);
2631ea8dc4b6Seschrock 
2632ea8dc4b6Seschrock 	/*
2633ea8dc4b6Seschrock 	 * If any devices are done replacing, detach them.
2634ea8dc4b6Seschrock 	 */
2635ea8dc4b6Seschrock 	if (tasks & SPA_ASYNC_REPLACE_DONE)
2636fa9e4066Sahrens 		spa_vdev_replace_done(spa);
2637fa9e4066Sahrens 
2638ea8dc4b6Seschrock 	/*
2639ea8dc4b6Seschrock 	 * Kick off a scrub.
2640ea8dc4b6Seschrock 	 */
2641ea8dc4b6Seschrock 	if (tasks & SPA_ASYNC_SCRUB)
2642ea8dc4b6Seschrock 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
2643ea8dc4b6Seschrock 
2644ea8dc4b6Seschrock 	/*
2645ea8dc4b6Seschrock 	 * Kick off a resilver.
2646ea8dc4b6Seschrock 	 */
2647ea8dc4b6Seschrock 	if (tasks & SPA_ASYNC_RESILVER)
2648ea8dc4b6Seschrock 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2649ea8dc4b6Seschrock 
2650ea8dc4b6Seschrock 	/*
2651ea8dc4b6Seschrock 	 * Let the world know that we're done.
2652ea8dc4b6Seschrock 	 */
2653ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2654ea8dc4b6Seschrock 	spa->spa_async_thread = NULL;
2655ea8dc4b6Seschrock 	cv_broadcast(&spa->spa_async_cv);
2656ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2657ea8dc4b6Seschrock 	thread_exit();
2658ea8dc4b6Seschrock }
2659ea8dc4b6Seschrock 
2660ea8dc4b6Seschrock void
2661ea8dc4b6Seschrock spa_async_suspend(spa_t *spa)
2662ea8dc4b6Seschrock {
2663ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2664ea8dc4b6Seschrock 	spa->spa_async_suspended++;
2665ea8dc4b6Seschrock 	while (spa->spa_async_thread != NULL)
2666ea8dc4b6Seschrock 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
2667ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2668ea8dc4b6Seschrock }
2669ea8dc4b6Seschrock 
2670ea8dc4b6Seschrock void
2671ea8dc4b6Seschrock spa_async_resume(spa_t *spa)
2672ea8dc4b6Seschrock {
2673ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2674ea8dc4b6Seschrock 	ASSERT(spa->spa_async_suspended != 0);
2675ea8dc4b6Seschrock 	spa->spa_async_suspended--;
2676ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2677ea8dc4b6Seschrock }
2678ea8dc4b6Seschrock 
2679ea8dc4b6Seschrock static void
2680ea8dc4b6Seschrock spa_async_dispatch(spa_t *spa)
2681ea8dc4b6Seschrock {
2682ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2683ea8dc4b6Seschrock 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
26840373e76bSbonwick 	    spa->spa_async_thread == NULL &&
26850373e76bSbonwick 	    rootdir != NULL && !vn_is_readonly(rootdir))
2686ea8dc4b6Seschrock 		spa->spa_async_thread = thread_create(NULL, 0,
2687ea8dc4b6Seschrock 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
2688ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2689ea8dc4b6Seschrock }
2690ea8dc4b6Seschrock 
2691ea8dc4b6Seschrock void
2692ea8dc4b6Seschrock spa_async_request(spa_t *spa, int task)
2693ea8dc4b6Seschrock {
2694ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
2695ea8dc4b6Seschrock 	spa->spa_async_tasks |= task;
2696ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
2697fa9e4066Sahrens }
2698fa9e4066Sahrens 
2699fa9e4066Sahrens /*
2700fa9e4066Sahrens  * ==========================================================================
2701fa9e4066Sahrens  * SPA syncing routines
2702fa9e4066Sahrens  * ==========================================================================
2703fa9e4066Sahrens  */
2704fa9e4066Sahrens 
2705fa9e4066Sahrens static void
2706fa9e4066Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
2707fa9e4066Sahrens {
2708fa9e4066Sahrens 	bplist_t *bpl = &spa->spa_sync_bplist;
2709fa9e4066Sahrens 	dmu_tx_t *tx;
2710fa9e4066Sahrens 	blkptr_t blk;
2711fa9e4066Sahrens 	uint64_t itor = 0;
2712fa9e4066Sahrens 	zio_t *zio;
2713fa9e4066Sahrens 	int error;
2714fa9e4066Sahrens 	uint8_t c = 1;
2715fa9e4066Sahrens 
2716fa9e4066Sahrens 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
2717fa9e4066Sahrens 
2718fa9e4066Sahrens 	while (bplist_iterate(bpl, &itor, &blk) == 0)
2719fa9e4066Sahrens 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
2720fa9e4066Sahrens 
2721fa9e4066Sahrens 	error = zio_wait(zio);
2722fa9e4066Sahrens 	ASSERT3U(error, ==, 0);
2723fa9e4066Sahrens 
2724fa9e4066Sahrens 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2725fa9e4066Sahrens 	bplist_vacate(bpl, tx);
2726fa9e4066Sahrens 
2727fa9e4066Sahrens 	/*
2728fa9e4066Sahrens 	 * Pre-dirty the first block so we sync to convergence faster.
2729fa9e4066Sahrens 	 * (Usually only the first block is needed.)
2730fa9e4066Sahrens 	 */
2731fa9e4066Sahrens 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
2732fa9e4066Sahrens 	dmu_tx_commit(tx);
2733fa9e4066Sahrens }
2734fa9e4066Sahrens 
2735fa9e4066Sahrens static void
273699653d4eSeschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
2737fa9e4066Sahrens {
2738fa9e4066Sahrens 	char *packed = NULL;
2739fa9e4066Sahrens 	size_t nvsize = 0;
2740fa9e4066Sahrens 	dmu_buf_t *db;
2741fa9e4066Sahrens 
274299653d4eSeschrock 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
2743fa9e4066Sahrens 
2744fa9e4066Sahrens 	packed = kmem_alloc(nvsize, KM_SLEEP);
2745fa9e4066Sahrens 
274699653d4eSeschrock 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
2747ea8dc4b6Seschrock 	    KM_SLEEP) == 0);
2748fa9e4066Sahrens 
274999653d4eSeschrock 	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
2750fa9e4066Sahrens 
2751fa9e4066Sahrens 	kmem_free(packed, nvsize);
2752fa9e4066Sahrens 
275399653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
2754fa9e4066Sahrens 	dmu_buf_will_dirty(db, tx);
2755fa9e4066Sahrens 	*(uint64_t *)db->db_data = nvsize;
2756ea8dc4b6Seschrock 	dmu_buf_rele(db, FTAG);
2757fa9e4066Sahrens }
2758fa9e4066Sahrens 
275999653d4eSeschrock static void
276099653d4eSeschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
276199653d4eSeschrock {
276299653d4eSeschrock 	nvlist_t *nvroot;
276399653d4eSeschrock 	nvlist_t **spares;
276499653d4eSeschrock 	int i;
276599653d4eSeschrock 
276699653d4eSeschrock 	if (!spa->spa_sync_spares)
276799653d4eSeschrock 		return;
276899653d4eSeschrock 
276999653d4eSeschrock 	/*
277099653d4eSeschrock 	 * Update the MOS nvlist describing the list of available spares.
277199653d4eSeschrock 	 * spa_validate_spares() will have already made sure this nvlist is
277299653d4eSeschrock 	 * valid and the vdevs are labelled appropriately.
277399653d4eSeschrock 	 */
277499653d4eSeschrock 	if (spa->spa_spares_object == 0) {
277599653d4eSeschrock 		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
277699653d4eSeschrock 		    DMU_OT_PACKED_NVLIST, 1 << 14,
277799653d4eSeschrock 		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
277899653d4eSeschrock 		VERIFY(zap_update(spa->spa_meta_objset,
277999653d4eSeschrock 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
278099653d4eSeschrock 		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
278199653d4eSeschrock 	}
278299653d4eSeschrock 
278399653d4eSeschrock 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
278499653d4eSeschrock 	if (spa->spa_nspares == 0) {
278599653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
278699653d4eSeschrock 		    NULL, 0) == 0);
278799653d4eSeschrock 	} else {
278899653d4eSeschrock 		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
278999653d4eSeschrock 		    KM_SLEEP);
279099653d4eSeschrock 		for (i = 0; i < spa->spa_nspares; i++)
279199653d4eSeschrock 			spares[i] = vdev_config_generate(spa,
279299653d4eSeschrock 			    spa->spa_spares[i], B_FALSE, B_TRUE);
279399653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
279499653d4eSeschrock 		    spares, spa->spa_nspares) == 0);
279599653d4eSeschrock 		for (i = 0; i < spa->spa_nspares; i++)
279699653d4eSeschrock 			nvlist_free(spares[i]);
279799653d4eSeschrock 		kmem_free(spares, spa->spa_nspares * sizeof (void *));
279899653d4eSeschrock 	}
279999653d4eSeschrock 
280099653d4eSeschrock 	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
280106eeb2adSek 	nvlist_free(nvroot);
280299653d4eSeschrock 
280399653d4eSeschrock 	spa->spa_sync_spares = B_FALSE;
280499653d4eSeschrock }
280599653d4eSeschrock 
280699653d4eSeschrock static void
280799653d4eSeschrock spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
280899653d4eSeschrock {
280999653d4eSeschrock 	nvlist_t *config;
281099653d4eSeschrock 
281199653d4eSeschrock 	if (list_is_empty(&spa->spa_dirty_list))
281299653d4eSeschrock 		return;
281399653d4eSeschrock 
281499653d4eSeschrock 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
281599653d4eSeschrock 
281699653d4eSeschrock 	if (spa->spa_config_syncing)
281799653d4eSeschrock 		nvlist_free(spa->spa_config_syncing);
281899653d4eSeschrock 	spa->spa_config_syncing = config;
281999653d4eSeschrock 
282099653d4eSeschrock 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
282199653d4eSeschrock }
282299653d4eSeschrock 
2823fa9e4066Sahrens /*
2824fa9e4066Sahrens  * Sync the specified transaction group.  New blocks may be dirtied as
2825fa9e4066Sahrens  * part of the process, so we iterate until it converges.
2826fa9e4066Sahrens  */
2827fa9e4066Sahrens void
2828fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg)
2829fa9e4066Sahrens {
2830fa9e4066Sahrens 	dsl_pool_t *dp = spa->spa_dsl_pool;
2831fa9e4066Sahrens 	objset_t *mos = spa->spa_meta_objset;
2832fa9e4066Sahrens 	bplist_t *bpl = &spa->spa_sync_bplist;
28330373e76bSbonwick 	vdev_t *rvd = spa->spa_root_vdev;
2834fa9e4066Sahrens 	vdev_t *vd;
2835fa9e4066Sahrens 	dmu_tx_t *tx;
2836fa9e4066Sahrens 	int dirty_vdevs;
2837fa9e4066Sahrens 
2838fa9e4066Sahrens 	/*
2839fa9e4066Sahrens 	 * Lock out configuration changes.
2840fa9e4066Sahrens 	 */
2841ea8dc4b6Seschrock 	spa_config_enter(spa, RW_READER, FTAG);
2842fa9e4066Sahrens 
2843fa9e4066Sahrens 	spa->spa_syncing_txg = txg;
2844fa9e4066Sahrens 	spa->spa_sync_pass = 0;
2845fa9e4066Sahrens 
2846ea8dc4b6Seschrock 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
2847fa9e4066Sahrens 
284899653d4eSeschrock 	tx = dmu_tx_create_assigned(dp, txg);
284999653d4eSeschrock 
285099653d4eSeschrock 	/*
285199653d4eSeschrock 	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
285299653d4eSeschrock 	 * set spa_deflate if we have no raid-z vdevs.
285399653d4eSeschrock 	 */
285499653d4eSeschrock 	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
285599653d4eSeschrock 	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
285699653d4eSeschrock 		int i;
285799653d4eSeschrock 
285899653d4eSeschrock 		for (i = 0; i < rvd->vdev_children; i++) {
285999653d4eSeschrock 			vd = rvd->vdev_child[i];
286099653d4eSeschrock 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
286199653d4eSeschrock 				break;
286299653d4eSeschrock 		}
286399653d4eSeschrock 		if (i == rvd->vdev_children) {
286499653d4eSeschrock 			spa->spa_deflate = TRUE;
286599653d4eSeschrock 			VERIFY(0 == zap_add(spa->spa_meta_objset,
286699653d4eSeschrock 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
286799653d4eSeschrock 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
286899653d4eSeschrock 		}
286999653d4eSeschrock 	}
287099653d4eSeschrock 
2871fa9e4066Sahrens 	/*
2872fa9e4066Sahrens 	 * If anything has changed in this txg, push the deferred frees
2873fa9e4066Sahrens 	 * from the previous txg.  If not, leave them alone so that we
2874fa9e4066Sahrens 	 * don't generate work on an otherwise idle system.
2875fa9e4066Sahrens 	 */
2876fa9e4066Sahrens 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
28771615a317Sek 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
28781615a317Sek 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
2879fa9e4066Sahrens 		spa_sync_deferred_frees(spa, txg);
2880fa9e4066Sahrens 
2881fa9e4066Sahrens 	/*
2882fa9e4066Sahrens 	 * Iterate to convergence.
2883fa9e4066Sahrens 	 */
2884fa9e4066Sahrens 	do {
2885fa9e4066Sahrens 		spa->spa_sync_pass++;
2886fa9e4066Sahrens 
2887fa9e4066Sahrens 		spa_sync_config_object(spa, tx);
288899653d4eSeschrock 		spa_sync_spares(spa, tx);
2889ea8dc4b6Seschrock 		spa_errlog_sync(spa, txg);
2890fa9e4066Sahrens 		dsl_pool_sync(dp, txg);
2891fa9e4066Sahrens 
2892fa9e4066Sahrens 		dirty_vdevs = 0;
2893fa9e4066Sahrens 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
2894fa9e4066Sahrens 			vdev_sync(vd, txg);
2895fa9e4066Sahrens 			dirty_vdevs++;
2896fa9e4066Sahrens 		}
2897fa9e4066Sahrens 
2898fa9e4066Sahrens 		bplist_sync(bpl, tx);
2899fa9e4066Sahrens 	} while (dirty_vdevs);
2900fa9e4066Sahrens 
2901fa9e4066Sahrens 	bplist_close(bpl);
2902fa9e4066Sahrens 
2903fa9e4066Sahrens 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
2904fa9e4066Sahrens 
2905fa9e4066Sahrens 	/*
2906fa9e4066Sahrens 	 * Rewrite the vdev configuration (which includes the uberblock)
2907fa9e4066Sahrens 	 * to commit the transaction group.
29080373e76bSbonwick 	 *
29090373e76bSbonwick 	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
29100373e76bSbonwick 	 * Otherwise, pick a random top-level vdev that's known to be
29110373e76bSbonwick 	 * visible in the config cache (see spa_vdev_add() for details).
29120373e76bSbonwick 	 * If the write fails, try the next vdev until we're tried them all.
29130373e76bSbonwick 	 */
29140373e76bSbonwick 	if (!list_is_empty(&spa->spa_dirty_list)) {
29150373e76bSbonwick 		VERIFY(vdev_config_sync(rvd, txg) == 0);
29160373e76bSbonwick 	} else {
29170373e76bSbonwick 		int children = rvd->vdev_children;
29180373e76bSbonwick 		int c0 = spa_get_random(children);
29190373e76bSbonwick 		int c;
29200373e76bSbonwick 
29210373e76bSbonwick 		for (c = 0; c < children; c++) {
29220373e76bSbonwick 			vd = rvd->vdev_child[(c0 + c) % children];
29230373e76bSbonwick 			if (vd->vdev_ms_array == 0)
29240373e76bSbonwick 				continue;
29250373e76bSbonwick 			if (vdev_config_sync(vd, txg) == 0)
29260373e76bSbonwick 				break;
29270373e76bSbonwick 		}
29280373e76bSbonwick 		if (c == children)
29290373e76bSbonwick 			VERIFY(vdev_config_sync(rvd, txg) == 0);
29300373e76bSbonwick 	}
29310373e76bSbonwick 
293299653d4eSeschrock 	dmu_tx_commit(tx);
293399653d4eSeschrock 
29340373e76bSbonwick 	/*
29350373e76bSbonwick 	 * Clear the dirty config list.
2936fa9e4066Sahrens 	 */
29370373e76bSbonwick 	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
29380373e76bSbonwick 		vdev_config_clean(vd);
29390373e76bSbonwick 
29400373e76bSbonwick 	/*
29410373e76bSbonwick 	 * Now that the new config has synced transactionally,
29420373e76bSbonwick 	 * let it become visible to the config cache.
29430373e76bSbonwick 	 */
29440373e76bSbonwick 	if (spa->spa_config_syncing != NULL) {
29450373e76bSbonwick 		spa_config_set(spa, spa->spa_config_syncing);
29460373e76bSbonwick 		spa->spa_config_txg = txg;
29470373e76bSbonwick 		spa->spa_config_syncing = NULL;
29480373e76bSbonwick 	}
2949fa9e4066Sahrens 
2950fa9e4066Sahrens 	/*
2951fa9e4066Sahrens 	 * Make a stable copy of the fully synced uberblock.
2952fa9e4066Sahrens 	 * We use this as the root for pool traversals.
2953fa9e4066Sahrens 	 */
2954fa9e4066Sahrens 	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
2955fa9e4066Sahrens 
2956fa9e4066Sahrens 	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
2957fa9e4066Sahrens 
2958fa9e4066Sahrens 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
2959fa9e4066Sahrens 	spa->spa_traverse_wanted = 0;
2960fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
2961fa9e4066Sahrens 	rw_exit(&spa->spa_traverse_lock);
2962fa9e4066Sahrens 
2963fa9e4066Sahrens 	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
2964fa9e4066Sahrens 
2965fa9e4066Sahrens 	/*
2966fa9e4066Sahrens 	 * Clean up the ZIL records for the synced txg.
2967fa9e4066Sahrens 	 */
2968fa9e4066Sahrens 	dsl_pool_zil_clean(dp);
2969fa9e4066Sahrens 
2970fa9e4066Sahrens 	/*
2971fa9e4066Sahrens 	 * Update usable space statistics.
2972fa9e4066Sahrens 	 */
2973fa9e4066Sahrens 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
2974fa9e4066Sahrens 		vdev_sync_done(vd, txg);
2975fa9e4066Sahrens 
2976fa9e4066Sahrens 	/*
2977fa9e4066Sahrens 	 * It had better be the case that we didn't dirty anything
297899653d4eSeschrock 	 * since vdev_config_sync().
2979fa9e4066Sahrens 	 */
2980fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
2981fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
2982fa9e4066Sahrens 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
2983fa9e4066Sahrens 	ASSERT(bpl->bpl_queue == NULL);
2984fa9e4066Sahrens 
2985ea8dc4b6Seschrock 	spa_config_exit(spa, FTAG);
2986ea8dc4b6Seschrock 
2987ea8dc4b6Seschrock 	/*
2988ea8dc4b6Seschrock 	 * If any async tasks have been requested, kick them off.
2989ea8dc4b6Seschrock 	 */
2990ea8dc4b6Seschrock 	spa_async_dispatch(spa);
2991fa9e4066Sahrens }
2992fa9e4066Sahrens 
2993fa9e4066Sahrens /*
2994fa9e4066Sahrens  * Sync all pools.  We don't want to hold the namespace lock across these
2995fa9e4066Sahrens  * operations, so we take a reference on the spa_t and drop the lock during the
2996fa9e4066Sahrens  * sync.
2997fa9e4066Sahrens  */
2998fa9e4066Sahrens void
2999fa9e4066Sahrens spa_sync_allpools(void)
3000fa9e4066Sahrens {
3001fa9e4066Sahrens 	spa_t *spa = NULL;
3002fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
3003fa9e4066Sahrens 	while ((spa = spa_next(spa)) != NULL) {
3004fa9e4066Sahrens 		if (spa_state(spa) != POOL_STATE_ACTIVE)
3005fa9e4066Sahrens 			continue;
3006fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
3007fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
3008fa9e4066Sahrens 		txg_wait_synced(spa_get_dsl(spa), 0);
3009fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
3010fa9e4066Sahrens 		spa_close(spa, FTAG);
3011fa9e4066Sahrens 	}
3012fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3013fa9e4066Sahrens }
3014fa9e4066Sahrens 
3015fa9e4066Sahrens /*
3016fa9e4066Sahrens  * ==========================================================================
3017fa9e4066Sahrens  * Miscellaneous routines
3018fa9e4066Sahrens  * ==========================================================================
3019fa9e4066Sahrens  */
3020fa9e4066Sahrens 
3021fa9e4066Sahrens /*
3022fa9e4066Sahrens  * Remove all pools in the system.
3023fa9e4066Sahrens  */
3024fa9e4066Sahrens void
3025fa9e4066Sahrens spa_evict_all(void)
3026fa9e4066Sahrens {
3027fa9e4066Sahrens 	spa_t *spa;
3028fa9e4066Sahrens 
3029fa9e4066Sahrens 	/*
3030fa9e4066Sahrens 	 * Remove all cached state.  All pools should be closed now,
3031fa9e4066Sahrens 	 * so every spa in the AVL tree should be unreferenced.
3032fa9e4066Sahrens 	 */
3033fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
3034fa9e4066Sahrens 	while ((spa = spa_next(NULL)) != NULL) {
3035fa9e4066Sahrens 		/*
3036ea8dc4b6Seschrock 		 * Stop async tasks.  The async thread may need to detach
3037ea8dc4b6Seschrock 		 * a device that's been replaced, which requires grabbing
3038ea8dc4b6Seschrock 		 * spa_namespace_lock, so we must drop it here.
3039fa9e4066Sahrens 		 */
3040fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
3041fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
3042ea8dc4b6Seschrock 		spa_async_suspend(spa);
3043fa9e4066Sahrens 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
3044fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
3045fa9e4066Sahrens 		spa_close(spa, FTAG);
3046fa9e4066Sahrens 
3047fa9e4066Sahrens 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3048fa9e4066Sahrens 			spa_unload(spa);
3049fa9e4066Sahrens 			spa_deactivate(spa);
3050fa9e4066Sahrens 		}
3051fa9e4066Sahrens 		spa_remove(spa);
3052fa9e4066Sahrens 	}
3053fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3054fa9e4066Sahrens }
3055ea8dc4b6Seschrock 
3056ea8dc4b6Seschrock vdev_t *
3057ea8dc4b6Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid)
3058ea8dc4b6Seschrock {
3059ea8dc4b6Seschrock 	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
3060ea8dc4b6Seschrock }
3061eaca9bbdSeschrock 
3062eaca9bbdSeschrock void
3063eaca9bbdSeschrock spa_upgrade(spa_t *spa)
3064eaca9bbdSeschrock {
3065eaca9bbdSeschrock 	spa_config_enter(spa, RW_WRITER, FTAG);
3066eaca9bbdSeschrock 
3067eaca9bbdSeschrock 	/*
3068eaca9bbdSeschrock 	 * This should only be called for a non-faulted pool, and since a
3069eaca9bbdSeschrock 	 * future version would result in an unopenable pool, this shouldn't be
3070eaca9bbdSeschrock 	 * possible.
3071eaca9bbdSeschrock 	 */
3072eaca9bbdSeschrock 	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
3073eaca9bbdSeschrock 
3074eaca9bbdSeschrock 	spa->spa_uberblock.ub_version = ZFS_VERSION;
3075eaca9bbdSeschrock 	vdev_config_dirty(spa->spa_root_vdev);
3076eaca9bbdSeschrock 
3077eaca9bbdSeschrock 	spa_config_exit(spa, FTAG);
307899653d4eSeschrock 
307999653d4eSeschrock 	txg_wait_synced(spa_get_dsl(spa), 0);
308099653d4eSeschrock }
308199653d4eSeschrock 
308299653d4eSeschrock boolean_t
308399653d4eSeschrock spa_has_spare(spa_t *spa, uint64_t guid)
308499653d4eSeschrock {
308599653d4eSeschrock 	int i;
3086*39c23413Seschrock 	uint64_t spareguid;
308799653d4eSeschrock 
308899653d4eSeschrock 	for (i = 0; i < spa->spa_nspares; i++)
308999653d4eSeschrock 		if (spa->spa_spares[i]->vdev_guid == guid)
309099653d4eSeschrock 			return (B_TRUE);
309199653d4eSeschrock 
3092*39c23413Seschrock 	for (i = 0; i < spa->spa_pending_nspares; i++) {
3093*39c23413Seschrock 		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
3094*39c23413Seschrock 		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
3095*39c23413Seschrock 		    spareguid == guid)
3096*39c23413Seschrock 			return (B_TRUE);
3097*39c23413Seschrock 	}
3098*39c23413Seschrock 
309999653d4eSeschrock 	return (B_FALSE);
3100eaca9bbdSeschrock }
3101