spa.c revision 0bf246f5efaa80a4f69d1dd27714e59408bbe41c
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * This file contains all the routines used when modifying on-disk SPA state.
31 * This includes opening, importing, destroying, exporting a pool, and syncing a
32 * pool.
33 */
34
35#include <sys/zfs_context.h>
36#include <sys/fm/fs/zfs.h>
37#include <sys/spa_impl.h>
38#include <sys/zio.h>
39#include <sys/zio_checksum.h>
40#include <sys/zio_compress.h>
41#include <sys/dmu.h>
42#include <sys/dmu_tx.h>
43#include <sys/zap.h>
44#include <sys/zil.h>
45#include <sys/vdev_impl.h>
46#include <sys/metaslab.h>
47#include <sys/uberblock_impl.h>
48#include <sys/txg.h>
49#include <sys/avl.h>
50#include <sys/dmu_traverse.h>
51#include <sys/dmu_objset.h>
52#include <sys/unique.h>
53#include <sys/dsl_pool.h>
54#include <sys/dsl_dataset.h>
55#include <sys/dsl_dir.h>
56#include <sys/dsl_prop.h>
57#include <sys/dsl_synctask.h>
58#include <sys/fs/zfs.h>
59#include <sys/callb.h>
60#include <sys/systeminfo.h>
61#include <sys/sunddi.h>
62
63int zio_taskq_threads = 8;
64
65/*
66 * ==========================================================================
67 * SPA state manipulation (open/create/destroy/import/export)
68 * ==========================================================================
69 */
70
71static int
72spa_error_entry_compare(const void *a, const void *b)
73{
74	spa_error_entry_t *sa = (spa_error_entry_t *)a;
75	spa_error_entry_t *sb = (spa_error_entry_t *)b;
76	int ret;
77
78	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
79	    sizeof (zbookmark_t));
80
81	if (ret < 0)
82		return (-1);
83	else if (ret > 0)
84		return (1);
85	else
86		return (0);
87}
88
89/*
90 * Utility function which retrieves copies of the current logs and
91 * re-initializes them in the process.
92 */
93void
94spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
95{
96	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
97
98	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
99	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
100
101	avl_create(&spa->spa_errlist_scrub,
102	    spa_error_entry_compare, sizeof (spa_error_entry_t),
103	    offsetof(spa_error_entry_t, se_avl));
104	avl_create(&spa->spa_errlist_last,
105	    spa_error_entry_compare, sizeof (spa_error_entry_t),
106	    offsetof(spa_error_entry_t, se_avl));
107}
108
109/*
110 * Activate an uninitialized pool.
111 */
112static void
113spa_activate(spa_t *spa)
114{
115	int t;
116
117	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
118
119	spa->spa_state = POOL_STATE_ACTIVE;
120
121	spa->spa_normal_class = metaslab_class_create();
122
123	for (t = 0; t < ZIO_TYPES; t++) {
124		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
125		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
126		    TASKQ_PREPOPULATE);
127		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
128		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
129		    TASKQ_PREPOPULATE);
130	}
131
132	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
133
134	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
135	mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
136	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
137	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
138	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
139	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
140	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
141	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
142	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
143
144	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
145	    offsetof(vdev_t, vdev_dirty_node));
146
147	txg_list_create(&spa->spa_vdev_txg_list,
148	    offsetof(struct vdev, vdev_txg_node));
149
150	avl_create(&spa->spa_errlist_scrub,
151	    spa_error_entry_compare, sizeof (spa_error_entry_t),
152	    offsetof(spa_error_entry_t, se_avl));
153	avl_create(&spa->spa_errlist_last,
154	    spa_error_entry_compare, sizeof (spa_error_entry_t),
155	    offsetof(spa_error_entry_t, se_avl));
156}
157
158/*
159 * Opposite of spa_activate().
160 */
161static void
162spa_deactivate(spa_t *spa)
163{
164	int t;
165
166	ASSERT(spa->spa_sync_on == B_FALSE);
167	ASSERT(spa->spa_dsl_pool == NULL);
168	ASSERT(spa->spa_root_vdev == NULL);
169
170	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
171
172	txg_list_destroy(&spa->spa_vdev_txg_list);
173
174	list_destroy(&spa->spa_dirty_list);
175
176	rw_destroy(&spa->spa_traverse_lock);
177
178	for (t = 0; t < ZIO_TYPES; t++) {
179		taskq_destroy(spa->spa_zio_issue_taskq[t]);
180		taskq_destroy(spa->spa_zio_intr_taskq[t]);
181		spa->spa_zio_issue_taskq[t] = NULL;
182		spa->spa_zio_intr_taskq[t] = NULL;
183	}
184
185	metaslab_class_destroy(spa->spa_normal_class);
186	spa->spa_normal_class = NULL;
187
188	/*
189	 * If this was part of an import or the open otherwise failed, we may
190	 * still have errors left in the queues.  Empty them just in case.
191	 */
192	spa_errlog_drain(spa);
193
194	avl_destroy(&spa->spa_errlist_scrub);
195	avl_destroy(&spa->spa_errlist_last);
196
197	spa->spa_state = POOL_STATE_UNINITIALIZED;
198}
199
200/*
201 * Verify a pool configuration, and construct the vdev tree appropriately.  This
202 * will create all the necessary vdevs in the appropriate layout, with each vdev
203 * in the CLOSED state.  This will prep the pool before open/creation/import.
204 * All vdev validation is done by the vdev_alloc() routine.
205 */
206static int
207spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
208    uint_t id, int atype)
209{
210	nvlist_t **child;
211	uint_t c, children;
212	int error;
213
214	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
215		return (error);
216
217	if ((*vdp)->vdev_ops->vdev_op_leaf)
218		return (0);
219
220	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
221	    &child, &children) != 0) {
222		vdev_free(*vdp);
223		*vdp = NULL;
224		return (EINVAL);
225	}
226
227	for (c = 0; c < children; c++) {
228		vdev_t *vd;
229		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
230		    atype)) != 0) {
231			vdev_free(*vdp);
232			*vdp = NULL;
233			return (error);
234		}
235	}
236
237	ASSERT(*vdp != NULL);
238
239	return (0);
240}
241
242/*
243 * Opposite of spa_load().
244 */
245static void
246spa_unload(spa_t *spa)
247{
248	int i;
249
250	/*
251	 * Stop async tasks.
252	 */
253	spa_async_suspend(spa);
254
255	/*
256	 * Stop syncing.
257	 */
258	if (spa->spa_sync_on) {
259		txg_sync_stop(spa->spa_dsl_pool);
260		spa->spa_sync_on = B_FALSE;
261	}
262
263	/*
264	 * Wait for any outstanding prefetch I/O to complete.
265	 */
266	spa_config_enter(spa, RW_WRITER, FTAG);
267	spa_config_exit(spa, FTAG);
268
269	/*
270	 * Close the dsl pool.
271	 */
272	if (spa->spa_dsl_pool) {
273		dsl_pool_close(spa->spa_dsl_pool);
274		spa->spa_dsl_pool = NULL;
275	}
276
277	/*
278	 * Close all vdevs.
279	 */
280	if (spa->spa_root_vdev)
281		vdev_free(spa->spa_root_vdev);
282	ASSERT(spa->spa_root_vdev == NULL);
283
284	for (i = 0; i < spa->spa_nspares; i++)
285		vdev_free(spa->spa_spares[i]);
286	if (spa->spa_spares) {
287		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
288		spa->spa_spares = NULL;
289	}
290	if (spa->spa_sparelist) {
291		nvlist_free(spa->spa_sparelist);
292		spa->spa_sparelist = NULL;
293	}
294
295	spa->spa_async_suspended = 0;
296}
297
298/*
299 * Load (or re-load) the current list of vdevs describing the active spares for
300 * this pool.  When this is called, we have some form of basic information in
301 * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
302 * re-generate a more complete list including status information.
303 */
304static void
305spa_load_spares(spa_t *spa)
306{
307	nvlist_t **spares;
308	uint_t nspares;
309	int i;
310	vdev_t *vd, *tvd;
311
312	/*
313	 * First, close and free any existing spare vdevs.
314	 */
315	for (i = 0; i < spa->spa_nspares; i++) {
316		vd = spa->spa_spares[i];
317
318		/* Undo the call to spa_activate() below */
319		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
320		    tvd->vdev_isspare)
321			spa_spare_remove(tvd);
322		vdev_close(vd);
323		vdev_free(vd);
324	}
325
326	if (spa->spa_spares)
327		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
328
329	if (spa->spa_sparelist == NULL)
330		nspares = 0;
331	else
332		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
333		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
334
335	spa->spa_nspares = (int)nspares;
336	spa->spa_spares = NULL;
337
338	if (nspares == 0)
339		return;
340
341	/*
342	 * Construct the array of vdevs, opening them to get status in the
343	 * process.   For each spare, there is potentially two different vdev_t
344	 * structures associated with it: one in the list of spares (used only
345	 * for basic validation purposes) and one in the active vdev
346	 * configuration (if it's spared in).  During this phase we open and
347	 * validate each vdev on the spare list.  If the vdev also exists in the
348	 * active configuration, then we also mark this vdev as an active spare.
349	 */
350	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
351	for (i = 0; i < spa->spa_nspares; i++) {
352		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
353		    VDEV_ALLOC_SPARE) == 0);
354		ASSERT(vd != NULL);
355
356		spa->spa_spares[i] = vd;
357
358		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
359			if (!tvd->vdev_isspare)
360				spa_spare_add(tvd);
361
362			/*
363			 * We only mark the spare active if we were successfully
364			 * able to load the vdev.  Otherwise, importing a pool
365			 * with a bad active spare would result in strange
366			 * behavior, because multiple pool would think the spare
367			 * is actively in use.
368			 *
369			 * There is a vulnerability here to an equally bizarre
370			 * circumstance, where a dead active spare is later
371			 * brought back to life (onlined or otherwise).  Given
372			 * the rarity of this scenario, and the extra complexity
373			 * it adds, we ignore the possibility.
374			 */
375			if (!vdev_is_dead(tvd))
376				spa_spare_activate(tvd);
377		}
378
379		if (vdev_open(vd) != 0)
380			continue;
381
382		vd->vdev_top = vd;
383		(void) vdev_validate_spare(vd);
384	}
385
386	/*
387	 * Recompute the stashed list of spares, with status information
388	 * this time.
389	 */
390	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
391	    DATA_TYPE_NVLIST_ARRAY) == 0);
392
393	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
394	for (i = 0; i < spa->spa_nspares; i++)
395		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
396		    B_TRUE, B_TRUE);
397	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
398	    spares, spa->spa_nspares) == 0);
399	for (i = 0; i < spa->spa_nspares; i++)
400		nvlist_free(spares[i]);
401	kmem_free(spares, spa->spa_nspares * sizeof (void *));
402}
403
404static int
405load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
406{
407	dmu_buf_t *db;
408	char *packed = NULL;
409	size_t nvsize = 0;
410	int error;
411	*value = NULL;
412
413	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
414	nvsize = *(uint64_t *)db->db_data;
415	dmu_buf_rele(db, FTAG);
416
417	packed = kmem_alloc(nvsize, KM_SLEEP);
418	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
419	if (error == 0)
420		error = nvlist_unpack(packed, nvsize, value, 0);
421	kmem_free(packed, nvsize);
422
423	return (error);
424}
425
426/*
427 * Load an existing storage pool, using the pool's builtin spa_config as a
428 * source of configuration information.
429 */
430static int
431spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
432{
433	int error = 0;
434	nvlist_t *nvroot = NULL;
435	vdev_t *rvd;
436	uberblock_t *ub = &spa->spa_uberblock;
437	uint64_t config_cache_txg = spa->spa_config_txg;
438	uint64_t pool_guid;
439	uint64_t version;
440	zio_t *zio;
441
442	spa->spa_load_state = state;
443
444	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
445	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
446		error = EINVAL;
447		goto out;
448	}
449
450	/*
451	 * Versioning wasn't explicitly added to the label until later, so if
452	 * it's not present treat it as the initial version.
453	 */
454	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
455		version = ZFS_VERSION_INITIAL;
456
457	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
458	    &spa->spa_config_txg);
459
460	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
461	    spa_guid_exists(pool_guid, 0)) {
462		error = EEXIST;
463		goto out;
464	}
465
466	spa->spa_load_guid = pool_guid;
467
468	/*
469	 * Parse the configuration into a vdev tree.  We explicitly set the
470	 * value that will be returned by spa_version() since parsing the
471	 * configuration requires knowing the version number.
472	 */
473	spa_config_enter(spa, RW_WRITER, FTAG);
474	spa->spa_ubsync.ub_version = version;
475	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
476	spa_config_exit(spa, FTAG);
477
478	if (error != 0)
479		goto out;
480
481	ASSERT(spa->spa_root_vdev == rvd);
482	ASSERT(spa_guid(spa) == pool_guid);
483
484	/*
485	 * Try to open all vdevs, loading each label in the process.
486	 */
487	error = vdev_open(rvd);
488	if (error != 0)
489		goto out;
490
491	/*
492	 * Validate the labels for all leaf vdevs.  We need to grab the config
493	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
494	 * flag.
495	 */
496	spa_config_enter(spa, RW_READER, FTAG);
497	error = vdev_validate(rvd);
498	spa_config_exit(spa, FTAG);
499
500	if (error != 0)
501		goto out;
502
503	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
504		error = ENXIO;
505		goto out;
506	}
507
508	/*
509	 * Find the best uberblock.
510	 */
511	bzero(ub, sizeof (uberblock_t));
512
513	zio = zio_root(spa, NULL, NULL,
514	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
515	vdev_uberblock_load(zio, rvd, ub);
516	error = zio_wait(zio);
517
518	/*
519	 * If we weren't able to find a single valid uberblock, return failure.
520	 */
521	if (ub->ub_txg == 0) {
522		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
523		    VDEV_AUX_CORRUPT_DATA);
524		error = ENXIO;
525		goto out;
526	}
527
528	/*
529	 * If the pool is newer than the code, we can't open it.
530	 */
531	if (ub->ub_version > ZFS_VERSION) {
532		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
533		    VDEV_AUX_VERSION_NEWER);
534		error = ENOTSUP;
535		goto out;
536	}
537
538	/*
539	 * If the vdev guid sum doesn't match the uberblock, we have an
540	 * incomplete configuration.
541	 */
542	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
543		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
544		    VDEV_AUX_BAD_GUID_SUM);
545		error = ENXIO;
546		goto out;
547	}
548
549	/*
550	 * Initialize internal SPA structures.
551	 */
552	spa->spa_state = POOL_STATE_ACTIVE;
553	spa->spa_ubsync = spa->spa_uberblock;
554	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
555	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
556	if (error) {
557		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
558		    VDEV_AUX_CORRUPT_DATA);
559		goto out;
560	}
561	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
562
563	if (zap_lookup(spa->spa_meta_objset,
564	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
565	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
566		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
567		    VDEV_AUX_CORRUPT_DATA);
568		error = EIO;
569		goto out;
570	}
571
572	if (!mosconfig) {
573		nvlist_t *newconfig;
574		uint64_t hostid;
575
576		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
577			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
578			    VDEV_AUX_CORRUPT_DATA);
579			error = EIO;
580			goto out;
581		}
582
583		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
584		    &hostid) == 0) {
585			char *hostname;
586			unsigned long myhostid = 0;
587
588			VERIFY(nvlist_lookup_string(newconfig,
589			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
590
591			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
592			if ((unsigned long)hostid != myhostid) {
593				cmn_err(CE_WARN, "pool '%s' could not be "
594				    "loaded as it was last accessed by "
595				    "another system (host: %s hostid: 0x%lx).  "
596				    "See: http://www.sun.com/msg/ZFS-8000-EY",
597				    spa->spa_name, hostname,
598				    (unsigned long)hostid);
599				error = EBADF;
600				goto out;
601			}
602		}
603
604		spa_config_set(spa, newconfig);
605		spa_unload(spa);
606		spa_deactivate(spa);
607		spa_activate(spa);
608
609		return (spa_load(spa, newconfig, state, B_TRUE));
610	}
611
612	if (zap_lookup(spa->spa_meta_objset,
613	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
614	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
615		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
616		    VDEV_AUX_CORRUPT_DATA);
617		error = EIO;
618		goto out;
619	}
620
621	/*
622	 * Load the bit that tells us to use the new accounting function
623	 * (raid-z deflation).  If we have an older pool, this will not
624	 * be present.
625	 */
626	error = zap_lookup(spa->spa_meta_objset,
627	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
628	    sizeof (uint64_t), 1, &spa->spa_deflate);
629	if (error != 0 && error != ENOENT) {
630		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
631		    VDEV_AUX_CORRUPT_DATA);
632		error = EIO;
633		goto out;
634	}
635
636	/*
637	 * Load the persistent error log.  If we have an older pool, this will
638	 * not be present.
639	 */
640	error = zap_lookup(spa->spa_meta_objset,
641	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
642	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
643	if (error != 0 && error != ENOENT) {
644		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
645		    VDEV_AUX_CORRUPT_DATA);
646		error = EIO;
647		goto out;
648	}
649
650	error = zap_lookup(spa->spa_meta_objset,
651	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
652	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
653	if (error != 0 && error != ENOENT) {
654		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
655		    VDEV_AUX_CORRUPT_DATA);
656		error = EIO;
657		goto out;
658	}
659
660	/*
661	 * Load the history object.  If we have an older pool, this
662	 * will not be present.
663	 */
664	error = zap_lookup(spa->spa_meta_objset,
665	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
666	    sizeof (uint64_t), 1, &spa->spa_history);
667	if (error != 0 && error != ENOENT) {
668		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
669		    VDEV_AUX_CORRUPT_DATA);
670		error = EIO;
671		goto out;
672	}
673
674	/*
675	 * Load any hot spares for this pool.
676	 */
677	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
678	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
679	if (error != 0 && error != ENOENT) {
680		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
681		    VDEV_AUX_CORRUPT_DATA);
682		error = EIO;
683		goto out;
684	}
685	if (error == 0) {
686		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
687		if (load_nvlist(spa, spa->spa_spares_object,
688		    &spa->spa_sparelist) != 0) {
689			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
690			    VDEV_AUX_CORRUPT_DATA);
691			error = EIO;
692			goto out;
693		}
694
695		spa_config_enter(spa, RW_WRITER, FTAG);
696		spa_load_spares(spa);
697		spa_config_exit(spa, FTAG);
698	}
699
700	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
701	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
702
703	if (error && error != ENOENT) {
704		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
705		    VDEV_AUX_CORRUPT_DATA);
706		error = EIO;
707		goto out;
708	}
709
710	if (error == 0) {
711		(void) zap_lookup(spa->spa_meta_objset,
712		    spa->spa_pool_props_object,
713		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
714		    sizeof (uint64_t), 1, &spa->spa_bootfs);
715	}
716
717	/*
718	 * Load the vdev state for all toplevel vdevs.
719	 */
720	vdev_load(rvd);
721
722	/*
723	 * Propagate the leaf DTLs we just loaded all the way up the tree.
724	 */
725	spa_config_enter(spa, RW_WRITER, FTAG);
726	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
727	spa_config_exit(spa, FTAG);
728
729	/*
730	 * Check the state of the root vdev.  If it can't be opened, it
731	 * indicates one or more toplevel vdevs are faulted.
732	 */
733	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
734		error = ENXIO;
735		goto out;
736	}
737
738	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
739		dmu_tx_t *tx;
740		int need_update = B_FALSE;
741		int c;
742
743		/*
744		 * Claim log blocks that haven't been committed yet.
745		 * This must all happen in a single txg.
746		 */
747		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
748		    spa_first_txg(spa));
749		(void) dmu_objset_find(spa->spa_name,
750		    zil_claim, tx, DS_FIND_CHILDREN);
751		dmu_tx_commit(tx);
752
753		spa->spa_sync_on = B_TRUE;
754		txg_sync_start(spa->spa_dsl_pool);
755
756		/*
757		 * Wait for all claims to sync.
758		 */
759		txg_wait_synced(spa->spa_dsl_pool, 0);
760
761		/*
762		 * If the config cache is stale, or we have uninitialized
763		 * metaslabs (see spa_vdev_add()), then update the config.
764		 */
765		if (config_cache_txg != spa->spa_config_txg ||
766		    state == SPA_LOAD_IMPORT)
767			need_update = B_TRUE;
768
769		for (c = 0; c < rvd->vdev_children; c++)
770			if (rvd->vdev_child[c]->vdev_ms_array == 0)
771				need_update = B_TRUE;
772
773		/*
774		 * Update the config cache asychronously in case we're the
775		 * root pool, in which case the config cache isn't writable yet.
776		 */
777		if (need_update)
778			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
779	}
780
781	error = 0;
782out:
783	if (error && error != EBADF)
784		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
785	spa->spa_load_state = SPA_LOAD_NONE;
786	spa->spa_ena = 0;
787
788	return (error);
789}
790
791/*
792 * Pool Open/Import
793 *
794 * The import case is identical to an open except that the configuration is sent
795 * down from userland, instead of grabbed from the configuration cache.  For the
796 * case of an open, the pool configuration will exist in the
797 * POOL_STATE_UNITIALIZED state.
798 *
799 * The stats information (gen/count/ustats) is used to gather vdev statistics at
800 * the same time open the pool, without having to keep around the spa_t in some
801 * ambiguous state.
802 */
803static int
804spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
805{
806	spa_t *spa;
807	int error;
808	int loaded = B_FALSE;
809	int locked = B_FALSE;
810
811	*spapp = NULL;
812
813	/*
814	 * As disgusting as this is, we need to support recursive calls to this
815	 * function because dsl_dir_open() is called during spa_load(), and ends
816	 * up calling spa_open() again.  The real fix is to figure out how to
817	 * avoid dsl_dir_open() calling this in the first place.
818	 */
819	if (mutex_owner(&spa_namespace_lock) != curthread) {
820		mutex_enter(&spa_namespace_lock);
821		locked = B_TRUE;
822	}
823
824	if ((spa = spa_lookup(pool)) == NULL) {
825		if (locked)
826			mutex_exit(&spa_namespace_lock);
827		return (ENOENT);
828	}
829	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
830
831		spa_activate(spa);
832
833		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
834
835		if (error == EBADF) {
836			/*
837			 * If vdev_validate() returns failure (indicated by
838			 * EBADF), it indicates that one of the vdevs indicates
839			 * that the pool has been exported or destroyed.  If
840			 * this is the case, the config cache is out of sync and
841			 * we should remove the pool from the namespace.
842			 */
843			zfs_post_ok(spa, NULL);
844			spa_unload(spa);
845			spa_deactivate(spa);
846			spa_remove(spa);
847			spa_config_sync();
848			if (locked)
849				mutex_exit(&spa_namespace_lock);
850			return (ENOENT);
851		}
852
853		if (error) {
854			/*
855			 * We can't open the pool, but we still have useful
856			 * information: the state of each vdev after the
857			 * attempted vdev_open().  Return this to the user.
858			 */
859			if (config != NULL && spa->spa_root_vdev != NULL) {
860				spa_config_enter(spa, RW_READER, FTAG);
861				*config = spa_config_generate(spa, NULL, -1ULL,
862				    B_TRUE);
863				spa_config_exit(spa, FTAG);
864			}
865			spa_unload(spa);
866			spa_deactivate(spa);
867			spa->spa_last_open_failed = B_TRUE;
868			if (locked)
869				mutex_exit(&spa_namespace_lock);
870			*spapp = NULL;
871			return (error);
872		} else {
873			zfs_post_ok(spa, NULL);
874			spa->spa_last_open_failed = B_FALSE;
875		}
876
877		loaded = B_TRUE;
878	}
879
880	spa_open_ref(spa, tag);
881	if (locked)
882		mutex_exit(&spa_namespace_lock);
883
884	*spapp = spa;
885
886	if (config != NULL) {
887		spa_config_enter(spa, RW_READER, FTAG);
888		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
889		spa_config_exit(spa, FTAG);
890	}
891
892	/*
893	 * If we just loaded the pool, resilver anything that's out of date.
894	 */
895	if (loaded && (spa_mode & FWRITE))
896		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
897
898	return (0);
899}
900
901int
902spa_open(const char *name, spa_t **spapp, void *tag)
903{
904	return (spa_open_common(name, spapp, tag, NULL));
905}
906
907/*
908 * Lookup the given spa_t, incrementing the inject count in the process,
909 * preventing it from being exported or destroyed.
910 */
911spa_t *
912spa_inject_addref(char *name)
913{
914	spa_t *spa;
915
916	mutex_enter(&spa_namespace_lock);
917	if ((spa = spa_lookup(name)) == NULL) {
918		mutex_exit(&spa_namespace_lock);
919		return (NULL);
920	}
921	spa->spa_inject_ref++;
922	mutex_exit(&spa_namespace_lock);
923
924	return (spa);
925}
926
927void
928spa_inject_delref(spa_t *spa)
929{
930	mutex_enter(&spa_namespace_lock);
931	spa->spa_inject_ref--;
932	mutex_exit(&spa_namespace_lock);
933}
934
935static void
936spa_add_spares(spa_t *spa, nvlist_t *config)
937{
938	nvlist_t **spares;
939	uint_t i, nspares;
940	nvlist_t *nvroot;
941	uint64_t guid;
942	vdev_stat_t *vs;
943	uint_t vsc;
944	uint64_t pool;
945
946	if (spa->spa_nspares == 0)
947		return;
948
949	VERIFY(nvlist_lookup_nvlist(config,
950	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
951	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
952	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
953	if (nspares != 0) {
954		VERIFY(nvlist_add_nvlist_array(nvroot,
955		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
956		VERIFY(nvlist_lookup_nvlist_array(nvroot,
957		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
958
959		/*
960		 * Go through and find any spares which have since been
961		 * repurposed as an active spare.  If this is the case, update
962		 * their status appropriately.
963		 */
964		for (i = 0; i < nspares; i++) {
965			VERIFY(nvlist_lookup_uint64(spares[i],
966			    ZPOOL_CONFIG_GUID, &guid) == 0);
967			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
968				VERIFY(nvlist_lookup_uint64_array(
969				    spares[i], ZPOOL_CONFIG_STATS,
970				    (uint64_t **)&vs, &vsc) == 0);
971				vs->vs_state = VDEV_STATE_CANT_OPEN;
972				vs->vs_aux = VDEV_AUX_SPARED;
973			}
974		}
975	}
976}
977
978int
979spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
980{
981	int error;
982	spa_t *spa;
983
984	*config = NULL;
985	error = spa_open_common(name, &spa, FTAG, config);
986
987	if (spa && *config != NULL) {
988		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
989		    spa_get_errlog_size(spa)) == 0);
990
991		spa_add_spares(spa, *config);
992	}
993
994	/*
995	 * We want to get the alternate root even for faulted pools, so we cheat
996	 * and call spa_lookup() directly.
997	 */
998	if (altroot) {
999		if (spa == NULL) {
1000			mutex_enter(&spa_namespace_lock);
1001			spa = spa_lookup(name);
1002			if (spa)
1003				spa_altroot(spa, altroot, buflen);
1004			else
1005				altroot[0] = '\0';
1006			spa = NULL;
1007			mutex_exit(&spa_namespace_lock);
1008		} else {
1009			spa_altroot(spa, altroot, buflen);
1010		}
1011	}
1012
1013	if (spa != NULL)
1014		spa_close(spa, FTAG);
1015
1016	return (error);
1017}
1018
1019/*
1020 * Validate that the 'spares' array is well formed.  We must have an array of
1021 * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
1022 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
1023 * as they are well-formed.
1024 */
1025static int
1026spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1027{
1028	nvlist_t **spares;
1029	uint_t i, nspares;
1030	vdev_t *vd;
1031	int error;
1032
1033	/*
1034	 * It's acceptable to have no spares specified.
1035	 */
1036	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1037	    &spares, &nspares) != 0)
1038		return (0);
1039
1040	if (nspares == 0)
1041		return (EINVAL);
1042
1043	/*
1044	 * Make sure the pool is formatted with a version that supports hot
1045	 * spares.
1046	 */
1047	if (spa_version(spa) < ZFS_VERSION_SPARES)
1048		return (ENOTSUP);
1049
1050	/*
1051	 * Set the pending spare list so we correctly handle device in-use
1052	 * checking.
1053	 */
1054	spa->spa_pending_spares = spares;
1055	spa->spa_pending_nspares = nspares;
1056
1057	for (i = 0; i < nspares; i++) {
1058		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
1059		    mode)) != 0)
1060			goto out;
1061
1062		if (!vd->vdev_ops->vdev_op_leaf) {
1063			vdev_free(vd);
1064			error = EINVAL;
1065			goto out;
1066		}
1067
1068		vd->vdev_top = vd;
1069
1070		if ((error = vdev_open(vd)) == 0 &&
1071		    (error = vdev_label_init(vd, crtxg,
1072		    VDEV_LABEL_SPARE)) == 0) {
1073			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
1074			    vd->vdev_guid) == 0);
1075		}
1076
1077		vdev_free(vd);
1078
1079		if (error && mode != VDEV_ALLOC_SPARE)
1080			goto out;
1081		else
1082			error = 0;
1083	}
1084
1085out:
1086	spa->spa_pending_spares = NULL;
1087	spa->spa_pending_nspares = 0;
1088	return (error);
1089}
1090
1091/*
1092 * Pool Creation
1093 */
1094int
1095spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
1096{
1097	spa_t *spa;
1098	vdev_t *rvd;
1099	dsl_pool_t *dp;
1100	dmu_tx_t *tx;
1101	int c, error = 0;
1102	uint64_t txg = TXG_INITIAL;
1103	nvlist_t **spares;
1104	uint_t nspares;
1105
1106	/*
1107	 * If this pool already exists, return failure.
1108	 */
1109	mutex_enter(&spa_namespace_lock);
1110	if (spa_lookup(pool) != NULL) {
1111		mutex_exit(&spa_namespace_lock);
1112		return (EEXIST);
1113	}
1114
1115	/*
1116	 * Allocate a new spa_t structure.
1117	 */
1118	spa = spa_add(pool, altroot);
1119	spa_activate(spa);
1120
1121	spa->spa_uberblock.ub_txg = txg - 1;
1122	spa->spa_uberblock.ub_version = ZFS_VERSION;
1123	spa->spa_ubsync = spa->spa_uberblock;
1124
1125	/*
1126	 * Create the root vdev.
1127	 */
1128	spa_config_enter(spa, RW_WRITER, FTAG);
1129
1130	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1131
1132	ASSERT(error != 0 || rvd != NULL);
1133	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1134
1135	if (error == 0 && rvd->vdev_children == 0)
1136		error = EINVAL;
1137
1138	if (error == 0 &&
1139	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1140	    (error = spa_validate_spares(spa, nvroot, txg,
1141	    VDEV_ALLOC_ADD)) == 0) {
1142		for (c = 0; c < rvd->vdev_children; c++)
1143			vdev_init(rvd->vdev_child[c], txg);
1144		vdev_config_dirty(rvd);
1145	}
1146
1147	spa_config_exit(spa, FTAG);
1148
1149	if (error != 0) {
1150		spa_unload(spa);
1151		spa_deactivate(spa);
1152		spa_remove(spa);
1153		mutex_exit(&spa_namespace_lock);
1154		return (error);
1155	}
1156
1157	/*
1158	 * Get the list of spares, if specified.
1159	 */
1160	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1161	    &spares, &nspares) == 0) {
1162		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
1163		    KM_SLEEP) == 0);
1164		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1165		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1166		spa_config_enter(spa, RW_WRITER, FTAG);
1167		spa_load_spares(spa);
1168		spa_config_exit(spa, FTAG);
1169		spa->spa_sync_spares = B_TRUE;
1170	}
1171
1172	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1173	spa->spa_meta_objset = dp->dp_meta_objset;
1174
1175	tx = dmu_tx_create_assigned(dp, txg);
1176
1177	/*
1178	 * Create the pool config object.
1179	 */
1180	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1181	    DMU_OT_PACKED_NVLIST, 1 << 14,
1182	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1183
1184	if (zap_add(spa->spa_meta_objset,
1185	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1186	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
1187		cmn_err(CE_PANIC, "failed to add pool config");
1188	}
1189
1190	/* Newly created pools are always deflated. */
1191	spa->spa_deflate = TRUE;
1192	if (zap_add(spa->spa_meta_objset,
1193	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1194	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
1195		cmn_err(CE_PANIC, "failed to add deflate");
1196	}
1197
1198	/*
1199	 * Create the deferred-free bplist object.  Turn off compression
1200	 * because sync-to-convergence takes longer if the blocksize
1201	 * keeps changing.
1202	 */
1203	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
1204	    1 << 14, tx);
1205	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
1206	    ZIO_COMPRESS_OFF, tx);
1207
1208	if (zap_add(spa->spa_meta_objset,
1209	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1210	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
1211		cmn_err(CE_PANIC, "failed to add bplist");
1212	}
1213
1214	/*
1215	 * Create the pool's history object.
1216	 */
1217	spa_history_create_obj(spa, tx);
1218
1219	dmu_tx_commit(tx);
1220
1221	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
1222	spa->spa_sync_on = B_TRUE;
1223	txg_sync_start(spa->spa_dsl_pool);
1224
1225	/*
1226	 * We explicitly wait for the first transaction to complete so that our
1227	 * bean counters are appropriately updated.
1228	 */
1229	txg_wait_synced(spa->spa_dsl_pool, txg);
1230
1231	spa_config_sync();
1232
1233	mutex_exit(&spa_namespace_lock);
1234
1235	return (0);
1236}
1237
1238/*
1239 * Import the given pool into the system.  We set up the necessary spa_t and
1240 * then call spa_load() to do the dirty work.
1241 */
1242int
1243spa_import(const char *pool, nvlist_t *config, const char *altroot)
1244{
1245	spa_t *spa;
1246	int error;
1247	nvlist_t *nvroot;
1248	nvlist_t **spares;
1249	uint_t nspares;
1250
1251	if (!(spa_mode & FWRITE))
1252		return (EROFS);
1253
1254	/*
1255	 * If a pool with this name exists, return failure.
1256	 */
1257	mutex_enter(&spa_namespace_lock);
1258	if (spa_lookup(pool) != NULL) {
1259		mutex_exit(&spa_namespace_lock);
1260		return (EEXIST);
1261	}
1262
1263	/*
1264	 * Create and initialize the spa structure.
1265	 */
1266	spa = spa_add(pool, altroot);
1267	spa_activate(spa);
1268
1269	/*
1270	 * Pass off the heavy lifting to spa_load().
1271	 * Pass TRUE for mosconfig because the user-supplied config
1272	 * is actually the one to trust when doing an import.
1273	 */
1274	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
1275
1276	spa_config_enter(spa, RW_WRITER, FTAG);
1277	/*
1278	 * Toss any existing sparelist, as it doesn't have any validity anymore,
1279	 * and conflicts with spa_has_spare().
1280	 */
1281	if (spa->spa_sparelist) {
1282		nvlist_free(spa->spa_sparelist);
1283		spa->spa_sparelist = NULL;
1284		spa_load_spares(spa);
1285	}
1286
1287	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1288	    &nvroot) == 0);
1289	if (error == 0)
1290		error = spa_validate_spares(spa, nvroot, -1ULL,
1291		    VDEV_ALLOC_SPARE);
1292	spa_config_exit(spa, FTAG);
1293
1294	if (error != 0) {
1295		spa_unload(spa);
1296		spa_deactivate(spa);
1297		spa_remove(spa);
1298		mutex_exit(&spa_namespace_lock);
1299		return (error);
1300	}
1301
1302	/*
1303	 * Override any spares as specified by the user, as these may have
1304	 * correct device names/devids, etc.
1305	 */
1306	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1307	    &spares, &nspares) == 0) {
1308		if (spa->spa_sparelist)
1309			VERIFY(nvlist_remove(spa->spa_sparelist,
1310			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1311		else
1312			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1313			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1314		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1315		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1316		spa_config_enter(spa, RW_WRITER, FTAG);
1317		spa_load_spares(spa);
1318		spa_config_exit(spa, FTAG);
1319		spa->spa_sync_spares = B_TRUE;
1320	}
1321
1322	/*
1323	 * Update the config cache to include the newly-imported pool.
1324	 */
1325	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1326
1327	mutex_exit(&spa_namespace_lock);
1328
1329	/*
1330	 * Resilver anything that's out of date.
1331	 */
1332	if (spa_mode & FWRITE)
1333		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1334
1335	return (0);
1336}
1337
1338/*
1339 * This (illegal) pool name is used when temporarily importing a spa_t in order
1340 * to get the vdev stats associated with the imported devices.
1341 */
1342#define	TRYIMPORT_NAME	"$import"
1343
1344nvlist_t *
1345spa_tryimport(nvlist_t *tryconfig)
1346{
1347	nvlist_t *config = NULL;
1348	char *poolname;
1349	spa_t *spa;
1350	uint64_t state;
1351
1352	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
1353		return (NULL);
1354
1355	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
1356		return (NULL);
1357
1358	/*
1359	 * Create and initialize the spa structure.
1360	 */
1361	mutex_enter(&spa_namespace_lock);
1362	spa = spa_add(TRYIMPORT_NAME, NULL);
1363	spa_activate(spa);
1364
1365	/*
1366	 * Pass off the heavy lifting to spa_load().
1367	 * Pass TRUE for mosconfig because the user-supplied config
1368	 * is actually the one to trust when doing an import.
1369	 */
1370	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
1371
1372	/*
1373	 * If 'tryconfig' was at least parsable, return the current config.
1374	 */
1375	if (spa->spa_root_vdev != NULL) {
1376		spa_config_enter(spa, RW_READER, FTAG);
1377		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1378		spa_config_exit(spa, FTAG);
1379		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
1380		    poolname) == 0);
1381		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
1382		    state) == 0);
1383		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
1384		    spa->spa_uberblock.ub_timestamp) == 0);
1385
1386		/*
1387		 * Add the list of hot spares.
1388		 */
1389		spa_add_spares(spa, config);
1390	}
1391
1392	spa_unload(spa);
1393	spa_deactivate(spa);
1394	spa_remove(spa);
1395	mutex_exit(&spa_namespace_lock);
1396
1397	return (config);
1398}
1399
1400/*
1401 * Pool export/destroy
1402 *
1403 * The act of destroying or exporting a pool is very simple.  We make sure there
1404 * is no more pending I/O and any references to the pool are gone.  Then, we
1405 * update the pool state and sync all the labels to disk, removing the
1406 * configuration from the cache afterwards.
1407 */
1408static int
1409spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
1410{
1411	spa_t *spa;
1412
1413	if (oldconfig)
1414		*oldconfig = NULL;
1415
1416	if (!(spa_mode & FWRITE))
1417		return (EROFS);
1418
1419	mutex_enter(&spa_namespace_lock);
1420	if ((spa = spa_lookup(pool)) == NULL) {
1421		mutex_exit(&spa_namespace_lock);
1422		return (ENOENT);
1423	}
1424
1425	/*
1426	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
1427	 * reacquire the namespace lock, and see if we can export.
1428	 */
1429	spa_open_ref(spa, FTAG);
1430	mutex_exit(&spa_namespace_lock);
1431	spa_async_suspend(spa);
1432	mutex_enter(&spa_namespace_lock);
1433	spa_close(spa, FTAG);
1434
1435	/*
1436	 * The pool will be in core if it's openable,
1437	 * in which case we can modify its state.
1438	 */
1439	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
1440		/*
1441		 * Objsets may be open only because they're dirty, so we
1442		 * have to force it to sync before checking spa_refcnt.
1443		 */
1444		spa_scrub_suspend(spa);
1445		txg_wait_synced(spa->spa_dsl_pool, 0);
1446
1447		/*
1448		 * A pool cannot be exported or destroyed if there are active
1449		 * references.  If we are resetting a pool, allow references by
1450		 * fault injection handlers.
1451		 */
1452		if (!spa_refcount_zero(spa) ||
1453		    (spa->spa_inject_ref != 0 &&
1454		    new_state != POOL_STATE_UNINITIALIZED)) {
1455			spa_scrub_resume(spa);
1456			spa_async_resume(spa);
1457			mutex_exit(&spa_namespace_lock);
1458			return (EBUSY);
1459		}
1460
1461		spa_scrub_resume(spa);
1462		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
1463
1464		/*
1465		 * We want this to be reflected on every label,
1466		 * so mark them all dirty.  spa_unload() will do the
1467		 * final sync that pushes these changes out.
1468		 */
1469		if (new_state != POOL_STATE_UNINITIALIZED) {
1470			spa_config_enter(spa, RW_WRITER, FTAG);
1471			spa->spa_state = new_state;
1472			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
1473			vdev_config_dirty(spa->spa_root_vdev);
1474			spa_config_exit(spa, FTAG);
1475		}
1476	}
1477
1478	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
1479		spa_unload(spa);
1480		spa_deactivate(spa);
1481	}
1482
1483	if (oldconfig && spa->spa_config)
1484		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
1485
1486	if (new_state != POOL_STATE_UNINITIALIZED) {
1487		spa_remove(spa);
1488		spa_config_sync();
1489	}
1490	mutex_exit(&spa_namespace_lock);
1491
1492	return (0);
1493}
1494
1495/*
1496 * Destroy a storage pool.
1497 */
1498int
1499spa_destroy(char *pool)
1500{
1501	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
1502}
1503
1504/*
1505 * Export a storage pool.
1506 */
1507int
1508spa_export(char *pool, nvlist_t **oldconfig)
1509{
1510	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
1511}
1512
1513/*
1514 * Similar to spa_export(), this unloads the spa_t without actually removing it
1515 * from the namespace in any way.
1516 */
1517int
1518spa_reset(char *pool)
1519{
1520	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
1521}
1522
1523
1524/*
1525 * ==========================================================================
1526 * Device manipulation
1527 * ==========================================================================
1528 */
1529
1530/*
1531 * Add capacity to a storage pool.
1532 */
1533int
1534spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
1535{
1536	uint64_t txg;
1537	int c, error;
1538	vdev_t *rvd = spa->spa_root_vdev;
1539	vdev_t *vd, *tvd;
1540	nvlist_t **spares;
1541	uint_t i, nspares;
1542
1543	txg = spa_vdev_enter(spa);
1544
1545	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
1546	    VDEV_ALLOC_ADD)) != 0)
1547		return (spa_vdev_exit(spa, NULL, txg, error));
1548
1549	spa->spa_pending_vdev = vd;
1550
1551	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1552	    &spares, &nspares) != 0)
1553		nspares = 0;
1554
1555	if (vd->vdev_children == 0 && nspares == 0) {
1556		spa->spa_pending_vdev = NULL;
1557		return (spa_vdev_exit(spa, vd, txg, EINVAL));
1558	}
1559
1560	if (vd->vdev_children != 0) {
1561		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
1562			spa->spa_pending_vdev = NULL;
1563			return (spa_vdev_exit(spa, vd, txg, error));
1564		}
1565	}
1566
1567	/*
1568	 * We must validate the spares after checking the children.  Otherwise,
1569	 * vdev_inuse() will blindly overwrite the spare.
1570	 */
1571	if ((error = spa_validate_spares(spa, nvroot, txg,
1572	    VDEV_ALLOC_ADD)) != 0) {
1573		spa->spa_pending_vdev = NULL;
1574		return (spa_vdev_exit(spa, vd, txg, error));
1575	}
1576
1577	spa->spa_pending_vdev = NULL;
1578
1579	/*
1580	 * Transfer each new top-level vdev from vd to rvd.
1581	 */
1582	for (c = 0; c < vd->vdev_children; c++) {
1583		tvd = vd->vdev_child[c];
1584		vdev_remove_child(vd, tvd);
1585		tvd->vdev_id = rvd->vdev_children;
1586		vdev_add_child(rvd, tvd);
1587		vdev_config_dirty(tvd);
1588	}
1589
1590	if (nspares != 0) {
1591		if (spa->spa_sparelist != NULL) {
1592			nvlist_t **oldspares;
1593			uint_t oldnspares;
1594			nvlist_t **newspares;
1595
1596			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
1597			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
1598
1599			newspares = kmem_alloc(sizeof (void *) *
1600			    (nspares + oldnspares), KM_SLEEP);
1601			for (i = 0; i < oldnspares; i++)
1602				VERIFY(nvlist_dup(oldspares[i],
1603				    &newspares[i], KM_SLEEP) == 0);
1604			for (i = 0; i < nspares; i++)
1605				VERIFY(nvlist_dup(spares[i],
1606				    &newspares[i + oldnspares],
1607				    KM_SLEEP) == 0);
1608
1609			VERIFY(nvlist_remove(spa->spa_sparelist,
1610			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1611
1612			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1613			    ZPOOL_CONFIG_SPARES, newspares,
1614			    nspares + oldnspares) == 0);
1615			for (i = 0; i < oldnspares + nspares; i++)
1616				nvlist_free(newspares[i]);
1617			kmem_free(newspares, (oldnspares + nspares) *
1618			    sizeof (void *));
1619		} else {
1620			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1621			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1622			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1623			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1624		}
1625
1626		spa_load_spares(spa);
1627		spa->spa_sync_spares = B_TRUE;
1628	}
1629
1630	/*
1631	 * We have to be careful when adding new vdevs to an existing pool.
1632	 * If other threads start allocating from these vdevs before we
1633	 * sync the config cache, and we lose power, then upon reboot we may
1634	 * fail to open the pool because there are DVAs that the config cache
1635	 * can't translate.  Therefore, we first add the vdevs without
1636	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
1637	 * and then let spa_config_update() initialize the new metaslabs.
1638	 *
1639	 * spa_load() checks for added-but-not-initialized vdevs, so that
1640	 * if we lose power at any point in this sequence, the remaining
1641	 * steps will be completed the next time we load the pool.
1642	 */
1643	(void) spa_vdev_exit(spa, vd, txg, 0);
1644
1645	mutex_enter(&spa_namespace_lock);
1646	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1647	mutex_exit(&spa_namespace_lock);
1648
1649	return (0);
1650}
1651
1652/*
1653 * Attach a device to a mirror.  The arguments are the path to any device
1654 * in the mirror, and the nvroot for the new device.  If the path specifies
1655 * a device that is not mirrored, we automatically insert the mirror vdev.
1656 *
1657 * If 'replacing' is specified, the new device is intended to replace the
1658 * existing device; in this case the two devices are made into their own
1659 * mirror using the 'replacing' vdev, which is functionally idendical to
1660 * the mirror vdev (it actually reuses all the same ops) but has a few
1661 * extra rules: you can't attach to it after it's been created, and upon
1662 * completion of resilvering, the first disk (the one being replaced)
1663 * is automatically detached.
1664 */
1665int
1666spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
1667{
1668	uint64_t txg, open_txg;
1669	int error;
1670	vdev_t *rvd = spa->spa_root_vdev;
1671	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
1672	vdev_ops_t *pvops;
1673
1674	txg = spa_vdev_enter(spa);
1675
1676	oldvd = vdev_lookup_by_guid(rvd, guid);
1677
1678	if (oldvd == NULL)
1679		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1680
1681	if (!oldvd->vdev_ops->vdev_op_leaf)
1682		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1683
1684	pvd = oldvd->vdev_parent;
1685
1686	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
1687	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
1688		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1689
1690	newvd = newrootvd->vdev_child[0];
1691
1692	if (!newvd->vdev_ops->vdev_op_leaf)
1693		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1694
1695	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
1696		return (spa_vdev_exit(spa, newrootvd, txg, error));
1697
1698	if (!replacing) {
1699		/*
1700		 * For attach, the only allowable parent is a mirror or the root
1701		 * vdev.
1702		 */
1703		if (pvd->vdev_ops != &vdev_mirror_ops &&
1704		    pvd->vdev_ops != &vdev_root_ops)
1705			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1706
1707		pvops = &vdev_mirror_ops;
1708	} else {
1709		/*
1710		 * Active hot spares can only be replaced by inactive hot
1711		 * spares.
1712		 */
1713		if (pvd->vdev_ops == &vdev_spare_ops &&
1714		    pvd->vdev_child[1] == oldvd &&
1715		    !spa_has_spare(spa, newvd->vdev_guid))
1716			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1717
1718		/*
1719		 * If the source is a hot spare, and the parent isn't already a
1720		 * spare, then we want to create a new hot spare.  Otherwise, we
1721		 * want to create a replacing vdev.  The user is not allowed to
1722		 * attach to a spared vdev child unless the 'isspare' state is
1723		 * the same (spare replaces spare, non-spare replaces
1724		 * non-spare).
1725		 */
1726		if (pvd->vdev_ops == &vdev_replacing_ops)
1727			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1728		else if (pvd->vdev_ops == &vdev_spare_ops &&
1729		    newvd->vdev_isspare != oldvd->vdev_isspare)
1730			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1731		else if (pvd->vdev_ops != &vdev_spare_ops &&
1732		    newvd->vdev_isspare)
1733			pvops = &vdev_spare_ops;
1734		else
1735			pvops = &vdev_replacing_ops;
1736	}
1737
1738	/*
1739	 * Compare the new device size with the replaceable/attachable
1740	 * device size.
1741	 */
1742	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
1743		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
1744
1745	/*
1746	 * The new device cannot have a higher alignment requirement
1747	 * than the top-level vdev.
1748	 */
1749	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
1750		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
1751
1752	/*
1753	 * If this is an in-place replacement, update oldvd's path and devid
1754	 * to make it distinguishable from newvd, and unopenable from now on.
1755	 */
1756	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
1757		spa_strfree(oldvd->vdev_path);
1758		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
1759		    KM_SLEEP);
1760		(void) sprintf(oldvd->vdev_path, "%s/%s",
1761		    newvd->vdev_path, "old");
1762		if (oldvd->vdev_devid != NULL) {
1763			spa_strfree(oldvd->vdev_devid);
1764			oldvd->vdev_devid = NULL;
1765		}
1766	}
1767
1768	/*
1769	 * If the parent is not a mirror, or if we're replacing, insert the new
1770	 * mirror/replacing/spare vdev above oldvd.
1771	 */
1772	if (pvd->vdev_ops != pvops)
1773		pvd = vdev_add_parent(oldvd, pvops);
1774
1775	ASSERT(pvd->vdev_top->vdev_parent == rvd);
1776	ASSERT(pvd->vdev_ops == pvops);
1777	ASSERT(oldvd->vdev_parent == pvd);
1778
1779	/*
1780	 * Extract the new device from its root and add it to pvd.
1781	 */
1782	vdev_remove_child(newrootvd, newvd);
1783	newvd->vdev_id = pvd->vdev_children;
1784	vdev_add_child(pvd, newvd);
1785
1786	/*
1787	 * If newvd is smaller than oldvd, but larger than its rsize,
1788	 * the addition of newvd may have decreased our parent's asize.
1789	 */
1790	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
1791
1792	tvd = newvd->vdev_top;
1793	ASSERT(pvd->vdev_top == tvd);
1794	ASSERT(tvd->vdev_parent == rvd);
1795
1796	vdev_config_dirty(tvd);
1797
1798	/*
1799	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
1800	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
1801	 */
1802	open_txg = txg + TXG_CONCURRENT_STATES - 1;
1803
1804	mutex_enter(&newvd->vdev_dtl_lock);
1805	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
1806	    open_txg - TXG_INITIAL + 1);
1807	mutex_exit(&newvd->vdev_dtl_lock);
1808
1809	if (newvd->vdev_isspare)
1810		spa_spare_activate(newvd);
1811
1812	/*
1813	 * Mark newvd's DTL dirty in this txg.
1814	 */
1815	vdev_dirty(tvd, VDD_DTL, newvd, txg);
1816
1817	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
1818
1819	/*
1820	 * Kick off a resilver to update newvd.
1821	 */
1822	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1823
1824	return (0);
1825}
1826
1827/*
1828 * Detach a device from a mirror or replacing vdev.
1829 * If 'replace_done' is specified, only detach if the parent
1830 * is a replacing vdev.
1831 */
1832int
1833spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
1834{
1835	uint64_t txg;
1836	int c, t, error;
1837	vdev_t *rvd = spa->spa_root_vdev;
1838	vdev_t *vd, *pvd, *cvd, *tvd;
1839	boolean_t unspare = B_FALSE;
1840	uint64_t unspare_guid;
1841
1842	txg = spa_vdev_enter(spa);
1843
1844	vd = vdev_lookup_by_guid(rvd, guid);
1845
1846	if (vd == NULL)
1847		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1848
1849	if (!vd->vdev_ops->vdev_op_leaf)
1850		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1851
1852	pvd = vd->vdev_parent;
1853
1854	/*
1855	 * If replace_done is specified, only remove this device if it's
1856	 * the first child of a replacing vdev.  For the 'spare' vdev, either
1857	 * disk can be removed.
1858	 */
1859	if (replace_done) {
1860		if (pvd->vdev_ops == &vdev_replacing_ops) {
1861			if (vd->vdev_id != 0)
1862				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1863		} else if (pvd->vdev_ops != &vdev_spare_ops) {
1864			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1865		}
1866	}
1867
1868	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
1869	    spa_version(spa) >= ZFS_VERSION_SPARES);
1870
1871	/*
1872	 * Only mirror, replacing, and spare vdevs support detach.
1873	 */
1874	if (pvd->vdev_ops != &vdev_replacing_ops &&
1875	    pvd->vdev_ops != &vdev_mirror_ops &&
1876	    pvd->vdev_ops != &vdev_spare_ops)
1877		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1878
1879	/*
1880	 * If there's only one replica, you can't detach it.
1881	 */
1882	if (pvd->vdev_children <= 1)
1883		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1884
1885	/*
1886	 * If all siblings have non-empty DTLs, this device may have the only
1887	 * valid copy of the data, which means we cannot safely detach it.
1888	 *
1889	 * XXX -- as in the vdev_offline() case, we really want a more
1890	 * precise DTL check.
1891	 */
1892	for (c = 0; c < pvd->vdev_children; c++) {
1893		uint64_t dirty;
1894
1895		cvd = pvd->vdev_child[c];
1896		if (cvd == vd)
1897			continue;
1898		if (vdev_is_dead(cvd))
1899			continue;
1900		mutex_enter(&cvd->vdev_dtl_lock);
1901		dirty = cvd->vdev_dtl_map.sm_space |
1902		    cvd->vdev_dtl_scrub.sm_space;
1903		mutex_exit(&cvd->vdev_dtl_lock);
1904		if (!dirty)
1905			break;
1906	}
1907
1908	/*
1909	 * If we are a replacing or spare vdev, then we can always detach the
1910	 * latter child, as that is how one cancels the operation.
1911	 */
1912	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
1913	    c == pvd->vdev_children)
1914		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1915
1916	/*
1917	 * If we are detaching the original disk from a spare, then it implies
1918	 * that the spare should become a real disk, and be removed from the
1919	 * active spare list for the pool.
1920	 */
1921	if (pvd->vdev_ops == &vdev_spare_ops &&
1922	    vd->vdev_id == 0)
1923		unspare = B_TRUE;
1924
1925	/*
1926	 * Erase the disk labels so the disk can be used for other things.
1927	 * This must be done after all other error cases are handled,
1928	 * but before we disembowel vd (so we can still do I/O to it).
1929	 * But if we can't do it, don't treat the error as fatal --
1930	 * it may be that the unwritability of the disk is the reason
1931	 * it's being detached!
1932	 */
1933	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1934
1935	/*
1936	 * Remove vd from its parent and compact the parent's children.
1937	 */
1938	vdev_remove_child(pvd, vd);
1939	vdev_compact_children(pvd);
1940
1941	/*
1942	 * Remember one of the remaining children so we can get tvd below.
1943	 */
1944	cvd = pvd->vdev_child[0];
1945
1946	/*
1947	 * If we need to remove the remaining child from the list of hot spares,
1948	 * do it now, marking the vdev as no longer a spare in the process.  We
1949	 * must do this before vdev_remove_parent(), because that can change the
1950	 * GUID if it creates a new toplevel GUID.
1951	 */
1952	if (unspare) {
1953		ASSERT(cvd->vdev_isspare);
1954		spa_spare_remove(cvd);
1955		unspare_guid = cvd->vdev_guid;
1956	}
1957
1958	/*
1959	 * If the parent mirror/replacing vdev only has one child,
1960	 * the parent is no longer needed.  Remove it from the tree.
1961	 */
1962	if (pvd->vdev_children == 1)
1963		vdev_remove_parent(cvd);
1964
1965	/*
1966	 * We don't set tvd until now because the parent we just removed
1967	 * may have been the previous top-level vdev.
1968	 */
1969	tvd = cvd->vdev_top;
1970	ASSERT(tvd->vdev_parent == rvd);
1971
1972	/*
1973	 * Reevaluate the parent vdev state.
1974	 */
1975	vdev_propagate_state(cvd->vdev_parent);
1976
1977	/*
1978	 * If the device we just detached was smaller than the others, it may be
1979	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
1980	 * can't fail because the existing metaslabs are already in core, so
1981	 * there's nothing to read from disk.
1982	 */
1983	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
1984
1985	vdev_config_dirty(tvd);
1986
1987	/*
1988	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
1989	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
1990	 * But first make sure we're not on any *other* txg's DTL list, to
1991	 * prevent vd from being accessed after it's freed.
1992	 */
1993	for (t = 0; t < TXG_SIZE; t++)
1994		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
1995	vd->vdev_detached = B_TRUE;
1996	vdev_dirty(tvd, VDD_DTL, vd, txg);
1997
1998	error = spa_vdev_exit(spa, vd, txg, 0);
1999
2000	/*
2001	 * If this was the removal of the original device in a hot spare vdev,
2002	 * then we want to go through and remove the device from the hot spare
2003	 * list of every other pool.
2004	 */
2005	if (unspare) {
2006		spa = NULL;
2007		mutex_enter(&spa_namespace_lock);
2008		while ((spa = spa_next(spa)) != NULL) {
2009			if (spa->spa_state != POOL_STATE_ACTIVE)
2010				continue;
2011
2012			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
2013		}
2014		mutex_exit(&spa_namespace_lock);
2015	}
2016
2017	return (error);
2018}
2019
2020/*
2021 * Remove a device from the pool.  Currently, this supports removing only hot
2022 * spares.
2023 */
2024int
2025spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2026{
2027	vdev_t *vd;
2028	nvlist_t **spares, *nv, **newspares;
2029	uint_t i, j, nspares;
2030	int ret = 0;
2031
2032	spa_config_enter(spa, RW_WRITER, FTAG);
2033
2034	vd = spa_lookup_by_guid(spa, guid);
2035
2036	nv = NULL;
2037	if (spa->spa_spares != NULL &&
2038	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2039	    &spares, &nspares) == 0) {
2040		for (i = 0; i < nspares; i++) {
2041			uint64_t theguid;
2042
2043			VERIFY(nvlist_lookup_uint64(spares[i],
2044			    ZPOOL_CONFIG_GUID, &theguid) == 0);
2045			if (theguid == guid) {
2046				nv = spares[i];
2047				break;
2048			}
2049		}
2050	}
2051
2052	/*
2053	 * We only support removing a hot spare, and only if it's not currently
2054	 * in use in this pool.
2055	 */
2056	if (nv == NULL && vd == NULL) {
2057		ret = ENOENT;
2058		goto out;
2059	}
2060
2061	if (nv == NULL && vd != NULL) {
2062		ret = ENOTSUP;
2063		goto out;
2064	}
2065
2066	if (!unspare && nv != NULL && vd != NULL) {
2067		ret = EBUSY;
2068		goto out;
2069	}
2070
2071	if (nspares == 1) {
2072		newspares = NULL;
2073	} else {
2074		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2075		    KM_SLEEP);
2076		for (i = 0, j = 0; i < nspares; i++) {
2077			if (spares[i] != nv)
2078				VERIFY(nvlist_dup(spares[i],
2079				    &newspares[j++], KM_SLEEP) == 0);
2080		}
2081	}
2082
2083	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2084	    DATA_TYPE_NVLIST_ARRAY) == 0);
2085	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2086	    newspares, nspares - 1) == 0);
2087	for (i = 0; i < nspares - 1; i++)
2088		nvlist_free(newspares[i]);
2089	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2090	spa_load_spares(spa);
2091	spa->spa_sync_spares = B_TRUE;
2092
2093out:
2094	spa_config_exit(spa, FTAG);
2095
2096	return (ret);
2097}
2098
2099/*
2100 * Find any device that's done replacing, so we can detach it.
2101 */
2102static vdev_t *
2103spa_vdev_replace_done_hunt(vdev_t *vd)
2104{
2105	vdev_t *newvd, *oldvd;
2106	int c;
2107
2108	for (c = 0; c < vd->vdev_children; c++) {
2109		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
2110		if (oldvd != NULL)
2111			return (oldvd);
2112	}
2113
2114	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
2115		oldvd = vd->vdev_child[0];
2116		newvd = vd->vdev_child[1];
2117
2118		mutex_enter(&newvd->vdev_dtl_lock);
2119		if (newvd->vdev_dtl_map.sm_space == 0 &&
2120		    newvd->vdev_dtl_scrub.sm_space == 0) {
2121			mutex_exit(&newvd->vdev_dtl_lock);
2122			return (oldvd);
2123		}
2124		mutex_exit(&newvd->vdev_dtl_lock);
2125	}
2126
2127	return (NULL);
2128}
2129
2130static void
2131spa_vdev_replace_done(spa_t *spa)
2132{
2133	vdev_t *vd;
2134	vdev_t *pvd;
2135	uint64_t guid;
2136	uint64_t pguid = 0;
2137
2138	spa_config_enter(spa, RW_READER, FTAG);
2139
2140	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
2141		guid = vd->vdev_guid;
2142		/*
2143		 * If we have just finished replacing a hot spared device, then
2144		 * we need to detach the parent's first child (the original hot
2145		 * spare) as well.
2146		 */
2147		pvd = vd->vdev_parent;
2148		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2149		    pvd->vdev_id == 0) {
2150			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
2151			ASSERT(pvd->vdev_parent->vdev_children == 2);
2152			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
2153		}
2154		spa_config_exit(spa, FTAG);
2155		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
2156			return;
2157		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
2158			return;
2159		spa_config_enter(spa, RW_READER, FTAG);
2160	}
2161
2162	spa_config_exit(spa, FTAG);
2163}
2164
2165/*
2166 * Update the stored path for this vdev.  Dirty the vdev configuration, relying
2167 * on spa_vdev_enter/exit() to synchronize the labels and cache.
2168 */
2169int
2170spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
2171{
2172	vdev_t *rvd, *vd;
2173	uint64_t txg;
2174
2175	rvd = spa->spa_root_vdev;
2176
2177	txg = spa_vdev_enter(spa);
2178
2179	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
2180		/*
2181		 * Determine if this is a reference to a hot spare.  In that
2182		 * case, update the path as stored in the spare list.
2183		 */
2184		nvlist_t **spares;
2185		uint_t i, nspares;
2186		if (spa->spa_sparelist != NULL) {
2187			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
2188			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2189			for (i = 0; i < nspares; i++) {
2190				uint64_t theguid;
2191				VERIFY(nvlist_lookup_uint64(spares[i],
2192				    ZPOOL_CONFIG_GUID, &theguid) == 0);
2193				if (theguid == guid)
2194					break;
2195			}
2196
2197			if (i == nspares)
2198				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2199
2200			VERIFY(nvlist_add_string(spares[i],
2201			    ZPOOL_CONFIG_PATH, newpath) == 0);
2202			spa_load_spares(spa);
2203			spa->spa_sync_spares = B_TRUE;
2204			return (spa_vdev_exit(spa, NULL, txg, 0));
2205		} else {
2206			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2207		}
2208	}
2209
2210	if (!vd->vdev_ops->vdev_op_leaf)
2211		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2212
2213	spa_strfree(vd->vdev_path);
2214	vd->vdev_path = spa_strdup(newpath);
2215
2216	vdev_config_dirty(vd->vdev_top);
2217
2218	return (spa_vdev_exit(spa, NULL, txg, 0));
2219}
2220
2221/*
2222 * ==========================================================================
2223 * SPA Scrubbing
2224 * ==========================================================================
2225 */
2226
2227static void
2228spa_scrub_io_done(zio_t *zio)
2229{
2230	spa_t *spa = zio->io_spa;
2231
2232	zio_data_buf_free(zio->io_data, zio->io_size);
2233
2234	mutex_enter(&spa->spa_scrub_lock);
2235	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2236		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
2237		spa->spa_scrub_errors++;
2238		mutex_enter(&vd->vdev_stat_lock);
2239		vd->vdev_stat.vs_scrub_errors++;
2240		mutex_exit(&vd->vdev_stat_lock);
2241	}
2242
2243	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
2244		cv_broadcast(&spa->spa_scrub_io_cv);
2245
2246	ASSERT(spa->spa_scrub_inflight >= 0);
2247
2248	mutex_exit(&spa->spa_scrub_lock);
2249}
2250
2251static void
2252spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
2253    zbookmark_t *zb)
2254{
2255	size_t size = BP_GET_LSIZE(bp);
2256	void *data;
2257
2258	mutex_enter(&spa->spa_scrub_lock);
2259	/*
2260	 * Do not give too much work to vdev(s).
2261	 */
2262	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
2263		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2264	}
2265	spa->spa_scrub_inflight++;
2266	mutex_exit(&spa->spa_scrub_lock);
2267
2268	data = zio_data_buf_alloc(size);
2269
2270	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
2271		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
2272
2273	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
2274
2275	zio_nowait(zio_read(NULL, spa, bp, data, size,
2276	    spa_scrub_io_done, NULL, priority, flags, zb));
2277}
2278
2279/* ARGSUSED */
2280static int
2281spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
2282{
2283	blkptr_t *bp = &bc->bc_blkptr;
2284	vdev_t *vd = spa->spa_root_vdev;
2285	dva_t *dva = bp->blk_dva;
2286	int needs_resilver = B_FALSE;
2287	int d;
2288
2289	if (bc->bc_errno) {
2290		/*
2291		 * We can't scrub this block, but we can continue to scrub
2292		 * the rest of the pool.  Note the error and move along.
2293		 */
2294		mutex_enter(&spa->spa_scrub_lock);
2295		spa->spa_scrub_errors++;
2296		mutex_exit(&spa->spa_scrub_lock);
2297
2298		mutex_enter(&vd->vdev_stat_lock);
2299		vd->vdev_stat.vs_scrub_errors++;
2300		mutex_exit(&vd->vdev_stat_lock);
2301
2302		return (ERESTART);
2303	}
2304
2305	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
2306
2307	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
2308		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
2309
2310		ASSERT(vd != NULL);
2311
2312		/*
2313		 * Keep track of how much data we've examined so that
2314		 * zpool(1M) status can make useful progress reports.
2315		 */
2316		mutex_enter(&vd->vdev_stat_lock);
2317		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
2318		mutex_exit(&vd->vdev_stat_lock);
2319
2320		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
2321			if (DVA_GET_GANG(&dva[d])) {
2322				/*
2323				 * Gang members may be spread across multiple
2324				 * vdevs, so the best we can do is look at the
2325				 * pool-wide DTL.
2326				 * XXX -- it would be better to change our
2327				 * allocation policy to ensure that this can't
2328				 * happen.
2329				 */
2330				vd = spa->spa_root_vdev;
2331			}
2332			if (vdev_dtl_contains(&vd->vdev_dtl_map,
2333			    bp->blk_birth, 1))
2334				needs_resilver = B_TRUE;
2335		}
2336	}
2337
2338	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
2339		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
2340		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
2341	else if (needs_resilver)
2342		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
2343		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
2344
2345	return (0);
2346}
2347
2348static void
2349spa_scrub_thread(spa_t *spa)
2350{
2351	callb_cpr_t cprinfo;
2352	traverse_handle_t *th = spa->spa_scrub_th;
2353	vdev_t *rvd = spa->spa_root_vdev;
2354	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
2355	int error = 0;
2356	boolean_t complete;
2357
2358	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
2359
2360	/*
2361	 * If we're restarting due to a snapshot create/delete,
2362	 * wait for that to complete.
2363	 */
2364	txg_wait_synced(spa_get_dsl(spa), 0);
2365
2366	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
2367	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2368	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
2369
2370	spa_config_enter(spa, RW_WRITER, FTAG);
2371	vdev_reopen(rvd);		/* purge all vdev caches */
2372	vdev_config_dirty(rvd);		/* rewrite all disk labels */
2373	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
2374	spa_config_exit(spa, FTAG);
2375
2376	mutex_enter(&spa->spa_scrub_lock);
2377	spa->spa_scrub_errors = 0;
2378	spa->spa_scrub_active = 1;
2379	ASSERT(spa->spa_scrub_inflight == 0);
2380
2381	while (!spa->spa_scrub_stop) {
2382		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2383		while (spa->spa_scrub_suspended) {
2384			spa->spa_scrub_active = 0;
2385			cv_broadcast(&spa->spa_scrub_cv);
2386			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2387			spa->spa_scrub_active = 1;
2388		}
2389		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
2390
2391		if (spa->spa_scrub_restart_txg != 0)
2392			break;
2393
2394		mutex_exit(&spa->spa_scrub_lock);
2395		error = traverse_more(th);
2396		mutex_enter(&spa->spa_scrub_lock);
2397		if (error != EAGAIN)
2398			break;
2399	}
2400
2401	while (spa->spa_scrub_inflight)
2402		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2403
2404	spa->spa_scrub_active = 0;
2405	cv_broadcast(&spa->spa_scrub_cv);
2406
2407	mutex_exit(&spa->spa_scrub_lock);
2408
2409	spa_config_enter(spa, RW_WRITER, FTAG);
2410
2411	mutex_enter(&spa->spa_scrub_lock);
2412
2413	/*
2414	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
2415	 * AND the spa config lock to synchronize with any config changes
2416	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
2417	 */
2418	if (spa->spa_scrub_restart_txg != 0)
2419		error = ERESTART;
2420
2421	if (spa->spa_scrub_stop)
2422		error = EINTR;
2423
2424	/*
2425	 * Even if there were uncorrectable errors, we consider the scrub
2426	 * completed.  The downside is that if there is a transient error during
2427	 * a resilver, we won't resilver the data properly to the target.  But
2428	 * if the damage is permanent (more likely) we will resilver forever,
2429	 * which isn't really acceptable.  Since there is enough information for
2430	 * the user to know what has failed and why, this seems like a more
2431	 * tractable approach.
2432	 */
2433	complete = (error == 0);
2434
2435	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
2436	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2437	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
2438	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
2439
2440	mutex_exit(&spa->spa_scrub_lock);
2441
2442	/*
2443	 * If the scrub/resilver completed, update all DTLs to reflect this.
2444	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
2445	 */
2446	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
2447	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
2448	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
2449	spa_errlog_rotate(spa);
2450
2451	spa_config_exit(spa, FTAG);
2452
2453	mutex_enter(&spa->spa_scrub_lock);
2454
2455	/*
2456	 * We may have finished replacing a device.
2457	 * Let the async thread assess this and handle the detach.
2458	 */
2459	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2460
2461	/*
2462	 * If we were told to restart, our final act is to start a new scrub.
2463	 */
2464	if (error == ERESTART)
2465		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
2466		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
2467
2468	spa->spa_scrub_type = POOL_SCRUB_NONE;
2469	spa->spa_scrub_active = 0;
2470	spa->spa_scrub_thread = NULL;
2471	cv_broadcast(&spa->spa_scrub_cv);
2472	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
2473	thread_exit();
2474}
2475
2476void
2477spa_scrub_suspend(spa_t *spa)
2478{
2479	mutex_enter(&spa->spa_scrub_lock);
2480	spa->spa_scrub_suspended++;
2481	while (spa->spa_scrub_active) {
2482		cv_broadcast(&spa->spa_scrub_cv);
2483		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2484	}
2485	while (spa->spa_scrub_inflight)
2486		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2487	mutex_exit(&spa->spa_scrub_lock);
2488}
2489
2490void
2491spa_scrub_resume(spa_t *spa)
2492{
2493	mutex_enter(&spa->spa_scrub_lock);
2494	ASSERT(spa->spa_scrub_suspended != 0);
2495	if (--spa->spa_scrub_suspended == 0)
2496		cv_broadcast(&spa->spa_scrub_cv);
2497	mutex_exit(&spa->spa_scrub_lock);
2498}
2499
2500void
2501spa_scrub_restart(spa_t *spa, uint64_t txg)
2502{
2503	/*
2504	 * Something happened (e.g. snapshot create/delete) that means
2505	 * we must restart any in-progress scrubs.  The itinerary will
2506	 * fix this properly.
2507	 */
2508	mutex_enter(&spa->spa_scrub_lock);
2509	spa->spa_scrub_restart_txg = txg;
2510	mutex_exit(&spa->spa_scrub_lock);
2511}
2512
2513int
2514spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
2515{
2516	space_seg_t *ss;
2517	uint64_t mintxg, maxtxg;
2518	vdev_t *rvd = spa->spa_root_vdev;
2519
2520	if ((uint_t)type >= POOL_SCRUB_TYPES)
2521		return (ENOTSUP);
2522
2523	mutex_enter(&spa->spa_scrub_lock);
2524
2525	/*
2526	 * If there's a scrub or resilver already in progress, stop it.
2527	 */
2528	while (spa->spa_scrub_thread != NULL) {
2529		/*
2530		 * Don't stop a resilver unless forced.
2531		 */
2532		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
2533			mutex_exit(&spa->spa_scrub_lock);
2534			return (EBUSY);
2535		}
2536		spa->spa_scrub_stop = 1;
2537		cv_broadcast(&spa->spa_scrub_cv);
2538		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2539	}
2540
2541	/*
2542	 * Terminate the previous traverse.
2543	 */
2544	if (spa->spa_scrub_th != NULL) {
2545		traverse_fini(spa->spa_scrub_th);
2546		spa->spa_scrub_th = NULL;
2547	}
2548
2549	if (rvd == NULL) {
2550		ASSERT(spa->spa_scrub_stop == 0);
2551		ASSERT(spa->spa_scrub_type == type);
2552		ASSERT(spa->spa_scrub_restart_txg == 0);
2553		mutex_exit(&spa->spa_scrub_lock);
2554		return (0);
2555	}
2556
2557	mintxg = TXG_INITIAL - 1;
2558	maxtxg = spa_last_synced_txg(spa) + 1;
2559
2560	mutex_enter(&rvd->vdev_dtl_lock);
2561
2562	if (rvd->vdev_dtl_map.sm_space == 0) {
2563		/*
2564		 * The pool-wide DTL is empty.
2565		 * If this is a resilver, there's nothing to do except
2566		 * check whether any in-progress replacements have completed.
2567		 */
2568		if (type == POOL_SCRUB_RESILVER) {
2569			type = POOL_SCRUB_NONE;
2570			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2571		}
2572	} else {
2573		/*
2574		 * The pool-wide DTL is non-empty.
2575		 * If this is a normal scrub, upgrade to a resilver instead.
2576		 */
2577		if (type == POOL_SCRUB_EVERYTHING)
2578			type = POOL_SCRUB_RESILVER;
2579	}
2580
2581	if (type == POOL_SCRUB_RESILVER) {
2582		/*
2583		 * Determine the resilvering boundaries.
2584		 *
2585		 * Note: (mintxg, maxtxg) is an open interval,
2586		 * i.e. mintxg and maxtxg themselves are not included.
2587		 *
2588		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
2589		 * so we don't claim to resilver a txg that's still changing.
2590		 */
2591		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
2592		mintxg = ss->ss_start - 1;
2593		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
2594		maxtxg = MIN(ss->ss_end, maxtxg);
2595	}
2596
2597	mutex_exit(&rvd->vdev_dtl_lock);
2598
2599	spa->spa_scrub_stop = 0;
2600	spa->spa_scrub_type = type;
2601	spa->spa_scrub_restart_txg = 0;
2602
2603	if (type != POOL_SCRUB_NONE) {
2604		spa->spa_scrub_mintxg = mintxg;
2605		spa->spa_scrub_maxtxg = maxtxg;
2606		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
2607		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
2608		    ZIO_FLAG_CANFAIL);
2609		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
2610		spa->spa_scrub_thread = thread_create(NULL, 0,
2611		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
2612	}
2613
2614	mutex_exit(&spa->spa_scrub_lock);
2615
2616	return (0);
2617}
2618
2619/*
2620 * ==========================================================================
2621 * SPA async task processing
2622 * ==========================================================================
2623 */
2624
2625static void
2626spa_async_reopen(spa_t *spa)
2627{
2628	vdev_t *rvd = spa->spa_root_vdev;
2629	vdev_t *tvd;
2630	int c;
2631
2632	spa_config_enter(spa, RW_WRITER, FTAG);
2633
2634	for (c = 0; c < rvd->vdev_children; c++) {
2635		tvd = rvd->vdev_child[c];
2636		if (tvd->vdev_reopen_wanted) {
2637			tvd->vdev_reopen_wanted = 0;
2638			vdev_reopen(tvd);
2639		}
2640	}
2641
2642	spa_config_exit(spa, FTAG);
2643}
2644
2645static void
2646spa_async_thread(spa_t *spa)
2647{
2648	int tasks;
2649
2650	ASSERT(spa->spa_sync_on);
2651
2652	mutex_enter(&spa->spa_async_lock);
2653	tasks = spa->spa_async_tasks;
2654	spa->spa_async_tasks = 0;
2655	mutex_exit(&spa->spa_async_lock);
2656
2657	/*
2658	 * See if the config needs to be updated.
2659	 */
2660	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
2661		mutex_enter(&spa_namespace_lock);
2662		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2663		mutex_exit(&spa_namespace_lock);
2664	}
2665
2666	/*
2667	 * See if any devices need to be reopened.
2668	 */
2669	if (tasks & SPA_ASYNC_REOPEN)
2670		spa_async_reopen(spa);
2671
2672	/*
2673	 * If any devices are done replacing, detach them.
2674	 */
2675	if (tasks & SPA_ASYNC_REPLACE_DONE)
2676		spa_vdev_replace_done(spa);
2677
2678	/*
2679	 * Kick off a scrub.
2680	 */
2681	if (tasks & SPA_ASYNC_SCRUB)
2682		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
2683
2684	/*
2685	 * Kick off a resilver.
2686	 */
2687	if (tasks & SPA_ASYNC_RESILVER)
2688		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2689
2690	/*
2691	 * Let the world know that we're done.
2692	 */
2693	mutex_enter(&spa->spa_async_lock);
2694	spa->spa_async_thread = NULL;
2695	cv_broadcast(&spa->spa_async_cv);
2696	mutex_exit(&spa->spa_async_lock);
2697	thread_exit();
2698}
2699
2700void
2701spa_async_suspend(spa_t *spa)
2702{
2703	mutex_enter(&spa->spa_async_lock);
2704	spa->spa_async_suspended++;
2705	while (spa->spa_async_thread != NULL)
2706		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
2707	mutex_exit(&spa->spa_async_lock);
2708}
2709
2710void
2711spa_async_resume(spa_t *spa)
2712{
2713	mutex_enter(&spa->spa_async_lock);
2714	ASSERT(spa->spa_async_suspended != 0);
2715	spa->spa_async_suspended--;
2716	mutex_exit(&spa->spa_async_lock);
2717}
2718
2719static void
2720spa_async_dispatch(spa_t *spa)
2721{
2722	mutex_enter(&spa->spa_async_lock);
2723	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
2724	    spa->spa_async_thread == NULL &&
2725	    rootdir != NULL && !vn_is_readonly(rootdir))
2726		spa->spa_async_thread = thread_create(NULL, 0,
2727		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
2728	mutex_exit(&spa->spa_async_lock);
2729}
2730
2731void
2732spa_async_request(spa_t *spa, int task)
2733{
2734	mutex_enter(&spa->spa_async_lock);
2735	spa->spa_async_tasks |= task;
2736	mutex_exit(&spa->spa_async_lock);
2737}
2738
2739/*
2740 * ==========================================================================
2741 * SPA syncing routines
2742 * ==========================================================================
2743 */
2744
2745static void
2746spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
2747{
2748	bplist_t *bpl = &spa->spa_sync_bplist;
2749	dmu_tx_t *tx;
2750	blkptr_t blk;
2751	uint64_t itor = 0;
2752	zio_t *zio;
2753	int error;
2754	uint8_t c = 1;
2755
2756	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
2757
2758	while (bplist_iterate(bpl, &itor, &blk) == 0)
2759		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
2760
2761	error = zio_wait(zio);
2762	ASSERT3U(error, ==, 0);
2763
2764	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2765	bplist_vacate(bpl, tx);
2766
2767	/*
2768	 * Pre-dirty the first block so we sync to convergence faster.
2769	 * (Usually only the first block is needed.)
2770	 */
2771	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
2772	dmu_tx_commit(tx);
2773}
2774
2775static void
2776spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
2777{
2778	char *packed = NULL;
2779	size_t nvsize = 0;
2780	dmu_buf_t *db;
2781
2782	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
2783
2784	packed = kmem_alloc(nvsize, KM_SLEEP);
2785
2786	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
2787	    KM_SLEEP) == 0);
2788
2789	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
2790
2791	kmem_free(packed, nvsize);
2792
2793	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
2794	dmu_buf_will_dirty(db, tx);
2795	*(uint64_t *)db->db_data = nvsize;
2796	dmu_buf_rele(db, FTAG);
2797}
2798
2799static void
2800spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
2801{
2802	nvlist_t *nvroot;
2803	nvlist_t **spares;
2804	int i;
2805
2806	if (!spa->spa_sync_spares)
2807		return;
2808
2809	/*
2810	 * Update the MOS nvlist describing the list of available spares.
2811	 * spa_validate_spares() will have already made sure this nvlist is
2812	 * valid and the vdevs are labelled appropriately.
2813	 */
2814	if (spa->spa_spares_object == 0) {
2815		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
2816		    DMU_OT_PACKED_NVLIST, 1 << 14,
2817		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2818		VERIFY(zap_update(spa->spa_meta_objset,
2819		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
2820		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
2821	}
2822
2823	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2824	if (spa->spa_nspares == 0) {
2825		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2826		    NULL, 0) == 0);
2827	} else {
2828		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
2829		    KM_SLEEP);
2830		for (i = 0; i < spa->spa_nspares; i++)
2831			spares[i] = vdev_config_generate(spa,
2832			    spa->spa_spares[i], B_FALSE, B_TRUE);
2833		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2834		    spares, spa->spa_nspares) == 0);
2835		for (i = 0; i < spa->spa_nspares; i++)
2836			nvlist_free(spares[i]);
2837		kmem_free(spares, spa->spa_nspares * sizeof (void *));
2838	}
2839
2840	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
2841	nvlist_free(nvroot);
2842
2843	spa->spa_sync_spares = B_FALSE;
2844}
2845
2846static void
2847spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
2848{
2849	nvlist_t *config;
2850
2851	if (list_is_empty(&spa->spa_dirty_list))
2852		return;
2853
2854	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
2855
2856	if (spa->spa_config_syncing)
2857		nvlist_free(spa->spa_config_syncing);
2858	spa->spa_config_syncing = config;
2859
2860	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
2861}
2862
2863static void
2864spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
2865{
2866	spa_t *spa = arg1;
2867	nvlist_t *nvp = arg2;
2868	nvpair_t *nvpair;
2869	objset_t *mos = spa->spa_meta_objset;
2870	uint64_t zapobj;
2871
2872	mutex_enter(&spa->spa_props_lock);
2873	if (spa->spa_pool_props_object == 0) {
2874		zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
2875		VERIFY(zapobj > 0);
2876
2877		spa->spa_pool_props_object = zapobj;
2878
2879		VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
2880		    DMU_POOL_PROPS, 8, 1,
2881		    &spa->spa_pool_props_object, tx) == 0);
2882	}
2883	mutex_exit(&spa->spa_props_lock);
2884
2885	nvpair = NULL;
2886	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
2887		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
2888		case ZFS_PROP_BOOTFS:
2889			VERIFY(nvlist_lookup_uint64(nvp,
2890			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
2891			VERIFY(zap_update(mos,
2892			    spa->spa_pool_props_object,
2893			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
2894			    &spa->spa_bootfs, tx) == 0);
2895			break;
2896		}
2897	}
2898}
2899
2900/*
2901 * Sync the specified transaction group.  New blocks may be dirtied as
2902 * part of the process, so we iterate until it converges.
2903 */
2904void
2905spa_sync(spa_t *spa, uint64_t txg)
2906{
2907	dsl_pool_t *dp = spa->spa_dsl_pool;
2908	objset_t *mos = spa->spa_meta_objset;
2909	bplist_t *bpl = &spa->spa_sync_bplist;
2910	vdev_t *rvd = spa->spa_root_vdev;
2911	vdev_t *vd;
2912	dmu_tx_t *tx;
2913	int dirty_vdevs;
2914
2915	/*
2916	 * Lock out configuration changes.
2917	 */
2918	spa_config_enter(spa, RW_READER, FTAG);
2919
2920	spa->spa_syncing_txg = txg;
2921	spa->spa_sync_pass = 0;
2922
2923	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
2924
2925	tx = dmu_tx_create_assigned(dp, txg);
2926
2927	/*
2928	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
2929	 * set spa_deflate if we have no raid-z vdevs.
2930	 */
2931	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
2932	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
2933		int i;
2934
2935		for (i = 0; i < rvd->vdev_children; i++) {
2936			vd = rvd->vdev_child[i];
2937			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
2938				break;
2939		}
2940		if (i == rvd->vdev_children) {
2941			spa->spa_deflate = TRUE;
2942			VERIFY(0 == zap_add(spa->spa_meta_objset,
2943			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2944			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
2945		}
2946	}
2947
2948	/*
2949	 * If anything has changed in this txg, push the deferred frees
2950	 * from the previous txg.  If not, leave them alone so that we
2951	 * don't generate work on an otherwise idle system.
2952	 */
2953	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
2954	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
2955	    !txg_list_empty(&dp->dp_sync_tasks, txg))
2956		spa_sync_deferred_frees(spa, txg);
2957
2958	/*
2959	 * Iterate to convergence.
2960	 */
2961	do {
2962		spa->spa_sync_pass++;
2963
2964		spa_sync_config_object(spa, tx);
2965		spa_sync_spares(spa, tx);
2966		spa_errlog_sync(spa, txg);
2967		dsl_pool_sync(dp, txg);
2968
2969		dirty_vdevs = 0;
2970		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
2971			vdev_sync(vd, txg);
2972			dirty_vdevs++;
2973		}
2974
2975		bplist_sync(bpl, tx);
2976	} while (dirty_vdevs);
2977
2978	bplist_close(bpl);
2979
2980	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
2981
2982	/*
2983	 * Rewrite the vdev configuration (which includes the uberblock)
2984	 * to commit the transaction group.
2985	 *
2986	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
2987	 * Otherwise, pick a random top-level vdev that's known to be
2988	 * visible in the config cache (see spa_vdev_add() for details).
2989	 * If the write fails, try the next vdev until we're tried them all.
2990	 */
2991	if (!list_is_empty(&spa->spa_dirty_list)) {
2992		VERIFY(vdev_config_sync(rvd, txg) == 0);
2993	} else {
2994		int children = rvd->vdev_children;
2995		int c0 = spa_get_random(children);
2996		int c;
2997
2998		for (c = 0; c < children; c++) {
2999			vd = rvd->vdev_child[(c0 + c) % children];
3000			if (vd->vdev_ms_array == 0)
3001				continue;
3002			if (vdev_config_sync(vd, txg) == 0)
3003				break;
3004		}
3005		if (c == children)
3006			VERIFY(vdev_config_sync(rvd, txg) == 0);
3007	}
3008
3009	dmu_tx_commit(tx);
3010
3011	/*
3012	 * Clear the dirty config list.
3013	 */
3014	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
3015		vdev_config_clean(vd);
3016
3017	/*
3018	 * Now that the new config has synced transactionally,
3019	 * let it become visible to the config cache.
3020	 */
3021	if (spa->spa_config_syncing != NULL) {
3022		spa_config_set(spa, spa->spa_config_syncing);
3023		spa->spa_config_txg = txg;
3024		spa->spa_config_syncing = NULL;
3025	}
3026
3027	/*
3028	 * Make a stable copy of the fully synced uberblock.
3029	 * We use this as the root for pool traversals.
3030	 */
3031	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
3032
3033	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
3034
3035	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
3036	spa->spa_traverse_wanted = 0;
3037	spa->spa_ubsync = spa->spa_uberblock;
3038	rw_exit(&spa->spa_traverse_lock);
3039
3040	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
3041
3042	/*
3043	 * Clean up the ZIL records for the synced txg.
3044	 */
3045	dsl_pool_zil_clean(dp);
3046
3047	/*
3048	 * Update usable space statistics.
3049	 */
3050	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
3051		vdev_sync_done(vd, txg);
3052
3053	/*
3054	 * It had better be the case that we didn't dirty anything
3055	 * since vdev_config_sync().
3056	 */
3057	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
3058	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
3059	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
3060	ASSERT(bpl->bpl_queue == NULL);
3061
3062	spa_config_exit(spa, FTAG);
3063
3064	/*
3065	 * If any async tasks have been requested, kick them off.
3066	 */
3067	spa_async_dispatch(spa);
3068}
3069
3070/*
3071 * Sync all pools.  We don't want to hold the namespace lock across these
3072 * operations, so we take a reference on the spa_t and drop the lock during the
3073 * sync.
3074 */
3075void
3076spa_sync_allpools(void)
3077{
3078	spa_t *spa = NULL;
3079	mutex_enter(&spa_namespace_lock);
3080	while ((spa = spa_next(spa)) != NULL) {
3081		if (spa_state(spa) != POOL_STATE_ACTIVE)
3082			continue;
3083		spa_open_ref(spa, FTAG);
3084		mutex_exit(&spa_namespace_lock);
3085		txg_wait_synced(spa_get_dsl(spa), 0);
3086		mutex_enter(&spa_namespace_lock);
3087		spa_close(spa, FTAG);
3088	}
3089	mutex_exit(&spa_namespace_lock);
3090}
3091
3092/*
3093 * ==========================================================================
3094 * Miscellaneous routines
3095 * ==========================================================================
3096 */
3097
3098/*
3099 * Remove all pools in the system.
3100 */
3101void
3102spa_evict_all(void)
3103{
3104	spa_t *spa;
3105
3106	/*
3107	 * Remove all cached state.  All pools should be closed now,
3108	 * so every spa in the AVL tree should be unreferenced.
3109	 */
3110	mutex_enter(&spa_namespace_lock);
3111	while ((spa = spa_next(NULL)) != NULL) {
3112		/*
3113		 * Stop async tasks.  The async thread may need to detach
3114		 * a device that's been replaced, which requires grabbing
3115		 * spa_namespace_lock, so we must drop it here.
3116		 */
3117		spa_open_ref(spa, FTAG);
3118		mutex_exit(&spa_namespace_lock);
3119		spa_async_suspend(spa);
3120		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
3121		mutex_enter(&spa_namespace_lock);
3122		spa_close(spa, FTAG);
3123
3124		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3125			spa_unload(spa);
3126			spa_deactivate(spa);
3127		}
3128		spa_remove(spa);
3129	}
3130	mutex_exit(&spa_namespace_lock);
3131}
3132
3133vdev_t *
3134spa_lookup_by_guid(spa_t *spa, uint64_t guid)
3135{
3136	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
3137}
3138
3139void
3140spa_upgrade(spa_t *spa)
3141{
3142	spa_config_enter(spa, RW_WRITER, FTAG);
3143
3144	/*
3145	 * This should only be called for a non-faulted pool, and since a
3146	 * future version would result in an unopenable pool, this shouldn't be
3147	 * possible.
3148	 */
3149	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
3150
3151	spa->spa_uberblock.ub_version = ZFS_VERSION;
3152	vdev_config_dirty(spa->spa_root_vdev);
3153
3154	spa_config_exit(spa, FTAG);
3155
3156	txg_wait_synced(spa_get_dsl(spa), 0);
3157}
3158
3159boolean_t
3160spa_has_spare(spa_t *spa, uint64_t guid)
3161{
3162	int i;
3163	uint64_t spareguid;
3164
3165	for (i = 0; i < spa->spa_nspares; i++)
3166		if (spa->spa_spares[i]->vdev_guid == guid)
3167			return (B_TRUE);
3168
3169	for (i = 0; i < spa->spa_pending_nspares; i++) {
3170		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
3171		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
3172		    spareguid == guid)
3173			return (B_TRUE);
3174	}
3175
3176	return (B_FALSE);
3177}
3178
3179int
3180spa_set_props(spa_t *spa, nvlist_t *nvp)
3181{
3182	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
3183	    spa, nvp, 3));
3184}
3185
3186int
3187spa_get_props(spa_t *spa, nvlist_t **nvp)
3188{
3189	zap_cursor_t zc;
3190	zap_attribute_t za;
3191	objset_t *mos = spa->spa_meta_objset;
3192	zfs_source_t src;
3193	zfs_prop_t prop;
3194	nvlist_t *propval;
3195	uint64_t value;
3196	int err;
3197
3198	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3199
3200	mutex_enter(&spa->spa_props_lock);
3201	/* If no props object, then just return empty nvlist */
3202	if (spa->spa_pool_props_object == 0) {
3203		mutex_exit(&spa->spa_props_lock);
3204		return (0);
3205	}
3206
3207	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
3208	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
3209	    zap_cursor_advance(&zc)) {
3210
3211		if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
3212			continue;
3213
3214		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3215		switch (za.za_integer_length) {
3216		case 8:
3217			if (zfs_prop_default_numeric(prop) ==
3218			    za.za_first_integer)
3219				src = ZFS_SRC_DEFAULT;
3220			else
3221				src = ZFS_SRC_LOCAL;
3222			value = za.za_first_integer;
3223
3224			if (prop == ZFS_PROP_BOOTFS) {
3225				dsl_pool_t *dp;
3226				dsl_dataset_t *ds = NULL;
3227				char strval[MAXPATHLEN];
3228
3229				dp = spa_get_dsl(spa);
3230				rw_enter(&dp->dp_config_rwlock, RW_READER);
3231				if ((err = dsl_dataset_open_obj(dp,
3232				    za.za_first_integer, NULL, DS_MODE_NONE,
3233				    FTAG, &ds)) != 0) {
3234					rw_exit(&dp->dp_config_rwlock);
3235					break;
3236				}
3237				dsl_dataset_name(ds, strval);
3238				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
3239				rw_exit(&dp->dp_config_rwlock);
3240
3241				VERIFY(nvlist_add_uint64(propval,
3242				    ZFS_PROP_SOURCE, src) == 0);
3243				VERIFY(nvlist_add_string(propval,
3244				    ZFS_PROP_VALUE, strval) == 0);
3245			} else {
3246				VERIFY(nvlist_add_uint64(propval,
3247				    ZFS_PROP_SOURCE, src) == 0);
3248				VERIFY(nvlist_add_uint64(propval,
3249				    ZFS_PROP_VALUE, value) == 0);
3250			}
3251			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
3252			    propval) == 0);
3253			break;
3254		}
3255		nvlist_free(propval);
3256	}
3257	zap_cursor_fini(&zc);
3258	mutex_exit(&spa->spa_props_lock);
3259	if (err && err != ENOENT) {
3260		nvlist_free(*nvp);
3261		return (err);
3262	}
3263
3264	return (0);
3265}
3266
3267/*
3268 * If the bootfs property value is dsobj, clear it.
3269 */
3270void
3271spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
3272{
3273	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
3274		VERIFY(zap_remove(spa->spa_meta_objset,
3275		    spa->spa_pool_props_object,
3276		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
3277		spa->spa_bootfs = 0;
3278	}
3279}
3280