1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 * Copyright 2016 Toomas Soome <tsoome@me.com>
30 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
31 * Copyright 2019 Joyent, Inc.
32 * Copyright (c) 2017, Intel Corporation.
33 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
34 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
35 */
36
37/*
38 * SPA: Storage Pool Allocator
39 *
40 * This file contains all the routines used when modifying on-disk SPA state.
41 * This includes opening, importing, destroying, exporting a pool, and syncing a
42 * pool.
43 */
44
45#include <sys/zfs_context.h>
46#include <sys/fm/fs/zfs.h>
47#include <sys/spa_impl.h>
48#include <sys/zio.h>
49#include <sys/zio_checksum.h>
50#include <sys/dmu.h>
51#include <sys/dmu_tx.h>
52#include <sys/zap.h>
53#include <sys/zil.h>
54#include <sys/ddt.h>
55#include <sys/vdev_impl.h>
56#include <sys/vdev_removal.h>
57#include <sys/vdev_indirect_mapping.h>
58#include <sys/vdev_indirect_births.h>
59#include <sys/vdev_initialize.h>
60#include <sys/vdev_trim.h>
61#include <sys/metaslab.h>
62#include <sys/metaslab_impl.h>
63#include <sys/mmp.h>
64#include <sys/uberblock_impl.h>
65#include <sys/txg.h>
66#include <sys/avl.h>
67#include <sys/bpobj.h>
68#include <sys/dmu_traverse.h>
69#include <sys/dmu_objset.h>
70#include <sys/unique.h>
71#include <sys/dsl_pool.h>
72#include <sys/dsl_dataset.h>
73#include <sys/dsl_dir.h>
74#include <sys/dsl_prop.h>
75#include <sys/dsl_synctask.h>
76#include <sys/fs/zfs.h>
77#include <sys/arc.h>
78#include <sys/callb.h>
79#include <sys/systeminfo.h>
80#include <sys/spa_boot.h>
81#include <sys/zfs_ioctl.h>
82#include <sys/dsl_scan.h>
83#include <sys/zfeature.h>
84#include <sys/dsl_destroy.h>
85#include <sys/abd.h>
86
87#ifdef	_KERNEL
88#include <sys/bootprops.h>
89#include <sys/callb.h>
90#include <sys/cpupart.h>
91#include <sys/pool.h>
92#include <sys/sysdc.h>
93#include <sys/zone.h>
94#endif	/* _KERNEL */
95
96#include "zfs_prop.h"
97#include "zfs_comutil.h"
98
99/*
100 * The interval, in seconds, at which failed configuration cache file writes
101 * should be retried.
102 */
103int zfs_ccw_retry_interval = 300;
104
105typedef enum zti_modes {
106	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
107	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
108	ZTI_MODE_NULL,			/* don't create a taskq */
109	ZTI_NMODES
110} zti_modes_t;
111
112#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
113#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
114#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
115
116#define	ZTI_N(n)	ZTI_P(n, 1)
117#define	ZTI_ONE		ZTI_N(1)
118
119typedef struct zio_taskq_info {
120	zti_modes_t zti_mode;
121	uint_t zti_value;
122	uint_t zti_count;
123} zio_taskq_info_t;
124
125static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
126	"issue", "issue_high", "intr", "intr_high"
127};
128
129/*
130 * This table defines the taskq settings for each ZFS I/O type. When
131 * initializing a pool, we use this table to create an appropriately sized
132 * taskq. Some operations are low volume and therefore have a small, static
133 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
134 * macros. Other operations process a large amount of data; the ZTI_BATCH
135 * macro causes us to create a taskq oriented for throughput. Some operations
136 * are so high frequency and short-lived that the taskq itself can become a
137 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
138 * additional degree of parallelism specified by the number of threads per-
139 * taskq and the number of taskqs; when dispatching an event in this case, the
140 * particular taskq is chosen at random.
141 *
142 * The different taskq priorities are to handle the different contexts (issue
143 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
144 * need to be handled with minimum delay.
145 */
146const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
147	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
148	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
149	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
150	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
151	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
152	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
153	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
154	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
155};
156
157static void spa_sync_version(void *arg, dmu_tx_t *tx);
158static void spa_sync_props(void *arg, dmu_tx_t *tx);
159static boolean_t spa_has_active_shared_spare(spa_t *spa);
160static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
161static void spa_vdev_resilver_done(spa_t *spa);
162
163uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
164id_t		zio_taskq_psrset_bind = PS_NONE;
165boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
166uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
167
168boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
169extern int	zfs_sync_pass_deferred_free;
170
171/*
172 * Report any spa_load_verify errors found, but do not fail spa_load.
173 * This is used by zdb to analyze non-idle pools.
174 */
175boolean_t	spa_load_verify_dryrun = B_FALSE;
176
177/*
178 * This (illegal) pool name is used when temporarily importing a spa_t in order
179 * to get the vdev stats associated with the imported devices.
180 */
181#define	TRYIMPORT_NAME	"$import"
182
183/*
184 * For debugging purposes: print out vdev tree during pool import.
185 */
186boolean_t	spa_load_print_vdev_tree = B_FALSE;
187
188/*
189 * A non-zero value for zfs_max_missing_tvds means that we allow importing
190 * pools with missing top-level vdevs. This is strictly intended for advanced
191 * pool recovery cases since missing data is almost inevitable. Pools with
192 * missing devices can only be imported read-only for safety reasons, and their
193 * fail-mode will be automatically set to "continue".
194 *
195 * With 1 missing vdev we should be able to import the pool and mount all
196 * datasets. User data that was not modified after the missing device has been
197 * added should be recoverable. This means that snapshots created prior to the
198 * addition of that device should be completely intact.
199 *
200 * With 2 missing vdevs, some datasets may fail to mount since there are
201 * dataset statistics that are stored as regular metadata. Some data might be
202 * recoverable if those vdevs were added recently.
203 *
204 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
205 * may be missing entirely. Chances of data recovery are very low. Note that
206 * there are also risks of performing an inadvertent rewind as we might be
207 * missing all the vdevs with the latest uberblocks.
208 */
209uint64_t	zfs_max_missing_tvds = 0;
210
211/*
212 * The parameters below are similar to zfs_max_missing_tvds but are only
213 * intended for a preliminary open of the pool with an untrusted config which
214 * might be incomplete or out-dated.
215 *
216 * We are more tolerant for pools opened from a cachefile since we could have
217 * an out-dated cachefile where a device removal was not registered.
218 * We could have set the limit arbitrarily high but in the case where devices
219 * are really missing we would want to return the proper error codes; we chose
220 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
221 * and we get a chance to retrieve the trusted config.
222 */
223uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
224
225/*
226 * In the case where config was assembled by scanning device paths (/dev/dsks
227 * by default) we are less tolerant since all the existing devices should have
228 * been detected and we want spa_load to return the right error codes.
229 */
230uint64_t	zfs_max_missing_tvds_scan = 0;
231
232/*
233 * Interval in seconds at which to poll spare vdevs for health.
234 * Setting this to zero disables spare polling.
235 * Set to three hours by default.
236 */
237uint_t		spa_spare_poll_interval_seconds = 60 * 60 * 3;
238
239/*
240 * Debugging aid that pauses spa_sync() towards the end.
241 */
242boolean_t	zfs_pause_spa_sync = B_FALSE;
243
244/*
245 * ==========================================================================
246 * SPA properties routines
247 * ==========================================================================
248 */
249
250/*
251 * Add a (source=src, propname=propval) list to an nvlist.
252 */
253static void
254spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
255    uint64_t intval, zprop_source_t src)
256{
257	const char *propname = zpool_prop_to_name(prop);
258	nvlist_t *propval;
259
260	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
261	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
262
263	if (strval != NULL)
264		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
265	else
266		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
267
268	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
269	nvlist_free(propval);
270}
271
272/*
273 * Get property values from the spa configuration.
274 */
275static void
276spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
277{
278	vdev_t *rvd = spa->spa_root_vdev;
279	dsl_pool_t *pool = spa->spa_dsl_pool;
280	uint64_t size, alloc, cap, version;
281	zprop_source_t src = ZPROP_SRC_NONE;
282	spa_config_dirent_t *dp;
283	metaslab_class_t *mc = spa_normal_class(spa);
284
285	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
286
287	if (rvd != NULL) {
288		alloc = metaslab_class_get_alloc(mc);
289		alloc += metaslab_class_get_alloc(spa_special_class(spa));
290		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
291
292		size = metaslab_class_get_space(mc);
293		size += metaslab_class_get_space(spa_special_class(spa));
294		size += metaslab_class_get_space(spa_dedup_class(spa));
295
296		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
297		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
298		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
299		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
300		    size - alloc, src);
301		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
302		    spa->spa_checkpoint_info.sci_dspace, src);
303
304		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
305		    metaslab_class_fragmentation(mc), src);
306		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
307		    metaslab_class_expandable_space(mc), src);
308		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
309		    (spa_mode(spa) == FREAD), src);
310
311		cap = (size == 0) ? 0 : (alloc * 100 / size);
312		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
313
314		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
315		    ddt_get_pool_dedup_ratio(spa), src);
316
317		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
318		    rvd->vdev_state, src);
319
320		version = spa_version(spa);
321		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
322			src = ZPROP_SRC_DEFAULT;
323		else
324			src = ZPROP_SRC_LOCAL;
325		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
326	}
327
328	if (pool != NULL) {
329		/*
330		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
331		 * when opening pools before this version freedir will be NULL.
332		 */
333		if (pool->dp_free_dir != NULL) {
334			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
335			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
336			    src);
337		} else {
338			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
339			    NULL, 0, src);
340		}
341
342		if (pool->dp_leak_dir != NULL) {
343			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
344			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
345			    src);
346		} else {
347			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
348			    NULL, 0, src);
349		}
350	}
351
352	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
353
354	if (spa->spa_comment != NULL) {
355		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
356		    0, ZPROP_SRC_LOCAL);
357	}
358
359	if (spa->spa_root != NULL)
360		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
361		    0, ZPROP_SRC_LOCAL);
362
363	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
364		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
365		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
366	} else {
367		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
368		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
369	}
370
371	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
372		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
373		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
374	} else {
375		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
376		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
377	}
378
379	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
380		if (dp->scd_path == NULL) {
381			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
382			    "none", 0, ZPROP_SRC_LOCAL);
383		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
384			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
385			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
386		}
387	}
388}
389
390/*
391 * Get zpool property values.
392 */
393int
394spa_prop_get(spa_t *spa, nvlist_t **nvp)
395{
396	objset_t *mos = spa->spa_meta_objset;
397	zap_cursor_t zc;
398	zap_attribute_t za;
399	int err;
400
401	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
402
403	mutex_enter(&spa->spa_props_lock);
404
405	/*
406	 * Get properties from the spa config.
407	 */
408	spa_prop_get_config(spa, nvp);
409
410	/* If no pool property object, no more prop to get. */
411	if (mos == NULL || spa->spa_pool_props_object == 0) {
412		mutex_exit(&spa->spa_props_lock);
413		return (0);
414	}
415
416	/*
417	 * Get properties from the MOS pool property object.
418	 */
419	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
420	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
421	    zap_cursor_advance(&zc)) {
422		uint64_t intval = 0;
423		char *strval = NULL;
424		zprop_source_t src = ZPROP_SRC_DEFAULT;
425		zpool_prop_t prop;
426
427		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
428			continue;
429
430		switch (za.za_integer_length) {
431		case 8:
432			/* integer property */
433			if (za.za_first_integer !=
434			    zpool_prop_default_numeric(prop))
435				src = ZPROP_SRC_LOCAL;
436
437			if (prop == ZPOOL_PROP_BOOTFS) {
438				dsl_pool_t *dp;
439				dsl_dataset_t *ds = NULL;
440
441				dp = spa_get_dsl(spa);
442				dsl_pool_config_enter(dp, FTAG);
443				err = dsl_dataset_hold_obj(dp,
444				    za.za_first_integer, FTAG, &ds);
445				if (err != 0) {
446					dsl_pool_config_exit(dp, FTAG);
447					break;
448				}
449
450				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
451				    KM_SLEEP);
452				dsl_dataset_name(ds, strval);
453				dsl_dataset_rele(ds, FTAG);
454				dsl_pool_config_exit(dp, FTAG);
455			} else {
456				strval = NULL;
457				intval = za.za_first_integer;
458			}
459
460			spa_prop_add_list(*nvp, prop, strval, intval, src);
461
462			if (strval != NULL)
463				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
464
465			break;
466
467		case 1:
468			/* string property */
469			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
470			err = zap_lookup(mos, spa->spa_pool_props_object,
471			    za.za_name, 1, za.za_num_integers, strval);
472			if (err) {
473				kmem_free(strval, za.za_num_integers);
474				break;
475			}
476			spa_prop_add_list(*nvp, prop, strval, 0, src);
477			kmem_free(strval, za.za_num_integers);
478			break;
479
480		default:
481			break;
482		}
483	}
484	zap_cursor_fini(&zc);
485	mutex_exit(&spa->spa_props_lock);
486out:
487	if (err && err != ENOENT) {
488		nvlist_free(*nvp);
489		*nvp = NULL;
490		return (err);
491	}
492
493	return (0);
494}
495
496/*
497 * Validate the given pool properties nvlist and modify the list
498 * for the property values to be set.
499 */
500static int
501spa_prop_validate(spa_t *spa, nvlist_t *props)
502{
503	nvpair_t *elem;
504	int error = 0, reset_bootfs = 0;
505	uint64_t objnum = 0;
506	boolean_t has_feature = B_FALSE;
507
508	elem = NULL;
509	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
510		uint64_t intval;
511		char *strval, *slash, *check, *fname;
512		const char *propname = nvpair_name(elem);
513		zpool_prop_t prop = zpool_name_to_prop(propname);
514
515		switch (prop) {
516		case ZPOOL_PROP_INVAL:
517			if (!zpool_prop_feature(propname)) {
518				error = SET_ERROR(EINVAL);
519				break;
520			}
521
522			/*
523			 * Sanitize the input.
524			 */
525			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
526				error = SET_ERROR(EINVAL);
527				break;
528			}
529
530			if (nvpair_value_uint64(elem, &intval) != 0) {
531				error = SET_ERROR(EINVAL);
532				break;
533			}
534
535			if (intval != 0) {
536				error = SET_ERROR(EINVAL);
537				break;
538			}
539
540			fname = strchr(propname, '@') + 1;
541			if (zfeature_lookup_name(fname, NULL) != 0) {
542				error = SET_ERROR(EINVAL);
543				break;
544			}
545
546			has_feature = B_TRUE;
547			break;
548
549		case ZPOOL_PROP_VERSION:
550			error = nvpair_value_uint64(elem, &intval);
551			if (!error &&
552			    (intval < spa_version(spa) ||
553			    intval > SPA_VERSION_BEFORE_FEATURES ||
554			    has_feature))
555				error = SET_ERROR(EINVAL);
556			break;
557
558		case ZPOOL_PROP_DELEGATION:
559		case ZPOOL_PROP_AUTOREPLACE:
560		case ZPOOL_PROP_LISTSNAPS:
561		case ZPOOL_PROP_AUTOEXPAND:
562		case ZPOOL_PROP_AUTOTRIM:
563			error = nvpair_value_uint64(elem, &intval);
564			if (!error && intval > 1)
565				error = SET_ERROR(EINVAL);
566			break;
567
568		case ZPOOL_PROP_MULTIHOST:
569			error = nvpair_value_uint64(elem, &intval);
570			if (!error && intval > 1)
571				error = SET_ERROR(EINVAL);
572
573			if (!error && !spa_get_hostid())
574				error = SET_ERROR(ENOTSUP);
575
576			break;
577
578		case ZPOOL_PROP_BOOTFS:
579			/*
580			 * If the pool version is less than SPA_VERSION_BOOTFS,
581			 * or the pool is still being created (version == 0),
582			 * the bootfs property cannot be set.
583			 */
584			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
585				error = SET_ERROR(ENOTSUP);
586				break;
587			}
588
589			/*
590			 * Make sure the vdev config is bootable
591			 */
592			if (!vdev_is_bootable(spa->spa_root_vdev)) {
593				error = SET_ERROR(ENOTSUP);
594				break;
595			}
596
597			reset_bootfs = 1;
598
599			error = nvpair_value_string(elem, &strval);
600
601			if (!error) {
602				objset_t *os;
603				uint64_t propval;
604
605				if (strval == NULL || strval[0] == '\0') {
606					objnum = zpool_prop_default_numeric(
607					    ZPOOL_PROP_BOOTFS);
608					break;
609				}
610
611				error = dmu_objset_hold(strval, FTAG, &os);
612				if (error != 0)
613					break;
614
615				/*
616				 * Must be ZPL, and its property settings
617				 * must be supported.
618				 */
619
620				if (dmu_objset_type(os) != DMU_OST_ZFS) {
621					error = SET_ERROR(ENOTSUP);
622				} else if ((error =
623				    dsl_prop_get_int_ds(dmu_objset_ds(os),
624				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
625				    &propval)) == 0 &&
626				    !BOOTFS_COMPRESS_VALID(propval)) {
627					error = SET_ERROR(ENOTSUP);
628				} else {
629					objnum = dmu_objset_id(os);
630				}
631				dmu_objset_rele(os, FTAG);
632			}
633			break;
634
635		case ZPOOL_PROP_FAILUREMODE:
636			error = nvpair_value_uint64(elem, &intval);
637			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
638			    intval > ZIO_FAILURE_MODE_PANIC))
639				error = SET_ERROR(EINVAL);
640
641			/*
642			 * This is a special case which only occurs when
643			 * the pool has completely failed. This allows
644			 * the user to change the in-core failmode property
645			 * without syncing it out to disk (I/Os might
646			 * currently be blocked). We do this by returning
647			 * EIO to the caller (spa_prop_set) to trick it
648			 * into thinking we encountered a property validation
649			 * error.
650			 */
651			if (!error && spa_suspended(spa)) {
652				spa->spa_failmode = intval;
653				error = SET_ERROR(EIO);
654			}
655			break;
656
657		case ZPOOL_PROP_CACHEFILE:
658			if ((error = nvpair_value_string(elem, &strval)) != 0)
659				break;
660
661			if (strval[0] == '\0')
662				break;
663
664			if (strcmp(strval, "none") == 0)
665				break;
666
667			if (strval[0] != '/') {
668				error = SET_ERROR(EINVAL);
669				break;
670			}
671
672			slash = strrchr(strval, '/');
673			ASSERT(slash != NULL);
674
675			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
676			    strcmp(slash, "/..") == 0)
677				error = SET_ERROR(EINVAL);
678			break;
679
680		case ZPOOL_PROP_COMMENT:
681			if ((error = nvpair_value_string(elem, &strval)) != 0)
682				break;
683			for (check = strval; *check != '\0'; check++) {
684				/*
685				 * The kernel doesn't have an easy isprint()
686				 * check.  For this kernel check, we merely
687				 * check ASCII apart from DEL.  Fix this if
688				 * there is an easy-to-use kernel isprint().
689				 */
690				if (*check >= 0x7f) {
691					error = SET_ERROR(EINVAL);
692					break;
693				}
694			}
695			if (strlen(strval) > ZPROP_MAX_COMMENT)
696				error = E2BIG;
697			break;
698
699		case ZPOOL_PROP_DEDUPDITTO:
700			if (spa_version(spa) < SPA_VERSION_DEDUP)
701				error = SET_ERROR(ENOTSUP);
702			else
703				error = nvpair_value_uint64(elem, &intval);
704			if (error == 0 &&
705			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
706				error = SET_ERROR(EINVAL);
707			break;
708		}
709
710		if (error)
711			break;
712	}
713
714	if (!error && reset_bootfs) {
715		error = nvlist_remove(props,
716		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
717
718		if (!error) {
719			error = nvlist_add_uint64(props,
720			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
721		}
722	}
723
724	return (error);
725}
726
727void
728spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
729{
730	char *cachefile;
731	spa_config_dirent_t *dp;
732
733	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
734	    &cachefile) != 0)
735		return;
736
737	dp = kmem_alloc(sizeof (spa_config_dirent_t),
738	    KM_SLEEP);
739
740	if (cachefile[0] == '\0')
741		dp->scd_path = spa_strdup(spa_config_path);
742	else if (strcmp(cachefile, "none") == 0)
743		dp->scd_path = NULL;
744	else
745		dp->scd_path = spa_strdup(cachefile);
746
747	list_insert_head(&spa->spa_config_list, dp);
748	if (need_sync)
749		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
750}
751
752int
753spa_prop_set(spa_t *spa, nvlist_t *nvp)
754{
755	int error;
756	nvpair_t *elem = NULL;
757	boolean_t need_sync = B_FALSE;
758
759	if ((error = spa_prop_validate(spa, nvp)) != 0)
760		return (error);
761
762	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
763		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
764
765		if (prop == ZPOOL_PROP_CACHEFILE ||
766		    prop == ZPOOL_PROP_ALTROOT ||
767		    prop == ZPOOL_PROP_READONLY)
768			continue;
769
770		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
771			uint64_t ver;
772
773			if (prop == ZPOOL_PROP_VERSION) {
774				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
775			} else {
776				ASSERT(zpool_prop_feature(nvpair_name(elem)));
777				ver = SPA_VERSION_FEATURES;
778				need_sync = B_TRUE;
779			}
780
781			/* Save time if the version is already set. */
782			if (ver == spa_version(spa))
783				continue;
784
785			/*
786			 * In addition to the pool directory object, we might
787			 * create the pool properties object, the features for
788			 * read object, the features for write object, or the
789			 * feature descriptions object.
790			 */
791			error = dsl_sync_task(spa->spa_name, NULL,
792			    spa_sync_version, &ver,
793			    6, ZFS_SPACE_CHECK_RESERVED);
794			if (error)
795				return (error);
796			continue;
797		}
798
799		need_sync = B_TRUE;
800		break;
801	}
802
803	if (need_sync) {
804		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
805		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
806	}
807
808	return (0);
809}
810
811/*
812 * If the bootfs property value is dsobj, clear it.
813 */
814void
815spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
816{
817	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
818		VERIFY(zap_remove(spa->spa_meta_objset,
819		    spa->spa_pool_props_object,
820		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
821		spa->spa_bootfs = 0;
822	}
823}
824
825/*ARGSUSED*/
826static int
827spa_change_guid_check(void *arg, dmu_tx_t *tx)
828{
829	uint64_t *newguid = arg;
830	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
831	vdev_t *rvd = spa->spa_root_vdev;
832	uint64_t vdev_state;
833
834	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
835		int error = (spa_has_checkpoint(spa)) ?
836		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
837		return (SET_ERROR(error));
838	}
839
840	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
841	vdev_state = rvd->vdev_state;
842	spa_config_exit(spa, SCL_STATE, FTAG);
843
844	if (vdev_state != VDEV_STATE_HEALTHY)
845		return (SET_ERROR(ENXIO));
846
847	ASSERT3U(spa_guid(spa), !=, *newguid);
848
849	return (0);
850}
851
852static void
853spa_change_guid_sync(void *arg, dmu_tx_t *tx)
854{
855	uint64_t *newguid = arg;
856	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
857	uint64_t oldguid;
858	vdev_t *rvd = spa->spa_root_vdev;
859
860	oldguid = spa_guid(spa);
861
862	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
863	rvd->vdev_guid = *newguid;
864	rvd->vdev_guid_sum += (*newguid - oldguid);
865	vdev_config_dirty(rvd);
866	spa_config_exit(spa, SCL_STATE, FTAG);
867
868	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
869	    oldguid, *newguid);
870}
871
872/*
873 * Change the GUID for the pool.  This is done so that we can later
874 * re-import a pool built from a clone of our own vdevs.  We will modify
875 * the root vdev's guid, our own pool guid, and then mark all of our
876 * vdevs dirty.  Note that we must make sure that all our vdevs are
877 * online when we do this, or else any vdevs that weren't present
878 * would be orphaned from our pool.  We are also going to issue a
879 * sysevent to update any watchers.
880 */
881int
882spa_change_guid(spa_t *spa)
883{
884	int error;
885	uint64_t guid;
886
887	mutex_enter(&spa->spa_vdev_top_lock);
888	mutex_enter(&spa_namespace_lock);
889	guid = spa_generate_guid(NULL);
890
891	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
892	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
893
894	if (error == 0) {
895		spa_write_cachefile(spa, B_FALSE, B_TRUE);
896		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
897	}
898
899	mutex_exit(&spa_namespace_lock);
900	mutex_exit(&spa->spa_vdev_top_lock);
901
902	return (error);
903}
904
905/*
906 * ==========================================================================
907 * SPA state manipulation (open/create/destroy/import/export)
908 * ==========================================================================
909 */
910
911static int
912spa_error_entry_compare(const void *a, const void *b)
913{
914	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
915	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
916	int ret;
917
918	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
919	    sizeof (zbookmark_phys_t));
920
921	return (TREE_ISIGN(ret));
922}
923
924/*
925 * Utility function which retrieves copies of the current logs and
926 * re-initializes them in the process.
927 */
928void
929spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
930{
931	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
932
933	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
934	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
935
936	avl_create(&spa->spa_errlist_scrub,
937	    spa_error_entry_compare, sizeof (spa_error_entry_t),
938	    offsetof(spa_error_entry_t, se_avl));
939	avl_create(&spa->spa_errlist_last,
940	    spa_error_entry_compare, sizeof (spa_error_entry_t),
941	    offsetof(spa_error_entry_t, se_avl));
942}
943
944static void
945spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
946{
947	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
948	enum zti_modes mode = ztip->zti_mode;
949	uint_t value = ztip->zti_value;
950	uint_t count = ztip->zti_count;
951	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
952	char name[32];
953	uint_t flags = 0;
954	boolean_t batch = B_FALSE;
955
956	if (mode == ZTI_MODE_NULL) {
957		tqs->stqs_count = 0;
958		tqs->stqs_taskq = NULL;
959		return;
960	}
961
962	ASSERT3U(count, >, 0);
963
964	tqs->stqs_count = count;
965	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
966
967	switch (mode) {
968	case ZTI_MODE_FIXED:
969		ASSERT3U(value, >=, 1);
970		value = MAX(value, 1);
971		break;
972
973	case ZTI_MODE_BATCH:
974		batch = B_TRUE;
975		flags |= TASKQ_THREADS_CPU_PCT;
976		value = zio_taskq_batch_pct;
977		break;
978
979	default:
980		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
981		    "spa_activate()",
982		    zio_type_name[t], zio_taskq_types[q], mode, value);
983		break;
984	}
985
986	for (uint_t i = 0; i < count; i++) {
987		taskq_t *tq;
988
989		if (count > 1) {
990			(void) snprintf(name, sizeof (name), "%s_%s_%u",
991			    zio_type_name[t], zio_taskq_types[q], i);
992		} else {
993			(void) snprintf(name, sizeof (name), "%s_%s",
994			    zio_type_name[t], zio_taskq_types[q]);
995		}
996
997		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
998			if (batch)
999				flags |= TASKQ_DC_BATCH;
1000
1001			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1002			    spa->spa_proc, zio_taskq_basedc, flags);
1003		} else {
1004			pri_t pri = maxclsyspri;
1005			/*
1006			 * The write issue taskq can be extremely CPU
1007			 * intensive.  Run it at slightly lower priority
1008			 * than the other taskqs.
1009			 */
1010			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
1011				pri--;
1012
1013			tq = taskq_create_proc(name, value, pri, 50,
1014			    INT_MAX, spa->spa_proc, flags);
1015		}
1016
1017		tqs->stqs_taskq[i] = tq;
1018	}
1019}
1020
1021static void
1022spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1023{
1024	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1025
1026	if (tqs->stqs_taskq == NULL) {
1027		ASSERT0(tqs->stqs_count);
1028		return;
1029	}
1030
1031	for (uint_t i = 0; i < tqs->stqs_count; i++) {
1032		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1033		taskq_destroy(tqs->stqs_taskq[i]);
1034	}
1035
1036	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1037	tqs->stqs_taskq = NULL;
1038}
1039
1040/*
1041 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1042 * Note that a type may have multiple discrete taskqs to avoid lock contention
1043 * on the taskq itself. In that case we choose which taskq at random by using
1044 * the low bits of gethrtime().
1045 */
1046void
1047spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1048    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1049{
1050	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1051	taskq_t *tq;
1052
1053	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1054	ASSERT3U(tqs->stqs_count, !=, 0);
1055
1056	if (tqs->stqs_count == 1) {
1057		tq = tqs->stqs_taskq[0];
1058	} else {
1059		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1060	}
1061
1062	taskq_dispatch_ent(tq, func, arg, flags, ent);
1063}
1064
1065static void
1066spa_create_zio_taskqs(spa_t *spa)
1067{
1068	for (int t = 0; t < ZIO_TYPES; t++) {
1069		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1070			spa_taskqs_init(spa, t, q);
1071		}
1072	}
1073}
1074
1075#ifdef _KERNEL
1076static void
1077spa_thread(void *arg)
1078{
1079	callb_cpr_t cprinfo;
1080
1081	spa_t *spa = arg;
1082	user_t *pu = PTOU(curproc);
1083
1084	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1085	    spa->spa_name);
1086
1087	ASSERT(curproc != &p0);
1088	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1089	    "zpool-%s", spa->spa_name);
1090	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1091
1092	/* bind this thread to the requested psrset */
1093	if (zio_taskq_psrset_bind != PS_NONE) {
1094		pool_lock();
1095		mutex_enter(&cpu_lock);
1096		mutex_enter(&pidlock);
1097		mutex_enter(&curproc->p_lock);
1098
1099		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1100		    0, NULL, NULL) == 0)  {
1101			curthread->t_bind_pset = zio_taskq_psrset_bind;
1102		} else {
1103			cmn_err(CE_WARN,
1104			    "Couldn't bind process for zfs pool \"%s\" to "
1105			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1106		}
1107
1108		mutex_exit(&curproc->p_lock);
1109		mutex_exit(&pidlock);
1110		mutex_exit(&cpu_lock);
1111		pool_unlock();
1112	}
1113
1114	if (zio_taskq_sysdc) {
1115		sysdc_thread_enter(curthread, 100, 0);
1116	}
1117
1118	spa->spa_proc = curproc;
1119	spa->spa_did = curthread->t_did;
1120
1121	spa_create_zio_taskqs(spa);
1122
1123	mutex_enter(&spa->spa_proc_lock);
1124	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1125
1126	spa->spa_proc_state = SPA_PROC_ACTIVE;
1127	cv_broadcast(&spa->spa_proc_cv);
1128
1129	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1130	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1131		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1132	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1133
1134	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1135	spa->spa_proc_state = SPA_PROC_GONE;
1136	spa->spa_proc = &p0;
1137	cv_broadcast(&spa->spa_proc_cv);
1138	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1139
1140	mutex_enter(&curproc->p_lock);
1141	lwp_exit();
1142}
1143#endif
1144
1145/*
1146 * Activate an uninitialized pool.
1147 */
1148static void
1149spa_activate(spa_t *spa, int mode)
1150{
1151	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1152
1153	spa->spa_state = POOL_STATE_ACTIVE;
1154	spa->spa_mode = mode;
1155
1156	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1157	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1158	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
1159	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
1160
1161	/* Try to create a covering process */
1162	mutex_enter(&spa->spa_proc_lock);
1163	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1164	ASSERT(spa->spa_proc == &p0);
1165	spa->spa_did = 0;
1166
1167	/* Only create a process if we're going to be around a while. */
1168	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1169		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1170		    NULL, 0) == 0) {
1171			spa->spa_proc_state = SPA_PROC_CREATED;
1172			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1173				cv_wait(&spa->spa_proc_cv,
1174				    &spa->spa_proc_lock);
1175			}
1176			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1177			ASSERT(spa->spa_proc != &p0);
1178			ASSERT(spa->spa_did != 0);
1179		} else {
1180#ifdef _KERNEL
1181			cmn_err(CE_WARN,
1182			    "Couldn't create process for zfs pool \"%s\"\n",
1183			    spa->spa_name);
1184#endif
1185		}
1186	}
1187	mutex_exit(&spa->spa_proc_lock);
1188
1189	/* If we didn't create a process, we need to create our taskqs. */
1190	if (spa->spa_proc == &p0) {
1191		spa_create_zio_taskqs(spa);
1192	}
1193
1194	for (size_t i = 0; i < TXG_SIZE; i++) {
1195		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1196		    ZIO_FLAG_CANFAIL);
1197	}
1198
1199	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1200	    offsetof(vdev_t, vdev_config_dirty_node));
1201	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1202	    offsetof(objset_t, os_evicting_node));
1203	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1204	    offsetof(vdev_t, vdev_state_dirty_node));
1205
1206	txg_list_create(&spa->spa_vdev_txg_list, spa,
1207	    offsetof(struct vdev, vdev_txg_node));
1208
1209	avl_create(&spa->spa_errlist_scrub,
1210	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1211	    offsetof(spa_error_entry_t, se_avl));
1212	avl_create(&spa->spa_errlist_last,
1213	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1214	    offsetof(spa_error_entry_t, se_avl));
1215
1216	spa_keystore_init(&spa->spa_keystore);
1217
1218	/*
1219	 * The taskq to upgrade datasets in this pool. Currently used by
1220	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1221	 */
1222	spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
1223	    minclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
1224}
1225
1226/*
1227 * Opposite of spa_activate().
1228 */
1229static void
1230spa_deactivate(spa_t *spa)
1231{
1232	ASSERT(spa->spa_sync_on == B_FALSE);
1233	ASSERT(spa->spa_dsl_pool == NULL);
1234	ASSERT(spa->spa_root_vdev == NULL);
1235	ASSERT(spa->spa_async_zio_root == NULL);
1236	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1237
1238	spa_evicting_os_wait(spa);
1239
1240	if (spa->spa_upgrade_taskq) {
1241		taskq_destroy(spa->spa_upgrade_taskq);
1242		spa->spa_upgrade_taskq = NULL;
1243	}
1244
1245	txg_list_destroy(&spa->spa_vdev_txg_list);
1246
1247	list_destroy(&spa->spa_config_dirty_list);
1248	list_destroy(&spa->spa_evicting_os_list);
1249	list_destroy(&spa->spa_state_dirty_list);
1250
1251	for (int t = 0; t < ZIO_TYPES; t++) {
1252		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1253			spa_taskqs_fini(spa, t, q);
1254		}
1255	}
1256
1257	for (size_t i = 0; i < TXG_SIZE; i++) {
1258		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1259		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1260		spa->spa_txg_zio[i] = NULL;
1261	}
1262
1263	metaslab_class_destroy(spa->spa_normal_class);
1264	spa->spa_normal_class = NULL;
1265
1266	metaslab_class_destroy(spa->spa_log_class);
1267	spa->spa_log_class = NULL;
1268
1269	metaslab_class_destroy(spa->spa_special_class);
1270	spa->spa_special_class = NULL;
1271
1272	metaslab_class_destroy(spa->spa_dedup_class);
1273	spa->spa_dedup_class = NULL;
1274
1275	/*
1276	 * If this was part of an import or the open otherwise failed, we may
1277	 * still have errors left in the queues.  Empty them just in case.
1278	 */
1279	spa_errlog_drain(spa);
1280	avl_destroy(&spa->spa_errlist_scrub);
1281	avl_destroy(&spa->spa_errlist_last);
1282
1283	spa_keystore_fini(&spa->spa_keystore);
1284
1285	spa->spa_state = POOL_STATE_UNINITIALIZED;
1286
1287	mutex_enter(&spa->spa_proc_lock);
1288	if (spa->spa_proc_state != SPA_PROC_NONE) {
1289		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1290		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1291		cv_broadcast(&spa->spa_proc_cv);
1292		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1293			ASSERT(spa->spa_proc != &p0);
1294			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1295		}
1296		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1297		spa->spa_proc_state = SPA_PROC_NONE;
1298	}
1299	ASSERT(spa->spa_proc == &p0);
1300	mutex_exit(&spa->spa_proc_lock);
1301
1302	/*
1303	 * We want to make sure spa_thread() has actually exited the ZFS
1304	 * module, so that the module can't be unloaded out from underneath
1305	 * it.
1306	 */
1307	if (spa->spa_did != 0) {
1308		thread_join(spa->spa_did);
1309		spa->spa_did = 0;
1310	}
1311}
1312
1313/*
1314 * Verify a pool configuration, and construct the vdev tree appropriately.  This
1315 * will create all the necessary vdevs in the appropriate layout, with each vdev
1316 * in the CLOSED state.  This will prep the pool before open/creation/import.
1317 * All vdev validation is done by the vdev_alloc() routine.
1318 */
1319static int
1320spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1321    uint_t id, int atype)
1322{
1323	nvlist_t **child;
1324	uint_t children;
1325	int error;
1326
1327	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1328		return (error);
1329
1330	if ((*vdp)->vdev_ops->vdev_op_leaf)
1331		return (0);
1332
1333	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1334	    &child, &children);
1335
1336	if (error == ENOENT)
1337		return (0);
1338
1339	if (error) {
1340		vdev_free(*vdp);
1341		*vdp = NULL;
1342		return (SET_ERROR(EINVAL));
1343	}
1344
1345	for (int c = 0; c < children; c++) {
1346		vdev_t *vd;
1347		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1348		    atype)) != 0) {
1349			vdev_free(*vdp);
1350			*vdp = NULL;
1351			return (error);
1352		}
1353	}
1354
1355	ASSERT(*vdp != NULL);
1356
1357	return (0);
1358}
1359
1360static boolean_t
1361spa_should_flush_logs_on_unload(spa_t *spa)
1362{
1363	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1364		return (B_FALSE);
1365
1366	if (!spa_writeable(spa))
1367		return (B_FALSE);
1368
1369	if (!spa->spa_sync_on)
1370		return (B_FALSE);
1371
1372	if (spa_state(spa) != POOL_STATE_EXPORTED)
1373		return (B_FALSE);
1374
1375	if (zfs_keep_log_spacemaps_at_export)
1376		return (B_FALSE);
1377
1378	return (B_TRUE);
1379}
1380
1381/*
1382 * Opens a transaction that will set the flag that will instruct
1383 * spa_sync to attempt to flush all the metaslabs for that txg.
1384 */
1385static void
1386spa_unload_log_sm_flush_all(spa_t *spa)
1387{
1388	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1389
1390	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1391
1392	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
1393	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
1394
1395	dmu_tx_commit(tx);
1396	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
1397}
1398
1399static void
1400spa_unload_log_sm_metadata(spa_t *spa)
1401{
1402	void *cookie = NULL;
1403	spa_log_sm_t *sls;
1404
1405	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
1406	    &cookie)) != NULL) {
1407		VERIFY0(sls->sls_mscount);
1408		kmem_free(sls, sizeof (spa_log_sm_t));
1409	}
1410
1411	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
1412	    e != NULL; e = list_head(&spa->spa_log_summary)) {
1413		VERIFY0(e->lse_mscount);
1414		list_remove(&spa->spa_log_summary, e);
1415		kmem_free(e, sizeof (log_summary_entry_t));
1416	}
1417
1418	spa->spa_unflushed_stats.sus_nblocks = 0;
1419	spa->spa_unflushed_stats.sus_memused = 0;
1420	spa->spa_unflushed_stats.sus_blocklimit = 0;
1421}
1422
1423/*
1424 * Opposite of spa_load().
1425 */
1426static void
1427spa_unload(spa_t *spa)
1428{
1429	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1430	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
1431
1432	spa_import_progress_remove(spa);
1433	spa_load_note(spa, "UNLOADING");
1434
1435	/*
1436	 * If the log space map feature is enabled and the pool is getting
1437	 * exported (but not destroyed), we want to spend some time flushing
1438	 * as many metaslabs as we can in an attempt to destroy log space
1439	 * maps and save import time.
1440	 */
1441	if (spa_should_flush_logs_on_unload(spa))
1442		spa_unload_log_sm_flush_all(spa);
1443
1444	/*
1445	 * Stop async tasks.
1446	 */
1447	spa_async_suspend(spa);
1448
1449	if (spa->spa_root_vdev) {
1450		vdev_t *root_vdev = spa->spa_root_vdev;
1451		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
1452		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
1453		vdev_autotrim_stop_all(spa);
1454	}
1455
1456	/*
1457	 * Stop syncing.
1458	 */
1459	if (spa->spa_sync_on) {
1460		txg_sync_stop(spa->spa_dsl_pool);
1461		spa->spa_sync_on = B_FALSE;
1462	}
1463
1464	/*
1465	 * This ensures that there is no async metaslab prefetching
1466	 * while we attempt to unload the spa.
1467	 */
1468	if (spa->spa_root_vdev != NULL) {
1469		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1470			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
1471			if (vc->vdev_mg != NULL)
1472				taskq_wait(vc->vdev_mg->mg_taskq);
1473		}
1474	}
1475
1476	if (spa->spa_mmp.mmp_thread)
1477		mmp_thread_stop(spa);
1478
1479	/*
1480	 * Wait for any outstanding async I/O to complete.
1481	 */
1482	if (spa->spa_async_zio_root != NULL) {
1483		for (int i = 0; i < max_ncpus; i++)
1484			(void) zio_wait(spa->spa_async_zio_root[i]);
1485		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1486		spa->spa_async_zio_root = NULL;
1487	}
1488
1489	if (spa->spa_vdev_removal != NULL) {
1490		spa_vdev_removal_destroy(spa->spa_vdev_removal);
1491		spa->spa_vdev_removal = NULL;
1492	}
1493
1494	if (spa->spa_condense_zthr != NULL) {
1495		zthr_destroy(spa->spa_condense_zthr);
1496		spa->spa_condense_zthr = NULL;
1497	}
1498
1499	if (spa->spa_checkpoint_discard_zthr != NULL) {
1500		zthr_destroy(spa->spa_checkpoint_discard_zthr);
1501		spa->spa_checkpoint_discard_zthr = NULL;
1502	}
1503
1504	spa_condense_fini(spa);
1505
1506	bpobj_close(&spa->spa_deferred_bpobj);
1507
1508	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1509
1510	/*
1511	 * Close all vdevs.
1512	 */
1513	if (spa->spa_root_vdev)
1514		vdev_free(spa->spa_root_vdev);
1515	ASSERT(spa->spa_root_vdev == NULL);
1516
1517	/*
1518	 * Close the dsl pool.
1519	 */
1520	if (spa->spa_dsl_pool) {
1521		dsl_pool_close(spa->spa_dsl_pool);
1522		spa->spa_dsl_pool = NULL;
1523		spa->spa_meta_objset = NULL;
1524	}
1525
1526	ddt_unload(spa);
1527	spa_unload_log_sm_metadata(spa);
1528
1529	/*
1530	 * Drop and purge level 2 cache
1531	 */
1532	spa_l2cache_drop(spa);
1533
1534	for (int i = 0; i < spa->spa_spares.sav_count; i++)
1535		vdev_free(spa->spa_spares.sav_vdevs[i]);
1536	if (spa->spa_spares.sav_vdevs) {
1537		kmem_free(spa->spa_spares.sav_vdevs,
1538		    spa->spa_spares.sav_count * sizeof (void *));
1539		spa->spa_spares.sav_vdevs = NULL;
1540	}
1541	if (spa->spa_spares.sav_config) {
1542		nvlist_free(spa->spa_spares.sav_config);
1543		spa->spa_spares.sav_config = NULL;
1544	}
1545	spa->spa_spares.sav_count = 0;
1546
1547	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
1548		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1549		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1550	}
1551	if (spa->spa_l2cache.sav_vdevs) {
1552		kmem_free(spa->spa_l2cache.sav_vdevs,
1553		    spa->spa_l2cache.sav_count * sizeof (void *));
1554		spa->spa_l2cache.sav_vdevs = NULL;
1555	}
1556	if (spa->spa_l2cache.sav_config) {
1557		nvlist_free(spa->spa_l2cache.sav_config);
1558		spa->spa_l2cache.sav_config = NULL;
1559	}
1560	spa->spa_l2cache.sav_count = 0;
1561
1562	spa->spa_async_suspended = 0;
1563
1564	spa->spa_indirect_vdevs_loaded = B_FALSE;
1565
1566	if (spa->spa_comment != NULL) {
1567		spa_strfree(spa->spa_comment);
1568		spa->spa_comment = NULL;
1569	}
1570
1571	spa_config_exit(spa, SCL_ALL, spa);
1572}
1573
1574/*
1575 * Load (or re-load) the current list of vdevs describing the active spares for
1576 * this pool.  When this is called, we have some form of basic information in
1577 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1578 * then re-generate a more complete list including status information.
1579 */
1580void
1581spa_load_spares(spa_t *spa)
1582{
1583	nvlist_t **spares;
1584	uint_t nspares;
1585	int i;
1586	vdev_t *vd, *tvd;
1587
1588#ifndef _KERNEL
1589	/*
1590	 * zdb opens both the current state of the pool and the
1591	 * checkpointed state (if present), with a different spa_t.
1592	 *
1593	 * As spare vdevs are shared among open pools, we skip loading
1594	 * them when we load the checkpointed state of the pool.
1595	 */
1596	if (!spa_writeable(spa))
1597		return;
1598#endif
1599
1600	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1601
1602	/*
1603	 * First, close and free any existing spare vdevs.
1604	 */
1605	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1606		vd = spa->spa_spares.sav_vdevs[i];
1607
1608		/* Undo the call to spa_activate() below */
1609		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1610		    B_FALSE)) != NULL && tvd->vdev_isspare)
1611			spa_spare_remove(tvd);
1612		vdev_close(vd);
1613		vdev_free(vd);
1614	}
1615
1616	if (spa->spa_spares.sav_vdevs)
1617		kmem_free(spa->spa_spares.sav_vdevs,
1618		    spa->spa_spares.sav_count * sizeof (void *));
1619
1620	if (spa->spa_spares.sav_config == NULL)
1621		nspares = 0;
1622	else
1623		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1624		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1625
1626	spa->spa_spares.sav_count = (int)nspares;
1627	spa->spa_spares.sav_vdevs = NULL;
1628
1629	if (nspares == 0)
1630		return;
1631
1632	/*
1633	 * Construct the array of vdevs, opening them to get status in the
1634	 * process.   For each spare, there is potentially two different vdev_t
1635	 * structures associated with it: one in the list of spares (used only
1636	 * for basic validation purposes) and one in the active vdev
1637	 * configuration (if it's spared in).  During this phase we open and
1638	 * validate each vdev on the spare list.  If the vdev also exists in the
1639	 * active configuration, then we also mark this vdev as an active spare.
1640	 */
1641	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1642	    KM_SLEEP);
1643	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1644		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1645		    VDEV_ALLOC_SPARE) == 0);
1646		ASSERT(vd != NULL);
1647
1648		spa->spa_spares.sav_vdevs[i] = vd;
1649
1650		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1651		    B_FALSE)) != NULL) {
1652			if (!tvd->vdev_isspare)
1653				spa_spare_add(tvd);
1654
1655			/*
1656			 * We only mark the spare active if we were successfully
1657			 * able to load the vdev.  Otherwise, importing a pool
1658			 * with a bad active spare would result in strange
1659			 * behavior, because multiple pool would think the spare
1660			 * is actively in use.
1661			 *
1662			 * There is a vulnerability here to an equally bizarre
1663			 * circumstance, where a dead active spare is later
1664			 * brought back to life (onlined or otherwise).  Given
1665			 * the rarity of this scenario, and the extra complexity
1666			 * it adds, we ignore the possibility.
1667			 */
1668			if (!vdev_is_dead(tvd))
1669				spa_spare_activate(tvd);
1670		}
1671
1672		vd->vdev_top = vd;
1673		vd->vdev_aux = &spa->spa_spares;
1674
1675		if (vdev_open(vd) != 0)
1676			continue;
1677
1678		if (vdev_validate_aux(vd) == 0)
1679			spa_spare_add(vd);
1680	}
1681
1682	/*
1683	 * Recompute the stashed list of spares, with status information
1684	 * this time.
1685	 */
1686	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1687	    DATA_TYPE_NVLIST_ARRAY) == 0);
1688
1689	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1690	    KM_SLEEP);
1691	for (i = 0; i < spa->spa_spares.sav_count; i++)
1692		spares[i] = vdev_config_generate(spa,
1693		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1694	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1695	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1696	for (i = 0; i < spa->spa_spares.sav_count; i++)
1697		nvlist_free(spares[i]);
1698	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1699}
1700
1701/*
1702 * Load (or re-load) the current list of vdevs describing the active l2cache for
1703 * this pool.  When this is called, we have some form of basic information in
1704 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1705 * then re-generate a more complete list including status information.
1706 * Devices which are already active have their details maintained, and are
1707 * not re-opened.
1708 */
1709void
1710spa_load_l2cache(spa_t *spa)
1711{
1712	nvlist_t **l2cache;
1713	uint_t nl2cache;
1714	int i, j, oldnvdevs;
1715	uint64_t guid;
1716	vdev_t *vd, **oldvdevs, **newvdevs;
1717	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1718
1719#ifndef _KERNEL
1720	/*
1721	 * zdb opens both the current state of the pool and the
1722	 * checkpointed state (if present), with a different spa_t.
1723	 *
1724	 * As L2 caches are part of the ARC which is shared among open
1725	 * pools, we skip loading them when we load the checkpointed
1726	 * state of the pool.
1727	 */
1728	if (!spa_writeable(spa))
1729		return;
1730#endif
1731
1732	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1733
1734	if (sav->sav_config != NULL) {
1735		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1736		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1737		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1738	} else {
1739		nl2cache = 0;
1740		newvdevs = NULL;
1741	}
1742
1743	oldvdevs = sav->sav_vdevs;
1744	oldnvdevs = sav->sav_count;
1745	sav->sav_vdevs = NULL;
1746	sav->sav_count = 0;
1747
1748	/*
1749	 * Process new nvlist of vdevs.
1750	 */
1751	for (i = 0; i < nl2cache; i++) {
1752		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1753		    &guid) == 0);
1754
1755		newvdevs[i] = NULL;
1756		for (j = 0; j < oldnvdevs; j++) {
1757			vd = oldvdevs[j];
1758			if (vd != NULL && guid == vd->vdev_guid) {
1759				/*
1760				 * Retain previous vdev for add/remove ops.
1761				 */
1762				newvdevs[i] = vd;
1763				oldvdevs[j] = NULL;
1764				break;
1765			}
1766		}
1767
1768		if (newvdevs[i] == NULL) {
1769			/*
1770			 * Create new vdev
1771			 */
1772			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1773			    VDEV_ALLOC_L2CACHE) == 0);
1774			ASSERT(vd != NULL);
1775			newvdevs[i] = vd;
1776
1777			/*
1778			 * Commit this vdev as an l2cache device,
1779			 * even if it fails to open.
1780			 */
1781			spa_l2cache_add(vd);
1782
1783			vd->vdev_top = vd;
1784			vd->vdev_aux = sav;
1785
1786			spa_l2cache_activate(vd);
1787
1788			if (vdev_open(vd) != 0)
1789				continue;
1790
1791			(void) vdev_validate_aux(vd);
1792
1793			if (!vdev_is_dead(vd))
1794				l2arc_add_vdev(spa, vd);
1795		}
1796	}
1797
1798	/*
1799	 * Purge vdevs that were dropped
1800	 */
1801	for (i = 0; i < oldnvdevs; i++) {
1802		uint64_t pool;
1803
1804		vd = oldvdevs[i];
1805		if (vd != NULL) {
1806			ASSERT(vd->vdev_isl2cache);
1807
1808			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1809			    pool != 0ULL && l2arc_vdev_present(vd))
1810				l2arc_remove_vdev(vd);
1811			vdev_clear_stats(vd);
1812			vdev_free(vd);
1813		}
1814	}
1815
1816	if (oldvdevs)
1817		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1818
1819	if (sav->sav_config == NULL)
1820		goto out;
1821
1822	sav->sav_vdevs = newvdevs;
1823	sav->sav_count = (int)nl2cache;
1824
1825	/*
1826	 * Recompute the stashed list of l2cache devices, with status
1827	 * information this time.
1828	 */
1829	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1830	    DATA_TYPE_NVLIST_ARRAY) == 0);
1831
1832	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1833	for (i = 0; i < sav->sav_count; i++)
1834		l2cache[i] = vdev_config_generate(spa,
1835		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1836	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1837	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1838out:
1839	for (i = 0; i < sav->sav_count; i++)
1840		nvlist_free(l2cache[i]);
1841	if (sav->sav_count)
1842		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1843}
1844
1845static int
1846load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1847{
1848	dmu_buf_t *db;
1849	char *packed = NULL;
1850	size_t nvsize = 0;
1851	int error;
1852	*value = NULL;
1853
1854	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1855	if (error != 0)
1856		return (error);
1857
1858	nvsize = *(uint64_t *)db->db_data;
1859	dmu_buf_rele(db, FTAG);
1860
1861	packed = kmem_alloc(nvsize, KM_SLEEP);
1862	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1863	    DMU_READ_PREFETCH);
1864	if (error == 0)
1865		error = nvlist_unpack(packed, nvsize, value, 0);
1866	kmem_free(packed, nvsize);
1867
1868	return (error);
1869}
1870
1871/*
1872 * Concrete top-level vdevs that are not missing and are not logs. At every
1873 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1874 */
1875static uint64_t
1876spa_healthy_core_tvds(spa_t *spa)
1877{
1878	vdev_t *rvd = spa->spa_root_vdev;
1879	uint64_t tvds = 0;
1880
1881	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1882		vdev_t *vd = rvd->vdev_child[i];
1883		if (vd->vdev_islog)
1884			continue;
1885		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1886			tvds++;
1887	}
1888
1889	return (tvds);
1890}
1891
1892/*
1893 * Checks to see if the given vdev could not be opened, in which case we post a
1894 * sysevent to notify the autoreplace code that the device has been removed.
1895 */
1896static void
1897spa_check_removed(vdev_t *vd)
1898{
1899	for (uint64_t c = 0; c < vd->vdev_children; c++)
1900		spa_check_removed(vd->vdev_child[c]);
1901
1902	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1903	    vdev_is_concrete(vd)) {
1904		zfs_post_autoreplace(vd->vdev_spa, vd);
1905		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1906	}
1907}
1908
1909static int
1910spa_check_for_missing_logs(spa_t *spa)
1911{
1912	vdev_t *rvd = spa->spa_root_vdev;
1913
1914	/*
1915	 * If we're doing a normal import, then build up any additional
1916	 * diagnostic information about missing log devices.
1917	 * We'll pass this up to the user for further processing.
1918	 */
1919	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1920		nvlist_t **child, *nv;
1921		uint64_t idx = 0;
1922
1923		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1924		    KM_SLEEP);
1925		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1926
1927		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1928			vdev_t *tvd = rvd->vdev_child[c];
1929
1930			/*
1931			 * We consider a device as missing only if it failed
1932			 * to open (i.e. offline or faulted is not considered
1933			 * as missing).
1934			 */
1935			if (tvd->vdev_islog &&
1936			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1937				child[idx++] = vdev_config_generate(spa, tvd,
1938				    B_FALSE, VDEV_CONFIG_MISSING);
1939			}
1940		}
1941
1942		if (idx > 0) {
1943			fnvlist_add_nvlist_array(nv,
1944			    ZPOOL_CONFIG_CHILDREN, child, idx);
1945			fnvlist_add_nvlist(spa->spa_load_info,
1946			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
1947
1948			for (uint64_t i = 0; i < idx; i++)
1949				nvlist_free(child[i]);
1950		}
1951		nvlist_free(nv);
1952		kmem_free(child, rvd->vdev_children * sizeof (char **));
1953
1954		if (idx > 0) {
1955			spa_load_failed(spa, "some log devices are missing");
1956			vdev_dbgmsg_print_tree(rvd, 2);
1957			return (SET_ERROR(ENXIO));
1958		}
1959	} else {
1960		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1961			vdev_t *tvd = rvd->vdev_child[c];
1962
1963			if (tvd->vdev_islog &&
1964			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1965				spa_set_log_state(spa, SPA_LOG_CLEAR);
1966				spa_load_note(spa, "some log devices are "
1967				    "missing, ZIL is dropped.");
1968				vdev_dbgmsg_print_tree(rvd, 2);
1969				break;
1970			}
1971		}
1972	}
1973
1974	return (0);
1975}
1976
1977/*
1978 * Check for missing log devices
1979 */
1980static boolean_t
1981spa_check_logs(spa_t *spa)
1982{
1983	boolean_t rv = B_FALSE;
1984	dsl_pool_t *dp = spa_get_dsl(spa);
1985
1986	switch (spa->spa_log_state) {
1987	case SPA_LOG_MISSING:
1988		/* need to recheck in case slog has been restored */
1989	case SPA_LOG_UNKNOWN:
1990		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1991		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1992		if (rv)
1993			spa_set_log_state(spa, SPA_LOG_MISSING);
1994		break;
1995	}
1996	return (rv);
1997}
1998
1999static boolean_t
2000spa_passivate_log(spa_t *spa)
2001{
2002	vdev_t *rvd = spa->spa_root_vdev;
2003	boolean_t slog_found = B_FALSE;
2004
2005	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2006
2007	if (!spa_has_slogs(spa))
2008		return (B_FALSE);
2009
2010	for (int c = 0; c < rvd->vdev_children; c++) {
2011		vdev_t *tvd = rvd->vdev_child[c];
2012		metaslab_group_t *mg = tvd->vdev_mg;
2013
2014		if (tvd->vdev_islog) {
2015			metaslab_group_passivate(mg);
2016			slog_found = B_TRUE;
2017		}
2018	}
2019
2020	return (slog_found);
2021}
2022
2023static void
2024spa_activate_log(spa_t *spa)
2025{
2026	vdev_t *rvd = spa->spa_root_vdev;
2027
2028	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2029
2030	for (int c = 0; c < rvd->vdev_children; c++) {
2031		vdev_t *tvd = rvd->vdev_child[c];
2032		metaslab_group_t *mg = tvd->vdev_mg;
2033
2034		if (tvd->vdev_islog)
2035			metaslab_group_activate(mg);
2036	}
2037}
2038
2039int
2040spa_reset_logs(spa_t *spa)
2041{
2042	int error;
2043
2044	error = dmu_objset_find(spa_name(spa), zil_reset,
2045	    NULL, DS_FIND_CHILDREN);
2046	if (error == 0) {
2047		/*
2048		 * We successfully offlined the log device, sync out the
2049		 * current txg so that the "stubby" block can be removed
2050		 * by zil_sync().
2051		 */
2052		txg_wait_synced(spa->spa_dsl_pool, 0);
2053	}
2054	return (error);
2055}
2056
2057static void
2058spa_aux_check_removed(spa_aux_vdev_t *sav)
2059{
2060	for (int i = 0; i < sav->sav_count; i++)
2061		spa_check_removed(sav->sav_vdevs[i]);
2062}
2063
2064void
2065spa_claim_notify(zio_t *zio)
2066{
2067	spa_t *spa = zio->io_spa;
2068
2069	if (zio->io_error)
2070		return;
2071
2072	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
2073	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
2074		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
2075	mutex_exit(&spa->spa_props_lock);
2076}
2077
2078typedef struct spa_load_error {
2079	uint64_t	sle_meta_count;
2080	uint64_t	sle_data_count;
2081} spa_load_error_t;
2082
2083static void
2084spa_load_verify_done(zio_t *zio)
2085{
2086	blkptr_t *bp = zio->io_bp;
2087	spa_load_error_t *sle = zio->io_private;
2088	dmu_object_type_t type = BP_GET_TYPE(bp);
2089	int error = zio->io_error;
2090	spa_t *spa = zio->io_spa;
2091
2092	abd_free(zio->io_abd);
2093	if (error) {
2094		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
2095		    type != DMU_OT_INTENT_LOG)
2096			atomic_inc_64(&sle->sle_meta_count);
2097		else
2098			atomic_inc_64(&sle->sle_data_count);
2099	}
2100
2101	mutex_enter(&spa->spa_scrub_lock);
2102	spa->spa_load_verify_ios--;
2103	cv_broadcast(&spa->spa_scrub_io_cv);
2104	mutex_exit(&spa->spa_scrub_lock);
2105}
2106
2107/*
2108 * Maximum number of concurrent scrub i/os to create while verifying
2109 * a pool while importing it.
2110 */
2111int spa_load_verify_maxinflight = 10000;
2112boolean_t spa_load_verify_metadata = B_TRUE;
2113boolean_t spa_load_verify_data = B_TRUE;
2114
2115/*ARGSUSED*/
2116static int
2117spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2118    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2119{
2120	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2121		return (0);
2122	/*
2123	 * Note: normally this routine will not be called if
2124	 * spa_load_verify_metadata is not set.  However, it may be useful
2125	 * to manually set the flag after the traversal has begun.
2126	 */
2127	if (!spa_load_verify_metadata)
2128		return (0);
2129	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
2130		return (0);
2131
2132	zio_t *rio = arg;
2133	size_t size = BP_GET_PSIZE(bp);
2134
2135	mutex_enter(&spa->spa_scrub_lock);
2136	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
2137		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2138	spa->spa_load_verify_ios++;
2139	mutex_exit(&spa->spa_scrub_lock);
2140
2141	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2142	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2143	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2144	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2145	return (0);
2146}
2147
2148/* ARGSUSED */
2149int
2150verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2151{
2152	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2153		return (SET_ERROR(ENAMETOOLONG));
2154
2155	return (0);
2156}
2157
2158static int
2159spa_load_verify(spa_t *spa)
2160{
2161	zio_t *rio;
2162	spa_load_error_t sle = { 0 };
2163	zpool_load_policy_t policy;
2164	boolean_t verify_ok = B_FALSE;
2165	int error = 0;
2166
2167	zpool_get_load_policy(spa->spa_config, &policy);
2168
2169	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
2170		return (0);
2171
2172	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2173	error = dmu_objset_find_dp(spa->spa_dsl_pool,
2174	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2175	    DS_FIND_CHILDREN);
2176	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2177	if (error != 0)
2178		return (error);
2179
2180	rio = zio_root(spa, NULL, &sle,
2181	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2182
2183	if (spa_load_verify_metadata) {
2184		if (spa->spa_extreme_rewind) {
2185			spa_load_note(spa, "performing a complete scan of the "
2186			    "pool since extreme rewind is on. This may take "
2187			    "a very long time.\n  (spa_load_verify_data=%u, "
2188			    "spa_load_verify_metadata=%u)",
2189			    spa_load_verify_data, spa_load_verify_metadata);
2190		}
2191		error = traverse_pool(spa, spa->spa_verify_min_txg,
2192		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
2193		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
2194	}
2195
2196	(void) zio_wait(rio);
2197
2198	spa->spa_load_meta_errors = sle.sle_meta_count;
2199	spa->spa_load_data_errors = sle.sle_data_count;
2200
2201	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2202		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2203		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2204		    (u_longlong_t)sle.sle_data_count);
2205	}
2206
2207	if (spa_load_verify_dryrun ||
2208	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2209	    sle.sle_data_count <= policy.zlp_maxdata)) {
2210		int64_t loss = 0;
2211
2212		verify_ok = B_TRUE;
2213		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2214		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2215
2216		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2217		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2218		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2219		VERIFY(nvlist_add_int64(spa->spa_load_info,
2220		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2221		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2222		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2223	} else {
2224		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2225	}
2226
2227	if (spa_load_verify_dryrun)
2228		return (0);
2229
2230	if (error) {
2231		if (error != ENXIO && error != EIO)
2232			error = SET_ERROR(EIO);
2233		return (error);
2234	}
2235
2236	return (verify_ok ? 0 : EIO);
2237}
2238
2239/*
2240 * Find a value in the pool props object.
2241 */
2242static void
2243spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2244{
2245	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2246	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2247}
2248
2249/*
2250 * Find a value in the pool directory object.
2251 */
2252static int
2253spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2254{
2255	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2256	    name, sizeof (uint64_t), 1, val);
2257
2258	if (error != 0 && (error != ENOENT || log_enoent)) {
2259		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2260		    "[error=%d]", name, error);
2261	}
2262
2263	return (error);
2264}
2265
2266static int
2267spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2268{
2269	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2270	return (SET_ERROR(err));
2271}
2272
2273static void
2274spa_spawn_aux_threads(spa_t *spa)
2275{
2276	ASSERT(spa_writeable(spa));
2277
2278	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2279
2280	spa_start_indirect_condensing_thread(spa);
2281
2282	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2283	spa->spa_checkpoint_discard_zthr =
2284	    zthr_create(spa_checkpoint_discard_thread_check,
2285	    spa_checkpoint_discard_thread, spa);
2286}
2287
2288/*
2289 * Fix up config after a partly-completed split.  This is done with the
2290 * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2291 * pool have that entry in their config, but only the splitting one contains
2292 * a list of all the guids of the vdevs that are being split off.
2293 *
2294 * This function determines what to do with that list: either rejoin
2295 * all the disks to the pool, or complete the splitting process.  To attempt
2296 * the rejoin, each disk that is offlined is marked online again, and
2297 * we do a reopen() call.  If the vdev label for every disk that was
2298 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2299 * then we call vdev_split() on each disk, and complete the split.
2300 *
2301 * Otherwise we leave the config alone, with all the vdevs in place in
2302 * the original pool.
2303 */
2304static void
2305spa_try_repair(spa_t *spa, nvlist_t *config)
2306{
2307	uint_t extracted;
2308	uint64_t *glist;
2309	uint_t i, gcount;
2310	nvlist_t *nvl;
2311	vdev_t **vd;
2312	boolean_t attempt_reopen;
2313
2314	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2315		return;
2316
2317	/* check that the config is complete */
2318	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2319	    &glist, &gcount) != 0)
2320		return;
2321
2322	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2323
2324	/* attempt to online all the vdevs & validate */
2325	attempt_reopen = B_TRUE;
2326	for (i = 0; i < gcount; i++) {
2327		if (glist[i] == 0)	/* vdev is hole */
2328			continue;
2329
2330		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2331		if (vd[i] == NULL) {
2332			/*
2333			 * Don't bother attempting to reopen the disks;
2334			 * just do the split.
2335			 */
2336			attempt_reopen = B_FALSE;
2337		} else {
2338			/* attempt to re-online it */
2339			vd[i]->vdev_offline = B_FALSE;
2340		}
2341	}
2342
2343	if (attempt_reopen) {
2344		vdev_reopen(spa->spa_root_vdev);
2345
2346		/* check each device to see what state it's in */
2347		for (extracted = 0, i = 0; i < gcount; i++) {
2348			if (vd[i] != NULL &&
2349			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2350				break;
2351			++extracted;
2352		}
2353	}
2354
2355	/*
2356	 * If every disk has been moved to the new pool, or if we never
2357	 * even attempted to look at them, then we split them off for
2358	 * good.
2359	 */
2360	if (!attempt_reopen || gcount == extracted) {
2361		for (i = 0; i < gcount; i++)
2362			if (vd[i] != NULL)
2363				vdev_split(vd[i]);
2364		vdev_reopen(spa->spa_root_vdev);
2365	}
2366
2367	kmem_free(vd, gcount * sizeof (vdev_t *));
2368}
2369
2370static int
2371spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2372{
2373	char *ereport = FM_EREPORT_ZFS_POOL;
2374	int error;
2375
2376	spa->spa_load_state = state;
2377	(void) spa_import_progress_set_state(spa, spa_load_state(spa));
2378
2379	gethrestime(&spa->spa_loaded_ts);
2380	error = spa_load_impl(spa, type, &ereport);
2381
2382	/*
2383	 * Don't count references from objsets that are already closed
2384	 * and are making their way through the eviction process.
2385	 */
2386	spa_evicting_os_wait(spa);
2387	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
2388	if (error) {
2389		if (error != EEXIST) {
2390			spa->spa_loaded_ts.tv_sec = 0;
2391			spa->spa_loaded_ts.tv_nsec = 0;
2392		}
2393		if (error != EBADF) {
2394			zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
2395		}
2396	}
2397	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2398	spa->spa_ena = 0;
2399
2400	(void) spa_import_progress_set_state(spa, spa_load_state(spa));
2401
2402	return (error);
2403}
2404
2405/*
2406 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2407 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2408 * spa's per-vdev ZAP list.
2409 */
2410static uint64_t
2411vdev_count_verify_zaps(vdev_t *vd)
2412{
2413	spa_t *spa = vd->vdev_spa;
2414	uint64_t total = 0;
2415	if (vd->vdev_top_zap != 0) {
2416		total++;
2417		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2418		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2419	}
2420	if (vd->vdev_leaf_zap != 0) {
2421		total++;
2422		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2423		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2424	}
2425
2426	for (uint64_t i = 0; i < vd->vdev_children; i++) {
2427		total += vdev_count_verify_zaps(vd->vdev_child[i]);
2428	}
2429
2430	return (total);
2431}
2432
2433/*
2434 * Determine whether the activity check is required.
2435 */
2436static boolean_t
2437spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
2438    nvlist_t *config)
2439{
2440	uint64_t state = 0;
2441	uint64_t hostid = 0;
2442	uint64_t tryconfig_txg = 0;
2443	uint64_t tryconfig_timestamp = 0;
2444	uint16_t tryconfig_mmp_seq = 0;
2445	nvlist_t *nvinfo;
2446
2447	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
2448		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
2449		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
2450		    &tryconfig_txg);
2451		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2452		    &tryconfig_timestamp);
2453		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
2454		    &tryconfig_mmp_seq);
2455	}
2456
2457	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
2458
2459	/*
2460	 * Disable the MMP activity check - This is used by zdb which
2461	 * is intended to be used on potentially active pools.
2462	 */
2463	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
2464		return (B_FALSE);
2465
2466	/*
2467	 * Skip the activity check when the MMP feature is disabled.
2468	 */
2469	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
2470		return (B_FALSE);
2471
2472	/*
2473	 * If the tryconfig_ values are nonzero, they are the results of an
2474	 * earlier tryimport.  If they all match the uberblock we just found,
2475	 * then the pool has not changed and we return false so we do not test
2476	 * a second time.
2477	 */
2478	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
2479	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
2480	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
2481	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
2482		return (B_FALSE);
2483
2484	/*
2485	 * Allow the activity check to be skipped when importing the pool
2486	 * on the same host which last imported it.  Since the hostid from
2487	 * configuration may be stale use the one read from the label.
2488	 */
2489	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
2490		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
2491
2492	if (hostid == spa_get_hostid())
2493		return (B_FALSE);
2494
2495	/*
2496	 * Skip the activity test when the pool was cleanly exported.
2497	 */
2498	if (state != POOL_STATE_ACTIVE)
2499		return (B_FALSE);
2500
2501	return (B_TRUE);
2502}
2503
2504/*
2505 * Nanoseconds the activity check must watch for changes on-disk.
2506 */
2507static uint64_t
2508spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
2509{
2510	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
2511	uint64_t multihost_interval = MSEC2NSEC(
2512	    MMP_INTERVAL_OK(zfs_multihost_interval));
2513	uint64_t import_delay = MAX(NANOSEC, import_intervals *
2514	    multihost_interval);
2515
2516	/*
2517	 * Local tunables determine a minimum duration except for the case
2518	 * where we know when the remote host will suspend the pool if MMP
2519	 * writes do not land.
2520	 *
2521	 * See Big Theory comment at the top of mmp.c for the reasoning behind
2522	 * these cases and times.
2523	 */
2524
2525	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
2526
2527	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
2528	    MMP_FAIL_INT(ub) > 0) {
2529
2530		/* MMP on remote host will suspend pool after failed writes */
2531		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
2532		    MMP_IMPORT_SAFETY_FACTOR / 100;
2533
2534		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
2535		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
2536		    "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
2537		    MMP_INTERVAL(ub), import_intervals);
2538
2539	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
2540	    MMP_FAIL_INT(ub) == 0) {
2541
2542		/* MMP on remote host will never suspend pool */
2543		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
2544		    ub->ub_mmp_delay) * import_intervals);
2545
2546		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
2547		    "mmp_interval=%llu ub_mmp_delay=%llu "
2548		    "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
2549		    ub->ub_mmp_delay, import_intervals);
2550
2551	} else if (MMP_VALID(ub)) {
2552		/*
2553		 * zfs-0.7 compatability case
2554		 */
2555
2556		import_delay = MAX(import_delay, (multihost_interval +
2557		    ub->ub_mmp_delay) * import_intervals);
2558
2559		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
2560		    "import_intervals=%u leaves=%u", import_delay,
2561		    ub->ub_mmp_delay, import_intervals,
2562		    vdev_count_leaves(spa));
2563	} else {
2564		/* Using local tunings is the only reasonable option */
2565		zfs_dbgmsg("pool last imported on non-MMP aware "
2566		    "host using import_delay=%llu multihost_interval=%llu "
2567		    "import_intervals=%u", import_delay, multihost_interval,
2568		    import_intervals);
2569	}
2570
2571	return (import_delay);
2572}
2573
2574/*
2575 * Perform the import activity check.  If the user canceled the import or
2576 * we detected activity then fail.
2577 */
2578static int
2579spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
2580{
2581	uint64_t txg = ub->ub_txg;
2582	uint64_t timestamp = ub->ub_timestamp;
2583	uint64_t mmp_config = ub->ub_mmp_config;
2584	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
2585	uint64_t import_delay;
2586	hrtime_t import_expire;
2587	nvlist_t *mmp_label = NULL;
2588	vdev_t *rvd = spa->spa_root_vdev;
2589	kcondvar_t cv;
2590	kmutex_t mtx;
2591	int error = 0;
2592
2593	cv_init(&cv, NULL, CV_DEFAULT, NULL);
2594	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
2595	mutex_enter(&mtx);
2596
2597	/*
2598	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
2599	 * during the earlier tryimport.  If the txg recorded there is 0 then
2600	 * the pool is known to be active on another host.
2601	 *
2602	 * Otherwise, the pool might be in use on another host.  Check for
2603	 * changes in the uberblocks on disk if necessary.
2604	 */
2605	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
2606		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
2607		    ZPOOL_CONFIG_LOAD_INFO);
2608
2609		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
2610		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
2611			vdev_uberblock_load(rvd, ub, &mmp_label);
2612			error = SET_ERROR(EREMOTEIO);
2613			goto out;
2614		}
2615	}
2616
2617	import_delay = spa_activity_check_duration(spa, ub);
2618
2619	/* Add a small random factor in case of simultaneous imports (0-25%) */
2620	import_delay += import_delay * spa_get_random(250) / 1000;
2621
2622	import_expire = gethrtime() + import_delay;
2623
2624	while (gethrtime() < import_expire) {
2625		(void) spa_import_progress_set_mmp_check(spa,
2626		    NSEC2SEC(import_expire - gethrtime()));
2627
2628		vdev_uberblock_load(rvd, ub, &mmp_label);
2629
2630		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
2631		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
2632			zfs_dbgmsg("multihost activity detected "
2633			    "txg %llu ub_txg  %llu "
2634			    "timestamp %llu ub_timestamp  %llu "
2635			    "mmp_config %#llx ub_mmp_config %#llx",
2636			    txg, ub->ub_txg, timestamp, ub->ub_timestamp,
2637			    mmp_config, ub->ub_mmp_config);
2638
2639			error = SET_ERROR(EREMOTEIO);
2640			break;
2641		}
2642
2643		if (mmp_label) {
2644			nvlist_free(mmp_label);
2645			mmp_label = NULL;
2646		}
2647
2648		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
2649		if (error != -1) {
2650			error = SET_ERROR(EINTR);
2651			break;
2652		}
2653		error = 0;
2654	}
2655
2656out:
2657	mutex_exit(&mtx);
2658	mutex_destroy(&mtx);
2659	cv_destroy(&cv);
2660
2661	/*
2662	 * If the pool is determined to be active store the status in the
2663	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
2664	 * available from configuration read from disk store them as well.
2665	 * This allows 'zpool import' to generate a more useful message.
2666	 *
2667	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
2668	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
2669	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
2670	 */
2671	if (error == EREMOTEIO) {
2672		char *hostname = "<unknown>";
2673		uint64_t hostid = 0;
2674
2675		if (mmp_label) {
2676			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
2677				hostname = fnvlist_lookup_string(mmp_label,
2678				    ZPOOL_CONFIG_HOSTNAME);
2679				fnvlist_add_string(spa->spa_load_info,
2680				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
2681			}
2682
2683			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
2684				hostid = fnvlist_lookup_uint64(mmp_label,
2685				    ZPOOL_CONFIG_HOSTID);
2686				fnvlist_add_uint64(spa->spa_load_info,
2687				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
2688			}
2689		}
2690
2691		fnvlist_add_uint64(spa->spa_load_info,
2692		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
2693		fnvlist_add_uint64(spa->spa_load_info,
2694		    ZPOOL_CONFIG_MMP_TXG, 0);
2695
2696		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
2697	}
2698
2699	if (mmp_label)
2700		nvlist_free(mmp_label);
2701
2702	return (error);
2703}
2704
2705static int
2706spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2707{
2708	uint64_t hostid;
2709	char *hostname;
2710	uint64_t myhostid = 0;
2711
2712	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2713	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2714		hostname = fnvlist_lookup_string(mos_config,
2715		    ZPOOL_CONFIG_HOSTNAME);
2716
2717		myhostid = zone_get_hostid(NULL);
2718
2719		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2720			cmn_err(CE_WARN, "pool '%s' could not be "
2721			    "loaded as it was last accessed by "
2722			    "another system (host: %s hostid: 0x%llx). "
2723			    "See: http://illumos.org/msg/ZFS-8000-EY",
2724			    spa_name(spa), hostname, (u_longlong_t)hostid);
2725			spa_load_failed(spa, "hostid verification failed: pool "
2726			    "last accessed by host: %s (hostid: 0x%llx)",
2727			    hostname, (u_longlong_t)hostid);
2728			return (SET_ERROR(EBADF));
2729		}
2730	}
2731
2732	return (0);
2733}
2734
2735static int
2736spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2737{
2738	int error = 0;
2739	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2740	int parse;
2741	vdev_t *rvd;
2742	uint64_t pool_guid;
2743	char *comment;
2744
2745	/*
2746	 * Versioning wasn't explicitly added to the label until later, so if
2747	 * it's not present treat it as the initial version.
2748	 */
2749	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2750	    &spa->spa_ubsync.ub_version) != 0)
2751		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2752
2753	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2754		spa_load_failed(spa, "invalid config provided: '%s' missing",
2755		    ZPOOL_CONFIG_POOL_GUID);
2756		return (SET_ERROR(EINVAL));
2757	}
2758
2759	/*
2760	 * If we are doing an import, ensure that the pool is not already
2761	 * imported by checking if its pool guid already exists in the
2762	 * spa namespace.
2763	 *
2764	 * The only case that we allow an already imported pool to be
2765	 * imported again, is when the pool is checkpointed and we want to
2766	 * look at its checkpointed state from userland tools like zdb.
2767	 */
2768#ifdef _KERNEL
2769	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2770	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2771	    spa_guid_exists(pool_guid, 0)) {
2772#else
2773	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2774	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2775	    spa_guid_exists(pool_guid, 0) &&
2776	    !spa_importing_readonly_checkpoint(spa)) {
2777#endif
2778		spa_load_failed(spa, "a pool with guid %llu is already open",
2779		    (u_longlong_t)pool_guid);
2780		return (SET_ERROR(EEXIST));
2781	}
2782
2783	spa->spa_config_guid = pool_guid;
2784
2785	nvlist_free(spa->spa_load_info);
2786	spa->spa_load_info = fnvlist_alloc();
2787
2788	ASSERT(spa->spa_comment == NULL);
2789	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2790		spa->spa_comment = spa_strdup(comment);
2791
2792	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2793	    &spa->spa_config_txg);
2794
2795	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2796		spa->spa_config_splitting = fnvlist_dup(nvl);
2797
2798	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2799		spa_load_failed(spa, "invalid config provided: '%s' missing",
2800		    ZPOOL_CONFIG_VDEV_TREE);
2801		return (SET_ERROR(EINVAL));
2802	}
2803
2804	/*
2805	 * Create "The Godfather" zio to hold all async IOs
2806	 */
2807	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2808	    KM_SLEEP);
2809	for (int i = 0; i < max_ncpus; i++) {
2810		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2811		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2812		    ZIO_FLAG_GODFATHER);
2813	}
2814
2815	/*
2816	 * Parse the configuration into a vdev tree.  We explicitly set the
2817	 * value that will be returned by spa_version() since parsing the
2818	 * configuration requires knowing the version number.
2819	 */
2820	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2821	parse = (type == SPA_IMPORT_EXISTING ?
2822	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2823	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2824	spa_config_exit(spa, SCL_ALL, FTAG);
2825
2826	if (error != 0) {
2827		spa_load_failed(spa, "unable to parse config [error=%d]",
2828		    error);
2829		return (error);
2830	}
2831
2832	ASSERT(spa->spa_root_vdev == rvd);
2833	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2834	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2835
2836	if (type != SPA_IMPORT_ASSEMBLE) {
2837		ASSERT(spa_guid(spa) == pool_guid);
2838	}
2839
2840	return (0);
2841}
2842
2843/*
2844 * Recursively open all vdevs in the vdev tree. This function is called twice:
2845 * first with the untrusted config, then with the trusted config.
2846 */
2847static int
2848spa_ld_open_vdevs(spa_t *spa)
2849{
2850	int error = 0;
2851
2852	/*
2853	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
2854	 * missing/unopenable for the root vdev to be still considered openable.
2855	 */
2856	if (spa->spa_trust_config) {
2857		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2858	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2859		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2860	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2861		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2862	} else {
2863		spa->spa_missing_tvds_allowed = 0;
2864	}
2865
2866	spa->spa_missing_tvds_allowed =
2867	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2868
2869	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2870	error = vdev_open(spa->spa_root_vdev);
2871	spa_config_exit(spa, SCL_ALL, FTAG);
2872
2873	if (spa->spa_missing_tvds != 0) {
2874		spa_load_note(spa, "vdev tree has %lld missing top-level "
2875		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2876		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2877			/*
2878			 * Although theoretically we could allow users to open
2879			 * incomplete pools in RW mode, we'd need to add a lot
2880			 * of extra logic (e.g. adjust pool space to account
2881			 * for missing vdevs).
2882			 * This limitation also prevents users from accidentally
2883			 * opening the pool in RW mode during data recovery and
2884			 * damaging it further.
2885			 */
2886			spa_load_note(spa, "pools with missing top-level "
2887			    "vdevs can only be opened in read-only mode.");
2888			error = SET_ERROR(ENXIO);
2889		} else {
2890			spa_load_note(spa, "current settings allow for maximum "
2891			    "%lld missing top-level vdevs at this stage.",
2892			    (u_longlong_t)spa->spa_missing_tvds_allowed);
2893		}
2894	}
2895	if (error != 0) {
2896		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2897		    error);
2898	}
2899	if (spa->spa_missing_tvds != 0 || error != 0)
2900		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2901
2902	return (error);
2903}
2904
2905/*
2906 * We need to validate the vdev labels against the configuration that
2907 * we have in hand. This function is called twice: first with an untrusted
2908 * config, then with a trusted config. The validation is more strict when the
2909 * config is trusted.
2910 */
2911static int
2912spa_ld_validate_vdevs(spa_t *spa)
2913{
2914	int error = 0;
2915	vdev_t *rvd = spa->spa_root_vdev;
2916
2917	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2918	error = vdev_validate(rvd);
2919	spa_config_exit(spa, SCL_ALL, FTAG);
2920
2921	if (error != 0) {
2922		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2923		return (error);
2924	}
2925
2926	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2927		spa_load_failed(spa, "cannot open vdev tree after invalidating "
2928		    "some vdevs");
2929		vdev_dbgmsg_print_tree(rvd, 2);
2930		return (SET_ERROR(ENXIO));
2931	}
2932
2933	return (0);
2934}
2935
2936static void
2937spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
2938{
2939	spa->spa_state = POOL_STATE_ACTIVE;
2940	spa->spa_ubsync = spa->spa_uberblock;
2941	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2942	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2943	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2944	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2945	spa->spa_claim_max_txg = spa->spa_first_txg;
2946	spa->spa_prev_software_version = ub->ub_software_version;
2947}
2948
2949static int
2950spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2951{
2952	vdev_t *rvd = spa->spa_root_vdev;
2953	nvlist_t *label;
2954	uberblock_t *ub = &spa->spa_uberblock;
2955	boolean_t activity_check = B_FALSE;
2956
2957	/*
2958	 * If we are opening the checkpointed state of the pool by
2959	 * rewinding to it, at this point we will have written the
2960	 * checkpointed uberblock to the vdev labels, so searching
2961	 * the labels will find the right uberblock.  However, if
2962	 * we are opening the checkpointed state read-only, we have
2963	 * not modified the labels. Therefore, we must ignore the
2964	 * labels and continue using the spa_uberblock that was set
2965	 * by spa_ld_checkpoint_rewind.
2966	 *
2967	 * Note that it would be fine to ignore the labels when
2968	 * rewinding (opening writeable) as well. However, if we
2969	 * crash just after writing the labels, we will end up
2970	 * searching the labels. Doing so in the common case means
2971	 * that this code path gets exercised normally, rather than
2972	 * just in the edge case.
2973	 */
2974	if (ub->ub_checkpoint_txg != 0 &&
2975	    spa_importing_readonly_checkpoint(spa)) {
2976		spa_ld_select_uberblock_done(spa, ub);
2977		return (0);
2978	}
2979
2980	/*
2981	 * Find the best uberblock.
2982	 */
2983	vdev_uberblock_load(rvd, ub, &label);
2984
2985	/*
2986	 * If we weren't able to find a single valid uberblock, return failure.
2987	 */
2988	if (ub->ub_txg == 0) {
2989		nvlist_free(label);
2990		spa_load_failed(spa, "no valid uberblock found");
2991		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2992	}
2993
2994	if (spa->spa_load_max_txg != UINT64_MAX) {
2995		(void) spa_import_progress_set_max_txg(spa,
2996		    (u_longlong_t)spa->spa_load_max_txg);
2997	}
2998	spa_load_note(spa, "using uberblock with txg=%llu",
2999	    (u_longlong_t)ub->ub_txg);
3000
3001	/*
3002	 * For pools which have the multihost property on determine if the
3003	 * pool is truly inactive and can be safely imported.  Prevent
3004	 * hosts which don't have a hostid set from importing the pool.
3005	 */
3006	activity_check = spa_activity_check_required(spa, ub, label,
3007	    spa->spa_config);
3008	if (activity_check) {
3009		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
3010		    spa_get_hostid() == 0) {
3011			nvlist_free(label);
3012			fnvlist_add_uint64(spa->spa_load_info,
3013			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3014			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3015		}
3016
3017		int error = spa_activity_check(spa, ub, spa->spa_config);
3018		if (error) {
3019			nvlist_free(label);
3020			return (error);
3021		}
3022
3023		fnvlist_add_uint64(spa->spa_load_info,
3024		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
3025		fnvlist_add_uint64(spa->spa_load_info,
3026		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
3027		fnvlist_add_uint16(spa->spa_load_info,
3028		    ZPOOL_CONFIG_MMP_SEQ,
3029		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
3030	}
3031
3032	/*
3033	 * If the pool has an unsupported version we can't open it.
3034	 */
3035	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
3036		nvlist_free(label);
3037		spa_load_failed(spa, "version %llu is not supported",
3038		    (u_longlong_t)ub->ub_version);
3039		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
3040	}
3041
3042	if (ub->ub_version >= SPA_VERSION_FEATURES) {
3043		nvlist_t *features;
3044
3045		/*
3046		 * If we weren't able to find what's necessary for reading the
3047		 * MOS in the label, return failure.
3048		 */
3049		if (label == NULL) {
3050			spa_load_failed(spa, "label config unavailable");
3051			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3052			    ENXIO));
3053		}
3054
3055		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
3056		    &features) != 0) {
3057			nvlist_free(label);
3058			spa_load_failed(spa, "invalid label: '%s' missing",
3059			    ZPOOL_CONFIG_FEATURES_FOR_READ);
3060			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3061			    ENXIO));
3062		}
3063
3064		/*
3065		 * Update our in-core representation with the definitive values
3066		 * from the label.
3067		 */
3068		nvlist_free(spa->spa_label_features);
3069		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
3070	}
3071
3072	nvlist_free(label);
3073
3074	/*
3075	 * Look through entries in the label nvlist's features_for_read. If
3076	 * there is a feature listed there which we don't understand then we
3077	 * cannot open a pool.
3078	 */
3079	if (ub->ub_version >= SPA_VERSION_FEATURES) {
3080		nvlist_t *unsup_feat;
3081
3082		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
3083		    0);
3084
3085		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
3086		    NULL); nvp != NULL;
3087		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
3088			if (!zfeature_is_supported(nvpair_name(nvp))) {
3089				VERIFY(nvlist_add_string(unsup_feat,
3090				    nvpair_name(nvp), "") == 0);
3091			}
3092		}
3093
3094		if (!nvlist_empty(unsup_feat)) {
3095			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
3096			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
3097			nvlist_free(unsup_feat);
3098			spa_load_failed(spa, "some features are unsupported");
3099			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3100			    ENOTSUP));
3101		}
3102
3103		nvlist_free(unsup_feat);
3104	}
3105
3106	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
3107		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3108		spa_try_repair(spa, spa->spa_config);
3109		spa_config_exit(spa, SCL_ALL, FTAG);
3110		nvlist_free(spa->spa_config_splitting);
3111		spa->spa_config_splitting = NULL;
3112	}
3113
3114	/*
3115	 * Initialize internal SPA structures.
3116	 */
3117	spa_ld_select_uberblock_done(spa, ub);
3118
3119	return (0);
3120}
3121
3122static int
3123spa_ld_open_rootbp(spa_t *spa)
3124{
3125	int error = 0;
3126	vdev_t *rvd = spa->spa_root_vdev;
3127
3128	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
3129	if (error != 0) {
3130		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
3131		    "[error=%d]", error);
3132		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3133	}
3134	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
3135
3136	return (0);
3137}
3138
3139static int
3140spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
3141    boolean_t reloading)
3142{
3143	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
3144	nvlist_t *nv, *mos_config, *policy;
3145	int error = 0, copy_error;
3146	uint64_t healthy_tvds, healthy_tvds_mos;
3147	uint64_t mos_config_txg;
3148
3149	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
3150	    != 0)
3151		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3152
3153	/*
3154	 * If we're assembling a pool from a split, the config provided is
3155	 * already trusted so there is nothing to do.
3156	 */
3157	if (type == SPA_IMPORT_ASSEMBLE)
3158		return (0);
3159
3160	healthy_tvds = spa_healthy_core_tvds(spa);
3161
3162	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
3163	    != 0) {
3164		spa_load_failed(spa, "unable to retrieve MOS config");
3165		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3166	}
3167
3168	/*
3169	 * If we are doing an open, pool owner wasn't verified yet, thus do
3170	 * the verification here.
3171	 */
3172	if (spa->spa_load_state == SPA_LOAD_OPEN) {
3173		error = spa_verify_host(spa, mos_config);
3174		if (error != 0) {
3175			nvlist_free(mos_config);
3176			return (error);
3177		}
3178	}
3179
3180	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
3181
3182	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3183
3184	/*
3185	 * Build a new vdev tree from the trusted config
3186	 */
3187	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
3188
3189	/*
3190	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
3191	 * obtained by scanning /dev/dsk, then it will have the right vdev
3192	 * paths. We update the trusted MOS config with this information.
3193	 * We first try to copy the paths with vdev_copy_path_strict, which
3194	 * succeeds only when both configs have exactly the same vdev tree.
3195	 * If that fails, we fall back to a more flexible method that has a
3196	 * best effort policy.
3197	 */
3198	copy_error = vdev_copy_path_strict(rvd, mrvd);
3199	if (copy_error != 0 || spa_load_print_vdev_tree) {
3200		spa_load_note(spa, "provided vdev tree:");
3201		vdev_dbgmsg_print_tree(rvd, 2);
3202		spa_load_note(spa, "MOS vdev tree:");
3203		vdev_dbgmsg_print_tree(mrvd, 2);
3204	}
3205	if (copy_error != 0) {
3206		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
3207		    "back to vdev_copy_path_relaxed");
3208		vdev_copy_path_relaxed(rvd, mrvd);
3209	}
3210
3211	vdev_close(rvd);
3212	vdev_free(rvd);
3213	spa->spa_root_vdev = mrvd;
3214	rvd = mrvd;
3215	spa_config_exit(spa, SCL_ALL, FTAG);
3216
3217	/*
3218	 * We will use spa_config if we decide to reload the spa or if spa_load
3219	 * fails and we rewind. We must thus regenerate the config using the
3220	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
3221	 * pass settings on how to load the pool and is not stored in the MOS.
3222	 * We copy it over to our new, trusted config.
3223	 */
3224	mos_config_txg = fnvlist_lookup_uint64(mos_config,
3225	    ZPOOL_CONFIG_POOL_TXG);
3226	nvlist_free(mos_config);
3227	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
3228	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
3229	    &policy) == 0)
3230		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
3231	spa_config_set(spa, mos_config);
3232	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
3233
3234	/*
3235	 * Now that we got the config from the MOS, we should be more strict
3236	 * in checking blkptrs and can make assumptions about the consistency
3237	 * of the vdev tree. spa_trust_config must be set to true before opening
3238	 * vdevs in order for them to be writeable.
3239	 */
3240	spa->spa_trust_config = B_TRUE;
3241
3242	/*
3243	 * Open and validate the new vdev tree
3244	 */
3245	error = spa_ld_open_vdevs(spa);
3246	if (error != 0)
3247		return (error);
3248
3249	error = spa_ld_validate_vdevs(spa);
3250	if (error != 0)
3251		return (error);
3252
3253	if (copy_error != 0 || spa_load_print_vdev_tree) {
3254		spa_load_note(spa, "final vdev tree:");
3255		vdev_dbgmsg_print_tree(rvd, 2);
3256	}
3257
3258	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
3259	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
3260		/*
3261		 * Sanity check to make sure that we are indeed loading the
3262		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
3263		 * in the config provided and they happened to be the only ones
3264		 * to have the latest uberblock, we could involuntarily perform
3265		 * an extreme rewind.
3266		 */
3267		healthy_tvds_mos = spa_healthy_core_tvds(spa);
3268		if (healthy_tvds_mos - healthy_tvds >=
3269		    SPA_SYNC_MIN_VDEVS) {
3270			spa_load_note(spa, "config provided misses too many "
3271			    "top-level vdevs compared to MOS (%lld vs %lld). ",
3272			    (u_longlong_t)healthy_tvds,
3273			    (u_longlong_t)healthy_tvds_mos);
3274			spa_load_note(spa, "vdev tree:");
3275			vdev_dbgmsg_print_tree(rvd, 2);
3276			if (reloading) {
3277				spa_load_failed(spa, "config was already "
3278				    "provided from MOS. Aborting.");
3279				return (spa_vdev_err(rvd,
3280				    VDEV_AUX_CORRUPT_DATA, EIO));
3281			}
3282			spa_load_note(spa, "spa must be reloaded using MOS "
3283			    "config");
3284			return (SET_ERROR(EAGAIN));
3285		}
3286	}
3287
3288	error = spa_check_for_missing_logs(spa);
3289	if (error != 0)
3290		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
3291
3292	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
3293		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
3294		    "guid sum (%llu != %llu)",
3295		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
3296		    (u_longlong_t)rvd->vdev_guid_sum);
3297		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
3298		    ENXIO));
3299	}
3300
3301	return (0);
3302}
3303
3304static int
3305spa_ld_open_indirect_vdev_metadata(spa_t *spa)
3306{
3307	int error = 0;
3308	vdev_t *rvd = spa->spa_root_vdev;
3309
3310	/*
3311	 * Everything that we read before spa_remove_init() must be stored
3312	 * on concreted vdevs.  Therefore we do this as early as possible.
3313	 */
3314	error = spa_remove_init(spa);
3315	if (error != 0) {
3316		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
3317		    error);
3318		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3319	}
3320
3321	/*
3322	 * Retrieve information needed to condense indirect vdev mappings.
3323	 */
3324	error = spa_condense_init(spa);
3325	if (error != 0) {
3326		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
3327		    error);
3328		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3329	}
3330
3331	return (0);
3332}
3333
3334static int
3335spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
3336{
3337	int error = 0;
3338	vdev_t *rvd = spa->spa_root_vdev;
3339
3340	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
3341		boolean_t missing_feat_read = B_FALSE;
3342		nvlist_t *unsup_feat, *enabled_feat;
3343
3344		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
3345		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
3346			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3347		}
3348
3349		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
3350		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
3351			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3352		}
3353
3354		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
3355		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
3356			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3357		}
3358
3359		enabled_feat = fnvlist_alloc();
3360		unsup_feat = fnvlist_alloc();
3361
3362		if (!spa_features_check(spa, B_FALSE,
3363		    unsup_feat, enabled_feat))
3364			missing_feat_read = B_TRUE;
3365
3366		if (spa_writeable(spa) ||
3367		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
3368			if (!spa_features_check(spa, B_TRUE,
3369			    unsup_feat, enabled_feat)) {
3370				*missing_feat_writep = B_TRUE;
3371			}
3372		}
3373
3374		fnvlist_add_nvlist(spa->spa_load_info,
3375		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
3376
3377		if (!nvlist_empty(unsup_feat)) {
3378			fnvlist_add_nvlist(spa->spa_load_info,
3379			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
3380		}
3381
3382		fnvlist_free(enabled_feat);
3383		fnvlist_free(unsup_feat);
3384
3385		if (!missing_feat_read) {
3386			fnvlist_add_boolean(spa->spa_load_info,
3387			    ZPOOL_CONFIG_CAN_RDONLY);
3388		}
3389
3390		/*
3391		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
3392		 * twofold: to determine whether the pool is available for
3393		 * import in read-write mode and (if it is not) whether the
3394		 * pool is available for import in read-only mode. If the pool
3395		 * is available for import in read-write mode, it is displayed
3396		 * as available in userland; if it is not available for import
3397		 * in read-only mode, it is displayed as unavailable in
3398		 * userland. If the pool is available for import in read-only
3399		 * mode but not read-write mode, it is displayed as unavailable
3400		 * in userland with a special note that the pool is actually
3401		 * available for open in read-only mode.
3402		 *
3403		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
3404		 * missing a feature for write, we must first determine whether
3405		 * the pool can be opened read-only before returning to
3406		 * userland in order to know whether to display the
3407		 * abovementioned note.
3408		 */
3409		if (missing_feat_read || (*missing_feat_writep &&
3410		    spa_writeable(spa))) {
3411			spa_load_failed(spa, "pool uses unsupported features");
3412			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3413			    ENOTSUP));
3414		}
3415
3416		/*
3417		 * Load refcounts for ZFS features from disk into an in-memory
3418		 * cache during SPA initialization.
3419		 */
3420		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
3421			uint64_t refcount;
3422
3423			error = feature_get_refcount_from_disk(spa,
3424			    &spa_feature_table[i], &refcount);
3425			if (error == 0) {
3426				spa->spa_feat_refcount_cache[i] = refcount;
3427			} else if (error == ENOTSUP) {
3428				spa->spa_feat_refcount_cache[i] =
3429				    SPA_FEATURE_DISABLED;
3430			} else {
3431				spa_load_failed(spa, "error getting refcount "
3432				    "for feature %s [error=%d]",
3433				    spa_feature_table[i].fi_guid, error);
3434				return (spa_vdev_err(rvd,
3435				    VDEV_AUX_CORRUPT_DATA, EIO));
3436			}
3437		}
3438	}
3439
3440	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
3441		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
3442		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
3443			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3444	}
3445
3446	/*
3447	 * Encryption was added before bookmark_v2, even though bookmark_v2
3448	 * is now a dependency. If this pool has encryption enabled without
3449	 * bookmark_v2, trigger an errata message.
3450	 */
3451	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
3452	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
3453		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
3454	}
3455
3456	return (0);
3457}
3458
3459static int
3460spa_ld_load_special_directories(spa_t *spa)
3461{
3462	int error = 0;
3463	vdev_t *rvd = spa->spa_root_vdev;
3464
3465	spa->spa_is_initializing = B_TRUE;
3466	error = dsl_pool_open(spa->spa_dsl_pool);
3467	spa->spa_is_initializing = B_FALSE;
3468	if (error != 0) {
3469		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
3470		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3471	}
3472
3473	return (0);
3474}
3475
3476static int
3477spa_ld_get_props(spa_t *spa)
3478{
3479	int error = 0;
3480	uint64_t obj;
3481	vdev_t *rvd = spa->spa_root_vdev;
3482
3483	/* Grab the secret checksum salt from the MOS. */
3484	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3485	    DMU_POOL_CHECKSUM_SALT, 1,
3486	    sizeof (spa->spa_cksum_salt.zcs_bytes),
3487	    spa->spa_cksum_salt.zcs_bytes);
3488	if (error == ENOENT) {
3489		/* Generate a new salt for subsequent use */
3490		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3491		    sizeof (spa->spa_cksum_salt.zcs_bytes));
3492	} else if (error != 0) {
3493		spa_load_failed(spa, "unable to retrieve checksum salt from "
3494		    "MOS [error=%d]", error);
3495		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3496	}
3497
3498	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
3499		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3500	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
3501	if (error != 0) {
3502		spa_load_failed(spa, "error opening deferred-frees bpobj "
3503		    "[error=%d]", error);
3504		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3505	}
3506
3507	/*
3508	 * Load the bit that tells us to use the new accounting function
3509	 * (raid-z deflation).  If we have an older pool, this will not
3510	 * be present.
3511	 */
3512	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
3513	if (error != 0 && error != ENOENT)
3514		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3515
3516	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
3517	    &spa->spa_creation_version, B_FALSE);
3518	if (error != 0 && error != ENOENT)
3519		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3520
3521	/*
3522	 * Load the persistent error log.  If we have an older pool, this will
3523	 * not be present.
3524	 */
3525	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
3526	    B_FALSE);
3527	if (error != 0 && error != ENOENT)
3528		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3529
3530	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
3531	    &spa->spa_errlog_scrub, B_FALSE);
3532	if (error != 0 && error != ENOENT)
3533		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3534
3535	/*
3536	 * Load the history object.  If we have an older pool, this
3537	 * will not be present.
3538	 */
3539	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
3540	if (error != 0 && error != ENOENT)
3541		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3542
3543	/*
3544	 * Load the per-vdev ZAP map. If we have an older pool, this will not
3545	 * be present; in this case, defer its creation to a later time to
3546	 * avoid dirtying the MOS this early / out of sync context. See
3547	 * spa_sync_config_object.
3548	 */
3549
3550	/* The sentinel is only available in the MOS config. */
3551	nvlist_t *mos_config;
3552	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
3553		spa_load_failed(spa, "unable to retrieve MOS config");
3554		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3555	}
3556
3557	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
3558	    &spa->spa_all_vdev_zaps, B_FALSE);
3559
3560	if (error == ENOENT) {
3561		VERIFY(!nvlist_exists(mos_config,
3562		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
3563		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
3564		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3565	} else if (error != 0) {
3566		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3567	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
3568		/*
3569		 * An older version of ZFS overwrote the sentinel value, so
3570		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
3571		 * destruction to later; see spa_sync_config_object.
3572		 */
3573		spa->spa_avz_action = AVZ_ACTION_DESTROY;
3574		/*
3575		 * We're assuming that no vdevs have had their ZAPs created
3576		 * before this. Better be sure of it.
3577		 */
3578		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3579	}
3580	nvlist_free(mos_config);
3581
3582	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3583
3584	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3585	    B_FALSE);
3586	if (error && error != ENOENT)
3587		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3588
3589	if (error == 0) {
3590		uint64_t autoreplace;
3591
3592		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3593		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3594		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3595		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3596		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3597		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
3598		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3599		    &spa->spa_dedup_ditto);
3600		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
3601		spa->spa_autoreplace = (autoreplace != 0);
3602	}
3603
3604	/*
3605	 * If we are importing a pool with missing top-level vdevs,
3606	 * we enforce that the pool doesn't panic or get suspended on
3607	 * error since the likelihood of missing data is extremely high.
3608	 */
3609	if (spa->spa_missing_tvds > 0 &&
3610	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3611	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3612		spa_load_note(spa, "forcing failmode to 'continue' "
3613		    "as some top level vdevs are missing");
3614		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3615	}
3616
3617	return (0);
3618}
3619
3620static int
3621spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3622{
3623	int error = 0;
3624	vdev_t *rvd = spa->spa_root_vdev;
3625
3626	/*
3627	 * If we're assembling the pool from the split-off vdevs of
3628	 * an existing pool, we don't want to attach the spares & cache
3629	 * devices.
3630	 */
3631
3632	/*
3633	 * Load any hot spares for this pool.
3634	 */
3635	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3636	    B_FALSE);
3637	if (error != 0 && error != ENOENT)
3638		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3639	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3640		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3641		if (load_nvlist(spa, spa->spa_spares.sav_object,
3642		    &spa->spa_spares.sav_config) != 0) {
3643			spa_load_failed(spa, "error loading spares nvlist");
3644			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3645		}
3646
3647		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3648		spa_load_spares(spa);
3649		spa_config_exit(spa, SCL_ALL, FTAG);
3650	} else if (error == 0) {
3651		spa->spa_spares.sav_sync = B_TRUE;
3652	}
3653
3654	/*
3655	 * Load any level 2 ARC devices for this pool.
3656	 */
3657	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3658	    &spa->spa_l2cache.sav_object, B_FALSE);
3659	if (error != 0 && error != ENOENT)
3660		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3661	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3662		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3663		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3664		    &spa->spa_l2cache.sav_config) != 0) {
3665			spa_load_failed(spa, "error loading l2cache nvlist");
3666			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3667		}
3668
3669		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3670		spa_load_l2cache(spa);
3671		spa_config_exit(spa, SCL_ALL, FTAG);
3672	} else if (error == 0) {
3673		spa->spa_l2cache.sav_sync = B_TRUE;
3674	}
3675
3676	return (0);
3677}
3678
3679static int
3680spa_ld_load_vdev_metadata(spa_t *spa)
3681{
3682	int error = 0;
3683	vdev_t *rvd = spa->spa_root_vdev;
3684
3685	/*
3686	 * If the 'multihost' property is set, then never allow a pool to
3687	 * be imported when the system hostid is zero.  The exception to
3688	 * this rule is zdb which is always allowed to access pools.
3689	 */
3690	if (spa_multihost(spa) && spa_get_hostid() == 0 &&
3691	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
3692		fnvlist_add_uint64(spa->spa_load_info,
3693		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3694		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3695	}
3696
3697	/*
3698	 * If the 'autoreplace' property is set, then post a resource notifying
3699	 * the ZFS DE that it should not issue any faults for unopenable
3700	 * devices.  We also iterate over the vdevs, and post a sysevent for any
3701	 * unopenable vdevs so that the normal autoreplace handler can take
3702	 * over.
3703	 */
3704	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3705		spa_check_removed(spa->spa_root_vdev);
3706		/*
3707		 * For the import case, this is done in spa_import(), because
3708		 * at this point we're using the spare definitions from
3709		 * the MOS config, not necessarily from the userland config.
3710		 */
3711		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3712			spa_aux_check_removed(&spa->spa_spares);
3713			spa_aux_check_removed(&spa->spa_l2cache);
3714		}
3715	}
3716
3717	/*
3718	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3719	 */
3720	error = vdev_load(rvd);
3721	if (error != 0) {
3722		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3723		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3724	}
3725
3726	error = spa_ld_log_spacemaps(spa);
3727	if (error != 0) {
3728		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
3729		    error);
3730		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3731	}
3732
3733	/*
3734	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3735	 */
3736	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3737	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3738	spa_config_exit(spa, SCL_ALL, FTAG);
3739
3740	return (0);
3741}
3742
3743static int
3744spa_ld_load_dedup_tables(spa_t *spa)
3745{
3746	int error = 0;
3747	vdev_t *rvd = spa->spa_root_vdev;
3748
3749	error = ddt_load(spa);
3750	if (error != 0) {
3751		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3752		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3753	}
3754
3755	return (0);
3756}
3757
3758static int
3759spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3760{
3761	vdev_t *rvd = spa->spa_root_vdev;
3762
3763	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3764		boolean_t missing = spa_check_logs(spa);
3765		if (missing) {
3766			if (spa->spa_missing_tvds != 0) {
3767				spa_load_note(spa, "spa_check_logs failed "
3768				    "so dropping the logs");
3769			} else {
3770				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3771				spa_load_failed(spa, "spa_check_logs failed");
3772				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3773				    ENXIO));
3774			}
3775		}
3776	}
3777
3778	return (0);
3779}
3780
3781static int
3782spa_ld_verify_pool_data(spa_t *spa)
3783{
3784	int error = 0;
3785	vdev_t *rvd = spa->spa_root_vdev;
3786
3787	/*
3788	 * We've successfully opened the pool, verify that we're ready
3789	 * to start pushing transactions.
3790	 */
3791	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3792		error = spa_load_verify(spa);
3793		if (error != 0) {
3794			spa_load_failed(spa, "spa_load_verify failed "
3795			    "[error=%d]", error);
3796			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3797			    error));
3798		}
3799	}
3800
3801	return (0);
3802}
3803
3804static void
3805spa_ld_claim_log_blocks(spa_t *spa)
3806{
3807	dmu_tx_t *tx;
3808	dsl_pool_t *dp = spa_get_dsl(spa);
3809
3810	/*
3811	 * Claim log blocks that haven't been committed yet.
3812	 * This must all happen in a single txg.
3813	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3814	 * invoked from zil_claim_log_block()'s i/o done callback.
3815	 * Price of rollback is that we abandon the log.
3816	 */
3817	spa->spa_claiming = B_TRUE;
3818
3819	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3820	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3821	    zil_claim, tx, DS_FIND_CHILDREN);
3822	dmu_tx_commit(tx);
3823
3824	spa->spa_claiming = B_FALSE;
3825
3826	spa_set_log_state(spa, SPA_LOG_GOOD);
3827}
3828
3829static void
3830spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3831    boolean_t update_config_cache)
3832{
3833	vdev_t *rvd = spa->spa_root_vdev;
3834	int need_update = B_FALSE;
3835
3836	/*
3837	 * If the config cache is stale, or we have uninitialized
3838	 * metaslabs (see spa_vdev_add()), then update the config.
3839	 *
3840	 * If this is a verbatim import, trust the current
3841	 * in-core spa_config and update the disk labels.
3842	 */
3843	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
3844	    spa->spa_load_state == SPA_LOAD_IMPORT ||
3845	    spa->spa_load_state == SPA_LOAD_RECOVER ||
3846	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3847		need_update = B_TRUE;
3848
3849	for (int c = 0; c < rvd->vdev_children; c++)
3850		if (rvd->vdev_child[c]->vdev_ms_array == 0)
3851			need_update = B_TRUE;
3852
3853	/*
3854	 * Update the config cache asychronously in case we're the
3855	 * root pool, in which case the config cache isn't writable yet.
3856	 */
3857	if (need_update)
3858		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3859}
3860
3861static void
3862spa_ld_prepare_for_reload(spa_t *spa)
3863{
3864	int mode = spa->spa_mode;
3865	int async_suspended = spa->spa_async_suspended;
3866
3867	spa_unload(spa);
3868	spa_deactivate(spa);
3869	spa_activate(spa, mode);
3870
3871	/*
3872	 * We save the value of spa_async_suspended as it gets reset to 0 by
3873	 * spa_unload(). We want to restore it back to the original value before
3874	 * returning as we might be calling spa_async_resume() later.
3875	 */
3876	spa->spa_async_suspended = async_suspended;
3877}
3878
3879static int
3880spa_ld_read_checkpoint_txg(spa_t *spa)
3881{
3882	uberblock_t checkpoint;
3883	int error = 0;
3884
3885	ASSERT0(spa->spa_checkpoint_txg);
3886	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3887
3888	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3889	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3890	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3891
3892	if (error == ENOENT)
3893		return (0);
3894
3895	if (error != 0)
3896		return (error);
3897
3898	ASSERT3U(checkpoint.ub_txg, !=, 0);
3899	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
3900	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
3901	spa->spa_checkpoint_txg = checkpoint.ub_txg;
3902	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
3903
3904	return (0);
3905}
3906
3907static int
3908spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
3909{
3910	int error = 0;
3911
3912	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3913	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3914
3915	/*
3916	 * Never trust the config that is provided unless we are assembling
3917	 * a pool following a split.
3918	 * This means don't trust blkptrs and the vdev tree in general. This
3919	 * also effectively puts the spa in read-only mode since
3920	 * spa_writeable() checks for spa_trust_config to be true.
3921	 * We will later load a trusted config from the MOS.
3922	 */
3923	if (type != SPA_IMPORT_ASSEMBLE)
3924		spa->spa_trust_config = B_FALSE;
3925
3926	/*
3927	 * Parse the config provided to create a vdev tree.
3928	 */
3929	error = spa_ld_parse_config(spa, type);
3930	if (error != 0)
3931		return (error);
3932
3933	spa_import_progress_add(spa);
3934
3935	/*
3936	 * Now that we have the vdev tree, try to open each vdev. This involves
3937	 * opening the underlying physical device, retrieving its geometry and
3938	 * probing the vdev with a dummy I/O. The state of each vdev will be set
3939	 * based on the success of those operations. After this we'll be ready
3940	 * to read from the vdevs.
3941	 */
3942	error = spa_ld_open_vdevs(spa);
3943	if (error != 0)
3944		return (error);
3945
3946	/*
3947	 * Read the label of each vdev and make sure that the GUIDs stored
3948	 * there match the GUIDs in the config provided.
3949	 * If we're assembling a new pool that's been split off from an
3950	 * existing pool, the labels haven't yet been updated so we skip
3951	 * validation for now.
3952	 */
3953	if (type != SPA_IMPORT_ASSEMBLE) {
3954		error = spa_ld_validate_vdevs(spa);
3955		if (error != 0)
3956			return (error);
3957	}
3958
3959	/*
3960	 * Read all vdev labels to find the best uberblock (i.e. latest,
3961	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
3962	 * get the list of features required to read blkptrs in the MOS from
3963	 * the vdev label with the best uberblock and verify that our version
3964	 * of zfs supports them all.
3965	 */
3966	error = spa_ld_select_uberblock(spa, type);
3967	if (error != 0)
3968		return (error);
3969
3970	/*
3971	 * Pass that uberblock to the dsl_pool layer which will open the root
3972	 * blkptr. This blkptr points to the latest version of the MOS and will
3973	 * allow us to read its contents.
3974	 */
3975	error = spa_ld_open_rootbp(spa);
3976	if (error != 0)
3977		return (error);
3978
3979	return (0);
3980}
3981
3982static int
3983spa_ld_checkpoint_rewind(spa_t *spa)
3984{
3985	uberblock_t checkpoint;
3986	int error = 0;
3987
3988	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3989	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3990
3991	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3992	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3993	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3994
3995	if (error != 0) {
3996		spa_load_failed(spa, "unable to retrieve checkpointed "
3997		    "uberblock from the MOS config [error=%d]", error);
3998
3999		if (error == ENOENT)
4000			error = ZFS_ERR_NO_CHECKPOINT;
4001
4002		return (error);
4003	}
4004
4005	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
4006	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
4007
4008	/*
4009	 * We need to update the txg and timestamp of the checkpointed
4010	 * uberblock to be higher than the latest one. This ensures that
4011	 * the checkpointed uberblock is selected if we were to close and
4012	 * reopen the pool right after we've written it in the vdev labels.
4013	 * (also see block comment in vdev_uberblock_compare)
4014	 */
4015	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
4016	checkpoint.ub_timestamp = gethrestime_sec();
4017
4018	/*
4019	 * Set current uberblock to be the checkpointed uberblock.
4020	 */
4021	spa->spa_uberblock = checkpoint;
4022
4023	/*
4024	 * If we are doing a normal rewind, then the pool is open for
4025	 * writing and we sync the "updated" checkpointed uberblock to
4026	 * disk. Once this is done, we've basically rewound the whole
4027	 * pool and there is no way back.
4028	 *
4029	 * There are cases when we don't want to attempt and sync the
4030	 * checkpointed uberblock to disk because we are opening a
4031	 * pool as read-only. Specifically, verifying the checkpointed
4032	 * state with zdb, and importing the checkpointed state to get
4033	 * a "preview" of its content.
4034	 */
4035	if (spa_writeable(spa)) {
4036		vdev_t *rvd = spa->spa_root_vdev;
4037
4038		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4039		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
4040		int svdcount = 0;
4041		int children = rvd->vdev_children;
4042		int c0 = spa_get_random(children);
4043
4044		for (int c = 0; c < children; c++) {
4045			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
4046
4047			/* Stop when revisiting the first vdev */
4048			if (c > 0 && svd[0] == vd)
4049				break;
4050
4051			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
4052			    !vdev_is_concrete(vd))
4053				continue;
4054
4055			svd[svdcount++] = vd;
4056			if (svdcount == SPA_SYNC_MIN_VDEVS)
4057				break;
4058		}
4059		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
4060		if (error == 0)
4061			spa->spa_last_synced_guid = rvd->vdev_guid;
4062		spa_config_exit(spa, SCL_ALL, FTAG);
4063
4064		if (error != 0) {
4065			spa_load_failed(spa, "failed to write checkpointed "
4066			    "uberblock to the vdev labels [error=%d]", error);
4067			return (error);
4068		}
4069	}
4070
4071	return (0);
4072}
4073
4074static int
4075spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
4076    boolean_t *update_config_cache)
4077{
4078	int error;
4079
4080	/*
4081	 * Parse the config for pool, open and validate vdevs,
4082	 * select an uberblock, and use that uberblock to open
4083	 * the MOS.
4084	 */
4085	error = spa_ld_mos_init(spa, type);
4086	if (error != 0)
4087		return (error);
4088
4089	/*
4090	 * Retrieve the trusted config stored in the MOS and use it to create
4091	 * a new, exact version of the vdev tree, then reopen all vdevs.
4092	 */
4093	error = spa_ld_trusted_config(spa, type, B_FALSE);
4094	if (error == EAGAIN) {
4095		if (update_config_cache != NULL)
4096			*update_config_cache = B_TRUE;
4097
4098		/*
4099		 * Redo the loading process with the trusted config if it is
4100		 * too different from the untrusted config.
4101		 */
4102		spa_ld_prepare_for_reload(spa);
4103		spa_load_note(spa, "RELOADING");
4104		error = spa_ld_mos_init(spa, type);
4105		if (error != 0)
4106			return (error);
4107
4108		error = spa_ld_trusted_config(spa, type, B_TRUE);
4109		if (error != 0)
4110			return (error);
4111
4112	} else if (error != 0) {
4113		return (error);
4114	}
4115
4116	return (0);
4117}
4118
4119/*
4120 * Load an existing storage pool, using the config provided. This config
4121 * describes which vdevs are part of the pool and is later validated against
4122 * partial configs present in each vdev's label and an entire copy of the
4123 * config stored in the MOS.
4124 */
4125static int
4126spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
4127{
4128	int error = 0;
4129	boolean_t missing_feat_write = B_FALSE;
4130	boolean_t checkpoint_rewind =
4131	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4132	boolean_t update_config_cache = B_FALSE;
4133
4134	ASSERT(MUTEX_HELD(&spa_namespace_lock));
4135	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4136
4137	spa_load_note(spa, "LOADING");
4138
4139	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
4140	if (error != 0)
4141		return (error);
4142
4143	/*
4144	 * If we are rewinding to the checkpoint then we need to repeat
4145	 * everything we've done so far in this function but this time
4146	 * selecting the checkpointed uberblock and using that to open
4147	 * the MOS.
4148	 */
4149	if (checkpoint_rewind) {
4150		/*
4151		 * If we are rewinding to the checkpoint update config cache
4152		 * anyway.
4153		 */
4154		update_config_cache = B_TRUE;
4155
4156		/*
4157		 * Extract the checkpointed uberblock from the current MOS
4158		 * and use this as the pool's uberblock from now on. If the
4159		 * pool is imported as writeable we also write the checkpoint
4160		 * uberblock to the labels, making the rewind permanent.
4161		 */
4162		error = spa_ld_checkpoint_rewind(spa);
4163		if (error != 0)
4164			return (error);
4165
4166		/*
4167		 * Redo the loading process process again with the
4168		 * checkpointed uberblock.
4169		 */
4170		spa_ld_prepare_for_reload(spa);
4171		spa_load_note(spa, "LOADING checkpointed uberblock");
4172		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
4173		if (error != 0)
4174			return (error);
4175	}
4176
4177	/*
4178	 * Retrieve the checkpoint txg if the pool has a checkpoint.
4179	 */
4180	error = spa_ld_read_checkpoint_txg(spa);
4181	if (error != 0)
4182		return (error);
4183
4184	/*
4185	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
4186	 * from the pool and their contents were re-mapped to other vdevs. Note
4187	 * that everything that we read before this step must have been
4188	 * rewritten on concrete vdevs after the last device removal was
4189	 * initiated. Otherwise we could be reading from indirect vdevs before
4190	 * we have loaded their mappings.
4191	 */
4192	error = spa_ld_open_indirect_vdev_metadata(spa);
4193	if (error != 0)
4194		return (error);
4195
4196	/*
4197	 * Retrieve the full list of active features from the MOS and check if
4198	 * they are all supported.
4199	 */
4200	error = spa_ld_check_features(spa, &missing_feat_write);
4201	if (error != 0)
4202		return (error);
4203
4204	/*
4205	 * Load several special directories from the MOS needed by the dsl_pool
4206	 * layer.
4207	 */
4208	error = spa_ld_load_special_directories(spa);
4209	if (error != 0)
4210		return (error);
4211
4212	/*
4213	 * Retrieve pool properties from the MOS.
4214	 */
4215	error = spa_ld_get_props(spa);
4216	if (error != 0)
4217		return (error);
4218
4219	/*
4220	 * Retrieve the list of auxiliary devices - cache devices and spares -
4221	 * and open them.
4222	 */
4223	error = spa_ld_open_aux_vdevs(spa, type);
4224	if (error != 0)
4225		return (error);
4226
4227	/*
4228	 * Load the metadata for all vdevs. Also check if unopenable devices
4229	 * should be autoreplaced.
4230	 */
4231	error = spa_ld_load_vdev_metadata(spa);
4232	if (error != 0)
4233		return (error);
4234
4235	error = spa_ld_load_dedup_tables(spa);
4236	if (error != 0)
4237		return (error);
4238
4239	/*
4240	 * Verify the logs now to make sure we don't have any unexpected errors
4241	 * when we claim log blocks later.
4242	 */
4243	error = spa_ld_verify_logs(spa, type, ereport);
4244	if (error != 0)
4245		return (error);
4246
4247	if (missing_feat_write) {
4248		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
4249
4250		/*
4251		 * At this point, we know that we can open the pool in
4252		 * read-only mode but not read-write mode. We now have enough
4253		 * information and can return to userland.
4254		 */
4255		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
4256		    ENOTSUP));
4257	}
4258
4259	/*
4260	 * Traverse the last txgs to make sure the pool was left off in a safe
4261	 * state. When performing an extreme rewind, we verify the whole pool,
4262	 * which can take a very long time.
4263	 */
4264	error = spa_ld_verify_pool_data(spa);
4265	if (error != 0)
4266		return (error);
4267
4268	/*
4269	 * Calculate the deflated space for the pool. This must be done before
4270	 * we write anything to the pool because we'd need to update the space
4271	 * accounting using the deflated sizes.
4272	 */
4273	spa_update_dspace(spa);
4274
4275	/*
4276	 * We have now retrieved all the information we needed to open the
4277	 * pool. If we are importing the pool in read-write mode, a few
4278	 * additional steps must be performed to finish the import.
4279	 */
4280	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
4281	    spa->spa_load_max_txg == UINT64_MAX)) {
4282		uint64_t config_cache_txg = spa->spa_config_txg;
4283
4284		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
4285
4286		/*
4287		 * In case of a checkpoint rewind, log the original txg
4288		 * of the checkpointed uberblock.
4289		 */
4290		if (checkpoint_rewind) {
4291			spa_history_log_internal(spa, "checkpoint rewind",
4292			    NULL, "rewound state to txg=%llu",
4293			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
4294		}
4295
4296		/*
4297		 * Traverse the ZIL and claim all blocks.
4298		 */
4299		spa_ld_claim_log_blocks(spa);
4300
4301		/*
4302		 * Kick-off the syncing thread.
4303		 */
4304		spa->spa_sync_on = B_TRUE;
4305		txg_sync_start(spa->spa_dsl_pool);
4306		mmp_thread_start(spa);
4307
4308		/*
4309		 * Wait for all claims to sync.  We sync up to the highest
4310		 * claimed log block birth time so that claimed log blocks
4311		 * don't appear to be from the future.  spa_claim_max_txg
4312		 * will have been set for us by ZIL traversal operations
4313		 * performed above.
4314		 */
4315		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
4316
4317		/*
4318		 * Check if we need to request an update of the config. On the
4319		 * next sync, we would update the config stored in vdev labels
4320		 * and the cachefile (by default /etc/zfs/zpool.cache).
4321		 */
4322		spa_ld_check_for_config_update(spa, config_cache_txg,
4323		    update_config_cache);
4324
4325		/*
4326		 * Check all DTLs to see if anything needs resilvering.
4327		 */
4328		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
4329		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
4330			spa_async_request(spa, SPA_ASYNC_RESILVER);
4331
4332		/*
4333		 * Log the fact that we booted up (so that we can detect if
4334		 * we rebooted in the middle of an operation).
4335		 */
4336		spa_history_log_version(spa, "open");
4337
4338		spa_restart_removal(spa);
4339		spa_spawn_aux_threads(spa);
4340
4341		/*
4342		 * Delete any inconsistent datasets.
4343		 *
4344		 * Note:
4345		 * Since we may be issuing deletes for clones here,
4346		 * we make sure to do so after we've spawned all the
4347		 * auxiliary threads above (from which the livelist
4348		 * deletion zthr is part of).
4349		 */
4350		(void) dmu_objset_find(spa_name(spa),
4351		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
4352
4353		/*
4354		 * Clean up any stale temporary dataset userrefs.
4355		 */
4356		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
4357
4358		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4359		vdev_initialize_restart(spa->spa_root_vdev);
4360		vdev_trim_restart(spa->spa_root_vdev);
4361		vdev_autotrim_restart(spa);
4362		spa_config_exit(spa, SCL_CONFIG, FTAG);
4363	}
4364
4365	spa_import_progress_remove(spa);
4366	spa_load_note(spa, "LOADED");
4367
4368	return (0);
4369}
4370
4371static int
4372spa_load_retry(spa_t *spa, spa_load_state_t state)
4373{
4374	int mode = spa->spa_mode;
4375
4376	spa_unload(spa);
4377	spa_deactivate(spa);
4378
4379	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
4380
4381	spa_activate(spa, mode);
4382	spa_async_suspend(spa);
4383
4384	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
4385	    (u_longlong_t)spa->spa_load_max_txg);
4386
4387	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
4388}
4389
4390/*
4391 * If spa_load() fails this function will try loading prior txg's. If
4392 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
4393 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
4394 * function will not rewind the pool and will return the same error as
4395 * spa_load().
4396 */
4397static int
4398spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
4399    int rewind_flags)
4400{
4401	nvlist_t *loadinfo = NULL;
4402	nvlist_t *config = NULL;
4403	int load_error, rewind_error;
4404	uint64_t safe_rewind_txg;
4405	uint64_t min_txg;
4406
4407	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
4408		spa->spa_load_max_txg = spa->spa_load_txg;
4409		spa_set_log_state(spa, SPA_LOG_CLEAR);
4410	} else {
4411		spa->spa_load_max_txg = max_request;
4412		if (max_request != UINT64_MAX)
4413			spa->spa_extreme_rewind = B_TRUE;
4414	}
4415
4416	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
4417	if (load_error == 0)
4418		return (0);
4419	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
4420		/*
4421		 * When attempting checkpoint-rewind on a pool with no
4422		 * checkpoint, we should not attempt to load uberblocks
4423		 * from previous txgs when spa_load fails.
4424		 */
4425		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4426		spa_import_progress_remove(spa);
4427		return (load_error);
4428	}
4429
4430	if (spa->spa_root_vdev != NULL)
4431		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4432
4433	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
4434	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
4435
4436	if (rewind_flags & ZPOOL_NEVER_REWIND) {
4437		nvlist_free(config);
4438		spa_import_progress_remove(spa);
4439		return (load_error);
4440	}
4441
4442	if (state == SPA_LOAD_RECOVER) {
4443		/* Price of rolling back is discarding txgs, including log */
4444		spa_set_log_state(spa, SPA_LOG_CLEAR);
4445	} else {
4446		/*
4447		 * If we aren't rolling back save the load info from our first
4448		 * import attempt so that we can restore it after attempting
4449		 * to rewind.
4450		 */
4451		loadinfo = spa->spa_load_info;
4452		spa->spa_load_info = fnvlist_alloc();
4453	}
4454
4455	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
4456	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
4457	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
4458	    TXG_INITIAL : safe_rewind_txg;
4459
4460	/*
4461	 * Continue as long as we're finding errors, we're still within
4462	 * the acceptable rewind range, and we're still finding uberblocks
4463	 */
4464	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
4465	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
4466		if (spa->spa_load_max_txg < safe_rewind_txg)
4467			spa->spa_extreme_rewind = B_TRUE;
4468		rewind_error = spa_load_retry(spa, state);
4469	}
4470
4471	spa->spa_extreme_rewind = B_FALSE;
4472	spa->spa_load_max_txg = UINT64_MAX;
4473
4474	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
4475		spa_config_set(spa, config);
4476	else
4477		nvlist_free(config);
4478
4479	if (state == SPA_LOAD_RECOVER) {
4480		ASSERT3P(loadinfo, ==, NULL);
4481		spa_import_progress_remove(spa);
4482		return (rewind_error);
4483	} else {
4484		/* Store the rewind info as part of the initial load info */
4485		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
4486		    spa->spa_load_info);
4487
4488		/* Restore the initial load info */
4489		fnvlist_free(spa->spa_load_info);
4490		spa->spa_load_info = loadinfo;
4491
4492		spa_import_progress_remove(spa);
4493		return (load_error);
4494	}
4495}
4496
4497/*
4498 * Pool Open/Import
4499 *
4500 * The import case is identical to an open except that the configuration is sent
4501 * down from userland, instead of grabbed from the configuration cache.  For the
4502 * case of an open, the pool configuration will exist in the
4503 * POOL_STATE_UNINITIALIZED state.
4504 *
4505 * The stats information (gen/count/ustats) is used to gather vdev statistics at
4506 * the same time open the pool, without having to keep around the spa_t in some
4507 * ambiguous state.
4508 */
4509static int
4510spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
4511    nvlist_t **config)
4512{
4513	spa_t *spa;
4514	spa_load_state_t state = SPA_LOAD_OPEN;
4515	int error;
4516	int locked = B_FALSE;
4517
4518	*spapp = NULL;
4519
4520	/*
4521	 * As disgusting as this is, we need to support recursive calls to this
4522	 * function because dsl_dir_open() is called during spa_load(), and ends
4523	 * up calling spa_open() again.  The real fix is to figure out how to
4524	 * avoid dsl_dir_open() calling this in the first place.
4525	 */
4526	if (mutex_owner(&spa_namespace_lock) != curthread) {
4527		mutex_enter(&spa_namespace_lock);
4528		locked = B_TRUE;
4529	}
4530
4531	if ((spa = spa_lookup(pool)) == NULL) {
4532		if (locked)
4533			mutex_exit(&spa_namespace_lock);
4534		return (SET_ERROR(ENOENT));
4535	}
4536
4537	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
4538		zpool_load_policy_t policy;
4539
4540		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
4541		    &policy);
4542		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
4543			state = SPA_LOAD_RECOVER;
4544
4545		spa_activate(spa, spa_mode_global);
4546
4547		if (state != SPA_LOAD_RECOVER)
4548			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4549		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4550
4551		zfs_dbgmsg("spa_open_common: opening %s", pool);
4552		error = spa_load_best(spa, state, policy.zlp_txg,
4553		    policy.zlp_rewind);
4554
4555		if (error == EBADF) {
4556			/*
4557			 * If vdev_validate() returns failure (indicated by
4558			 * EBADF), it indicates that one of the vdevs indicates
4559			 * that the pool has been exported or destroyed.  If
4560			 * this is the case, the config cache is out of sync and
4561			 * we should remove the pool from the namespace.
4562			 */
4563			spa_unload(spa);
4564			spa_deactivate(spa);
4565			spa_write_cachefile(spa, B_TRUE, B_TRUE);
4566			spa_remove(spa);
4567			if (locked)
4568				mutex_exit(&spa_namespace_lock);
4569			return (SET_ERROR(ENOENT));
4570		}
4571
4572		if (error) {
4573			/*
4574			 * We can't open the pool, but we still have useful
4575			 * information: the state of each vdev after the
4576			 * attempted vdev_open().  Return this to the user.
4577			 */
4578			if (config != NULL && spa->spa_config) {
4579				VERIFY(nvlist_dup(spa->spa_config, config,
4580				    KM_SLEEP) == 0);
4581				VERIFY(nvlist_add_nvlist(*config,
4582				    ZPOOL_CONFIG_LOAD_INFO,
4583				    spa->spa_load_info) == 0);
4584			}
4585			spa_unload(spa);
4586			spa_deactivate(spa);
4587			spa->spa_last_open_failed = error;
4588			if (locked)
4589				mutex_exit(&spa_namespace_lock);
4590			*spapp = NULL;
4591			return (error);
4592		}
4593	}
4594
4595	spa_open_ref(spa, tag);
4596
4597	if (config != NULL)
4598		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4599
4600	/*
4601	 * If we've recovered the pool, pass back any information we
4602	 * gathered while doing the load.
4603	 */
4604	if (state == SPA_LOAD_RECOVER) {
4605		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
4606		    spa->spa_load_info) == 0);
4607	}
4608
4609	if (locked) {
4610		spa->spa_last_open_failed = 0;
4611		spa->spa_last_ubsync_txg = 0;
4612		spa->spa_load_txg = 0;
4613		mutex_exit(&spa_namespace_lock);
4614	}
4615
4616	*spapp = spa;
4617
4618	return (0);
4619}
4620
4621int
4622spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
4623    nvlist_t **config)
4624{
4625	return (spa_open_common(name, spapp, tag, policy, config));
4626}
4627
4628int
4629spa_open(const char *name, spa_t **spapp, void *tag)
4630{
4631	return (spa_open_common(name, spapp, tag, NULL, NULL));
4632}
4633
4634/*
4635 * Lookup the given spa_t, incrementing the inject count in the process,
4636 * preventing it from being exported or destroyed.
4637 */
4638spa_t *
4639spa_inject_addref(char *name)
4640{
4641	spa_t *spa;
4642
4643	mutex_enter(&spa_namespace_lock);
4644	if ((spa = spa_lookup(name)) == NULL) {
4645		mutex_exit(&spa_namespace_lock);
4646		return (NULL);
4647	}
4648	spa->spa_inject_ref++;
4649	mutex_exit(&spa_namespace_lock);
4650
4651	return (spa);
4652}
4653
4654void
4655spa_inject_delref(spa_t *spa)
4656{
4657	mutex_enter(&spa_namespace_lock);
4658	spa->spa_inject_ref--;
4659	mutex_exit(&spa_namespace_lock);
4660}
4661
4662/*
4663 * Add spares device information to the nvlist.
4664 */
4665static void
4666spa_add_spares(spa_t *spa, nvlist_t *config)
4667{
4668	nvlist_t **spares;
4669	uint_t i, nspares;
4670	nvlist_t *nvroot;
4671	uint64_t guid;
4672	vdev_stat_t *vs;
4673	uint_t vsc;
4674	uint64_t pool;
4675
4676	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4677
4678	if (spa->spa_spares.sav_count == 0)
4679		return;
4680
4681	VERIFY(nvlist_lookup_nvlist(config,
4682	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4683	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4684	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4685	if (nspares != 0) {
4686		VERIFY(nvlist_add_nvlist_array(nvroot,
4687		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4688		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4689		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4690
4691		/*
4692		 * Go through and find any spares which have since been
4693		 * repurposed as an active spare.  If this is the case, update
4694		 * their status appropriately.
4695		 */
4696		for (i = 0; i < nspares; i++) {
4697			VERIFY(nvlist_lookup_uint64(spares[i],
4698			    ZPOOL_CONFIG_GUID, &guid) == 0);
4699			if (spa_spare_exists(guid, &pool, NULL) &&
4700			    pool != 0ULL) {
4701				VERIFY(nvlist_lookup_uint64_array(
4702				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
4703				    (uint64_t **)&vs, &vsc) == 0);
4704				vs->vs_state = VDEV_STATE_CANT_OPEN;
4705				vs->vs_aux = VDEV_AUX_SPARED;
4706			}
4707		}
4708	}
4709}
4710
4711/*
4712 * Add l2cache device information to the nvlist, including vdev stats.
4713 */
4714static void
4715spa_add_l2cache(spa_t *spa, nvlist_t *config)
4716{
4717	nvlist_t **l2cache;
4718	uint_t i, j, nl2cache;
4719	nvlist_t *nvroot;
4720	uint64_t guid;
4721	vdev_t *vd;
4722	vdev_stat_t *vs;
4723	uint_t vsc;
4724
4725	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4726
4727	if (spa->spa_l2cache.sav_count == 0)
4728		return;
4729
4730	VERIFY(nvlist_lookup_nvlist(config,
4731	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4732	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4733	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4734	if (nl2cache != 0) {
4735		VERIFY(nvlist_add_nvlist_array(nvroot,
4736		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4737		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4738		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4739
4740		/*
4741		 * Update level 2 cache device stats.
4742		 */
4743
4744		for (i = 0; i < nl2cache; i++) {
4745			VERIFY(nvlist_lookup_uint64(l2cache[i],
4746			    ZPOOL_CONFIG_GUID, &guid) == 0);
4747
4748			vd = NULL;
4749			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
4750				if (guid ==
4751				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
4752					vd = spa->spa_l2cache.sav_vdevs[j];
4753					break;
4754				}
4755			}
4756			ASSERT(vd != NULL);
4757
4758			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
4759			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
4760			    == 0);
4761			vdev_get_stats(vd, vs);
4762			vdev_config_generate_stats(vd, l2cache[i]);
4763
4764		}
4765	}
4766}
4767
4768static void
4769spa_add_feature_stats(spa_t *spa, nvlist_t *config)
4770{
4771	nvlist_t *features;
4772	zap_cursor_t zc;
4773	zap_attribute_t za;
4774
4775	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4776	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4777
4778	if (spa->spa_feat_for_read_obj != 0) {
4779		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4780		    spa->spa_feat_for_read_obj);
4781		    zap_cursor_retrieve(&zc, &za) == 0;
4782		    zap_cursor_advance(&zc)) {
4783			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4784			    za.za_num_integers == 1);
4785			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4786			    za.za_first_integer));
4787		}
4788		zap_cursor_fini(&zc);
4789	}
4790
4791	if (spa->spa_feat_for_write_obj != 0) {
4792		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4793		    spa->spa_feat_for_write_obj);
4794		    zap_cursor_retrieve(&zc, &za) == 0;
4795		    zap_cursor_advance(&zc)) {
4796			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4797			    za.za_num_integers == 1);
4798			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4799			    za.za_first_integer));
4800		}
4801		zap_cursor_fini(&zc);
4802	}
4803
4804	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
4805	    features) == 0);
4806	nvlist_free(features);
4807}
4808
4809int
4810spa_get_stats(const char *name, nvlist_t **config,
4811    char *altroot, size_t buflen)
4812{
4813	int error;
4814	spa_t *spa;
4815
4816	*config = NULL;
4817	error = spa_open_common(name, &spa, FTAG, NULL, config);
4818
4819	if (spa != NULL) {
4820		/*
4821		 * This still leaves a window of inconsistency where the spares
4822		 * or l2cache devices could change and the config would be
4823		 * self-inconsistent.
4824		 */
4825		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4826
4827		if (*config != NULL) {
4828			uint64_t loadtimes[2];
4829
4830			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4831			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4832			VERIFY(nvlist_add_uint64_array(*config,
4833			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4834
4835			VERIFY(nvlist_add_uint64(*config,
4836			    ZPOOL_CONFIG_ERRCOUNT,
4837			    spa_get_errlog_size(spa)) == 0);
4838
4839			if (spa_suspended(spa)) {
4840				VERIFY(nvlist_add_uint64(*config,
4841				    ZPOOL_CONFIG_SUSPENDED,
4842				    spa->spa_failmode) == 0);
4843				VERIFY(nvlist_add_uint64(*config,
4844				    ZPOOL_CONFIG_SUSPENDED_REASON,
4845				    spa->spa_suspended) == 0);
4846			}
4847
4848			spa_add_spares(spa, *config);
4849			spa_add_l2cache(spa, *config);
4850			spa_add_feature_stats(spa, *config);
4851		}
4852	}
4853
4854	/*
4855	 * We want to get the alternate root even for faulted pools, so we cheat
4856	 * and call spa_lookup() directly.
4857	 */
4858	if (altroot) {
4859		if (spa == NULL) {
4860			mutex_enter(&spa_namespace_lock);
4861			spa = spa_lookup(name);
4862			if (spa)
4863				spa_altroot(spa, altroot, buflen);
4864			else
4865				altroot[0] = '\0';
4866			spa = NULL;
4867			mutex_exit(&spa_namespace_lock);
4868		} else {
4869			spa_altroot(spa, altroot, buflen);
4870		}
4871	}
4872
4873	if (spa != NULL) {
4874		spa_config_exit(spa, SCL_CONFIG, FTAG);
4875		spa_close(spa, FTAG);
4876	}
4877
4878	return (error);
4879}
4880
4881/*
4882 * Validate that the auxiliary device array is well formed.  We must have an
4883 * array of nvlists, each which describes a valid leaf vdev.  If this is an
4884 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4885 * specified, as long as they are well-formed.
4886 */
4887static int
4888spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4889    spa_aux_vdev_t *sav, const char *config, uint64_t version,
4890    vdev_labeltype_t label)
4891{
4892	nvlist_t **dev;
4893	uint_t i, ndev;
4894	vdev_t *vd;
4895	int error;
4896
4897	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4898
4899	/*
4900	 * It's acceptable to have no devs specified.
4901	 */
4902	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4903		return (0);
4904
4905	if (ndev == 0)
4906		return (SET_ERROR(EINVAL));
4907
4908	/*
4909	 * Make sure the pool is formatted with a version that supports this
4910	 * device type.
4911	 */
4912	if (spa_version(spa) < version)
4913		return (SET_ERROR(ENOTSUP));
4914
4915	/*
4916	 * Set the pending device list so we correctly handle device in-use
4917	 * checking.
4918	 */
4919	sav->sav_pending = dev;
4920	sav->sav_npending = ndev;
4921
4922	for (i = 0; i < ndev; i++) {
4923		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4924		    mode)) != 0)
4925			goto out;
4926
4927		if (!vd->vdev_ops->vdev_op_leaf) {
4928			vdev_free(vd);
4929			error = SET_ERROR(EINVAL);
4930			goto out;
4931		}
4932
4933		vd->vdev_top = vd;
4934
4935		if ((error = vdev_open(vd)) == 0 &&
4936		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
4937			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4938			    vd->vdev_guid) == 0);
4939		}
4940
4941		vdev_free(vd);
4942
4943		if (error &&
4944		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4945			goto out;
4946		else
4947			error = 0;
4948	}
4949
4950out:
4951	sav->sav_pending = NULL;
4952	sav->sav_npending = 0;
4953	return (error);
4954}
4955
4956static int
4957spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4958{
4959	int error;
4960
4961	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4962
4963	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4964	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4965	    VDEV_LABEL_SPARE)) != 0) {
4966		return (error);
4967	}
4968
4969	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4970	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4971	    VDEV_LABEL_L2CACHE));
4972}
4973
4974static void
4975spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4976    const char *config)
4977{
4978	int i;
4979
4980	if (sav->sav_config != NULL) {
4981		nvlist_t **olddevs;
4982		uint_t oldndevs;
4983		nvlist_t **newdevs;
4984
4985		/*
4986		 * Generate new dev list by concatentating with the
4987		 * current dev list.
4988		 */
4989		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4990		    &olddevs, &oldndevs) == 0);
4991
4992		newdevs = kmem_alloc(sizeof (void *) *
4993		    (ndevs + oldndevs), KM_SLEEP);
4994		for (i = 0; i < oldndevs; i++)
4995			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4996			    KM_SLEEP) == 0);
4997		for (i = 0; i < ndevs; i++)
4998			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
4999			    KM_SLEEP) == 0);
5000
5001		VERIFY(nvlist_remove(sav->sav_config, config,
5002		    DATA_TYPE_NVLIST_ARRAY) == 0);
5003
5004		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
5005		    config, newdevs, ndevs + oldndevs) == 0);
5006		for (i = 0; i < oldndevs + ndevs; i++)
5007			nvlist_free(newdevs[i]);
5008		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
5009	} else {
5010		/*
5011		 * Generate a new dev list.
5012		 */
5013		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
5014		    KM_SLEEP) == 0);
5015		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
5016		    devs, ndevs) == 0);
5017	}
5018}
5019
5020/*
5021 * Stop and drop level 2 ARC devices
5022 */
5023void
5024spa_l2cache_drop(spa_t *spa)
5025{
5026	vdev_t *vd;
5027	int i;
5028	spa_aux_vdev_t *sav = &spa->spa_l2cache;
5029
5030	for (i = 0; i < sav->sav_count; i++) {
5031		uint64_t pool;
5032
5033		vd = sav->sav_vdevs[i];
5034		ASSERT(vd != NULL);
5035
5036		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
5037		    pool != 0ULL && l2arc_vdev_present(vd))
5038			l2arc_remove_vdev(vd);
5039	}
5040}
5041
5042/*
5043 * Verify encryption parameters for spa creation. If we are encrypting, we must
5044 * have the encryption feature flag enabled.
5045 */
5046static int
5047spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
5048    boolean_t has_encryption)
5049{
5050	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
5051	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
5052	    !has_encryption)
5053		return (SET_ERROR(ENOTSUP));
5054
5055	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
5056}
5057
5058/*
5059 * Pool Creation
5060 */
5061int
5062spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
5063    nvlist_t *zplprops, dsl_crypto_params_t *dcp)
5064{
5065	spa_t *spa;
5066	char *altroot = NULL;
5067	vdev_t *rvd;
5068	dsl_pool_t *dp;
5069	dmu_tx_t *tx;
5070	int error = 0;
5071	uint64_t txg = TXG_INITIAL;
5072	nvlist_t **spares, **l2cache;
5073	uint_t nspares, nl2cache;
5074	uint64_t version, obj;
5075	boolean_t has_features;
5076	char *poolname;
5077	nvlist_t *nvl;
5078	boolean_t has_encryption;
5079	spa_feature_t feat;
5080	char *feat_name;
5081
5082	if (props == NULL ||
5083	    nvlist_lookup_string(props,
5084	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
5085		poolname = (char *)pool;
5086
5087	/*
5088	 * If this pool already exists, return failure.
5089	 */
5090	mutex_enter(&spa_namespace_lock);
5091	if (spa_lookup(poolname) != NULL) {
5092		mutex_exit(&spa_namespace_lock);
5093		return (SET_ERROR(EEXIST));
5094	}
5095
5096	/*
5097	 * Allocate a new spa_t structure.
5098	 */
5099	nvl = fnvlist_alloc();
5100	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
5101	(void) nvlist_lookup_string(props,
5102	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5103	spa = spa_add(poolname, nvl, altroot);
5104	fnvlist_free(nvl);
5105	spa_activate(spa, spa_mode_global);
5106
5107	if (props && (error = spa_prop_validate(spa, props))) {
5108		spa_deactivate(spa);
5109		spa_remove(spa);
5110		mutex_exit(&spa_namespace_lock);
5111		return (error);
5112	}
5113
5114	/*
5115	 * Temporary pool names should never be written to disk.
5116	 */
5117	if (poolname != pool)
5118		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
5119
5120	has_features = B_FALSE;
5121	has_encryption = B_FALSE;
5122	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
5123	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
5124		if (zpool_prop_feature(nvpair_name(elem))) {
5125			has_features = B_TRUE;
5126			feat_name = strchr(nvpair_name(elem), '@') + 1;
5127			VERIFY0(zfeature_lookup_name(feat_name, &feat));
5128			if (feat == SPA_FEATURE_ENCRYPTION)
5129				has_encryption = B_TRUE;
5130		}
5131	}
5132
5133	/* verify encryption params, if they were provided */
5134	if (dcp != NULL) {
5135		error = spa_create_check_encryption_params(dcp, has_encryption);
5136		if (error != 0) {
5137			spa_deactivate(spa);
5138			spa_remove(spa);
5139			mutex_exit(&spa_namespace_lock);
5140			return (error);
5141		}
5142	}
5143
5144	if (has_features || nvlist_lookup_uint64(props,
5145	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
5146		version = SPA_VERSION;
5147	}
5148	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
5149
5150	spa->spa_first_txg = txg;
5151	spa->spa_uberblock.ub_txg = txg - 1;
5152	spa->spa_uberblock.ub_version = version;
5153	spa->spa_ubsync = spa->spa_uberblock;
5154	spa->spa_load_state = SPA_LOAD_CREATE;
5155	spa->spa_removing_phys.sr_state = DSS_NONE;
5156	spa->spa_removing_phys.sr_removing_vdev = -1;
5157	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
5158	spa->spa_indirect_vdevs_loaded = B_TRUE;
5159
5160	/*
5161	 * Create "The Godfather" zio to hold all async IOs
5162	 */
5163	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
5164	    KM_SLEEP);
5165	for (int i = 0; i < max_ncpus; i++) {
5166		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
5167		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
5168		    ZIO_FLAG_GODFATHER);
5169	}
5170
5171	/*
5172	 * Create the root vdev.
5173	 */
5174	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5175
5176	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
5177
5178	ASSERT(error != 0 || rvd != NULL);
5179	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
5180
5181	if (error == 0 && !zfs_allocatable_devs(nvroot))
5182		error = SET_ERROR(EINVAL);
5183
5184	if (error == 0 &&
5185	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
5186	    (error = spa_validate_aux(spa, nvroot, txg,
5187	    VDEV_ALLOC_ADD)) == 0) {
5188		/*
5189		 * instantiate the metaslab groups (this will dirty the vdevs)
5190		 * we can no longer error exit past this point
5191		 */
5192		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
5193			vdev_t *vd = rvd->vdev_child[c];
5194
5195			vdev_metaslab_set_size(vd);
5196			vdev_expand(vd, txg);
5197		}
5198	}
5199
5200	spa_config_exit(spa, SCL_ALL, FTAG);
5201
5202	if (error != 0) {
5203		spa_unload(spa);
5204		spa_deactivate(spa);
5205		spa_remove(spa);
5206		mutex_exit(&spa_namespace_lock);
5207		return (error);
5208	}
5209
5210	/*
5211	 * Get the list of spares, if specified.
5212	 */
5213	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5214	    &spares, &nspares) == 0) {
5215		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
5216		    KM_SLEEP) == 0);
5217		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5218		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5219		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5220		spa_load_spares(spa);
5221		spa_config_exit(spa, SCL_ALL, FTAG);
5222		spa->spa_spares.sav_sync = B_TRUE;
5223	}
5224
5225	/*
5226	 * Get the list of level 2 cache devices, if specified.
5227	 */
5228	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5229	    &l2cache, &nl2cache) == 0) {
5230		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5231		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5232		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5233		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5234		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5235		spa_load_l2cache(spa);
5236		spa_config_exit(spa, SCL_ALL, FTAG);
5237		spa->spa_l2cache.sav_sync = B_TRUE;
5238	}
5239
5240	spa->spa_is_initializing = B_TRUE;
5241	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
5242	spa->spa_is_initializing = B_FALSE;
5243
5244	/*
5245	 * Create DDTs (dedup tables).
5246	 */
5247	ddt_create(spa);
5248
5249	spa_update_dspace(spa);
5250
5251	tx = dmu_tx_create_assigned(dp, txg);
5252
5253	/*
5254	 * Create the pool config object.
5255	 */
5256	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
5257	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
5258	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
5259
5260	if (zap_add(spa->spa_meta_objset,
5261	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
5262	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
5263		cmn_err(CE_PANIC, "failed to add pool config");
5264	}
5265
5266	if (zap_add(spa->spa_meta_objset,
5267	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
5268	    sizeof (uint64_t), 1, &version, tx) != 0) {
5269		cmn_err(CE_PANIC, "failed to add pool version");
5270	}
5271
5272	/* Newly created pools with the right version are always deflated. */
5273	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
5274		spa->spa_deflate = TRUE;
5275		if (zap_add(spa->spa_meta_objset,
5276		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5277		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
5278			cmn_err(CE_PANIC, "failed to add deflate");
5279		}
5280	}
5281
5282	/*
5283	 * Create the deferred-free bpobj.  Turn off compression
5284	 * because sync-to-convergence takes longer if the blocksize
5285	 * keeps changing.
5286	 */
5287	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
5288	dmu_object_set_compress(spa->spa_meta_objset, obj,
5289	    ZIO_COMPRESS_OFF, tx);
5290	if (zap_add(spa->spa_meta_objset,
5291	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
5292	    sizeof (uint64_t), 1, &obj, tx) != 0) {
5293		cmn_err(CE_PANIC, "failed to add bpobj");
5294	}
5295	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
5296	    spa->spa_meta_objset, obj));
5297
5298	/*
5299	 * Create the pool's history object.
5300	 */
5301	if (version >= SPA_VERSION_ZPOOL_HISTORY)
5302		spa_history_create_obj(spa, tx);
5303
5304	/*
5305	 * Generate some random noise for salted checksums to operate on.
5306	 */
5307	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
5308	    sizeof (spa->spa_cksum_salt.zcs_bytes));
5309
5310	/*
5311	 * Set pool properties.
5312	 */
5313	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
5314	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
5315	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
5316	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
5317	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
5318	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
5319
5320	if (props != NULL) {
5321		spa_configfile_set(spa, props, B_FALSE);
5322		spa_sync_props(props, tx);
5323	}
5324
5325	dmu_tx_commit(tx);
5326
5327	spa->spa_sync_on = B_TRUE;
5328	txg_sync_start(spa->spa_dsl_pool);
5329	mmp_thread_start(spa);
5330
5331	/*
5332	 * We explicitly wait for the first transaction to complete so that our
5333	 * bean counters are appropriately updated.
5334	 */
5335	txg_wait_synced(spa->spa_dsl_pool, txg);
5336
5337	spa_spawn_aux_threads(spa);
5338
5339	spa_write_cachefile(spa, B_FALSE, B_TRUE);
5340	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
5341
5342	spa_history_log_version(spa, "create");
5343
5344	/*
5345	 * Don't count references from objsets that are already closed
5346	 * and are making their way through the eviction process.
5347	 */
5348	spa_evicting_os_wait(spa);
5349	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
5350	spa->spa_load_state = SPA_LOAD_NONE;
5351
5352	mutex_exit(&spa_namespace_lock);
5353
5354	return (0);
5355}
5356
5357#ifdef _KERNEL
5358/*
5359 * Get the root pool information from the root disk, then import the root pool
5360 * during the system boot up time.
5361 */
5362static nvlist_t *
5363spa_generate_rootconf(const char *devpath, const char *devid, uint64_t *guid,
5364    uint64_t pool_guid)
5365{
5366	nvlist_t *config;
5367	nvlist_t *nvtop, *nvroot;
5368	uint64_t pgid;
5369
5370	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
5371		return (NULL);
5372
5373	/*
5374	 * Add this top-level vdev to the child array.
5375	 */
5376	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5377	    &nvtop) == 0);
5378	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5379	    &pgid) == 0);
5380	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
5381
5382	if (pool_guid != 0 && pool_guid != pgid) {
5383		/*
5384		 * The boot loader provided a pool GUID, but it does not match
5385		 * the one we found in the label.  Return failure so that we
5386		 * can fall back to the full device scan.
5387		 */
5388		zfs_dbgmsg("spa_generate_rootconf: loader pool guid %llu != "
5389		    "label pool guid %llu", (u_longlong_t)pool_guid,
5390		    (u_longlong_t)pgid);
5391		nvlist_free(config);
5392		return (NULL);
5393	}
5394
5395	/*
5396	 * Put this pool's top-level vdevs into a root vdev.
5397	 */
5398	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5399	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
5400	    VDEV_TYPE_ROOT) == 0);
5401	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
5402	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
5403	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
5404	    &nvtop, 1) == 0);
5405
5406	/*
5407	 * Replace the existing vdev_tree with the new root vdev in
5408	 * this pool's configuration (remove the old, add the new).
5409	 */
5410	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
5411	nvlist_free(nvroot);
5412	return (config);
5413}
5414
5415/*
5416 * Walk the vdev tree and see if we can find a device with "better"
5417 * configuration. A configuration is "better" if the label on that
5418 * device has a more recent txg.
5419 */
5420static void
5421spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
5422{
5423	for (int c = 0; c < vd->vdev_children; c++)
5424		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
5425
5426	if (vd->vdev_ops->vdev_op_leaf) {
5427		nvlist_t *label;
5428		uint64_t label_txg;
5429
5430		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
5431		    &label) != 0)
5432			return;
5433
5434		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
5435		    &label_txg) == 0);
5436
5437		/*
5438		 * Do we have a better boot device?
5439		 */
5440		if (label_txg > *txg) {
5441			*txg = label_txg;
5442			*avd = vd;
5443		}
5444		nvlist_free(label);
5445	}
5446}
5447
5448/*
5449 * Import a root pool.
5450 *
5451 * For x86. devpath_list will consist of devid and/or physpath name of
5452 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
5453 * The GRUB "findroot" command will return the vdev we should boot.
5454 *
5455 * For Sparc, devpath_list consists the physpath name of the booting device
5456 * no matter the rootpool is a single device pool or a mirrored pool.
5457 * e.g.
5458 *	"/pci@1f,0/ide@d/disk@0,0:a"
5459 */
5460int
5461spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid,
5462    uint64_t vdev_guid)
5463{
5464	spa_t *spa;
5465	vdev_t *rvd, *bvd, *avd = NULL;
5466	nvlist_t *config, *nvtop;
5467	uint64_t guid, txg;
5468	char *pname;
5469	int error;
5470	const char *altdevpath = NULL;
5471
5472	/*
5473	 * Read the label from the boot device and generate a configuration.
5474	 */
5475	config = spa_generate_rootconf(devpath, devid, &guid, pool_guid);
5476#if defined(_OBP) && defined(_KERNEL)
5477	if (config == NULL) {
5478		if (strstr(devpath, "/iscsi/ssd") != NULL) {
5479			/* iscsi boot */
5480			get_iscsi_bootpath_phy(devpath);
5481			config = spa_generate_rootconf(devpath, devid, &guid,
5482			    pool_guid);
5483		}
5484	}
5485#endif
5486
5487	/*
5488	 * We were unable to import the pool using the /devices path or devid
5489	 * provided by the boot loader.  This may be the case if the boot
5490	 * device has been connected to a different location in the system, or
5491	 * if a new boot environment has changed the driver used to access the
5492	 * boot device.
5493	 *
5494	 * Attempt an exhaustive scan of all visible block devices to see if we
5495	 * can locate an alternative /devices path with a label that matches
5496	 * the expected pool and vdev GUID.
5497	 */
5498	if (config == NULL && (altdevpath =
5499	    vdev_disk_preroot_lookup(pool_guid, vdev_guid)) != NULL) {
5500		cmn_err(CE_NOTE, "Original /devices path (%s) not available; "
5501		    "ZFS is trying an alternate path (%s)", devpath,
5502		    altdevpath);
5503		config = spa_generate_rootconf(altdevpath, NULL, &guid,
5504		    pool_guid);
5505	}
5506
5507	if (config == NULL) {
5508		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
5509		    devpath);
5510		return (SET_ERROR(EIO));
5511	}
5512
5513	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
5514	    &pname) == 0);
5515	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
5516
5517	mutex_enter(&spa_namespace_lock);
5518	if ((spa = spa_lookup(pname)) != NULL) {
5519		/*
5520		 * Remove the existing root pool from the namespace so that we
5521		 * can replace it with the correct config we just read in.
5522		 */
5523		spa_remove(spa);
5524	}
5525
5526	spa = spa_add(pname, config, NULL);
5527	spa->spa_is_root = B_TRUE;
5528	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
5529	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
5530	    &spa->spa_ubsync.ub_version) != 0)
5531		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
5532
5533	/*
5534	 * Build up a vdev tree based on the boot device's label config.
5535	 */
5536	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5537	    &nvtop) == 0);
5538	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5539	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
5540	    VDEV_ALLOC_ROOTPOOL);
5541	spa_config_exit(spa, SCL_ALL, FTAG);
5542	if (error) {
5543		mutex_exit(&spa_namespace_lock);
5544		nvlist_free(config);
5545		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
5546		    pname);
5547		return (error);
5548	}
5549
5550	/*
5551	 * Get the boot vdev.
5552	 */
5553	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
5554		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
5555		    (u_longlong_t)guid);
5556		error = SET_ERROR(ENOENT);
5557		goto out;
5558	}
5559
5560	/*
5561	 * Determine if there is a better boot device.
5562	 */
5563	avd = bvd;
5564	spa_alt_rootvdev(rvd, &avd, &txg);
5565	if (avd != bvd) {
5566		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
5567		    "try booting from '%s'", avd->vdev_path);
5568		error = SET_ERROR(EINVAL);
5569		goto out;
5570	}
5571
5572	/*
5573	 * If the boot device is part of a spare vdev then ensure that
5574	 * we're booting off the active spare.
5575	 */
5576	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
5577	    !bvd->vdev_isspare) {
5578		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
5579		    "try booting from '%s'",
5580		    bvd->vdev_parent->
5581		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
5582		error = SET_ERROR(EINVAL);
5583		goto out;
5584	}
5585
5586	error = 0;
5587out:
5588	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5589	vdev_free(rvd);
5590	spa_config_exit(spa, SCL_ALL, FTAG);
5591	mutex_exit(&spa_namespace_lock);
5592
5593	nvlist_free(config);
5594	return (error);
5595}
5596
5597#endif
5598
5599/*
5600 * Import a non-root pool into the system.
5601 */
5602int
5603spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
5604{
5605	spa_t *spa;
5606	char *altroot = NULL;
5607	spa_load_state_t state = SPA_LOAD_IMPORT;
5608	zpool_load_policy_t policy;
5609	uint64_t mode = spa_mode_global;
5610	uint64_t readonly = B_FALSE;
5611	int error;
5612	nvlist_t *nvroot;
5613	nvlist_t **spares, **l2cache;
5614	uint_t nspares, nl2cache;
5615
5616	/*
5617	 * If a pool with this name exists, return failure.
5618	 */
5619	mutex_enter(&spa_namespace_lock);
5620	if (spa_lookup(pool) != NULL) {
5621		mutex_exit(&spa_namespace_lock);
5622		return (SET_ERROR(EEXIST));
5623	}
5624
5625	/*
5626	 * Create and initialize the spa structure.
5627	 */
5628	(void) nvlist_lookup_string(props,
5629	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5630	(void) nvlist_lookup_uint64(props,
5631	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
5632	if (readonly)
5633		mode = FREAD;
5634	spa = spa_add(pool, config, altroot);
5635	spa->spa_import_flags = flags;
5636
5637	/*
5638	 * Verbatim import - Take a pool and insert it into the namespace
5639	 * as if it had been loaded at boot.
5640	 */
5641	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
5642		if (props != NULL)
5643			spa_configfile_set(spa, props, B_FALSE);
5644
5645		spa_write_cachefile(spa, B_FALSE, B_TRUE);
5646		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5647		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
5648		mutex_exit(&spa_namespace_lock);
5649		return (0);
5650	}
5651
5652	spa_activate(spa, mode);
5653
5654	/*
5655	 * Don't start async tasks until we know everything is healthy.
5656	 */
5657	spa_async_suspend(spa);
5658
5659	zpool_get_load_policy(config, &policy);
5660	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5661		state = SPA_LOAD_RECOVER;
5662
5663	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
5664
5665	if (state != SPA_LOAD_RECOVER) {
5666		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5667		zfs_dbgmsg("spa_import: importing %s", pool);
5668	} else {
5669		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
5670		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
5671	}
5672	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
5673
5674	/*
5675	 * Propagate anything learned while loading the pool and pass it
5676	 * back to caller (i.e. rewind info, missing devices, etc).
5677	 */
5678	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5679	    spa->spa_load_info) == 0);
5680
5681	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5682	/*
5683	 * Toss any existing sparelist, as it doesn't have any validity
5684	 * anymore, and conflicts with spa_has_spare().
5685	 */
5686	if (spa->spa_spares.sav_config) {
5687		nvlist_free(spa->spa_spares.sav_config);
5688		spa->spa_spares.sav_config = NULL;
5689		spa_load_spares(spa);
5690	}
5691	if (spa->spa_l2cache.sav_config) {
5692		nvlist_free(spa->spa_l2cache.sav_config);
5693		spa->spa_l2cache.sav_config = NULL;
5694		spa_load_l2cache(spa);
5695	}
5696
5697	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5698	    &nvroot) == 0);
5699	if (error == 0)
5700		error = spa_validate_aux(spa, nvroot, -1ULL,
5701		    VDEV_ALLOC_SPARE);
5702	if (error == 0)
5703		error = spa_validate_aux(spa, nvroot, -1ULL,
5704		    VDEV_ALLOC_L2CACHE);
5705	spa_config_exit(spa, SCL_ALL, FTAG);
5706
5707	if (props != NULL)
5708		spa_configfile_set(spa, props, B_FALSE);
5709
5710	if (error != 0 || (props && spa_writeable(spa) &&
5711	    (error = spa_prop_set(spa, props)))) {
5712		spa_unload(spa);
5713		spa_deactivate(spa);
5714		spa_remove(spa);
5715		mutex_exit(&spa_namespace_lock);
5716		return (error);
5717	}
5718
5719	spa_async_resume(spa);
5720
5721	/*
5722	 * Override any spares and level 2 cache devices as specified by
5723	 * the user, as these may have correct device names/devids, etc.
5724	 */
5725	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5726	    &spares, &nspares) == 0) {
5727		if (spa->spa_spares.sav_config)
5728			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
5729			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
5730		else
5731			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
5732			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5733		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5734		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5735		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5736		spa_load_spares(spa);
5737		spa_config_exit(spa, SCL_ALL, FTAG);
5738		spa->spa_spares.sav_sync = B_TRUE;
5739	}
5740	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5741	    &l2cache, &nl2cache) == 0) {
5742		if (spa->spa_l2cache.sav_config)
5743			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
5744			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
5745		else
5746			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5747			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5748		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5749		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5750		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5751		spa_load_l2cache(spa);
5752		spa_config_exit(spa, SCL_ALL, FTAG);
5753		spa->spa_l2cache.sav_sync = B_TRUE;
5754	}
5755
5756	/*
5757	 * Check for any removed devices.
5758	 */
5759	if (spa->spa_autoreplace) {
5760		spa_aux_check_removed(&spa->spa_spares);
5761		spa_aux_check_removed(&spa->spa_l2cache);
5762	}
5763
5764	if (spa_writeable(spa)) {
5765		/*
5766		 * Update the config cache to include the newly-imported pool.
5767		 */
5768		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5769	}
5770
5771	/*
5772	 * It's possible that the pool was expanded while it was exported.
5773	 * We kick off an async task to handle this for us.
5774	 */
5775	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5776
5777	spa_history_log_version(spa, "import");
5778
5779	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5780
5781	mutex_exit(&spa_namespace_lock);
5782
5783	return (0);
5784}
5785
5786nvlist_t *
5787spa_tryimport(nvlist_t *tryconfig)
5788{
5789	nvlist_t *config = NULL;
5790	char *poolname, *cachefile;
5791	spa_t *spa;
5792	uint64_t state;
5793	int error;
5794	zpool_load_policy_t policy;
5795
5796	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
5797		return (NULL);
5798
5799	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
5800		return (NULL);
5801
5802	/*
5803	 * Create and initialize the spa structure.
5804	 */
5805	mutex_enter(&spa_namespace_lock);
5806	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
5807	spa_activate(spa, FREAD);
5808
5809	/*
5810	 * Rewind pool if a max txg was provided.
5811	 */
5812	zpool_get_load_policy(spa->spa_config, &policy);
5813	if (policy.zlp_txg != UINT64_MAX) {
5814		spa->spa_load_max_txg = policy.zlp_txg;
5815		spa->spa_extreme_rewind = B_TRUE;
5816		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
5817		    poolname, (longlong_t)policy.zlp_txg);
5818	} else {
5819		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
5820	}
5821
5822	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
5823	    == 0) {
5824		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
5825		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5826	} else {
5827		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
5828	}
5829
5830	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
5831
5832	/*
5833	 * If 'tryconfig' was at least parsable, return the current config.
5834	 */
5835	if (spa->spa_root_vdev != NULL) {
5836		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5837		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
5838		    poolname) == 0);
5839		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5840		    state) == 0);
5841		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
5842		    spa->spa_uberblock.ub_timestamp) == 0);
5843		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5844		    spa->spa_load_info) == 0);
5845
5846		/*
5847		 * If the bootfs property exists on this pool then we
5848		 * copy it out so that external consumers can tell which
5849		 * pools are bootable.
5850		 */
5851		if ((!error || error == EEXIST) && spa->spa_bootfs) {
5852			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5853
5854			/*
5855			 * We have to play games with the name since the
5856			 * pool was opened as TRYIMPORT_NAME.
5857			 */
5858			if (dsl_dsobj_to_dsname(spa_name(spa),
5859			    spa->spa_bootfs, tmpname) == 0) {
5860				char *cp;
5861				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5862
5863				cp = strchr(tmpname, '/');
5864				if (cp == NULL) {
5865					(void) strlcpy(dsname, tmpname,
5866					    MAXPATHLEN);
5867				} else {
5868					(void) snprintf(dsname, MAXPATHLEN,
5869					    "%s/%s", poolname, ++cp);
5870				}
5871				VERIFY(nvlist_add_string(config,
5872				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
5873				kmem_free(dsname, MAXPATHLEN);
5874			}
5875			kmem_free(tmpname, MAXPATHLEN);
5876		}
5877
5878		/*
5879		 * Add the list of hot spares and level 2 cache devices.
5880		 */
5881		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5882		spa_add_spares(spa, config);
5883		spa_add_l2cache(spa, config);
5884		spa_config_exit(spa, SCL_CONFIG, FTAG);
5885	}
5886
5887	spa_unload(spa);
5888	spa_deactivate(spa);
5889	spa_remove(spa);
5890	mutex_exit(&spa_namespace_lock);
5891
5892	return (config);
5893}
5894
5895/*
5896 * Pool export/destroy
5897 *
5898 * The act of destroying or exporting a pool is very simple.  We make sure there
5899 * is no more pending I/O and any references to the pool are gone.  Then, we
5900 * update the pool state and sync all the labels to disk, removing the
5901 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
5902 * we don't sync the labels or remove the configuration cache.
5903 */
5904static int
5905spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5906    boolean_t force, boolean_t hardforce)
5907{
5908	spa_t *spa;
5909
5910	if (oldconfig)
5911		*oldconfig = NULL;
5912
5913	if (!(spa_mode_global & FWRITE))
5914		return (SET_ERROR(EROFS));
5915
5916	mutex_enter(&spa_namespace_lock);
5917	if ((spa = spa_lookup(pool)) == NULL) {
5918		mutex_exit(&spa_namespace_lock);
5919		return (SET_ERROR(ENOENT));
5920	}
5921
5922	/*
5923	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
5924	 * reacquire the namespace lock, and see if we can export.
5925	 */
5926	spa_open_ref(spa, FTAG);
5927	mutex_exit(&spa_namespace_lock);
5928	spa_async_suspend(spa);
5929	mutex_enter(&spa_namespace_lock);
5930	spa_close(spa, FTAG);
5931
5932	/*
5933	 * The pool will be in core if it's openable,
5934	 * in which case we can modify its state.
5935	 */
5936	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5937
5938		/*
5939		 * Objsets may be open only because they're dirty, so we
5940		 * have to force it to sync before checking spa_refcnt.
5941		 */
5942		txg_wait_synced(spa->spa_dsl_pool, 0);
5943		spa_evicting_os_wait(spa);
5944
5945		/*
5946		 * A pool cannot be exported or destroyed if there are active
5947		 * references.  If we are resetting a pool, allow references by
5948		 * fault injection handlers.
5949		 */
5950		if (!spa_refcount_zero(spa) ||
5951		    (spa->spa_inject_ref != 0 &&
5952		    new_state != POOL_STATE_UNINITIALIZED)) {
5953			spa_async_resume(spa);
5954			mutex_exit(&spa_namespace_lock);
5955			return (SET_ERROR(EBUSY));
5956		}
5957
5958		/*
5959		 * A pool cannot be exported if it has an active shared spare.
5960		 * This is to prevent other pools stealing the active spare
5961		 * from an exported pool. At user's own will, such pool can
5962		 * be forcedly exported.
5963		 */
5964		if (!force && new_state == POOL_STATE_EXPORTED &&
5965		    spa_has_active_shared_spare(spa)) {
5966			spa_async_resume(spa);
5967			mutex_exit(&spa_namespace_lock);
5968			return (SET_ERROR(EXDEV));
5969		}
5970
5971		/*
5972		 * We're about to export or destroy this pool. Make sure
5973		 * we stop all initialization and trim activity here before
5974		 * we set the spa_final_txg. This will ensure that all
5975		 * dirty data resulting from the initialization is
5976		 * committed to disk before we unload the pool.
5977		 */
5978		if (spa->spa_root_vdev != NULL) {
5979			vdev_t *rvd = spa->spa_root_vdev;
5980			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
5981			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
5982			vdev_autotrim_stop_all(spa);
5983		}
5984
5985		/*
5986		 * We want this to be reflected on every label,
5987		 * so mark them all dirty.  spa_unload() will do the
5988		 * final sync that pushes these changes out.
5989		 */
5990		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5991			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5992			spa->spa_state = new_state;
5993			spa->spa_final_txg = spa_last_synced_txg(spa) +
5994			    TXG_DEFER_SIZE + 1;
5995			vdev_config_dirty(spa->spa_root_vdev);
5996			spa_config_exit(spa, SCL_ALL, FTAG);
5997		}
5998	}
5999
6000	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
6001
6002	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6003		spa_unload(spa);
6004		spa_deactivate(spa);
6005	}
6006
6007	if (oldconfig && spa->spa_config)
6008		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
6009
6010	if (new_state != POOL_STATE_UNINITIALIZED) {
6011		if (!hardforce)
6012			spa_write_cachefile(spa, B_TRUE, B_TRUE);
6013		spa_remove(spa);
6014	}
6015	mutex_exit(&spa_namespace_lock);
6016
6017	return (0);
6018}
6019
6020/*
6021 * Destroy a storage pool.
6022 */
6023int
6024spa_destroy(char *pool)
6025{
6026	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
6027	    B_FALSE, B_FALSE));
6028}
6029
6030/*
6031 * Export a storage pool.
6032 */
6033int
6034spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
6035    boolean_t hardforce)
6036{
6037	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
6038	    force, hardforce));
6039}
6040
6041/*
6042 * Similar to spa_export(), this unloads the spa_t without actually removing it
6043 * from the namespace in any way.
6044 */
6045int
6046spa_reset(char *pool)
6047{
6048	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
6049	    B_FALSE, B_FALSE));
6050}
6051
6052/*
6053 * ==========================================================================
6054 * Device manipulation
6055 * ==========================================================================
6056 */
6057
6058/*
6059 * Add a device to a storage pool.
6060 */
6061int
6062spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
6063{
6064	uint64_t txg;
6065	int error;
6066	vdev_t *rvd = spa->spa_root_vdev;
6067	vdev_t *vd, *tvd;
6068	nvlist_t **spares, **l2cache;
6069	uint_t nspares, nl2cache;
6070
6071	ASSERT(spa_writeable(spa));
6072
6073	txg = spa_vdev_enter(spa);
6074
6075	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
6076	    VDEV_ALLOC_ADD)) != 0)
6077		return (spa_vdev_exit(spa, NULL, txg, error));
6078
6079	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
6080
6081	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
6082	    &nspares) != 0)
6083		nspares = 0;
6084
6085	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
6086	    &nl2cache) != 0)
6087		nl2cache = 0;
6088
6089	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
6090		return (spa_vdev_exit(spa, vd, txg, EINVAL));
6091
6092	if (vd->vdev_children != 0 &&
6093	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
6094		return (spa_vdev_exit(spa, vd, txg, error));
6095
6096	/*
6097	 * We must validate the spares and l2cache devices after checking the
6098	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
6099	 */
6100	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
6101		return (spa_vdev_exit(spa, vd, txg, error));
6102
6103	/*
6104	 * If we are in the middle of a device removal, we can only add
6105	 * devices which match the existing devices in the pool.
6106	 * If we are in the middle of a removal, or have some indirect
6107	 * vdevs, we can not add raidz toplevels.
6108	 */
6109	if (spa->spa_vdev_removal != NULL ||
6110	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
6111		for (int c = 0; c < vd->vdev_children; c++) {
6112			tvd = vd->vdev_child[c];
6113			if (spa->spa_vdev_removal != NULL &&
6114			    tvd->vdev_ashift != spa->spa_max_ashift) {
6115				return (spa_vdev_exit(spa, vd, txg, EINVAL));
6116			}
6117			/* Fail if top level vdev is raidz */
6118			if (tvd->vdev_ops == &vdev_raidz_ops) {
6119				return (spa_vdev_exit(spa, vd, txg, EINVAL));
6120			}
6121			/*
6122			 * Need the top level mirror to be
6123			 * a mirror of leaf vdevs only
6124			 */
6125			if (tvd->vdev_ops == &vdev_mirror_ops) {
6126				for (uint64_t cid = 0;
6127				    cid < tvd->vdev_children; cid++) {
6128					vdev_t *cvd = tvd->vdev_child[cid];
6129					if (!cvd->vdev_ops->vdev_op_leaf) {
6130						return (spa_vdev_exit(spa, vd,
6131						    txg, EINVAL));
6132					}
6133				}
6134			}
6135		}
6136	}
6137
6138	for (int c = 0; c < vd->vdev_children; c++) {
6139		tvd = vd->vdev_child[c];
6140		vdev_remove_child(vd, tvd);
6141		tvd->vdev_id = rvd->vdev_children;
6142		vdev_add_child(rvd, tvd);
6143		vdev_config_dirty(tvd);
6144	}
6145
6146	if (nspares != 0) {
6147		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
6148		    ZPOOL_CONFIG_SPARES);
6149		spa_load_spares(spa);
6150		spa->spa_spares.sav_sync = B_TRUE;
6151	}
6152
6153	if (nl2cache != 0) {
6154		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
6155		    ZPOOL_CONFIG_L2CACHE);
6156		spa_load_l2cache(spa);
6157		spa->spa_l2cache.sav_sync = B_TRUE;
6158	}
6159
6160	/*
6161	 * We have to be careful when adding new vdevs to an existing pool.
6162	 * If other threads start allocating from these vdevs before we
6163	 * sync the config cache, and we lose power, then upon reboot we may
6164	 * fail to open the pool because there are DVAs that the config cache
6165	 * can't translate.  Therefore, we first add the vdevs without
6166	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
6167	 * and then let spa_config_update() initialize the new metaslabs.
6168	 *
6169	 * spa_load() checks for added-but-not-initialized vdevs, so that
6170	 * if we lose power at any point in this sequence, the remaining
6171	 * steps will be completed the next time we load the pool.
6172	 */
6173	(void) spa_vdev_exit(spa, vd, txg, 0);
6174
6175	mutex_enter(&spa_namespace_lock);
6176	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6177	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
6178	mutex_exit(&