1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 * Copyright 2016 Toomas Soome <tsoome@me.com>
30 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
31 * Copyright 2019 Joyent, Inc.
32 * Copyright (c) 2017, Intel Corporation.
33 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
34 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
35 */
36
37/*
38 * SPA: Storage Pool Allocator
39 *
40 * This file contains all the routines used when modifying on-disk SPA state.
41 * This includes opening, importing, destroying, exporting a pool, and syncing a
42 * pool.
43 */
44
45#include <sys/zfs_context.h>
46#include <sys/fm/fs/zfs.h>
47#include <sys/spa_impl.h>
48#include <sys/zio.h>
49#include <sys/zio_checksum.h>
50#include <sys/dmu.h>
51#include <sys/dmu_tx.h>
52#include <sys/zap.h>
53#include <sys/zil.h>
54#include <sys/ddt.h>
55#include <sys/vdev_impl.h>
56#include <sys/vdev_removal.h>
57#include <sys/vdev_indirect_mapping.h>
58#include <sys/vdev_indirect_births.h>
59#include <sys/vdev_initialize.h>
60#include <sys/vdev_trim.h>
61#include <sys/metaslab.h>
62#include <sys/metaslab_impl.h>
63#include <sys/mmp.h>
64#include <sys/uberblock_impl.h>
65#include <sys/txg.h>
66#include <sys/avl.h>
67#include <sys/bpobj.h>
68#include <sys/dmu_traverse.h>
69#include <sys/dmu_objset.h>
70#include <sys/unique.h>
71#include <sys/dsl_pool.h>
72#include <sys/dsl_dataset.h>
73#include <sys/dsl_dir.h>
74#include <sys/dsl_prop.h>
75#include <sys/dsl_synctask.h>
76#include <sys/fs/zfs.h>
77#include <sys/arc.h>
78#include <sys/callb.h>
79#include <sys/systeminfo.h>
80#include <sys/spa_boot.h>
81#include <sys/zfs_ioctl.h>
82#include <sys/dsl_scan.h>
83#include <sys/zfeature.h>
84#include <sys/dsl_destroy.h>
85#include <sys/abd.h>
86
87#ifdef	_KERNEL
88#include <sys/bootprops.h>
89#include <sys/callb.h>
90#include <sys/cpupart.h>
91#include <sys/pool.h>
92#include <sys/sysdc.h>
93#include <sys/zone.h>
94#endif	/* _KERNEL */
95
96#include "zfs_prop.h"
97#include "zfs_comutil.h"
98
99/*
100 * The interval, in seconds, at which failed configuration cache file writes
101 * should be retried.
102 */
103int zfs_ccw_retry_interval = 300;
104
105typedef enum zti_modes {
106	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
107	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
108	ZTI_MODE_NULL,			/* don't create a taskq */
109	ZTI_NMODES
110} zti_modes_t;
111
112#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
113#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
114#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
115
116#define	ZTI_N(n)	ZTI_P(n, 1)
117#define	ZTI_ONE		ZTI_N(1)
118
119typedef struct zio_taskq_info {
120	zti_modes_t zti_mode;
121	uint_t zti_value;
122	uint_t zti_count;
123} zio_taskq_info_t;
124
125static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
126	"issue", "issue_high", "intr", "intr_high"
127};
128
129/*
130 * This table defines the taskq settings for each ZFS I/O type. When
131 * initializing a pool, we use this table to create an appropriately sized
132 * taskq. Some operations are low volume and therefore have a small, static
133 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
134 * macros. Other operations process a large amount of data; the ZTI_BATCH
135 * macro causes us to create a taskq oriented for throughput. Some operations
136 * are so high frequency and short-lived that the taskq itself can become a
137 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
138 * additional degree of parallelism specified by the number of threads per-
139 * taskq and the number of taskqs; when dispatching an event in this case, the
140 * particular taskq is chosen at random.
141 *
142 * The different taskq priorities are to handle the different contexts (issue
143 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
144 * need to be handled with minimum delay.
145 */
146const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
147	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
148	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
149	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
150	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
151	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
152	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
153	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
154	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
155};
156
157static void spa_sync_version(void *arg, dmu_tx_t *tx);
158static void spa_sync_props(void *arg, dmu_tx_t *tx);
159static boolean_t spa_has_active_shared_spare(spa_t *spa);
160static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
161static void spa_vdev_resilver_done(spa_t *spa);
162
163uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
164id_t		zio_taskq_psrset_bind = PS_NONE;
165boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
166uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
167
168boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
169extern int	zfs_sync_pass_deferred_free;
170
171/*
172 * Report any spa_load_verify errors found, but do not fail spa_load.
173 * This is used by zdb to analyze non-idle pools.
174 */
175boolean_t	spa_load_verify_dryrun = B_FALSE;
176
177/*
178 * This (illegal) pool name is used when temporarily importing a spa_t in order
179 * to get the vdev stats associated with the imported devices.
180 */
181#define	TRYIMPORT_NAME	"$import"
182
183/*
184 * For debugging purposes: print out vdev tree during pool import.
185 */
186boolean_t	spa_load_print_vdev_tree = B_FALSE;
187
188/*
189 * A non-zero value for zfs_max_missing_tvds means that we allow importing
190 * pools with missing top-level vdevs. This is strictly intended for advanced
191 * pool recovery cases since missing data is almost inevitable. Pools with
192 * missing devices can only be imported read-only for safety reasons, and their
193 * fail-mode will be automatically set to "continue".
194 *
195 * With 1 missing vdev we should be able to import the pool and mount all
196 * datasets. User data that was not modified after the missing device has been
197 * added should be recoverable. This means that snapshots created prior to the
198 * addition of that device should be completely intact.
199 *
200 * With 2 missing vdevs, some datasets may fail to mount since there are
201 * dataset statistics that are stored as regular metadata. Some data might be
202 * recoverable if those vdevs were added recently.
203 *
204 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
205 * may be missing entirely. Chances of data recovery are very low. Note that
206 * there are also risks of performing an inadvertent rewind as we might be
207 * missing all the vdevs with the latest uberblocks.
208 */
209uint64_t	zfs_max_missing_tvds = 0;
210
211/*
212 * The parameters below are similar to zfs_max_missing_tvds but are only
213 * intended for a preliminary open of the pool with an untrusted config which
214 * might be incomplete or out-dated.
215 *
216 * We are more tolerant for pools opened from a cachefile since we could have
217 * an out-dated cachefile where a device removal was not registered.
218 * We could have set the limit arbitrarily high but in the case where devices
219 * are really missing we would want to return the proper error codes; we chose
220 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
221 * and we get a chance to retrieve the trusted config.
222 */
223uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
224
225/*
226 * In the case where config was assembled by scanning device paths (/dev/dsks
227 * by default) we are less tolerant since all the existing devices should have
228 * been detected and we want spa_load to return the right error codes.
229 */
230uint64_t	zfs_max_missing_tvds_scan = 0;
231
232/*
233 * Interval in seconds at which to poll spare vdevs for health.
234 * Setting this to zero disables spare polling.
235 * Set to three hours by default.
236 */
237uint_t		spa_spare_poll_interval_seconds = 60 * 60 * 3;
238
239/*
240 * Debugging aid that pauses spa_sync() towards the end.
241 */
242boolean_t	zfs_pause_spa_sync = B_FALSE;
243
244/*
245 * ==========================================================================
246 * SPA properties routines
247 * ==========================================================================
248 */
249
250/*
251 * Add a (source=src, propname=propval) list to an nvlist.
252 */
253static void
254spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
255    uint64_t intval, zprop_source_t src)
256{
257	const char *propname = zpool_prop_to_name(prop);
258	nvlist_t *propval;
259
260	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
261	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
262
263	if (strval != NULL)
264		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
265	else
266		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
267
268	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
269	nvlist_free(propval);
270}
271
272/*
273 * Get property values from the spa configuration.
274 */
275static void
276spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
277{
278	vdev_t *rvd = spa->spa_root_vdev;
279	dsl_pool_t *pool = spa->spa_dsl_pool;
280	uint64_t size, alloc, cap, version;
281	zprop_source_t src = ZPROP_SRC_NONE;
282	spa_config_dirent_t *dp;
283	metaslab_class_t *mc = spa_normal_class(spa);
284
285	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
286
287	if (rvd != NULL) {
288		alloc = metaslab_class_get_alloc(mc);
289		alloc += metaslab_class_get_alloc(spa_special_class(spa));
290		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
291
292		size = metaslab_class_get_space(mc);
293		size += metaslab_class_get_space(spa_special_class(spa));
294		size += metaslab_class_get_space(spa_dedup_class(spa));
295
296		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
297		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
298		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
299		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
300		    size - alloc, src);
301		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
302		    spa->spa_checkpoint_info.sci_dspace, src);
303
304		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
305		    metaslab_class_fragmentation(mc), src);
306		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
307		    metaslab_class_expandable_space(mc), src);
308		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
309		    (spa_mode(spa) == FREAD), src);
310
311		cap = (size == 0) ? 0 : (alloc * 100 / size);
312		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
313
314		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
315		    ddt_get_pool_dedup_ratio(spa), src);
316
317		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
318		    rvd->vdev_state, src);
319
320		version = spa_version(spa);
321		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
322			src = ZPROP_SRC_DEFAULT;
323		else
324			src = ZPROP_SRC_LOCAL;
325		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
326	}
327
328	if (pool != NULL) {
329		/*
330		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
331		 * when opening pools before this version freedir will be NULL.
332		 */
333		if (pool->dp_free_dir != NULL) {
334			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
335			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
336			    src);
337		} else {
338			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
339			    NULL, 0, src);
340		}
341
342		if (pool->dp_leak_dir != NULL) {
343			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
344			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
345			    src);
346		} else {
347			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
348			    NULL, 0, src);
349		}
350	}
351
352	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
353
354	if (spa->spa_comment != NULL) {
355		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
356		    0, ZPROP_SRC_LOCAL);
357	}
358
359	if (spa->spa_root != NULL)
360		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
361		    0, ZPROP_SRC_LOCAL);
362
363	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
364		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
365		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
366	} else {
367		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
368		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
369	}
370
371	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
372		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
373		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
374	} else {
375		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
376		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
377	}
378
379	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
380		if (dp->scd_path == NULL) {
381			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
382			    "none", 0, ZPROP_SRC_LOCAL);
383		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
384			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
385			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
386		}
387	}
388}
389
390/*
391 * Get zpool property values.
392 */
393int
394spa_prop_get(spa_t *spa, nvlist_t **nvp)
395{
396	objset_t *mos = spa->spa_meta_objset;
397	zap_cursor_t zc;
398	zap_attribute_t za;
399	int err;
400
401	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
402
403	mutex_enter(&spa->spa_props_lock);
404
405	/*
406	 * Get properties from the spa config.
407	 */
408	spa_prop_get_config(spa, nvp);
409
410	/* If no pool property object, no more prop to get. */
411	if (mos == NULL || spa->spa_pool_props_object == 0) {
412		mutex_exit(&spa->spa_props_lock);
413		return (0);
414	}
415
416	/*
417	 * Get properties from the MOS pool property object.
418	 */
419	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
420	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
421	    zap_cursor_advance(&zc)) {
422		uint64_t intval = 0;
423		char *strval = NULL;
424		zprop_source_t src = ZPROP_SRC_DEFAULT;
425		zpool_prop_t prop;
426
427		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
428			continue;
429
430		switch (za.za_integer_length) {
431		case 8:
432			/* integer property */
433			if (za.za_first_integer !=
434			    zpool_prop_default_numeric(prop))
435				src = ZPROP_SRC_LOCAL;
436
437			if (prop == ZPOOL_PROP_BOOTFS) {
438				dsl_pool_t *dp;
439				dsl_dataset_t *ds = NULL;
440
441				dp = spa_get_dsl(spa);
442				dsl_pool_config_enter(dp, FTAG);
443				err = dsl_dataset_hold_obj(dp,
444				    za.za_first_integer, FTAG, &ds);
445				if (err != 0) {
446					dsl_pool_config_exit(dp, FTAG);
447					break;
448				}
449
450				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
451				    KM_SLEEP);
452				dsl_dataset_name(ds, strval);
453				dsl_dataset_rele(ds, FTAG);
454				dsl_pool_config_exit(dp, FTAG);
455			} else {
456				strval = NULL;
457				intval = za.za_first_integer;
458			}
459
460			spa_prop_add_list(*nvp, prop, strval, intval, src);
461
462			if (strval != NULL)
463				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
464
465			break;
466
467		case 1:
468			/* string property */
469			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
470			err = zap_lookup(mos, spa->spa_pool_props_object,
471			    za.za_name, 1, za.za_num_integers, strval);
472			if (err) {
473				kmem_free(strval, za.za_num_integers);
474				break;
475			}
476			spa_prop_add_list(*nvp, prop, strval, 0, src);
477			kmem_free(strval, za.za_num_integers);
478			break;
479
480		default:
481			break;
482		}
483	}
484	zap_cursor_fini(&zc);
485	mutex_exit(&spa->spa_props_lock);
486out:
487	if (err && err != ENOENT) {
488		nvlist_free(*nvp);
489		*nvp = NULL;
490		return (err);
491	}
492
493	return (0);
494}
495
496/*
497 * Validate the given pool properties nvlist and modify the list
498 * for the property values to be set.
499 */
500static int
501spa_prop_validate(spa_t *spa, nvlist_t *props)
502{
503	nvpair_t *elem;
504	int error = 0, reset_bootfs = 0;
505	uint64_t objnum = 0;
506	boolean_t has_feature = B_FALSE;
507
508	elem = NULL;
509	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
510		uint64_t intval;
511		char *strval, *slash, *check, *fname;
512		const char *propname = nvpair_name(elem);
513		zpool_prop_t prop = zpool_name_to_prop(propname);
514
515		switch (prop) {
516		case ZPOOL_PROP_INVAL:
517			if (!zpool_prop_feature(propname)) {
518				error = SET_ERROR(EINVAL);
519				break;
520			}
521
522			/*
523			 * Sanitize the input.
524			 */
525			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
526				error = SET_ERROR(EINVAL);
527				break;
528			}
529
530			if (nvpair_value_uint64(elem, &intval) != 0) {
531				error = SET_ERROR(EINVAL);
532				break;
533			}
534
535			if (intval != 0) {
536				error = SET_ERROR(EINVAL);
537				break;
538			}
539
540			fname = strchr(propname, '@') + 1;
541			if (zfeature_lookup_name(fname, NULL) != 0) {
542				error = SET_ERROR(EINVAL);
543				break;
544			}
545
546			has_feature = B_TRUE;
547			break;
548
549		case ZPOOL_PROP_VERSION:
550			error = nvpair_value_uint64(elem, &intval);
551			if (!error &&
552			    (intval < spa_version(spa) ||
553			    intval > SPA_VERSION_BEFORE_FEATURES ||
554			    has_feature))
555				error = SET_ERROR(EINVAL);
556			break;
557
558		case ZPOOL_PROP_DELEGATION:
559		case ZPOOL_PROP_AUTOREPLACE:
560		case ZPOOL_PROP_LISTSNAPS:
561		case ZPOOL_PROP_AUTOEXPAND:
562		case ZPOOL_PROP_AUTOTRIM:
563			error = nvpair_value_uint64(elem, &intval);
564			if (!error && intval > 1)
565				error = SET_ERROR(EINVAL);
566			break;
567
568		case ZPOOL_PROP_MULTIHOST:
569			error = nvpair_value_uint64(elem, &intval);
570			if (!error && intval > 1)
571				error = SET_ERROR(EINVAL);
572
573			if (!error && !spa_get_hostid())
574				error = SET_ERROR(ENOTSUP);
575
576			break;
577
578		case ZPOOL_PROP_BOOTFS:
579			/*
580			 * If the pool version is less than SPA_VERSION_BOOTFS,
581			 * or the pool is still being created (version == 0),
582			 * the bootfs property cannot be set.
583			 */
584			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
585				error = SET_ERROR(ENOTSUP);
586				break;
587			}
588
589			/*
590			 * Make sure the vdev config is bootable
591			 */
592			if (!vdev_is_bootable(spa->spa_root_vdev)) {
593				error = SET_ERROR(ENOTSUP);
594				break;
595			}
596
597			reset_bootfs = 1;
598
599			error = nvpair_value_string(elem, &strval);
600
601			if (!error) {
602				objset_t *os;
603				uint64_t propval;
604
605				if (strval == NULL || strval[0] == '\0') {
606					objnum = zpool_prop_default_numeric(
607					    ZPOOL_PROP_BOOTFS);
608					break;
609				}
610
611				error = dmu_objset_hold(strval, FTAG, &os);
612				if (error != 0)
613					break;
614
615				/*
616				 * Must be ZPL, and its property settings
617				 * must be supported.
618				 */
619
620				if (dmu_objset_type(os) != DMU_OST_ZFS) {
621					error = SET_ERROR(ENOTSUP);
622				} else if ((error =
623				    dsl_prop_get_int_ds(dmu_objset_ds(os),
624				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
625				    &propval)) == 0 &&
626				    !BOOTFS_COMPRESS_VALID(propval)) {
627					error = SET_ERROR(ENOTSUP);
628				} else {
629					objnum = dmu_objset_id(os);
630				}
631				dmu_objset_rele(os, FTAG);
632			}
633			break;
634
635		case ZPOOL_PROP_FAILUREMODE:
636			error = nvpair_value_uint64(elem, &intval);
637			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
638			    intval > ZIO_FAILURE_MODE_PANIC))
639				error = SET_ERROR(EINVAL);
640
641			/*
642			 * This is a special case which only occurs when
643			 * the pool has completely failed. This allows
644			 * the user to change the in-core failmode property
645			 * without syncing it out to disk (I/Os might
646			 * currently be blocked). We do this by returning
647			 * EIO to the caller (spa_prop_set) to trick it
648			 * into thinking we encountered a property validation
649			 * error.
650			 */
651			if (!error && spa_suspended(spa)) {
652				spa->spa_failmode = intval;
653				error = SET_ERROR(EIO);
654			}
655			break;
656
657		case ZPOOL_PROP_CACHEFILE:
658			if ((error = nvpair_value_string(elem, &strval)) != 0)
659				break;
660
661			if (strval[0] == '\0')
662				break;
663
664			if (strcmp(strval, "none") == 0)
665				break;
666
667			if (strval[0] != '/') {
668				error = SET_ERROR(EINVAL);
669				break;
670			}
671
672			slash = strrchr(strval, '/');
673			ASSERT(slash != NULL);
674
675			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
676			    strcmp(slash, "/..") == 0)
677				error = SET_ERROR(EINVAL);
678			break;
679
680		case ZPOOL_PROP_COMMENT:
681			if ((error = nvpair_value_string(elem, &strval)) != 0)
682				break;
683			for (check = strval; *check != '\0'; check++) {
684				/*
685				 * The kernel doesn't have an easy isprint()
686				 * check.  For this kernel check, we merely
687				 * check ASCII apart from DEL.  Fix this if
688				 * there is an easy-to-use kernel isprint().
689				 */
690				if (*check >= 0x7f) {
691					error = SET_ERROR(EINVAL);
692					break;
693				}
694			}
695			if (strlen(strval) > ZPROP_MAX_COMMENT)
696				error = E2BIG;
697			break;
698
699		case ZPOOL_PROP_DEDUPDITTO:
700			if (spa_version(spa) < SPA_VERSION_DEDUP)
701				error = SET_ERROR(ENOTSUP);
702			else
703				error = nvpair_value_uint64(elem, &intval);
704			if (error == 0 &&
705			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
706				error = SET_ERROR(EINVAL);
707			break;
708		}
709
710		if (error)
711			break;
712	}
713
714	if (!error && reset_bootfs) {
715		error = nvlist_remove(props,
716		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
717
718		if (!error) {
719			error = nvlist_add_uint64(props,
720			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
721		}
722	}
723
724	return (error);
725}
726
727void
728spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
729{
730	char *cachefile;
731	spa_config_dirent_t *dp;
732
733	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
734	    &cachefile) != 0)
735		return;
736
737	dp = kmem_alloc(sizeof (spa_config_dirent_t),
738	    KM_SLEEP);
739
740	if (cachefile[0] == '\0')
741		dp->scd_path = spa_strdup(spa_config_path);
742	else if (strcmp(cachefile, "none") == 0)
743		dp->scd_path = NULL;
744	else
745		dp->scd_path = spa_strdup(cachefile);
746
747	list_insert_head(&spa->spa_config_list, dp);
748	if (need_sync)
749		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
750}
751
752int
753spa_prop_set(spa_t *spa, nvlist_t *nvp)
754{
755	int error;
756	nvpair_t *elem = NULL;
757	boolean_t need_sync = B_FALSE;
758
759	if ((error = spa_prop_validate(spa, nvp)) != 0)
760		return (error);
761
762	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
763		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
764
765		if (prop == ZPOOL_PROP_CACHEFILE ||
766		    prop == ZPOOL_PROP_ALTROOT ||
767		    prop == ZPOOL_PROP_READONLY)
768			continue;
769
770		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
771			uint64_t ver;
772
773			if (prop == ZPOOL_PROP_VERSION) {
774				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
775			} else {
776				ASSERT(zpool_prop_feature(nvpair_name(elem)));
777				ver = SPA_VERSION_FEATURES;
778				need_sync = B_TRUE;
779			}
780
781			/* Save time if the version is already set. */
782			if (ver == spa_version(spa))
783				continue;
784
785			/*
786			 * In addition to the pool directory object, we might
787			 * create the pool properties object, the features for
788			 * read object, the features for write object, or the
789			 * feature descriptions object.
790			 */
791			error = dsl_sync_task(spa->spa_name, NULL,
792			    spa_sync_version, &ver,
793			    6, ZFS_SPACE_CHECK_RESERVED);
794			if (error)
795				return (error);
796			continue;
797		}
798
799		need_sync = B_TRUE;
800		break;
801	}
802
803	if (need_sync) {
804		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
805		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
806	}
807
808	return (0);
809}
810
811/*
812 * If the bootfs property value is dsobj, clear it.
813 */
814void
815spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
816{
817	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
818		VERIFY(zap_remove(spa->spa_meta_objset,
819		    spa->spa_pool_props_object,
820		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
821		spa->spa_bootfs = 0;
822	}
823}
824
825/*ARGSUSED*/
826static int
827spa_change_guid_check(void *arg, dmu_tx_t *tx)
828{
829	uint64_t *newguid = arg;
830	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
831	vdev_t *rvd = spa->spa_root_vdev;
832	uint64_t vdev_state;
833
834	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
835		int error = (spa_has_checkpoint(spa)) ?
836		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
837		return (SET_ERROR(error));
838	}
839
840	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
841	vdev_state = rvd->vdev_state;
842	spa_config_exit(spa, SCL_STATE, FTAG);
843
844	if (vdev_state != VDEV_STATE_HEALTHY)
845		return (SET_ERROR(ENXIO));
846
847	ASSERT3U(spa_guid(spa), !=, *newguid);
848
849	return (0);
850}
851
852static void
853spa_change_guid_sync(void *arg, dmu_tx_t *tx)
854{
855	uint64_t *newguid = arg;
856	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
857	uint64_t oldguid;
858	vdev_t *rvd = spa->spa_root_vdev;
859
860	oldguid = spa_guid(spa);
861
862	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
863	rvd->vdev_guid = *newguid;
864	rvd->vdev_guid_sum += (*newguid - oldguid);
865	vdev_config_dirty(rvd);
866	spa_config_exit(spa, SCL_STATE, FTAG);
867
868	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
869	    oldguid, *newguid);
870}
871
872/*
873 * Change the GUID for the pool.  This is done so that we can later
874 * re-import a pool built from a clone of our own vdevs.  We will modify
875 * the root vdev's guid, our own pool guid, and then mark all of our
876 * vdevs dirty.  Note that we must make sure that all our vdevs are
877 * online when we do this, or else any vdevs that weren't present
878 * would be orphaned from our pool.  We are also going to issue a
879 * sysevent to update any watchers.
880 */
881int
882spa_change_guid(spa_t *spa)
883{
884	int error;
885	uint64_t guid;
886
887	mutex_enter(&spa->spa_vdev_top_lock);
888	mutex_enter(&spa_namespace_lock);
889	guid = spa_generate_guid(NULL);
890
891	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
892	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
893
894	if (error == 0) {
895		spa_write_cachefile(spa, B_FALSE, B_TRUE);
896		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
897	}
898
899	mutex_exit(&spa_namespace_lock);
900	mutex_exit(&spa->spa_vdev_top_lock);
901
902	return (error);
903}
904
905/*
906 * ==========================================================================
907 * SPA state manipulation (open/create/destroy/import/export)
908 * ==========================================================================
909 */
910
911static int
912spa_error_entry_compare(const void *a, const void *b)
913{
914	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
915	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
916	int ret;
917
918	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
919	    sizeof (zbookmark_phys_t));
920
921	return (TREE_ISIGN(ret));
922}
923
924/*
925 * Utility function which retrieves copies of the current logs and
926 * re-initializes them in the process.
927 */
928void
929spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
930{
931	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
932
933	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
934	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
935
936	avl_create(&spa->spa_errlist_scrub,
937	    spa_error_entry_compare, sizeof (spa_error_entry_t),
938	    offsetof(spa_error_entry_t, se_avl));
939	avl_create(&spa->spa_errlist_last,
940	    spa_error_entry_compare, sizeof (spa_error_entry_t),
941	    offsetof(spa_error_entry_t, se_avl));
942}
943
944static void
945spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
946{
947	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
948	enum zti_modes mode = ztip->zti_mode;
949	uint_t value = ztip->zti_value;
950	uint_t count = ztip->zti_count;
951	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
952	char name[32];
953	uint_t flags = 0;
954	boolean_t batch = B_FALSE;
955
956	if (mode == ZTI_MODE_NULL) {
957		tqs->stqs_count = 0;
958		tqs->stqs_taskq = NULL;
959		return;
960	}
961
962	ASSERT3U(count, >, 0);
963
964	tqs->stqs_count = count;
965	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
966
967	switch (mode) {
968	case ZTI_MODE_FIXED:
969		ASSERT3U(value, >=, 1);
970		value = MAX(value, 1);
971		break;
972
973	case ZTI_MODE_BATCH:
974		batch = B_TRUE;
975		flags |= TASKQ_THREADS_CPU_PCT;
976		value = zio_taskq_batch_pct;
977		break;
978
979	default:
980		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
981		    "spa_activate()",
982		    zio_type_name[t], zio_taskq_types[q], mode, value);
983		break;
984	}
985
986	for (uint_t i = 0; i < count; i++) {
987		taskq_t *tq;
988
989		if (count > 1) {
990			(void) snprintf(name, sizeof (name), "%s_%s_%u",
991			    zio_type_name[t], zio_taskq_types[q], i);
992		} else {
993			(void) snprintf(name, sizeof (name), "%s_%s",
994			    zio_type_name[t], zio_taskq_types[q]);
995		}
996
997		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
998			if (batch)
999				flags |= TASKQ_DC_BATCH;
1000
1001			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1002			    spa->spa_proc, zio_taskq_basedc, flags);
1003		} else {
1004			pri_t pri = maxclsyspri;
1005			/*
1006			 * The write issue taskq can be extremely CPU
1007			 * intensive.  Run it at slightly lower priority
1008			 * than the other taskqs.
1009			 */
1010			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
1011				pri--;
1012
1013			tq = taskq_create_proc(name, value, pri, 50,
1014			    INT_MAX, spa->spa_proc, flags);
1015		}
1016
1017		tqs->stqs_taskq[i] = tq;
1018	}
1019}
1020
1021static void
1022spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1023{
1024	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1025
1026	if (tqs->stqs_taskq == NULL) {
1027		ASSERT0(tqs->stqs_count);
1028		return;
1029	}
1030
1031	for (uint_t i = 0; i < tqs->stqs_count; i++) {
1032		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1033		taskq_destroy(tqs->stqs_taskq[i]);
1034	}
1035
1036	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1037	tqs->stqs_taskq = NULL;
1038}
1039
1040/*
1041 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1042 * Note that a type may have multiple discrete taskqs to avoid lock contention
1043 * on the taskq itself. In that case we choose which taskq at random by using
1044 * the low bits of gethrtime().
1045 */
1046void
1047spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1048    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1049{
1050	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1051	taskq_t *tq;
1052
1053	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1054	ASSERT3U(tqs->stqs_count, !=, 0);
1055
1056	if (tqs->stqs_count == 1) {
1057		tq = tqs->stqs_taskq[0];
1058	} else {
1059		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1060	}
1061
1062	taskq_dispatch_ent(tq, func, arg, flags, ent);
1063}
1064
1065static void
1066spa_create_zio_taskqs(spa_t *spa)
1067{
1068	for (int t = 0; t < ZIO_TYPES; t++) {
1069		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1070			spa_taskqs_init(spa, t, q);
1071		}
1072	}
1073}
1074
1075#ifdef _KERNEL
1076static void
1077spa_thread(void *arg)
1078{
1079	callb_cpr_t cprinfo;
1080
1081	spa_t *spa = arg;
1082	user_t *pu = PTOU(curproc);
1083
1084	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1085	    spa->spa_name);
1086
1087	ASSERT(curproc != &p0);
1088	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1089	    "zpool-%s", spa->spa_name);
1090	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1091
1092	/* bind this thread to the requested psrset */
1093	if (zio_taskq_psrset_bind != PS_NONE) {
1094		pool_lock();
1095		mutex_enter(&cpu_lock);
1096		mutex_enter(&pidlock);
1097		mutex_enter(&curproc->p_lock);
1098
1099		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1100		    0, NULL, NULL) == 0)  {
1101			curthread->t_bind_pset = zio_taskq_psrset_bind;
1102		} else {
1103			cmn_err(CE_WARN,
1104			    "Couldn't bind process for zfs pool \"%s\" to "
1105			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1106		}
1107
1108		mutex_exit(&curproc->p_lock);
1109		mutex_exit(&pidlock);
1110		mutex_exit(&cpu_lock);
1111		pool_unlock();
1112	}
1113
1114	if (zio_taskq_sysdc) {
1115		sysdc_thread_enter(curthread, 100, 0);
1116	}
1117
1118	spa->spa_proc = curproc;
1119	spa->spa_did = curthread->t_did;
1120
1121	spa_create_zio_taskqs(spa);
1122
1123	mutex_enter(&spa->spa_proc_lock);
1124	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1125
1126	spa->spa_proc_state = SPA_PROC_ACTIVE;
1127	cv_broadcast(&spa->spa_proc_cv);
1128
1129	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1130	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1131		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1132	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1133
1134	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1135	spa->spa_proc_state = SPA_PROC_GONE;
1136	spa->spa_proc = &p0;
1137	cv_broadcast(&spa->spa_proc_cv);
1138	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1139
1140	mutex_enter(&curproc->p_lock);
1141	lwp_exit();
1142}
1143#endif
1144
1145/*
1146 * Activate an uninitialized pool.
1147 */
1148static void
1149spa_activate(spa_t *spa, int mode)
1150{
1151	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1152
1153	spa->spa_state = POOL_STATE_ACTIVE;
1154	spa->spa_mode = mode;
1155
1156	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1157	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1158	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
1159	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
1160
1161	/* Try to create a covering process */
1162	mutex_enter(&spa->spa_proc_lock);
1163	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1164	ASSERT(spa->spa_proc == &p0);
1165	spa->spa_did = 0;
1166
1167	/* Only create a process if we're going to be around a while. */
1168	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1169		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1170		    NULL, 0) == 0) {
1171			spa->spa_proc_state = SPA_PROC_CREATED;
1172			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1173				cv_wait(&spa->spa_proc_cv,
1174				    &spa->spa_proc_lock);
1175			}
1176			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1177			ASSERT(spa->spa_proc != &p0);
1178			ASSERT(spa->spa_did != 0);
1179		} else {
1180#ifdef _KERNEL
1181			cmn_err(CE_WARN,
1182			    "Couldn't create process for zfs pool \"%s\"\n",
1183			    spa->spa_name);
1184#endif
1185		}
1186	}
1187	mutex_exit(&spa->spa_proc_lock);
1188
1189	/* If we didn't create a process, we need to create our taskqs. */
1190	if (spa->spa_proc == &p0) {
1191		spa_create_zio_taskqs(spa);
1192	}
1193
1194	for (size_t i = 0; i < TXG_SIZE; i++) {
1195		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1196		    ZIO_FLAG_CANFAIL);
1197	}
1198
1199	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1200	    offsetof(vdev_t, vdev_config_dirty_node));
1201	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1202	    offsetof(objset_t, os_evicting_node));
1203	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1204	    offsetof(vdev_t, vdev_state_dirty_node));
1205
1206	txg_list_create(&spa->spa_vdev_txg_list, spa,
1207	    offsetof(struct vdev, vdev_txg_node));
1208
1209	avl_create(&spa->spa_errlist_scrub,
1210	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1211	    offsetof(spa_error_entry_t, se_avl));
1212	avl_create(&spa->spa_errlist_last,
1213	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1214	    offsetof(spa_error_entry_t, se_avl));
1215
1216	spa_keystore_init(&spa->spa_keystore);
1217
1218	/*
1219	 * The taskq to upgrade datasets in this pool. Currently used by
1220	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1221	 */
1222	spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
1223	    minclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
1224}
1225
1226/*
1227 * Opposite of spa_activate().
1228 */
1229static void
1230spa_deactivate(spa_t *spa)
1231{
1232	ASSERT(spa->spa_sync_on == B_FALSE);
1233	ASSERT(spa->spa_dsl_pool == NULL);
1234	ASSERT(spa->spa_root_vdev == NULL);
1235	ASSERT(spa->spa_async_zio_root == NULL);
1236	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1237
1238	spa_evicting_os_wait(spa);
1239
1240	if (spa->spa_upgrade_taskq) {
1241		taskq_destroy(spa->spa_upgrade_taskq);
1242		spa->spa_upgrade_taskq = NULL;
1243	}
1244
1245	txg_list_destroy(&spa->spa_vdev_txg_list);
1246
1247	list_destroy(&spa->spa_config_dirty_list);
1248	list_destroy(&spa->spa_evicting_os_list);
1249	list_destroy(&spa->spa_state_dirty_list);
1250
1251	for (int t = 0; t < ZIO_TYPES; t++) {
1252		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1253			spa_taskqs_fini(spa, t, q);
1254		}
1255	}
1256
1257	for (size_t i = 0; i < TXG_SIZE; i++) {
1258		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1259		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1260		spa->spa_txg_zio[i] = NULL;
1261	}
1262
1263	metaslab_class_destroy(spa->spa_normal_class);
1264	spa->spa_normal_class = NULL;
1265
1266	metaslab_class_destroy(spa->spa_log_class);
1267	spa->spa_log_class = NULL;
1268
1269	metaslab_class_destroy(spa->spa_special_class);
1270	spa->spa_special_class = NULL;
1271
1272	metaslab_class_destroy(spa->spa_dedup_class);
1273	spa->spa_dedup_class = NULL;
1274
1275	/*
1276	 * If this was part of an import or the open otherwise failed, we may
1277	 * still have errors left in the queues.  Empty them just in case.
1278	 */
1279	spa_errlog_drain(spa);
1280	avl_destroy(&spa->spa_errlist_scrub);
1281	avl_destroy(&spa->spa_errlist_last);
1282
1283	spa_keystore_fini(&spa->spa_keystore);
1284
1285	spa->spa_state = POOL_STATE_UNINITIALIZED;
1286
1287	mutex_enter(&spa->spa_proc_lock);
1288	if (spa->spa_proc_state != SPA_PROC_NONE) {
1289		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1290		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1291		cv_broadcast(&spa->spa_proc_cv);
1292		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1293			ASSERT(spa->spa_proc != &p0);
1294			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1295		}
1296		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1297		spa->spa_proc_state = SPA_PROC_NONE;
1298	}
1299	ASSERT(spa->spa_proc == &p0);
1300	mutex_exit(&spa->spa_proc_lock);
1301
1302	/*
1303	 * We want to make sure spa_thread() has actually exited the ZFS
1304	 * module, so that the module can't be unloaded out from underneath
1305	 * it.
1306	 */
1307	if (spa->spa_did != 0) {
1308		thread_join(spa->spa_did);
1309		spa->spa_did = 0;
1310	}
1311}
1312
1313/*
1314 * Verify a pool configuration, and construct the vdev tree appropriately.  This
1315 * will create all the necessary vdevs in the appropriate layout, with each vdev
1316 * in the CLOSED state.  This will prep the pool before open/creation/import.
1317 * All vdev validation is done by the vdev_alloc() routine.
1318 */
1319static int
1320spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1321    uint_t id, int atype)
1322{
1323	nvlist_t **child;
1324	uint_t children;
1325	int error;
1326
1327	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1328		return (error);
1329
1330	if ((*vdp)->vdev_ops->vdev_op_leaf)
1331		return (0);
1332
1333	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1334	    &child, &children);
1335
1336	if (error == ENOENT)
1337		return (0);
1338
1339	if (error) {
1340		vdev_free(*vdp);
1341		*vdp = NULL;
1342		return (SET_ERROR(EINVAL));
1343	}
1344
1345	for (int c = 0; c < children; c++) {
1346		vdev_t *vd;
1347		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1348		    atype)) != 0) {
1349			vdev_free(*vdp);
1350			*vdp = NULL;
1351			return (error);
1352		}
1353	}
1354
1355	ASSERT(*vdp != NULL);
1356
1357	return (0);
1358}
1359
1360static boolean_t
1361spa_should_flush_logs_on_unload(spa_t *spa)
1362{
1363	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1364		return (B_FALSE);
1365
1366	if (!spa_writeable(spa))
1367		return (B_FALSE);
1368
1369	if (!spa->spa_sync_on)
1370		return (B_FALSE);
1371
1372	if (spa_state(spa) != POOL_STATE_EXPORTED)
1373		return (B_FALSE);
1374
1375	if (zfs_keep_log_spacemaps_at_export)
1376		return (B_FALSE);
1377
1378	return (B_TRUE);
1379}
1380
1381/*
1382 * Opens a transaction that will set the flag that will instruct
1383 * spa_sync to attempt to flush all the metaslabs for that txg.
1384 */
1385static void
1386spa_unload_log_sm_flush_all(spa_t *spa)
1387{
1388	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1389
1390	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1391
1392	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
1393	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
1394
1395	dmu_tx_commit(tx);
1396	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
1397}
1398
1399static void
1400spa_unload_log_sm_metadata(spa_t *spa)
1401{
1402	void *cookie = NULL;
1403	spa_log_sm_t *sls;
1404
1405	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
1406	    &cookie)) != NULL) {
1407		VERIFY0(sls->sls_mscount);
1408		kmem_free(sls, sizeof (spa_log_sm_t));
1409	}
1410
1411	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
1412	    e != NULL; e = list_head(&spa->spa_log_summary)) {
1413		VERIFY0(e->lse_mscount);
1414		list_remove(&spa->spa_log_summary, e);
1415		kmem_free(e, sizeof (log_summary_entry_t));
1416	}
1417
1418	spa->spa_unflushed_stats.sus_nblocks = 0;
1419	spa->spa_unflushed_stats.sus_memused = 0;
1420	spa->spa_unflushed_stats.sus_blocklimit = 0;
1421}
1422
1423/*
1424 * Opposite of spa_load().
1425 */
1426static void
1427spa_unload(spa_t *spa)
1428{
1429	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1430	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
1431
1432	spa_import_progress_remove(spa);
1433	spa_load_note(spa, "UNLOADING");
1434
1435	/*
1436	 * If the log space map feature is enabled and the pool is getting
1437	 * exported (but not destroyed), we want to spend some time flushing
1438	 * as many metaslabs as we can in an attempt to destroy log space
1439	 * maps and save import time.
1440	 */
1441	if (spa_should_flush_logs_on_unload(spa))
1442		spa_unload_log_sm_flush_all(spa);
1443
1444	/*
1445	 * Stop async tasks.
1446	 */
1447	spa_async_suspend(spa);
1448
1449	if (spa->spa_root_vdev) {
1450		vdev_t *root_vdev = spa->spa_root_vdev;
1451		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
1452		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
1453		vdev_autotrim_stop_all(spa);
1454	}
1455
1456	/*
1457	 * Stop syncing.
1458	 */
1459	if (spa->spa_sync_on) {
1460		txg_sync_stop(spa->spa_dsl_pool);
1461		spa->spa_sync_on = B_FALSE;
1462	}
1463
1464	/*
1465	 * This ensures that there is no async metaslab prefetching
1466	 * while we attempt to unload the spa.
1467	 */
1468	if (spa->spa_root_vdev != NULL) {
1469		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
1470			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
1471			if (vc->vdev_mg != NULL)
1472				taskq_wait(vc->vdev_mg->mg_taskq);
1473		}
1474	}
1475
1476	if (spa->spa_mmp.mmp_thread)
1477		mmp_thread_stop(spa);
1478
1479	/*
1480	 * Wait for any outstanding async I/O to complete.
1481	 */
1482	if (spa->spa_async_zio_root != NULL) {
1483		for (int i = 0; i < max_ncpus; i++)
1484			(void) zio_wait(spa->spa_async_zio_root[i]);
1485		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1486		spa->spa_async_zio_root = NULL;
1487	}
1488
1489	if (spa->spa_vdev_removal != NULL) {
1490		spa_vdev_removal_destroy(spa->spa_vdev_removal);
1491		spa->spa_vdev_removal = NULL;
1492	}
1493
1494	if (spa->spa_condense_zthr != NULL) {
1495		zthr_destroy(spa->spa_condense_zthr);
1496		spa->spa_condense_zthr = NULL;
1497	}
1498
1499	if (spa->spa_checkpoint_discard_zthr != NULL) {
1500		zthr_destroy(spa->spa_checkpoint_discard_zthr);
1501		spa->spa_checkpoint_discard_zthr = NULL;
1502	}
1503
1504	spa_condense_fini(spa);
1505
1506	bpobj_close(&spa->spa_deferred_bpobj);
1507
1508	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1509
1510	/*
1511	 * Close all vdevs.
1512	 */
1513	if (spa->spa_root_vdev)
1514		vdev_free(spa->spa_root_vdev);
1515	ASSERT(spa->spa_root_vdev == NULL);
1516
1517	/*
1518	 * Close the dsl pool.
1519	 */
1520	if (spa->spa_dsl_pool) {
1521		dsl_pool_close(spa->spa_dsl_pool);
1522		spa->spa_dsl_pool = NULL;
1523		spa->spa_meta_objset = NULL;
1524	}
1525
1526	ddt_unload(spa);
1527	spa_unload_log_sm_metadata(spa);
1528
1529	/*
1530	 * Drop and purge level 2 cache
1531	 */
1532	spa_l2cache_drop(spa);
1533
1534	for (int i = 0; i < spa->spa_spares.sav_count; i++)
1535		vdev_free(spa->spa_spares.sav_vdevs[i]);
1536	if (spa->spa_spares.sav_vdevs) {
1537		kmem_free(spa->spa_spares.sav_vdevs,
1538		    spa->spa_spares.sav_count * sizeof (void *));
1539		spa->spa_spares.sav_vdevs = NULL;
1540	}
1541	if (spa->spa_spares.sav_config) {
1542		nvlist_free(spa->spa_spares.sav_config);
1543		spa->spa_spares.sav_config = NULL;
1544	}
1545	spa->spa_spares.sav_count = 0;
1546
1547	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
1548		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1549		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1550	}
1551	if (spa->spa_l2cache.sav_vdevs) {
1552		kmem_free(spa->spa_l2cache.sav_vdevs,
1553		    spa->spa_l2cache.sav_count * sizeof (void *));
1554		spa->spa_l2cache.sav_vdevs = NULL;
1555	}
1556	if (spa->spa_l2cache.sav_config) {
1557		nvlist_free(spa->spa_l2cache.sav_config);
1558		spa->spa_l2cache.sav_config = NULL;
1559	}
1560	spa->spa_l2cache.sav_count = 0;
1561
1562	spa->spa_async_suspended = 0;
1563
1564	spa->spa_indirect_vdevs_loaded = B_FALSE;
1565
1566	if (spa->spa_comment != NULL) {
1567		spa_strfree(spa->spa_comment);
1568		spa->spa_comment = NULL;
1569	}
1570
1571	spa_config_exit(spa, SCL_ALL, spa);
1572}
1573
1574/*
1575 * Load (or re-load) the current list of vdevs describing the active spares for
1576 * this pool.  When this is called, we have some form of basic information in
1577 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1578 * then re-generate a more complete list including status information.
1579 */
1580void
1581spa_load_spares(spa_t *spa)
1582{
1583	nvlist_t **spares;
1584	uint_t nspares;
1585	int i;
1586	vdev_t *vd, *tvd;
1587
1588#ifndef _KERNEL
1589	/*
1590	 * zdb opens both the current state of the pool and the
1591	 * checkpointed state (if present), with a different spa_t.
1592	 *
1593	 * As spare vdevs are shared among open pools, we skip loading
1594	 * them when we load the checkpointed state of the pool.
1595	 */
1596	if (!spa_writeable(spa))
1597		return;
1598#endif
1599
1600	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1601
1602	/*
1603	 * First, close and free any existing spare vdevs.
1604	 */
1605	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1606		vd = spa->spa_spares.sav_vdevs[i];
1607
1608		/* Undo the call to spa_activate() below */
1609		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1610		    B_FALSE)) != NULL && tvd->vdev_isspare)
1611			spa_spare_remove(tvd);
1612		vdev_close(vd);
1613		vdev_free(vd);
1614	}
1615
1616	if (spa->spa_spares.sav_vdevs)
1617		kmem_free(spa->spa_spares.sav_vdevs,
1618		    spa->spa_spares.sav_count * sizeof (void *));
1619
1620	if (spa->spa_spares.sav_config == NULL)
1621		nspares = 0;
1622	else
1623		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1624		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1625
1626	spa->spa_spares.sav_count = (int)nspares;
1627	spa->spa_spares.sav_vdevs = NULL;
1628
1629	if (nspares == 0)
1630		return;
1631
1632	/*
1633	 * Construct the array of vdevs, opening them to get status in the
1634	 * process.   For each spare, there is potentially two different vdev_t
1635	 * structures associated with it: one in the list of spares (used only
1636	 * for basic validation purposes) and one in the active vdev
1637	 * configuration (if it's spared in).  During this phase we open and
1638	 * validate each vdev on the spare list.  If the vdev also exists in the
1639	 * active configuration, then we also mark this vdev as an active spare.
1640	 */
1641	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1642	    KM_SLEEP);
1643	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1644		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1645		    VDEV_ALLOC_SPARE) == 0);
1646		ASSERT(vd != NULL);
1647
1648		spa->spa_spares.sav_vdevs[i] = vd;
1649
1650		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1651		    B_FALSE)) != NULL) {
1652			if (!tvd->vdev_isspare)
1653				spa_spare_add(tvd);
1654
1655			/*
1656			 * We only mark the spare active if we were successfully
1657			 * able to load the vdev.  Otherwise, importing a pool
1658			 * with a bad active spare would result in strange
1659			 * behavior, because multiple pool would think the spare
1660			 * is actively in use.
1661			 *
1662			 * There is a vulnerability here to an equally bizarre
1663			 * circumstance, where a dead active spare is later
1664			 * brought back to life (onlined or otherwise).  Given
1665			 * the rarity of this scenario, and the extra complexity
1666			 * it adds, we ignore the possibility.
1667			 */
1668			if (!vdev_is_dead(tvd))
1669				spa_spare_activate(tvd);
1670		}
1671
1672		vd->vdev_top = vd;
1673		vd->vdev_aux = &spa->spa_spares;
1674
1675		if (vdev_open(vd) != 0)
1676			continue;
1677
1678		if (vdev_validate_aux(vd) == 0)
1679			spa_spare_add(vd);
1680	}
1681
1682	/*
1683	 * Recompute the stashed list of spares, with status information
1684	 * this time.
1685	 */
1686	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1687	    DATA_TYPE_NVLIST_ARRAY) == 0);
1688
1689	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1690	    KM_SLEEP);
1691	for (i = 0; i < spa->spa_spares.sav_count; i++)
1692		spares[i] = vdev_config_generate(spa,
1693		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1694	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1695	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1696	for (i = 0; i < spa->spa_spares.sav_count; i++)
1697		nvlist_free(spares[i]);
1698	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1699}
1700
1701/*
1702 * Load (or re-load) the current list of vdevs describing the active l2cache for
1703 * this pool.  When this is called, we have some form of basic information in
1704 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1705 * then re-generate a more complete list including status information.
1706 * Devices which are already active have their details maintained, and are
1707 * not re-opened.
1708 */
1709void
1710spa_load_l2cache(spa_t *spa)
1711{
1712	nvlist_t **l2cache;
1713	uint_t nl2cache;
1714	int i, j, oldnvdevs;
1715	uint64_t guid;
1716	vdev_t *vd, **oldvdevs, **newvdevs;
1717	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1718
1719#ifndef _KERNEL
1720	/*
1721	 * zdb opens both the current state of the pool and the
1722	 * checkpointed state (if present), with a different spa_t.
1723	 *
1724	 * As L2 caches are part of the ARC which is shared among open
1725	 * pools, we skip loading them when we load the checkpointed
1726	 * state of the pool.
1727	 */
1728	if (!spa_writeable(spa))
1729		return;
1730#endif
1731
1732	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1733
1734	if (sav->sav_config != NULL) {
1735		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1736		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1737		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1738	} else {
1739		nl2cache = 0;
1740		newvdevs = NULL;
1741	}
1742
1743	oldvdevs = sav->sav_vdevs;
1744	oldnvdevs = sav->sav_count;
1745	sav->sav_vdevs = NULL;
1746	sav->sav_count = 0;
1747
1748	/*
1749	 * Process new nvlist of vdevs.
1750	 */
1751	for (i = 0; i < nl2cache; i++) {
1752		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1753		    &guid) == 0);
1754
1755		newvdevs[i] = NULL;
1756		for (j = 0; j < oldnvdevs; j++) {
1757			vd = oldvdevs[j];
1758			if (vd != NULL && guid == vd->vdev_guid) {
1759				/*
1760				 * Retain previous vdev for add/remove ops.
1761				 */
1762				newvdevs[i] = vd;
1763				oldvdevs[j] = NULL;
1764				break;
1765			}
1766		}
1767
1768		if (newvdevs[i] == NULL) {
1769			/*
1770			 * Create new vdev
1771			 */
1772			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1773			    VDEV_ALLOC_L2CACHE) == 0);
1774			ASSERT(vd != NULL);
1775			newvdevs[i] = vd;
1776
1777			/*
1778			 * Commit this vdev as an l2cache device,
1779			 * even if it fails to open.
1780			 */
1781			spa_l2cache_add(vd);
1782
1783			vd->vdev_top = vd;
1784			vd->vdev_aux = sav;
1785
1786			spa_l2cache_activate(vd);
1787
1788			if (vdev_open(vd) != 0)
1789				continue;
1790
1791			(void) vdev_validate_aux(vd);
1792
1793			if (!vdev_is_dead(vd))
1794				l2arc_add_vdev(spa, vd);
1795		}
1796	}
1797
1798	/*
1799	 * Purge vdevs that were dropped
1800	 */
1801	for (i = 0; i < oldnvdevs; i++) {
1802		uint64_t pool;
1803
1804		vd = oldvdevs[i];
1805		if (vd != NULL) {
1806			ASSERT(vd->vdev_isl2cache);
1807
1808			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1809			    pool != 0ULL && l2arc_vdev_present(vd))
1810				l2arc_remove_vdev(vd);
1811			vdev_clear_stats(vd);
1812			vdev_free(vd);
1813		}
1814	}
1815
1816	if (oldvdevs)
1817		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1818
1819	if (sav->sav_config == NULL)
1820		goto out;
1821
1822	sav->sav_vdevs = newvdevs;
1823	sav->sav_count = (int)nl2cache;
1824
1825	/*
1826	 * Recompute the stashed list of l2cache devices, with status
1827	 * information this time.
1828	 */
1829	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1830	    DATA_TYPE_NVLIST_ARRAY) == 0);
1831
1832	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1833	for (i = 0; i < sav->sav_count; i++)
1834		l2cache[i] = vdev_config_generate(spa,
1835		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1836	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1837	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1838out:
1839	for (i = 0; i < sav->sav_count; i++)
1840		nvlist_free(l2cache[i]);
1841	if (sav->sav_count)
1842		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1843}
1844
1845static int
1846load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1847{
1848	dmu_buf_t *db;
1849	char *packed = NULL;
1850	size_t nvsize = 0;
1851	int error;
1852	*value = NULL;
1853
1854	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1855	if (error != 0)
1856		return (error);
1857
1858	nvsize = *(uint64_t *)db->db_data;
1859	dmu_buf_rele(db, FTAG);
1860
1861	packed = kmem_alloc(nvsize, KM_SLEEP);
1862	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1863	    DMU_READ_PREFETCH);
1864	if (error == 0)
1865		error = nvlist_unpack(packed, nvsize, value, 0);
1866	kmem_free(packed, nvsize);
1867
1868	return (error);
1869}
1870
1871/*
1872 * Concrete top-level vdevs that are not missing and are not logs. At every
1873 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1874 */
1875static uint64_t
1876spa_healthy_core_tvds(spa_t *spa)
1877{
1878	vdev_t *rvd = spa->spa_root_vdev;
1879	uint64_t tvds = 0;
1880
1881	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1882		vdev_t *vd = rvd->vdev_child[i];
1883		if (vd->vdev_islog)
1884			continue;
1885		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1886			tvds++;
1887	}
1888
1889	return (tvds);
1890}
1891
1892/*
1893 * Checks to see if the given vdev could not be opened, in which case we post a
1894 * sysevent to notify the autoreplace code that the device has been removed.
1895 */
1896static void
1897spa_check_removed(vdev_t *vd)
1898{
1899	for (uint64_t c = 0; c < vd->vdev_children; c++)
1900		spa_check_removed(vd->vdev_child[c]);
1901
1902	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1903	    vdev_is_concrete(vd)) {
1904		zfs_post_autoreplace(vd->vdev_spa, vd);
1905		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1906	}
1907}
1908
1909static int
1910spa_check_for_missing_logs(spa_t *spa)
1911{
1912	vdev_t *rvd = spa->spa_root_vdev;
1913
1914	/*
1915	 * If we're doing a normal import, then build up any additional
1916	 * diagnostic information about missing log devices.
1917	 * We'll pass this up to the user for further processing.
1918	 */
1919	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1920		nvlist_t **child, *nv;
1921		uint64_t idx = 0;
1922
1923		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1924		    KM_SLEEP);
1925		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1926
1927		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1928			vdev_t *tvd = rvd->vdev_child[c];
1929
1930			/*
1931			 * We consider a device as missing only if it failed
1932			 * to open (i.e. offline or faulted is not considered
1933			 * as missing).
1934			 */
1935			if (tvd->vdev_islog &&
1936			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1937				child[idx++] = vdev_config_generate(spa, tvd,
1938				    B_FALSE, VDEV_CONFIG_MISSING);
1939			}
1940		}
1941
1942		if (idx > 0) {
1943			fnvlist_add_nvlist_array(nv,
1944			    ZPOOL_CONFIG_CHILDREN, child, idx);
1945			fnvlist_add_nvlist(spa->spa_load_info,
1946			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
1947
1948			for (uint64_t i = 0; i < idx; i++)
1949				nvlist_free(child[i]);
1950		}
1951		nvlist_free(nv);
1952		kmem_free(child, rvd->vdev_children * sizeof (char **));
1953
1954		if (idx > 0) {
1955			spa_load_failed(spa, "some log devices are missing");
1956			vdev_dbgmsg_print_tree(rvd, 2);
1957			return (SET_ERROR(ENXIO));
1958		}
1959	} else {
1960		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1961			vdev_t *tvd = rvd->vdev_child[c];
1962
1963			if (tvd->vdev_islog &&
1964			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1965				spa_set_log_state(spa, SPA_LOG_CLEAR);
1966				spa_load_note(spa, "some log devices are "
1967				    "missing, ZIL is dropped.");
1968				vdev_dbgmsg_print_tree(rvd, 2);
1969				break;
1970			}
1971		}
1972	}
1973
1974	return (0);
1975}
1976
1977/*
1978 * Check for missing log devices
1979 */
1980static boolean_t
1981spa_check_logs(spa_t *spa)
1982{
1983	boolean_t rv = B_FALSE;
1984	dsl_pool_t *dp = spa_get_dsl(spa);
1985
1986	switch (spa->spa_log_state) {
1987	case SPA_LOG_MISSING:
1988		/* need to recheck in case slog has been restored */
1989	case SPA_LOG_UNKNOWN:
1990		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1991		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1992		if (rv)
1993			spa_set_log_state(spa, SPA_LOG_MISSING);
1994		break;
1995	}
1996	return (rv);
1997}
1998
1999static boolean_t
2000spa_passivate_log(spa_t *spa)
2001{
2002	vdev_t *rvd = spa->spa_root_vdev;
2003	boolean_t slog_found = B_FALSE;
2004
2005	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2006
2007	if (!spa_has_slogs(spa))
2008		return (B_FALSE);
2009
2010	for (int c = 0; c < rvd->vdev_children; c++) {
2011		vdev_t *tvd = rvd->vdev_child[c];
2012		metaslab_group_t *mg = tvd->vdev_mg;
2013
2014		if (tvd->vdev_islog) {
2015			metaslab_group_passivate(mg);
2016			slog_found = B_TRUE;
2017		}
2018	}
2019
2020	return (slog_found);
2021}
2022
2023static void
2024spa_activate_log(spa_t *spa)
2025{
2026	vdev_t *rvd = spa->spa_root_vdev;
2027
2028	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2029
2030	for (int c = 0; c < rvd->vdev_children; c++) {
2031		vdev_t *tvd = rvd->vdev_child[c];
2032		metaslab_group_t *mg = tvd->vdev_mg;
2033
2034		if (tvd->vdev_islog)
2035			metaslab_group_activate(mg);
2036	}
2037}
2038
2039int
2040spa_reset_logs(spa_t *spa)
2041{
2042	int error;
2043
2044	error = dmu_objset_find(spa_name(spa), zil_reset,
2045	    NULL, DS_FIND_CHILDREN);
2046	if (error == 0) {
2047		/*
2048		 * We successfully offlined the log device, sync out the
2049		 * current txg so that the "stubby" block can be removed
2050		 * by zil_sync().
2051		 */
2052		txg_wait_synced(spa->spa_dsl_pool, 0);
2053	}
2054	return (error);
2055}
2056
2057static void
2058spa_aux_check_removed(spa_aux_vdev_t *sav)
2059{
2060	for (int i = 0; i < sav->sav_count; i++)
2061		spa_check_removed(sav->sav_vdevs[i]);
2062}
2063
2064void
2065spa_claim_notify(zio_t *zio)
2066{
2067	spa_t *spa = zio->io_spa;
2068
2069	if (zio->io_error)
2070		return;
2071
2072	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
2073	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
2074		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
2075	mutex_exit(&spa->spa_props_lock);
2076}
2077
2078typedef struct spa_load_error {
2079	uint64_t	sle_meta_count;
2080	uint64_t	sle_data_count;
2081} spa_load_error_t;
2082
2083static void
2084spa_load_verify_done(zio_t *zio)
2085{
2086	blkptr_t *bp = zio->io_bp;
2087	spa_load_error_t *sle = zio->io_private;
2088	dmu_object_type_t type = BP_GET_TYPE(bp);
2089	int error = zio->io_error;
2090	spa_t *spa = zio->io_spa;
2091
2092	abd_free(zio->io_abd);
2093	if (error) {
2094		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
2095		    type != DMU_OT_INTENT_LOG)
2096			atomic_inc_64(&sle->sle_meta_count);
2097		else
2098			atomic_inc_64(&sle->sle_data_count);
2099	}
2100
2101	mutex_enter(&spa->spa_scrub_lock);
2102	spa->spa_load_verify_ios--;
2103	cv_broadcast(&spa->spa_scrub_io_cv);
2104	mutex_exit(&spa->spa_scrub_lock);
2105}
2106
2107/*
2108 * Maximum number of concurrent scrub i/os to create while verifying
2109 * a pool while importing it.
2110 */
2111int spa_load_verify_maxinflight = 10000;
2112boolean_t spa_load_verify_metadata = B_TRUE;
2113boolean_t spa_load_verify_data = B_TRUE;
2114
2115/*ARGSUSED*/
2116static int
2117spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2118    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2119{
2120	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2121		return (0);
2122	/*
2123	 * Note: normally this routine will not be called if
2124	 * spa_load_verify_metadata is not set.  However, it may be useful
2125	 * to manually set the flag after the traversal has begun.
2126	 */
2127	if (!spa_load_verify_metadata)
2128		return (0);
2129	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
2130		return (0);
2131
2132	zio_t *rio = arg;
2133	size_t size = BP_GET_PSIZE(bp);
2134
2135	mutex_enter(&spa->spa_scrub_lock);
2136	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
2137		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2138	spa->spa_load_verify_ios++;
2139	mutex_exit(&spa->spa_scrub_lock);
2140
2141	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2142	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2143	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2144	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2145	return (0);
2146}
2147
2148/* ARGSUSED */
2149int
2150verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2151{
2152	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2153		return (SET_ERROR(ENAMETOOLONG));
2154
2155	return (0);
2156}
2157
2158static int
2159spa_load_verify(spa_t *spa)
2160{
2161	zio_t *rio;
2162	spa_load_error_t sle = { 0 };
2163	zpool_load_policy_t policy;
2164	boolean_t verify_ok = B_FALSE;
2165	int error = 0;
2166
2167	zpool_get_load_policy(spa->spa_config, &policy);
2168
2169	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
2170		return (0);
2171
2172	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2173	error = dmu_objset_find_dp(spa->spa_dsl_pool,
2174	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2175	    DS_FIND_CHILDREN);
2176	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2177	if (error != 0)
2178		return (error);
2179
2180	rio = zio_root(spa, NULL, &sle,
2181	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2182
2183	if (spa_load_verify_metadata) {
2184		if (spa->spa_extreme_rewind) {
2185			spa_load_note(spa, "performing a complete scan of the "
2186			    "pool since extreme rewind is on. This may take "
2187			    "a very long time.\n  (spa_load_verify_data=%u, "
2188			    "spa_load_verify_metadata=%u)",
2189			    spa_load_verify_data, spa_load_verify_metadata);
2190		}
2191		error = traverse_pool(spa, spa->spa_verify_min_txg,
2192		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
2193		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
2194	}
2195
2196	(void) zio_wait(rio);
2197
2198	spa->spa_load_meta_errors = sle.sle_meta_count;
2199	spa->spa_load_data_errors = sle.sle_data_count;
2200
2201	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2202		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2203		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2204		    (u_longlong_t)sle.sle_data_count);
2205	}
2206
2207	if (spa_load_verify_dryrun ||
2208	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2209	    sle.sle_data_count <= policy.zlp_maxdata)) {
2210		int64_t loss = 0;
2211
2212		verify_ok = B_TRUE;
2213		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2214		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2215
2216		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2217		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2218		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2219		VERIFY(nvlist_add_int64(spa->spa_load_info,
2220		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2221		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2222		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2223	} else {
2224		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2225	}
2226
2227	if (spa_load_verify_dryrun)
2228		return (0);
2229
2230	if (error) {
2231		if (error != ENXIO && error != EIO)
2232			error = SET_ERROR(EIO);
2233		return (error);
2234	}
2235
2236	return (verify_ok ? 0 : EIO);
2237}
2238
2239/*
2240 * Find a value in the pool props object.
2241 */
2242static void
2243spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2244{
2245	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2246	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2247}
2248
2249/*
2250 * Find a value in the pool directory object.
2251 */
2252static int
2253spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2254{
2255	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2256	    name, sizeof (uint64_t), 1, val);
2257
2258	if (error != 0 && (error != ENOENT || log_enoent)) {
2259		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2260		    "[error=%d]", name, error);
2261	}
2262
2263	return (error);
2264}
2265
2266static int
2267spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2268{
2269	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2270	return (SET_ERROR(err));
2271}
2272
2273static void
2274spa_spawn_aux_threads(spa_t *spa)
2275{
2276	ASSERT(spa_writeable(spa));
2277
2278	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2279
2280	spa_start_indirect_condensing_thread(spa);
2281
2282	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2283	spa->spa_checkpoint_discard_zthr =
2284	    zthr_create(spa_checkpoint_discard_thread_check,
2285	    spa_checkpoint_discard_thread, spa);
2286}
2287
2288/*
2289 * Fix up config after a partly-completed split.  This is done with the
2290 * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2291 * pool have that entry in their config, but only the splitting one contains
2292 * a list of all the guids of the vdevs that are being split off.
2293 *
2294 * This function determines what to do with that list: either rejoin
2295 * all the disks to the pool, or complete the splitting process.  To attempt
2296 * the rejoin, each disk that is offlined is marked online again, and
2297 * we do a reopen() call.  If the vdev label for every disk that was
2298 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2299 * then we call vdev_split() on each disk, and complete the split.
2300 *
2301 * Otherwise we leave the config alone, with all the vdevs in place in
2302 * the original pool.
2303 */
2304static void
2305spa_try_repair(spa_t *spa, nvlist_t *config)
2306{
2307	uint_t extracted;
2308	uint64_t *glist;
2309	uint_t i, gcount;
2310	nvlist_t *nvl;
2311	vdev_t **vd;
2312	boolean_t attempt_reopen;
2313
2314	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2315		return;
2316
2317	/* check that the config is complete */
2318	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2319	    &glist, &gcount) != 0)
2320		return;
2321
2322	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2323
2324	/* attempt to online all the vdevs & validate */
2325	attempt_reopen = B_TRUE;
2326	for (i = 0; i < gcount; i++) {
2327		if (glist[i] == 0)	/* vdev is hole */
2328			continue;
2329
2330		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2331		if (vd[i] == NULL) {
2332			/*
2333			 * Don't bother attempting to reopen the disks;
2334			 * just do the split.
2335			 */
2336			attempt_reopen = B_FALSE;
2337		} else {
2338			/* attempt to re-online it */
2339			vd[i]->vdev_offline = B_FALSE;
2340		}
2341	}
2342
2343	if (attempt_reopen) {
2344		vdev_reopen(spa->spa_root_vdev);
2345
2346		/* check each device to see what state it's in */
2347		for (extracted = 0, i = 0; i < gcount; i++) {
2348			if (vd[i] != NULL &&
2349			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2350				break;
2351			++extracted;
2352		}
2353	}
2354
2355	/*
2356	 * If every disk has been moved to the new pool, or if we never
2357	 * even attempted to look at them, then we split them off for
2358	 * good.
2359	 */
2360	if (!attempt_reopen || gcount == extracted) {
2361		for (i = 0; i < gcount; i++)
2362			if (vd[i] != NULL)
2363				vdev_split(vd[i]);
2364		vdev_reopen(spa->spa_root_vdev);
2365	}
2366
2367	kmem_free(vd, gcount * sizeof (vdev_t *));
2368}
2369
2370static int
2371spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2372{
2373	char *ereport = FM_EREPORT_ZFS_POOL;
2374	int error;
2375
2376	spa->spa_load_state = state;
2377	(void) spa_import_progress_set_state(spa, spa_load_state(spa));
2378
2379	gethrestime(&spa->spa_loaded_ts);
2380	error = spa_load_impl(spa, type, &ereport);
2381
2382	/*
2383	 * Don't count references from objsets that are already closed
2384	 * and are making their way through the eviction process.
2385	 */
2386	spa_evicting_os_wait(spa);
2387	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
2388	if (error) {
2389		if (error != EEXIST) {
2390			spa->spa_loaded_ts.tv_sec = 0;
2391			spa->spa_loaded_ts.tv_nsec = 0;
2392		}
2393		if (error != EBADF) {
2394			(void) zfs_ereport_post(ereport, spa,
2395			    NULL, NULL, NULL, 0, 0);
2396		}
2397	}
2398	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2399	spa->spa_ena = 0;
2400
2401	(void) spa_import_progress_set_state(spa, spa_load_state(spa));
2402
2403	return (error);
2404}
2405
2406/*
2407 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2408 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2409 * spa's per-vdev ZAP list.
2410 */
2411static uint64_t
2412vdev_count_verify_zaps(vdev_t *vd)
2413{
2414	spa_t *spa = vd->vdev_spa;
2415	uint64_t total = 0;
2416	if (vd->vdev_top_zap != 0) {
2417		total++;
2418		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2419		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2420	}
2421	if (vd->vdev_leaf_zap != 0) {
2422		total++;
2423		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2424		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2425	}
2426
2427	for (uint64_t i = 0; i < vd->vdev_children; i++) {
2428		total += vdev_count_verify_zaps(vd->vdev_child[i]);
2429	}
2430
2431	return (total);
2432}
2433
2434/*
2435 * Determine whether the activity check is required.
2436 */
2437static boolean_t
2438spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
2439    nvlist_t *config)
2440{
2441	uint64_t state = 0;
2442	uint64_t hostid = 0;
2443	uint64_t tryconfig_txg = 0;
2444	uint64_t tryconfig_timestamp = 0;
2445	uint16_t tryconfig_mmp_seq = 0;
2446	nvlist_t *nvinfo;
2447
2448	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
2449		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
2450		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
2451		    &tryconfig_txg);
2452		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2453		    &tryconfig_timestamp);
2454		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
2455		    &tryconfig_mmp_seq);
2456	}
2457
2458	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
2459
2460	/*
2461	 * Disable the MMP activity check - This is used by zdb which
2462	 * is intended to be used on potentially active pools.
2463	 */
2464	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
2465		return (B_FALSE);
2466
2467	/*
2468	 * Skip the activity check when the MMP feature is disabled.
2469	 */
2470	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
2471		return (B_FALSE);
2472
2473	/*
2474	 * If the tryconfig_ values are nonzero, they are the results of an
2475	 * earlier tryimport.  If they all match the uberblock we just found,
2476	 * then the pool has not changed and we return false so we do not test
2477	 * a second time.
2478	 */
2479	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
2480	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
2481	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
2482	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
2483		return (B_FALSE);
2484
2485	/*
2486	 * Allow the activity check to be skipped when importing the pool
2487	 * on the same host which last imported it.  Since the hostid from
2488	 * configuration may be stale use the one read from the label.
2489	 */
2490	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
2491		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
2492
2493	if (hostid == spa_get_hostid())
2494		return (B_FALSE);
2495
2496	/*
2497	 * Skip the activity test when the pool was cleanly exported.
2498	 */
2499	if (state != POOL_STATE_ACTIVE)
2500		return (B_FALSE);
2501
2502	return (B_TRUE);
2503}
2504
2505/*
2506 * Nanoseconds the activity check must watch for changes on-disk.
2507 */
2508static uint64_t
2509spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
2510{
2511	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
2512	uint64_t multihost_interval = MSEC2NSEC(
2513	    MMP_INTERVAL_OK(zfs_multihost_interval));
2514	uint64_t import_delay = MAX(NANOSEC, import_intervals *
2515	    multihost_interval);
2516
2517	/*
2518	 * Local tunables determine a minimum duration except for the case
2519	 * where we know when the remote host will suspend the pool if MMP
2520	 * writes do not land.
2521	 *
2522	 * See Big Theory comment at the top of mmp.c for the reasoning behind
2523	 * these cases and times.
2524	 */
2525
2526	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
2527
2528	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
2529	    MMP_FAIL_INT(ub) > 0) {
2530
2531		/* MMP on remote host will suspend pool after failed writes */
2532		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
2533		    MMP_IMPORT_SAFETY_FACTOR / 100;
2534
2535		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
2536		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
2537		    "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
2538		    MMP_INTERVAL(ub), import_intervals);
2539
2540	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
2541	    MMP_FAIL_INT(ub) == 0) {
2542
2543		/* MMP on remote host will never suspend pool */
2544		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
2545		    ub->ub_mmp_delay) * import_intervals);
2546
2547		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
2548		    "mmp_interval=%llu ub_mmp_delay=%llu "
2549		    "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
2550		    ub->ub_mmp_delay, import_intervals);
2551
2552	} else if (MMP_VALID(ub)) {
2553		/*
2554		 * zfs-0.7 compatability case
2555		 */
2556
2557		import_delay = MAX(import_delay, (multihost_interval +
2558		    ub->ub_mmp_delay) * import_intervals);
2559
2560		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
2561		    "import_intervals=%u leaves=%u", import_delay,
2562		    ub->ub_mmp_delay, import_intervals,
2563		    vdev_count_leaves(spa));
2564	} else {
2565		/* Using local tunings is the only reasonable option */
2566		zfs_dbgmsg("pool last imported on non-MMP aware "
2567		    "host using import_delay=%llu multihost_interval=%llu "
2568		    "import_intervals=%u", import_delay, multihost_interval,
2569		    import_intervals);
2570	}
2571
2572	return (import_delay);
2573}
2574
2575/*
2576 * Perform the import activity check.  If the user canceled the import or
2577 * we detected activity then fail.
2578 */
2579static int
2580spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
2581{
2582	uint64_t txg = ub->ub_txg;
2583	uint64_t timestamp = ub->ub_timestamp;
2584	uint64_t mmp_config = ub->ub_mmp_config;
2585	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
2586	uint64_t import_delay;
2587	hrtime_t import_expire;
2588	nvlist_t *mmp_label = NULL;
2589	vdev_t *rvd = spa->spa_root_vdev;
2590	kcondvar_t cv;
2591	kmutex_t mtx;
2592	int error = 0;
2593
2594	cv_init(&cv, NULL, CV_DEFAULT, NULL);
2595	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
2596	mutex_enter(&mtx);
2597
2598	/*
2599	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
2600	 * during the earlier tryimport.  If the txg recorded there is 0 then
2601	 * the pool is known to be active on another host.
2602	 *
2603	 * Otherwise, the pool might be in use on another host.  Check for
2604	 * changes in the uberblocks on disk if necessary.
2605	 */
2606	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
2607		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
2608		    ZPOOL_CONFIG_LOAD_INFO);
2609
2610		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
2611		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
2612			vdev_uberblock_load(rvd, ub, &mmp_label);
2613			error = SET_ERROR(EREMOTEIO);
2614			goto out;
2615		}
2616	}
2617
2618	import_delay = spa_activity_check_duration(spa, ub);
2619
2620	/* Add a small random factor in case of simultaneous imports (0-25%) */
2621	import_delay += import_delay * spa_get_random(250) / 1000;
2622
2623	import_expire = gethrtime() + import_delay;
2624
2625	while (gethrtime() < import_expire) {
2626		(void) spa_import_progress_set_mmp_check(spa,
2627		    NSEC2SEC(import_expire - gethrtime()));
2628
2629		vdev_uberblock_load(rvd, ub, &mmp_label);
2630
2631		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
2632		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
2633			zfs_dbgmsg("multihost activity detected "
2634			    "txg %llu ub_txg  %llu "
2635			    "timestamp %llu ub_timestamp  %llu "
2636			    "mmp_config %#llx ub_mmp_config %#llx",
2637			    txg, ub->ub_txg, timestamp, ub->ub_timestamp,
2638			    mmp_config, ub->ub_mmp_config);
2639
2640			error = SET_ERROR(EREMOTEIO);
2641			break;
2642		}
2643
2644		if (mmp_label) {
2645			nvlist_free(mmp_label);
2646			mmp_label = NULL;
2647		}
2648
2649		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
2650		if (error != -1) {
2651			error = SET_ERROR(EINTR);
2652			break;
2653		}
2654		error = 0;
2655	}
2656
2657out:
2658	mutex_exit(&mtx);
2659	mutex_destroy(&mtx);
2660	cv_destroy(&cv);
2661
2662	/*
2663	 * If the pool is determined to be active store the status in the
2664	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
2665	 * available from configuration read from disk store them as well.
2666	 * This allows 'zpool import' to generate a more useful message.
2667	 *
2668	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
2669	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
2670	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
2671	 */
2672	if (error == EREMOTEIO) {
2673		char *hostname = "<unknown>";
2674		uint64_t hostid = 0;
2675
2676		if (mmp_label) {
2677			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
2678				hostname = fnvlist_lookup_string(mmp_label,
2679				    ZPOOL_CONFIG_HOSTNAME);
2680				fnvlist_add_string(spa->spa_load_info,
2681				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
2682			}
2683
2684			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
2685				hostid = fnvlist_lookup_uint64(mmp_label,
2686				    ZPOOL_CONFIG_HOSTID);
2687				fnvlist_add_uint64(spa->spa_load_info,
2688				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
2689			}
2690		}
2691
2692		fnvlist_add_uint64(spa->spa_load_info,
2693		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
2694		fnvlist_add_uint64(spa->spa_load_info,
2695		    ZPOOL_CONFIG_MMP_TXG, 0);
2696
2697		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
2698	}
2699
2700	if (mmp_label)
2701		nvlist_free(mmp_label);
2702
2703	return (error);
2704}
2705
2706static int
2707spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2708{
2709	uint64_t hostid;
2710	char *hostname;
2711	uint64_t myhostid = 0;
2712
2713	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2714	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2715		hostname = fnvlist_lookup_string(mos_config,
2716		    ZPOOL_CONFIG_HOSTNAME);
2717
2718		myhostid = zone_get_hostid(NULL);
2719
2720		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2721			cmn_err(CE_WARN, "pool '%s' could not be "
2722			    "loaded as it was last accessed by "
2723			    "another system (host: %s hostid: 0x%llx). "
2724			    "See: http://illumos.org/msg/ZFS-8000-EY",
2725			    spa_name(spa), hostname, (u_longlong_t)hostid);
2726			spa_load_failed(spa, "hostid verification failed: pool "
2727			    "last accessed by host: %s (hostid: 0x%llx)",
2728			    hostname, (u_longlong_t)hostid);
2729			return (SET_ERROR(EBADF));
2730		}
2731	}
2732
2733	return (0);
2734}
2735
2736static int
2737spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2738{
2739	int error = 0;
2740	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2741	int parse;
2742	vdev_t *rvd;
2743	uint64_t pool_guid;
2744	char *comment;
2745
2746	/*
2747	 * Versioning wasn't explicitly added to the label until later, so if
2748	 * it's not present treat it as the initial version.
2749	 */
2750	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2751	    &spa->spa_ubsync.ub_version) != 0)
2752		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2753
2754	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2755		spa_load_failed(spa, "invalid config provided: '%s' missing",
2756		    ZPOOL_CONFIG_POOL_GUID);
2757		return (SET_ERROR(EINVAL));
2758	}
2759
2760	/*
2761	 * If we are doing an import, ensure that the pool is not already
2762	 * imported by checking if its pool guid already exists in the
2763	 * spa namespace.
2764	 *
2765	 * The only case that we allow an already imported pool to be
2766	 * imported again, is when the pool is checkpointed and we want to
2767	 * look at its checkpointed state from userland tools like zdb.
2768	 */
2769#ifdef _KERNEL
2770	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2771	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2772	    spa_guid_exists(pool_guid, 0)) {
2773#else
2774	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2775	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2776	    spa_guid_exists(pool_guid, 0) &&
2777	    !spa_importing_readonly_checkpoint(spa)) {
2778#endif
2779		spa_load_failed(spa, "a pool with guid %llu is already open",
2780		    (u_longlong_t)pool_guid);
2781		return (SET_ERROR(EEXIST));
2782	}
2783
2784	spa->spa_config_guid = pool_guid;
2785
2786	nvlist_free(spa->spa_load_info);
2787	spa->spa_load_info = fnvlist_alloc();
2788
2789	ASSERT(spa->spa_comment == NULL);
2790	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2791		spa->spa_comment = spa_strdup(comment);
2792
2793	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2794	    &spa->spa_config_txg);
2795
2796	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2797		spa->spa_config_splitting = fnvlist_dup(nvl);
2798
2799	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2800		spa_load_failed(spa, "invalid config provided: '%s' missing",
2801		    ZPOOL_CONFIG_VDEV_TREE);
2802		return (SET_ERROR(EINVAL));
2803	}
2804
2805	/*
2806	 * Create "The Godfather" zio to hold all async IOs
2807	 */
2808	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2809	    KM_SLEEP);
2810	for (int i = 0; i < max_ncpus; i++) {
2811		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2812		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2813		    ZIO_FLAG_GODFATHER);
2814	}
2815
2816	/*
2817	 * Parse the configuration into a vdev tree.  We explicitly set the
2818	 * value that will be returned by spa_version() since parsing the
2819	 * configuration requires knowing the version number.
2820	 */
2821	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2822	parse = (type == SPA_IMPORT_EXISTING ?
2823	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2824	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2825	spa_config_exit(spa, SCL_ALL, FTAG);
2826
2827	if (error != 0) {
2828		spa_load_failed(spa, "unable to parse config [error=%d]",
2829		    error);
2830		return (error);
2831	}
2832
2833	ASSERT(spa->spa_root_vdev == rvd);
2834	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2835	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2836
2837	if (type != SPA_IMPORT_ASSEMBLE) {
2838		ASSERT(spa_guid(spa) == pool_guid);
2839	}
2840
2841	return (0);
2842}
2843
2844/*
2845 * Recursively open all vdevs in the vdev tree. This function is called twice:
2846 * first with the untrusted config, then with the trusted config.
2847 */
2848static int
2849spa_ld_open_vdevs(spa_t *spa)
2850{
2851	int error = 0;
2852
2853	/*
2854	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
2855	 * missing/unopenable for the root vdev to be still considered openable.
2856	 */
2857	if (spa->spa_trust_config) {
2858		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2859	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2860		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2861	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2862		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2863	} else {
2864		spa->spa_missing_tvds_allowed = 0;
2865	}
2866
2867	spa->spa_missing_tvds_allowed =
2868	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2869
2870	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2871	error = vdev_open(spa->spa_root_vdev);
2872	spa_config_exit(spa, SCL_ALL, FTAG);
2873
2874	if (spa->spa_missing_tvds != 0) {
2875		spa_load_note(spa, "vdev tree has %lld missing top-level "
2876		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2877		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2878			/*
2879			 * Although theoretically we could allow users to open
2880			 * incomplete pools in RW mode, we'd need to add a lot
2881			 * of extra logic (e.g. adjust pool space to account
2882			 * for missing vdevs).
2883			 * This limitation also prevents users from accidentally
2884			 * opening the pool in RW mode during data recovery and
2885			 * damaging it further.
2886			 */
2887			spa_load_note(spa, "pools with missing top-level "
2888			    "vdevs can only be opened in read-only mode.");
2889			error = SET_ERROR(ENXIO);
2890		} else {
2891			spa_load_note(spa, "current settings allow for maximum "
2892			    "%lld missing top-level vdevs at this stage.",
2893			    (u_longlong_t)spa->spa_missing_tvds_allowed);
2894		}
2895	}
2896	if (error != 0) {
2897		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2898		    error);
2899	}
2900	if (spa->spa_missing_tvds != 0 || error != 0)
2901		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2902
2903	return (error);
2904}
2905
2906/*
2907 * We need to validate the vdev labels against the configuration that
2908 * we have in hand. This function is called twice: first with an untrusted
2909 * config, then with a trusted config. The validation is more strict when the
2910 * config is trusted.
2911 */
2912static int
2913spa_ld_validate_vdevs(spa_t *spa)
2914{
2915	int error = 0;
2916	vdev_t *rvd = spa->spa_root_vdev;
2917
2918	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2919	error = vdev_validate(rvd);
2920	spa_config_exit(spa, SCL_ALL, FTAG);
2921
2922	if (error != 0) {
2923		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2924		return (error);
2925	}
2926
2927	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2928		spa_load_failed(spa, "cannot open vdev tree after invalidating "
2929		    "some vdevs");
2930		vdev_dbgmsg_print_tree(rvd, 2);
2931		return (SET_ERROR(ENXIO));
2932	}
2933
2934	return (0);
2935}
2936
2937static void
2938spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
2939{
2940	spa->spa_state = POOL_STATE_ACTIVE;
2941	spa->spa_ubsync = spa->spa_uberblock;
2942	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2943	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2944	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2945	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2946	spa->spa_claim_max_txg = spa->spa_first_txg;
2947	spa->spa_prev_software_version = ub->ub_software_version;
2948}
2949
2950static int
2951spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2952{
2953	vdev_t *rvd = spa->spa_root_vdev;
2954	nvlist_t *label;
2955	uberblock_t *ub = &spa->spa_uberblock;
2956	boolean_t activity_check = B_FALSE;
2957
2958	/*
2959	 * If we are opening the checkpointed state of the pool by
2960	 * rewinding to it, at this point we will have written the
2961	 * checkpointed uberblock to the vdev labels, so searching
2962	 * the labels will find the right uberblock.  However, if
2963	 * we are opening the checkpointed state read-only, we have
2964	 * not modified the labels. Therefore, we must ignore the
2965	 * labels and continue using the spa_uberblock that was set
2966	 * by spa_ld_checkpoint_rewind.
2967	 *
2968	 * Note that it would be fine to ignore the labels when
2969	 * rewinding (opening writeable) as well. However, if we
2970	 * crash just after writing the labels, we will end up
2971	 * searching the labels. Doing so in the common case means
2972	 * that this code path gets exercised normally, rather than
2973	 * just in the edge case.
2974	 */
2975	if (ub->ub_checkpoint_txg != 0 &&
2976	    spa_importing_readonly_checkpoint(spa)) {
2977		spa_ld_select_uberblock_done(spa, ub);
2978		return (0);
2979	}
2980
2981	/*
2982	 * Find the best uberblock.
2983	 */
2984	vdev_uberblock_load(rvd, ub, &label);
2985
2986	/*
2987	 * If we weren't able to find a single valid uberblock, return failure.
2988	 */
2989	if (ub->ub_txg == 0) {
2990		nvlist_free(label);
2991		spa_load_failed(spa, "no valid uberblock found");
2992		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2993	}
2994
2995	if (spa->spa_load_max_txg != UINT64_MAX) {
2996		(void) spa_import_progress_set_max_txg(spa,
2997		    (u_longlong_t)spa->spa_load_max_txg);
2998	}
2999	spa_load_note(spa, "using uberblock with txg=%llu",
3000	    (u_longlong_t)ub->ub_txg);
3001
3002	/*
3003	 * For pools which have the multihost property on determine if the
3004	 * pool is truly inactive and can be safely imported.  Prevent
3005	 * hosts which don't have a hostid set from importing the pool.
3006	 */
3007	activity_check = spa_activity_check_required(spa, ub, label,
3008	    spa->spa_config);
3009	if (activity_check) {
3010		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
3011		    spa_get_hostid() == 0) {
3012			nvlist_free(label);
3013			fnvlist_add_uint64(spa->spa_load_info,
3014			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3015			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3016		}
3017
3018		int error = spa_activity_check(spa, ub, spa->spa_config);
3019		if (error) {
3020			nvlist_free(label);
3021			return (error);
3022		}
3023
3024		fnvlist_add_uint64(spa->spa_load_info,
3025		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
3026		fnvlist_add_uint64(spa->spa_load_info,
3027		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
3028		fnvlist_add_uint16(spa->spa_load_info,
3029		    ZPOOL_CONFIG_MMP_SEQ,
3030		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
3031	}
3032
3033	/*
3034	 * If the pool has an unsupported version we can't open it.
3035	 */
3036	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
3037		nvlist_free(label);
3038		spa_load_failed(spa, "version %llu is not supported",
3039		    (u_longlong_t)ub->ub_version);
3040		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
3041	}
3042
3043	if (ub->ub_version >= SPA_VERSION_FEATURES) {
3044		nvlist_t *features;
3045
3046		/*
3047		 * If we weren't able to find what's necessary for reading the
3048		 * MOS in the label, return failure.
3049		 */
3050		if (label == NULL) {
3051			spa_load_failed(spa, "label config unavailable");
3052			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3053			    ENXIO));
3054		}
3055
3056		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
3057		    &features) != 0) {
3058			nvlist_free(label);
3059			spa_load_failed(spa, "invalid label: '%s' missing",
3060			    ZPOOL_CONFIG_FEATURES_FOR_READ);
3061			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3062			    ENXIO));
3063		}
3064
3065		/*
3066		 * Update our in-core representation with the definitive values
3067		 * from the label.
3068		 */
3069		nvlist_free(spa->spa_label_features);
3070		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
3071	}
3072
3073	nvlist_free(label);
3074
3075	/*
3076	 * Look through entries in the label nvlist's features_for_read. If
3077	 * there is a feature listed there which we don't understand then we
3078	 * cannot open a pool.
3079	 */
3080	if (ub->ub_version >= SPA_VERSION_FEATURES) {
3081		nvlist_t *unsup_feat;
3082
3083		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
3084		    0);
3085
3086		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
3087		    NULL); nvp != NULL;
3088		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
3089			if (!zfeature_is_supported(nvpair_name(nvp))) {
3090				VERIFY(nvlist_add_string(unsup_feat,
3091				    nvpair_name(nvp), "") == 0);
3092			}
3093		}
3094
3095		if (!nvlist_empty(unsup_feat)) {
3096			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
3097			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
3098			nvlist_free(unsup_feat);
3099			spa_load_failed(spa, "some features are unsupported");
3100			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3101			    ENOTSUP));
3102		}
3103
3104		nvlist_free(unsup_feat);
3105	}
3106
3107	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
3108		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3109		spa_try_repair(spa, spa->spa_config);
3110		spa_config_exit(spa, SCL_ALL, FTAG);
3111		nvlist_free(spa->spa_config_splitting);
3112		spa->spa_config_splitting = NULL;
3113	}
3114
3115	/*
3116	 * Initialize internal SPA structures.
3117	 */
3118	spa_ld_select_uberblock_done(spa, ub);
3119
3120	return (0);
3121}
3122
3123static int
3124spa_ld_open_rootbp(spa_t *spa)
3125{
3126	int error = 0;
3127	vdev_t *rvd = spa->spa_root_vdev;
3128
3129	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
3130	if (error != 0) {
3131		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
3132		    "[error=%d]", error);
3133		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3134	}
3135	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
3136
3137	return (0);
3138}
3139
3140static int
3141spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
3142    boolean_t reloading)
3143{
3144	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
3145	nvlist_t *nv, *mos_config, *policy;
3146	int error = 0, copy_error;
3147	uint64_t healthy_tvds, healthy_tvds_mos;
3148	uint64_t mos_config_txg;
3149
3150	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
3151	    != 0)
3152		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3153
3154	/*
3155	 * If we're assembling a pool from a split, the config provided is
3156	 * already trusted so there is nothing to do.
3157	 */
3158	if (type == SPA_IMPORT_ASSEMBLE)
3159		return (0);
3160
3161	healthy_tvds = spa_healthy_core_tvds(spa);
3162
3163	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
3164	    != 0) {
3165		spa_load_failed(spa, "unable to retrieve MOS config");
3166		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3167	}
3168
3169	/*
3170	 * If we are doing an open, pool owner wasn't verified yet, thus do
3171	 * the verification here.
3172	 */
3173	if (spa->spa_load_state == SPA_LOAD_OPEN) {
3174		error = spa_verify_host(spa, mos_config);
3175		if (error != 0) {
3176			nvlist_free(mos_config);
3177			return (error);
3178		}
3179	}
3180
3181	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
3182
3183	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3184
3185	/*
3186	 * Build a new vdev tree from the trusted config
3187	 */
3188	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
3189
3190	/*
3191	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
3192	 * obtained by scanning /dev/dsk, then it will have the right vdev
3193	 * paths. We update the trusted MOS config with this information.
3194	 * We first try to copy the paths with vdev_copy_path_strict, which
3195	 * succeeds only when both configs have exactly the same vdev tree.
3196	 * If that fails, we fall back to a more flexible method that has a
3197	 * best effort policy.
3198	 */
3199	copy_error = vdev_copy_path_strict(rvd, mrvd);
3200	if (copy_error != 0 || spa_load_print_vdev_tree) {
3201		spa_load_note(spa, "provided vdev tree:");
3202		vdev_dbgmsg_print_tree(rvd, 2);
3203		spa_load_note(spa, "MOS vdev tree:");
3204		vdev_dbgmsg_print_tree(mrvd, 2);
3205	}
3206	if (copy_error != 0) {
3207		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
3208		    "back to vdev_copy_path_relaxed");
3209		vdev_copy_path_relaxed(rvd, mrvd);
3210	}
3211
3212	vdev_close(rvd);
3213	vdev_free(rvd);
3214	spa->spa_root_vdev = mrvd;
3215	rvd = mrvd;
3216	spa_config_exit(spa, SCL_ALL, FTAG);
3217
3218	/*
3219	 * We will use spa_config if we decide to reload the spa or if spa_load
3220	 * fails and we rewind. We must thus regenerate the config using the
3221	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
3222	 * pass settings on how to load the pool and is not stored in the MOS.
3223	 * We copy it over to our new, trusted config.
3224	 */
3225	mos_config_txg = fnvlist_lookup_uint64(mos_config,
3226	    ZPOOL_CONFIG_POOL_TXG);
3227	nvlist_free(mos_config);
3228	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
3229	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
3230	    &policy) == 0)
3231		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
3232	spa_config_set(spa, mos_config);
3233	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
3234
3235	/*
3236	 * Now that we got the config from the MOS, we should be more strict
3237	 * in checking blkptrs and can make assumptions about the consistency
3238	 * of the vdev tree. spa_trust_config must be set to true before opening
3239	 * vdevs in order for them to be writeable.
3240	 */
3241	spa->spa_trust_config = B_TRUE;
3242
3243	/*
3244	 * Open and validate the new vdev tree
3245	 */
3246	error = spa_ld_open_vdevs(spa);
3247	if (error != 0)
3248		return (error);
3249
3250	error = spa_ld_validate_vdevs(spa);
3251	if (error != 0)
3252		return (error);
3253
3254	if (copy_error != 0 || spa_load_print_vdev_tree) {
3255		spa_load_note(spa, "final vdev tree:");
3256		vdev_dbgmsg_print_tree(rvd, 2);
3257	}
3258
3259	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
3260	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
3261		/*
3262		 * Sanity check to make sure that we are indeed loading the
3263		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
3264		 * in the config provided and they happened to be the only ones
3265		 * to have the latest uberblock, we could involuntarily perform
3266		 * an extreme rewind.
3267		 */
3268		healthy_tvds_mos = spa_healthy_core_tvds(spa);
3269		if (healthy_tvds_mos - healthy_tvds >=
3270		    SPA_SYNC_MIN_VDEVS) {
3271			spa_load_note(spa, "config provided misses too many "
3272			    "top-level vdevs compared to MOS (%lld vs %lld). ",
3273			    (u_longlong_t)healthy_tvds,
3274			    (u_longlong_t)healthy_tvds_mos);
3275			spa_load_note(spa, "vdev tree:");
3276			vdev_dbgmsg_print_tree(rvd, 2);
3277			if (reloading) {
3278				spa_load_failed(spa, "config was already "
3279				    "provided from MOS. Aborting.");
3280				return (spa_vdev_err(rvd,
3281				    VDEV_AUX_CORRUPT_DATA, EIO));
3282			}
3283			spa_load_note(spa, "spa must be reloaded using MOS "
3284			    "config");
3285			return (SET_ERROR(EAGAIN));
3286		}
3287	}
3288
3289	error = spa_check_for_missing_logs(spa);
3290	if (error != 0)
3291		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
3292
3293	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
3294		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
3295		    "guid sum (%llu != %llu)",
3296		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
3297		    (u_longlong_t)rvd->vdev_guid_sum);
3298		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
3299		    ENXIO));
3300	}
3301
3302	return (0);
3303}
3304
3305static int
3306spa_ld_open_indirect_vdev_metadata(spa_t *spa)
3307{
3308	int error = 0;
3309	vdev_t *rvd = spa->spa_root_vdev;
3310
3311	/*
3312	 * Everything that we read before spa_remove_init() must be stored
3313	 * on concreted vdevs.  Therefore we do this as early as possible.
3314	 */
3315	error = spa_remove_init(spa);
3316	if (error != 0) {
3317		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
3318		    error);
3319		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3320	}
3321
3322	/*
3323	 * Retrieve information needed to condense indirect vdev mappings.
3324	 */
3325	error = spa_condense_init(spa);
3326	if (error != 0) {
3327		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
3328		    error);
3329		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3330	}
3331
3332	return (0);
3333}
3334
3335static int
3336spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
3337{
3338	int error = 0;
3339	vdev_t *rvd = spa->spa_root_vdev;
3340
3341	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
3342		boolean_t missing_feat_read = B_FALSE;
3343		nvlist_t *unsup_feat, *enabled_feat;
3344
3345		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
3346		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
3347			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3348		}
3349
3350		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
3351		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
3352			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3353		}
3354
3355		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
3356		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
3357			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3358		}
3359
3360		enabled_feat = fnvlist_alloc();
3361		unsup_feat = fnvlist_alloc();
3362
3363		if (!spa_features_check(spa, B_FALSE,
3364		    unsup_feat, enabled_feat))
3365			missing_feat_read = B_TRUE;
3366
3367		if (spa_writeable(spa) ||
3368		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
3369			if (!spa_features_check(spa, B_TRUE,
3370			    unsup_feat, enabled_feat)) {
3371				*missing_feat_writep = B_TRUE;
3372			}
3373		}
3374
3375		fnvlist_add_nvlist(spa->spa_load_info,
3376		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
3377
3378		if (!nvlist_empty(unsup_feat)) {
3379			fnvlist_add_nvlist(spa->spa_load_info,
3380			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
3381		}
3382
3383		fnvlist_free(enabled_feat);
3384		fnvlist_free(unsup_feat);
3385
3386		if (!missing_feat_read) {
3387			fnvlist_add_boolean(spa->spa_load_info,
3388			    ZPOOL_CONFIG_CAN_RDONLY);
3389		}
3390
3391		/*
3392		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
3393		 * twofold: to determine whether the pool is available for
3394		 * import in read-write mode and (if it is not) whether the
3395		 * pool is available for import in read-only mode. If the pool
3396		 * is available for import in read-write mode, it is displayed
3397		 * as available in userland; if it is not available for import
3398		 * in read-only mode, it is displayed as unavailable in
3399		 * userland. If the pool is available for import in read-only
3400		 * mode but not read-write mode, it is displayed as unavailable
3401		 * in userland with a special note that the pool is actually
3402		 * available for open in read-only mode.
3403		 *
3404		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
3405		 * missing a feature for write, we must first determine whether
3406		 * the pool can be opened read-only before returning to
3407		 * userland in order to know whether to display the
3408		 * abovementioned note.
3409		 */
3410		if (missing_feat_read || (*missing_feat_writep &&
3411		    spa_writeable(spa))) {
3412			spa_load_failed(spa, "pool uses unsupported features");
3413			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3414			    ENOTSUP));
3415		}
3416
3417		/*
3418		 * Load refcounts for ZFS features from disk into an in-memory
3419		 * cache during SPA initialization.
3420		 */
3421		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
3422			uint64_t refcount;
3423
3424			error = feature_get_refcount_from_disk(spa,
3425			    &spa_feature_table[i], &refcount);
3426			if (error == 0) {
3427				spa->spa_feat_refcount_cache[i] = refcount;
3428			} else if (error == ENOTSUP) {
3429				spa->spa_feat_refcount_cache[i] =
3430				    SPA_FEATURE_DISABLED;
3431			} else {
3432				spa_load_failed(spa, "error getting refcount "
3433				    "for feature %s [error=%d]",
3434				    spa_feature_table[i].fi_guid, error);
3435				return (spa_vdev_err(rvd,
3436				    VDEV_AUX_CORRUPT_DATA, EIO));
3437			}
3438		}
3439	}
3440
3441	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
3442		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
3443		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
3444			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3445	}
3446
3447	/*
3448	 * Encryption was added before bookmark_v2, even though bookmark_v2
3449	 * is now a dependency. If this pool has encryption enabled without
3450	 * bookmark_v2, trigger an errata message.
3451	 */
3452	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
3453	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
3454		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
3455	}
3456
3457	return (0);
3458}
3459
3460static int
3461spa_ld_load_special_directories(spa_t *spa)
3462{
3463	int error = 0;
3464	vdev_t *rvd = spa->spa_root_vdev;
3465
3466	spa->spa_is_initializing = B_TRUE;
3467	error = dsl_pool_open(spa->spa_dsl_pool);
3468	spa->spa_is_initializing = B_FALSE;
3469	if (error != 0) {
3470		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
3471		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3472	}
3473
3474	return (0);
3475}
3476
3477static int
3478spa_ld_get_props(spa_t *spa)
3479{
3480	int error = 0;
3481	uint64_t obj;
3482	vdev_t *rvd = spa->spa_root_vdev;
3483
3484	/* Grab the secret checksum salt from the MOS. */
3485	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3486	    DMU_POOL_CHECKSUM_SALT, 1,
3487	    sizeof (spa->spa_cksum_salt.zcs_bytes),
3488	    spa->spa_cksum_salt.zcs_bytes);
3489	if (error == ENOENT) {
3490		/* Generate a new salt for subsequent use */
3491		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3492		    sizeof (spa->spa_cksum_salt.zcs_bytes));
3493	} else if (error != 0) {
3494		spa_load_failed(spa, "unable to retrieve checksum salt from "
3495		    "MOS [error=%d]", error);
3496		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3497	}
3498
3499	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
3500		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3501	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
3502	if (error != 0) {
3503		spa_load_failed(spa, "error opening deferred-frees bpobj "
3504		    "[error=%d]", error);
3505		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3506	}
3507
3508	/*
3509	 * Load the bit that tells us to use the new accounting function
3510	 * (raid-z deflation).  If we have an older pool, this will not
3511	 * be present.
3512	 */
3513	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
3514	if (error != 0 && error != ENOENT)
3515		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3516
3517	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
3518	    &spa->spa_creation_version, B_FALSE);
3519	if (error != 0 && error != ENOENT)
3520		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3521
3522	/*
3523	 * Load the persistent error log.  If we have an older pool, this will
3524	 * not be present.
3525	 */
3526	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
3527	    B_FALSE);
3528	if (error != 0 && error != ENOENT)
3529		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3530
3531	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
3532	    &spa->spa_errlog_scrub, B_FALSE);
3533	if (error != 0 && error != ENOENT)
3534		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3535
3536	/*
3537	 * Load the history object.  If we have an older pool, this
3538	 * will not be present.
3539	 */
3540	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
3541	if (error != 0 && error != ENOENT)
3542		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3543
3544	/*
3545	 * Load the per-vdev ZAP map. If we have an older pool, this will not
3546	 * be present; in this case, defer its creation to a later time to
3547	 * avoid dirtying the MOS this early / out of sync context. See
3548	 * spa_sync_config_object.
3549	 */
3550
3551	/* The sentinel is only available in the MOS config. */
3552	nvlist_t *mos_config;
3553	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
3554		spa_load_failed(spa, "unable to retrieve MOS config");
3555		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3556	}
3557
3558	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
3559	    &spa->spa_all_vdev_zaps, B_FALSE);
3560
3561	if (error == ENOENT) {
3562		VERIFY(!nvlist_exists(mos_config,
3563		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
3564		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
3565		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3566	} else if (error != 0) {
3567		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3568	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
3569		/*
3570		 * An older version of ZFS overwrote the sentinel value, so
3571		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
3572		 * destruction to later; see spa_sync_config_object.
3573		 */
3574		spa->spa_avz_action = AVZ_ACTION_DESTROY;
3575		/*
3576		 * We're assuming that no vdevs have had their ZAPs created
3577		 * before this. Better be sure of it.
3578		 */
3579		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3580	}
3581	nvlist_free(mos_config);
3582
3583	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3584
3585	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3586	    B_FALSE);
3587	if (error && error != ENOENT)
3588		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3589
3590	if (error == 0) {
3591		uint64_t autoreplace;
3592
3593		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3594		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3595		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3596		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3597		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3598		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
3599		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3600		    &spa->spa_dedup_ditto);
3601		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
3602		spa->spa_autoreplace = (autoreplace != 0);
3603	}
3604
3605	/*
3606	 * If we are importing a pool with missing top-level vdevs,
3607	 * we enforce that the pool doesn't panic or get suspended on
3608	 * error since the likelihood of missing data is extremely high.
3609	 */
3610	if (spa->spa_missing_tvds > 0 &&
3611	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3612	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3613		spa_load_note(spa, "forcing failmode to 'continue' "
3614		    "as some top level vdevs are missing");
3615		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3616	}
3617
3618	return (0);
3619}
3620
3621static int
3622spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3623{
3624	int error = 0;
3625	vdev_t *rvd = spa->spa_root_vdev;
3626
3627	/*
3628	 * If we're assembling the pool from the split-off vdevs of
3629	 * an existing pool, we don't want to attach the spares & cache
3630	 * devices.
3631	 */
3632
3633	/*
3634	 * Load any hot spares for this pool.
3635	 */
3636	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3637	    B_FALSE);
3638	if (error != 0 && error != ENOENT)
3639		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3640	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3641		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3642		if (load_nvlist(spa, spa->spa_spares.sav_object,
3643		    &spa->spa_spares.sav_config) != 0) {
3644			spa_load_failed(spa, "error loading spares nvlist");
3645			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3646		}
3647
3648		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3649		spa_load_spares(spa);
3650		spa_config_exit(spa, SCL_ALL, FTAG);
3651	} else if (error == 0) {
3652		spa->spa_spares.sav_sync = B_TRUE;
3653	}
3654
3655	/*
3656	 * Load any level 2 ARC devices for this pool.
3657	 */
3658	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3659	    &spa->spa_l2cache.sav_object, B_FALSE);
3660	if (error != 0 && error != ENOENT)
3661		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3662	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3663		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3664		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3665		    &spa->spa_l2cache.sav_config) != 0) {
3666			spa_load_failed(spa, "error loading l2cache nvlist");
3667			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3668		}
3669
3670		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3671		spa_load_l2cache(spa);
3672		spa_config_exit(spa, SCL_ALL, FTAG);
3673	} else if (error == 0) {
3674		spa->spa_l2cache.sav_sync = B_TRUE;
3675	}
3676
3677	return (0);
3678}
3679
3680static int
3681spa_ld_load_vdev_metadata(spa_t *spa)
3682{
3683	int error = 0;
3684	vdev_t *rvd = spa->spa_root_vdev;
3685
3686	/*
3687	 * If the 'multihost' property is set, then never allow a pool to
3688	 * be imported when the system hostid is zero.  The exception to
3689	 * this rule is zdb which is always allowed to access pools.
3690	 */
3691	if (spa_multihost(spa) && spa_get_hostid() == 0 &&
3692	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
3693		fnvlist_add_uint64(spa->spa_load_info,
3694		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
3695		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
3696	}
3697
3698	/*
3699	 * If the 'autoreplace' property is set, then post a resource notifying
3700	 * the ZFS DE that it should not issue any faults for unopenable
3701	 * devices.  We also iterate over the vdevs, and post a sysevent for any
3702	 * unopenable vdevs so that the normal autoreplace handler can take
3703	 * over.
3704	 */
3705	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3706		spa_check_removed(spa->spa_root_vdev);
3707		/*
3708		 * For the import case, this is done in spa_import(), because
3709		 * at this point we're using the spare definitions from
3710		 * the MOS config, not necessarily from the userland config.
3711		 */
3712		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3713			spa_aux_check_removed(&spa->spa_spares);
3714			spa_aux_check_removed(&spa->spa_l2cache);
3715		}
3716	}
3717
3718	/*
3719	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3720	 */
3721	error = vdev_load(rvd);
3722	if (error != 0) {
3723		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3724		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3725	}
3726
3727	error = spa_ld_log_spacemaps(spa);
3728	if (error != 0) {
3729		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
3730		    error);
3731		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3732	}
3733
3734	/*
3735	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3736	 */
3737	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3738	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3739	spa_config_exit(spa, SCL_ALL, FTAG);
3740
3741	return (0);
3742}
3743
3744static int
3745spa_ld_load_dedup_tables(spa_t *spa)
3746{
3747	int error = 0;
3748	vdev_t *rvd = spa->spa_root_vdev;
3749
3750	error = ddt_load(spa);
3751	if (error != 0) {
3752		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3753		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3754	}
3755
3756	return (0);
3757}
3758
3759static int
3760spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3761{
3762	vdev_t *rvd = spa->spa_root_vdev;
3763
3764	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3765		boolean_t missing = spa_check_logs(spa);
3766		if (missing) {
3767			if (spa->spa_missing_tvds != 0) {
3768				spa_load_note(spa, "spa_check_logs failed "
3769				    "so dropping the logs");
3770			} else {
3771				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3772				spa_load_failed(spa, "spa_check_logs failed");
3773				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3774				    ENXIO));
3775			}
3776		}
3777	}
3778
3779	return (0);
3780}
3781
3782static int
3783spa_ld_verify_pool_data(spa_t *spa)
3784{
3785	int error = 0;
3786	vdev_t *rvd = spa->spa_root_vdev;
3787
3788	/*
3789	 * We've successfully opened the pool, verify that we're ready
3790	 * to start pushing transactions.
3791	 */
3792	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3793		error = spa_load_verify(spa);
3794		if (error != 0) {
3795			spa_load_failed(spa, "spa_load_verify failed "
3796			    "[error=%d]", error);
3797			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3798			    error));
3799		}
3800	}
3801
3802	return (0);
3803}
3804
3805static void
3806spa_ld_claim_log_blocks(spa_t *spa)
3807{
3808	dmu_tx_t *tx;
3809	dsl_pool_t *dp = spa_get_dsl(spa);
3810
3811	/*
3812	 * Claim log blocks that haven't been committed yet.
3813	 * This must all happen in a single txg.
3814	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3815	 * invoked from zil_claim_log_block()'s i/o done callback.
3816	 * Price of rollback is that we abandon the log.
3817	 */
3818	spa->spa_claiming = B_TRUE;
3819
3820	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3821	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3822	    zil_claim, tx, DS_FIND_CHILDREN);
3823	dmu_tx_commit(tx);
3824
3825	spa->spa_claiming = B_FALSE;
3826
3827	spa_set_log_state(spa, SPA_LOG_GOOD);
3828}
3829
3830static void
3831spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3832    boolean_t update_config_cache)
3833{
3834	vdev_t *rvd = spa->spa_root_vdev;
3835	int need_update = B_FALSE;
3836
3837	/*
3838	 * If the config cache is stale, or we have uninitialized
3839	 * metaslabs (see spa_vdev_add()), then update the config.
3840	 *
3841	 * If this is a verbatim import, trust the current
3842	 * in-core spa_config and update the disk labels.
3843	 */
3844	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
3845	    spa->spa_load_state == SPA_LOAD_IMPORT ||
3846	    spa->spa_load_state == SPA_LOAD_RECOVER ||
3847	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3848		need_update = B_TRUE;
3849
3850	for (int c = 0; c < rvd->vdev_children; c++)
3851		if (rvd->vdev_child[c]->vdev_ms_array == 0)
3852			need_update = B_TRUE;
3853
3854	/*
3855	 * Update the config cache asychronously in case we're the
3856	 * root pool, in which case the config cache isn't writable yet.
3857	 */
3858	if (need_update)
3859		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3860}
3861
3862static void
3863spa_ld_prepare_for_reload(spa_t *spa)
3864{
3865	int mode = spa->spa_mode;
3866	int async_suspended = spa->spa_async_suspended;
3867
3868	spa_unload(spa);
3869	spa_deactivate(spa);
3870	spa_activate(spa, mode);
3871
3872	/*
3873	 * We save the value of spa_async_suspended as it gets reset to 0 by
3874	 * spa_unload(). We want to restore it back to the original value before
3875	 * returning as we might be calling spa_async_resume() later.
3876	 */
3877	spa->spa_async_suspended = async_suspended;
3878}
3879
3880static int
3881spa_ld_read_checkpoint_txg(spa_t *spa)
3882{
3883	uberblock_t checkpoint;
3884	int error = 0;
3885
3886	ASSERT0(spa->spa_checkpoint_txg);
3887	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3888
3889	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3890	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3891	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3892
3893	if (error == ENOENT)
3894		return (0);
3895
3896	if (error != 0)
3897		return (error);
3898
3899	ASSERT3U(checkpoint.ub_txg, !=, 0);
3900	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
3901	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
3902	spa->spa_checkpoint_txg = checkpoint.ub_txg;
3903	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
3904
3905	return (0);
3906}
3907
3908static int
3909spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
3910{
3911	int error = 0;
3912
3913	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3914	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3915
3916	/*
3917	 * Never trust the config that is provided unless we are assembling
3918	 * a pool following a split.
3919	 * This means don't trust blkptrs and the vdev tree in general. This
3920	 * also effectively puts the spa in read-only mode since
3921	 * spa_writeable() checks for spa_trust_config to be true.
3922	 * We will later load a trusted config from the MOS.
3923	 */
3924	if (type != SPA_IMPORT_ASSEMBLE)
3925		spa->spa_trust_config = B_FALSE;
3926
3927	/*
3928	 * Parse the config provided to create a vdev tree.
3929	 */
3930	error = spa_ld_parse_config(spa, type);
3931	if (error != 0)
3932		return (error);
3933
3934	spa_import_progress_add(spa);
3935
3936	/*
3937	 * Now that we have the vdev tree, try to open each vdev. This involves
3938	 * opening the underlying physical device, retrieving its geometry and
3939	 * probing the vdev with a dummy I/O. The state of each vdev will be set
3940	 * based on the success of those operations. After this we'll be ready
3941	 * to read from the vdevs.
3942	 */
3943	error = spa_ld_open_vdevs(spa);
3944	if (error != 0)
3945		return (error);
3946
3947	/*
3948	 * Read the label of each vdev and make sure that the GUIDs stored
3949	 * there match the GUIDs in the config provided.
3950	 * If we're assembling a new pool that's been split off from an
3951	 * existing pool, the labels haven't yet been updated so we skip
3952	 * validation for now.
3953	 */
3954	if (type != SPA_IMPORT_ASSEMBLE) {
3955		error = spa_ld_validate_vdevs(spa);
3956		if (error != 0)
3957			return (error);
3958	}
3959
3960	/*
3961	 * Read all vdev labels to find the best uberblock (i.e. latest,
3962	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
3963	 * get the list of features required to read blkptrs in the MOS from
3964	 * the vdev label with the best uberblock and verify that our version
3965	 * of zfs supports them all.
3966	 */
3967	error = spa_ld_select_uberblock(spa, type);
3968	if (error != 0)
3969		return (error);
3970
3971	/*
3972	 * Pass that uberblock to the dsl_pool layer which will open the root
3973	 * blkptr. This blkptr points to the latest version of the MOS and will
3974	 * allow us to read its contents.
3975	 */
3976	error = spa_ld_open_rootbp(spa);
3977	if (error != 0)
3978		return (error);
3979
3980	return (0);
3981}
3982
3983static int
3984spa_ld_checkpoint_rewind(spa_t *spa)
3985{
3986	uberblock_t checkpoint;
3987	int error = 0;
3988
3989	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3990	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3991
3992	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3993	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3994	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3995
3996	if (error != 0) {
3997		spa_load_failed(spa, "unable to retrieve checkpointed "
3998		    "uberblock from the MOS config [error=%d]", error);
3999
4000		if (error == ENOENT)
4001			error = ZFS_ERR_NO_CHECKPOINT;
4002
4003		return (error);
4004	}
4005
4006	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
4007	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
4008
4009	/*
4010	 * We need to update the txg and timestamp of the checkpointed
4011	 * uberblock to be higher than the latest one. This ensures that
4012	 * the checkpointed uberblock is selected if we were to close and
4013	 * reopen the pool right after we've written it in the vdev labels.
4014	 * (also see block comment in vdev_uberblock_compare)
4015	 */
4016	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
4017	checkpoint.ub_timestamp = gethrestime_sec();
4018
4019	/*
4020	 * Set current uberblock to be the checkpointed uberblock.
4021	 */
4022	spa->spa_uberblock = checkpoint;
4023
4024	/*
4025	 * If we are doing a normal rewind, then the pool is open for
4026	 * writing and we sync the "updated" checkpointed uberblock to
4027	 * disk. Once this is done, we've basically rewound the whole
4028	 * pool and there is no way back.
4029	 *
4030	 * There are cases when we don't want to attempt and sync the
4031	 * checkpointed uberblock to disk because we are opening a
4032	 * pool as read-only. Specifically, verifying the checkpointed
4033	 * state with zdb, and importing the checkpointed state to get
4034	 * a "preview" of its content.
4035	 */
4036	if (spa_writeable(spa)) {
4037		vdev_t *rvd = spa->spa_root_vdev;
4038
4039		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4040		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
4041		int svdcount = 0;
4042		int children = rvd->vdev_children;
4043		int c0 = spa_get_random(children);
4044
4045		for (int c = 0; c < children; c++) {
4046			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
4047
4048			/* Stop when revisiting the first vdev */
4049			if (c > 0 && svd[0] == vd)
4050				break;
4051
4052			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
4053			    !vdev_is_concrete(vd))
4054				continue;
4055
4056			svd[svdcount++] = vd;
4057			if (svdcount == SPA_SYNC_MIN_VDEVS)
4058				break;
4059		}
4060		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
4061		if (error == 0)
4062			spa->spa_last_synced_guid = rvd->vdev_guid;
4063		spa_config_exit(spa, SCL_ALL, FTAG);
4064
4065		if (error != 0) {
4066			spa_load_failed(spa, "failed to write checkpointed "
4067			    "uberblock to the vdev labels [error=%d]", error);
4068			return (error);
4069		}
4070	}
4071
4072	return (0);
4073}
4074
4075static int
4076spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
4077    boolean_t *update_config_cache)
4078{
4079	int error;
4080
4081	/*
4082	 * Parse the config for pool, open and validate vdevs,
4083	 * select an uberblock, and use that uberblock to open
4084	 * the MOS.
4085	 */
4086	error = spa_ld_mos_init(spa, type);
4087	if (error != 0)
4088		return (error);
4089
4090	/*
4091	 * Retrieve the trusted config stored in the MOS and use it to create
4092	 * a new, exact version of the vdev tree, then reopen all vdevs.
4093	 */
4094	error = spa_ld_trusted_config(spa, type, B_FALSE);
4095	if (error == EAGAIN) {
4096		if (update_config_cache != NULL)
4097			*update_config_cache = B_TRUE;
4098
4099		/*
4100		 * Redo the loading process with the trusted config if it is
4101		 * too different from the untrusted config.
4102		 */
4103		spa_ld_prepare_for_reload(spa);
4104		spa_load_note(spa, "RELOADING");
4105		error = spa_ld_mos_init(spa, type);
4106		if (error != 0)
4107			return (error);
4108
4109		error = spa_ld_trusted_config(spa, type, B_TRUE);
4110		if (error != 0)
4111			return (error);
4112
4113	} else if (error != 0) {
4114		return (error);
4115	}
4116
4117	return (0);
4118}
4119
4120/*
4121 * Load an existing storage pool, using the config provided. This config
4122 * describes which vdevs are part of the pool and is later validated against
4123 * partial configs present in each vdev's label and an entire copy of the
4124 * config stored in the MOS.
4125 */
4126static int
4127spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
4128{
4129	int error = 0;
4130	boolean_t missing_feat_write = B_FALSE;
4131	boolean_t checkpoint_rewind =
4132	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4133	boolean_t update_config_cache = B_FALSE;
4134
4135	ASSERT(MUTEX_HELD(&spa_namespace_lock));
4136	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
4137
4138	spa_load_note(spa, "LOADING");
4139
4140	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
4141	if (error != 0)
4142		return (error);
4143
4144	/*
4145	 * If we are rewinding to the checkpoint then we need to repeat
4146	 * everything we've done so far in this function but this time
4147	 * selecting the checkpointed uberblock and using that to open
4148	 * the MOS.
4149	 */
4150	if (checkpoint_rewind) {
4151		/*
4152		 * If we are rewinding to the checkpoint update config cache
4153		 * anyway.
4154		 */
4155		update_config_cache = B_TRUE;
4156
4157		/*
4158		 * Extract the checkpointed uberblock from the current MOS
4159		 * and use this as the pool's uberblock from now on. If the
4160		 * pool is imported as writeable we also write the checkpoint
4161		 * uberblock to the labels, making the rewind permanent.
4162		 */
4163		error = spa_ld_checkpoint_rewind(spa);
4164		if (error != 0)
4165			return (error);
4166
4167		/*
4168		 * Redo the loading process process again with the
4169		 * checkpointed uberblock.
4170		 */
4171		spa_ld_prepare_for_reload(spa);
4172		spa_load_note(spa, "LOADING checkpointed uberblock");
4173		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
4174		if (error != 0)
4175			return (error);
4176	}
4177
4178	/*
4179	 * Retrieve the checkpoint txg if the pool has a checkpoint.
4180	 */
4181	error = spa_ld_read_checkpoint_txg(spa);
4182	if (error != 0)
4183		return (error);
4184
4185	/*
4186	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
4187	 * from the pool and their contents were re-mapped to other vdevs. Note
4188	 * that everything that we read before this step must have been
4189	 * rewritten on concrete vdevs after the last device removal was
4190	 * initiated. Otherwise we could be reading from indirect vdevs before
4191	 * we have loaded their mappings.
4192	 */
4193	error = spa_ld_open_indirect_vdev_metadata(spa);
4194	if (error != 0)
4195		return (error);
4196
4197	/*
4198	 * Retrieve the full list of active features from the MOS and check if
4199	 * they are all supported.
4200	 */
4201	error = spa_ld_check_features(spa, &missing_feat_write);
4202	if (error != 0)
4203		return (error);
4204
4205	/*
4206	 * Load several special directories from the MOS needed by the dsl_pool
4207	 * layer.
4208	 */
4209	error = spa_ld_load_special_directories(spa);
4210	if (error != 0)
4211		return (error);
4212
4213	/*
4214	 * Retrieve pool properties from the MOS.
4215	 */
4216	error = spa_ld_get_props(spa);
4217	if (error != 0)
4218		return (error);
4219
4220	/*
4221	 * Retrieve the list of auxiliary devices - cache devices and spares -
4222	 * and open them.
4223	 */
4224	error = spa_ld_open_aux_vdevs(spa, type);
4225	if (error != 0)
4226		return (error);
4227
4228	/*
4229	 * Load the metadata for all vdevs. Also check if unopenable devices
4230	 * should be autoreplaced.
4231	 */
4232	error = spa_ld_load_vdev_metadata(spa);
4233	if (error != 0)
4234		return (error);
4235
4236	error = spa_ld_load_dedup_tables(spa);
4237	if (error != 0)
4238		return (error);
4239
4240	/*
4241	 * Verify the logs now to make sure we don't have any unexpected errors
4242	 * when we claim log blocks later.
4243	 */
4244	error = spa_ld_verify_logs(spa, type, ereport);
4245	if (error != 0)
4246		return (error);
4247
4248	if (missing_feat_write) {
4249		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
4250
4251		/*
4252		 * At this point, we know that we can open the pool in
4253		 * read-only mode but not read-write mode. We now have enough
4254		 * information and can return to userland.
4255		 */
4256		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
4257		    ENOTSUP));
4258	}
4259
4260	/*
4261	 * Traverse the last txgs to make sure the pool was left off in a safe
4262	 * state. When performing an extreme rewind, we verify the whole pool,
4263	 * which can take a very long time.
4264	 */
4265	error = spa_ld_verify_pool_data(spa);
4266	if (error != 0)
4267		return (error);
4268
4269	/*
4270	 * Calculate the deflated space for the pool. This must be done before
4271	 * we write anything to the pool because we'd need to update the space
4272	 * accounting using the deflated sizes.
4273	 */
4274	spa_update_dspace(spa);
4275
4276	/*
4277	 * We have now retrieved all the information we needed to open the
4278	 * pool. If we are importing the pool in read-write mode, a few
4279	 * additional steps must be performed to finish the import.
4280	 */
4281	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
4282	    spa->spa_load_max_txg == UINT64_MAX)) {
4283		uint64_t config_cache_txg = spa->spa_config_txg;
4284
4285		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
4286
4287		/*
4288		 * In case of a checkpoint rewind, log the original txg
4289		 * of the checkpointed uberblock.
4290		 */
4291		if (checkpoint_rewind) {
4292			spa_history_log_internal(spa, "checkpoint rewind",
4293			    NULL, "rewound state to txg=%llu",
4294			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
4295		}
4296
4297		/*
4298		 * Traverse the ZIL and claim all blocks.
4299		 */
4300		spa_ld_claim_log_blocks(spa);
4301
4302		/*
4303		 * Kick-off the syncing thread.
4304		 */
4305		spa->spa_sync_on = B_TRUE;
4306		txg_sync_start(spa->spa_dsl_pool);
4307		mmp_thread_start(spa);
4308
4309		/*
4310		 * Wait for all claims to sync.  We sync up to the highest
4311		 * claimed log block birth time so that claimed log blocks
4312		 * don't appear to be from the future.  spa_claim_max_txg
4313		 * will have been set for us by ZIL traversal operations
4314		 * performed above.
4315		 */
4316		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
4317
4318		/*
4319		 * Check if we need to request an update of the config. On the
4320		 * next sync, we would update the config stored in vdev labels
4321		 * and the cachefile (by default /etc/zfs/zpool.cache).
4322		 */
4323		spa_ld_check_for_config_update(spa, config_cache_txg,
4324		    update_config_cache);
4325
4326		/*
4327		 * Check all DTLs to see if anything needs resilvering.
4328		 */
4329		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
4330		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
4331			spa_async_request(spa, SPA_ASYNC_RESILVER);
4332
4333		/*
4334		 * Log the fact that we booted up (so that we can detect if
4335		 * we rebooted in the middle of an operation).
4336		 */
4337		spa_history_log_version(spa, "open");
4338
4339		spa_restart_removal(spa);
4340		spa_spawn_aux_threads(spa);
4341
4342		/*
4343		 * Delete any inconsistent datasets.
4344		 *
4345		 * Note:
4346		 * Since we may be issuing deletes for clones here,
4347		 * we make sure to do so after we've spawned all the
4348		 * auxiliary threads above (from which the livelist
4349		 * deletion zthr is part of).
4350		 */
4351		(void) dmu_objset_find(spa_name(spa),
4352		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
4353
4354		/*
4355		 * Clean up any stale temporary dataset userrefs.
4356		 */
4357		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
4358
4359		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4360		vdev_initialize_restart(spa->spa_root_vdev);
4361		vdev_trim_restart(spa->spa_root_vdev);
4362		vdev_autotrim_restart(spa);
4363		spa_config_exit(spa, SCL_CONFIG, FTAG);
4364	}
4365
4366	spa_import_progress_remove(spa);
4367	spa_load_note(spa, "LOADED");
4368
4369	return (0);
4370}
4371
4372static int
4373spa_load_retry(spa_t *spa, spa_load_state_t state)
4374{
4375	int mode = spa->spa_mode;
4376
4377	spa_unload(spa);
4378	spa_deactivate(spa);
4379
4380	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
4381
4382	spa_activate(spa, mode);
4383	spa_async_suspend(spa);
4384
4385	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
4386	    (u_longlong_t)spa->spa_load_max_txg);
4387
4388	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
4389}
4390
4391/*
4392 * If spa_load() fails this function will try loading prior txg's. If
4393 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
4394 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
4395 * function will not rewind the pool and will return the same error as
4396 * spa_load().
4397 */
4398static int
4399spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
4400    int rewind_flags)
4401{
4402	nvlist_t *loadinfo = NULL;
4403	nvlist_t *config = NULL;
4404	int load_error, rewind_error;
4405	uint64_t safe_rewind_txg;
4406	uint64_t min_txg;
4407
4408	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
4409		spa->spa_load_max_txg = spa->spa_load_txg;
4410		spa_set_log_state(spa, SPA_LOG_CLEAR);
4411	} else {
4412		spa->spa_load_max_txg = max_request;
4413		if (max_request != UINT64_MAX)
4414			spa->spa_extreme_rewind = B_TRUE;
4415	}
4416
4417	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
4418	if (load_error == 0)
4419		return (0);
4420	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
4421		/*
4422		 * When attempting checkpoint-rewind on a pool with no
4423		 * checkpoint, we should not attempt to load uberblocks
4424		 * from previous txgs when spa_load fails.
4425		 */
4426		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4427		spa_import_progress_remove(spa);
4428		return (load_error);
4429	}
4430
4431	if (spa->spa_root_vdev != NULL)
4432		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4433
4434	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
4435	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
4436
4437	if (rewind_flags & ZPOOL_NEVER_REWIND) {
4438		nvlist_free(config);
4439		spa_import_progress_remove(spa);
4440		return (load_error);
4441	}
4442
4443	if (state == SPA_LOAD_RECOVER) {
4444		/* Price of rolling back is discarding txgs, including log */
4445		spa_set_log_state(spa, SPA_LOG_CLEAR);
4446	} else {
4447		/*
4448		 * If we aren't rolling back save the load info from our first
4449		 * import attempt so that we can restore it after attempting
4450		 * to rewind.
4451		 */
4452		loadinfo = spa->spa_load_info;
4453		spa->spa_load_info = fnvlist_alloc();
4454	}
4455
4456	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
4457	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
4458	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
4459	    TXG_INITIAL : safe_rewind_txg;
4460
4461	/*
4462	 * Continue as long as we're finding errors, we're still within
4463	 * the acceptable rewind range, and we're still finding uberblocks
4464	 */
4465	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
4466	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
4467		if (spa->spa_load_max_txg < safe_rewind_txg)
4468			spa->spa_extreme_rewind = B_TRUE;
4469		rewind_error = spa_load_retry(spa, state);
4470	}
4471
4472	spa->spa_extreme_rewind = B_FALSE;
4473	spa->spa_load_max_txg = UINT64_MAX;
4474
4475	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
4476		spa_config_set(spa, config);
4477	else
4478		nvlist_free(config);
4479
4480	if (state == SPA_LOAD_RECOVER) {
4481		ASSERT3P(loadinfo, ==, NULL);
4482		spa_import_progress_remove(spa);
4483		return (rewind_error);
4484	} else {
4485		/* Store the rewind info as part of the initial load info */
4486		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
4487		    spa->spa_load_info);
4488
4489		/* Restore the initial load info */
4490		fnvlist_free(spa->spa_load_info);
4491		spa->spa_load_info = loadinfo;
4492
4493		spa_import_progress_remove(spa);
4494		return (load_error);
4495	}
4496}
4497
4498/*
4499 * Pool Open/Import
4500 *
4501 * The import case is identical to an open except that the configuration is sent
4502 * down from userland, instead of grabbed from the configuration cache.  For the
4503 * case of an open, the pool configuration will exist in the
4504 * POOL_STATE_UNINITIALIZED state.
4505 *
4506 * The stats information (gen/count/ustats) is used to gather vdev statistics at
4507 * the same time open the pool, without having to keep around the spa_t in some
4508 * ambiguous state.
4509 */
4510static int
4511spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
4512    nvlist_t **config)
4513{
4514	spa_t *spa;
4515	spa_load_state_t state = SPA_LOAD_OPEN;
4516	int error;
4517	int locked = B_FALSE;
4518
4519	*spapp = NULL;
4520
4521	/*
4522	 * As disgusting as this is, we need to support recursive calls to this
4523	 * function because dsl_dir_open() is called during spa_load(), and ends
4524	 * up calling spa_open() again.  The real fix is to figure out how to
4525	 * avoid dsl_dir_open() calling this in the first place.
4526	 */
4527	if (mutex_owner(&spa_namespace_lock) != curthread) {
4528		mutex_enter(&spa_namespace_lock);
4529		locked = B_TRUE;
4530	}
4531
4532	if ((spa = spa_lookup(pool)) == NULL) {
4533		if (locked)
4534			mutex_exit(&spa_namespace_lock);
4535		return (SET_ERROR(ENOENT));
4536	}
4537
4538	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
4539		zpool_load_policy_t policy;
4540
4541		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
4542		    &policy);
4543		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
4544			state = SPA_LOAD_RECOVER;
4545
4546		spa_activate(spa, spa_mode_global);
4547
4548		if (state != SPA_LOAD_RECOVER)
4549			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4550		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4551
4552		zfs_dbgmsg("spa_open_common: opening %s", pool);
4553		error = spa_load_best(spa, state, policy.zlp_txg,
4554		    policy.zlp_rewind);
4555
4556		if (error == EBADF) {
4557			/*
4558			 * If vdev_validate() returns failure (indicated by
4559			 * EBADF), it indicates that one of the vdevs indicates
4560			 * that the pool has been exported or destroyed.  If
4561			 * this is the case, the config cache is out of sync and
4562			 * we should remove the pool from the namespace.
4563			 */
4564			spa_unload(spa);
4565			spa_deactivate(spa);
4566			spa_write_cachefile(spa, B_TRUE, B_TRUE);
4567			spa_remove(spa);
4568			if (locked)
4569				mutex_exit(&spa_namespace_lock);
4570			return (SET_ERROR(ENOENT));
4571		}
4572
4573		if (error) {
4574			/*
4575			 * We can't open the pool, but we still have useful
4576			 * information: the state of each vdev after the
4577			 * attempted vdev_open().  Return this to the user.
4578			 */
4579			if (config != NULL && spa->spa_config) {
4580				VERIFY(nvlist_dup(spa->spa_config, config,
4581				    KM_SLEEP) == 0);
4582				VERIFY(nvlist_add_nvlist(*config,
4583				    ZPOOL_CONFIG_LOAD_INFO,
4584				    spa->spa_load_info) == 0);
4585			}
4586			spa_unload(spa);
4587			spa_deactivate(spa);
4588			spa->spa_last_open_failed = error;
4589			if (locked)
4590				mutex_exit(&spa_namespace_lock);
4591			*spapp = NULL;
4592			return (error);
4593		}
4594	}
4595
4596	spa_open_ref(spa, tag);
4597
4598	if (config != NULL)
4599		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4600
4601	/*
4602	 * If we've recovered the pool, pass back any information we
4603	 * gathered while doing the load.
4604	 */
4605	if (state == SPA_LOAD_RECOVER) {
4606		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
4607		    spa->spa_load_info) == 0);
4608	}
4609
4610	if (locked) {
4611		spa->spa_last_open_failed = 0;
4612		spa->spa_last_ubsync_txg = 0;
4613		spa->spa_load_txg = 0;
4614		mutex_exit(&spa_namespace_lock);
4615	}
4616
4617	*spapp = spa;
4618
4619	return (0);
4620}
4621
4622int
4623spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
4624    nvlist_t **config)
4625{
4626	return (spa_open_common(name, spapp, tag, policy, config));
4627}
4628
4629int
4630spa_open(const char *name, spa_t **spapp, void *tag)
4631{
4632	return (spa_open_common(name, spapp, tag, NULL, NULL));
4633}
4634
4635/*
4636 * Lookup the given spa_t, incrementing the inject count in the process,
4637 * preventing it from being exported or destroyed.
4638 */
4639spa_t *
4640spa_inject_addref(char *name)
4641{
4642	spa_t *spa;
4643
4644	mutex_enter(&spa_namespace_lock);
4645	if ((spa = spa_lookup(name)) == NULL) {
4646		mutex_exit(&spa_namespace_lock);
4647		return (NULL);
4648	}
4649	spa->spa_inject_ref++;
4650	mutex_exit(&spa_namespace_lock);
4651
4652	return (spa);
4653}
4654
4655void
4656spa_inject_delref(spa_t *spa)
4657{
4658	mutex_enter(&spa_namespace_lock);
4659	spa->spa_inject_ref--;
4660	mutex_exit(&spa_namespace_lock);
4661}
4662
4663/*
4664 * Add spares device information to the nvlist.
4665 */
4666static void
4667spa_add_spares(spa_t *spa, nvlist_t *config)
4668{
4669	nvlist_t **spares;
4670	uint_t i, nspares;
4671	nvlist_t *nvroot;
4672	uint64_t guid;
4673	vdev_stat_t *vs;
4674	uint_t vsc;
4675	uint64_t pool;
4676
4677	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4678
4679	if (spa->spa_spares.sav_count == 0)
4680		return;
4681
4682	VERIFY(nvlist_lookup_nvlist(config,
4683	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4684	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4685	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4686	if (nspares != 0) {
4687		VERIFY(nvlist_add_nvlist_array(nvroot,
4688		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4689		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4690		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4691
4692		/*
4693		 * Go through and find any spares which have since been
4694		 * repurposed as an active spare.  If this is the case, update
4695		 * their status appropriately.
4696		 */
4697		for (i = 0; i < nspares; i++) {
4698			VERIFY(nvlist_lookup_uint64(spares[i],
4699			    ZPOOL_CONFIG_GUID, &guid) == 0);
4700			if (spa_spare_exists(guid, &pool, NULL) &&
4701			    pool != 0ULL) {
4702				VERIFY(nvlist_lookup_uint64_array(
4703				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
4704				    (uint64_t **)&vs, &vsc) == 0);
4705				vs->vs_state = VDEV_STATE_CANT_OPEN;
4706				vs->vs_aux = VDEV_AUX_SPARED;
4707			}
4708		}
4709	}
4710}
4711
4712/*
4713 * Add l2cache device information to the nvlist, including vdev stats.
4714 */
4715static void
4716spa_add_l2cache(spa_t *spa, nvlist_t *config)
4717{
4718	nvlist_t **l2cache;
4719	uint_t i, j, nl2cache;
4720	nvlist_t *nvroot;
4721	uint64_t guid;
4722	vdev_t *vd;
4723	vdev_stat_t *vs;
4724	uint_t vsc;
4725
4726	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4727
4728	if (spa->spa_l2cache.sav_count == 0)
4729		return;
4730
4731	VERIFY(nvlist_lookup_nvlist(config,
4732	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4733	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4734	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4735	if (nl2cache != 0) {
4736		VERIFY(nvlist_add_nvlist_array(nvroot,
4737		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4738		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4739		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4740
4741		/*
4742		 * Update level 2 cache device stats.
4743		 */
4744
4745		for (i = 0; i < nl2cache; i++) {
4746			VERIFY(nvlist_lookup_uint64(l2cache[i],
4747			    ZPOOL_CONFIG_GUID, &guid) == 0);
4748
4749			vd = NULL;
4750			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
4751				if (guid ==
4752				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
4753					vd = spa->spa_l2cache.sav_vdevs[j];
4754					break;
4755				}
4756			}
4757			ASSERT(vd != NULL);
4758
4759			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
4760			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
4761			    == 0);
4762			vdev_get_stats(vd, vs);
4763			vdev_config_generate_stats(vd, l2cache[i]);
4764
4765		}
4766	}
4767}
4768
4769static void
4770spa_add_feature_stats(spa_t *spa, nvlist_t *config)
4771{
4772	nvlist_t *features;
4773	zap_cursor_t zc;
4774	zap_attribute_t za;
4775
4776	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4777	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4778
4779	if (spa->spa_feat_for_read_obj != 0) {
4780		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4781		    spa->spa_feat_for_read_obj);
4782		    zap_cursor_retrieve(&zc, &za) == 0;
4783		    zap_cursor_advance(&zc)) {
4784			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4785			    za.za_num_integers == 1);
4786			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4787			    za.za_first_integer));
4788		}
4789		zap_cursor_fini(&zc);
4790	}
4791
4792	if (spa->spa_feat_for_write_obj != 0) {
4793		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4794		    spa->spa_feat_for_write_obj);
4795		    zap_cursor_retrieve(&zc, &za) == 0;
4796		    zap_cursor_advance(&zc)) {
4797			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4798			    za.za_num_integers == 1);
4799			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4800			    za.za_first_integer));
4801		}
4802		zap_cursor_fini(&zc);
4803	}
4804
4805	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
4806	    features) == 0);
4807	nvlist_free(features);
4808}
4809
4810int
4811spa_get_stats(const char *name, nvlist_t **config,
4812    char *altroot, size_t buflen)
4813{
4814	int error;
4815	spa_t *spa;
4816
4817	*config = NULL;
4818	error = spa_open_common(name, &spa, FTAG, NULL, config);
4819
4820	if (spa != NULL) {
4821		/*
4822		 * This still leaves a window of inconsistency where the spares
4823		 * or l2cache devices could change and the config would be
4824		 * self-inconsistent.
4825		 */
4826		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4827
4828		if (*config != NULL) {
4829			uint64_t loadtimes[2];
4830
4831			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4832			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4833			VERIFY(nvlist_add_uint64_array(*config,
4834			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4835
4836			VERIFY(nvlist_add_uint64(*config,
4837			    ZPOOL_CONFIG_ERRCOUNT,
4838			    spa_get_errlog_size(spa)) == 0);
4839
4840			if (spa_suspended(spa)) {
4841				VERIFY(nvlist_add_uint64(*config,
4842				    ZPOOL_CONFIG_SUSPENDED,
4843				    spa->spa_failmode) == 0);
4844				VERIFY(nvlist_add_uint64(*config,
4845				    ZPOOL_CONFIG_SUSPENDED_REASON,
4846				    spa->spa_suspended) == 0);
4847			}
4848
4849			spa_add_spares(spa, *config);
4850			spa_add_l2cache(spa, *config);
4851			spa_add_feature_stats(spa, *config);
4852		}
4853	}
4854
4855	/*
4856	 * We want to get the alternate root even for faulted pools, so we cheat
4857	 * and call spa_lookup() directly.
4858	 */
4859	if (altroot) {
4860		if (spa == NULL) {
4861			mutex_enter(&spa_namespace_lock);
4862			spa = spa_lookup(name);
4863			if (spa)
4864				spa_altroot(spa, altroot, buflen);
4865			else
4866				altroot[0] = '\0';
4867			spa = NULL;
4868			mutex_exit(&spa_namespace_lock);
4869		} else {
4870			spa_altroot(spa, altroot, buflen);
4871		}
4872	}
4873
4874	if (spa != NULL) {
4875		spa_config_exit(spa, SCL_CONFIG, FTAG);
4876		spa_close(spa, FTAG);
4877	}
4878
4879	return (error);
4880}
4881
4882/*
4883 * Validate that the auxiliary device array is well formed.  We must have an
4884 * array of nvlists, each which describes a valid leaf vdev.  If this is an
4885 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4886 * specified, as long as they are well-formed.
4887 */
4888static int
4889spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4890    spa_aux_vdev_t *sav, const char *config, uint64_t version,
4891    vdev_labeltype_t label)
4892{
4893	nvlist_t **dev;
4894	uint_t i, ndev;
4895	vdev_t *vd;
4896	int error;
4897
4898	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4899
4900	/*
4901	 * It's acceptable to have no devs specified.
4902	 */
4903	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4904		return (0);
4905
4906	if (ndev == 0)
4907		return (SET_ERROR(EINVAL));
4908
4909	/*
4910	 * Make sure the pool is formatted with a version that supports this
4911	 * device type.
4912	 */
4913	if (spa_version(spa) < version)
4914		return (SET_ERROR(ENOTSUP));
4915
4916	/*
4917	 * Set the pending device list so we correctly handle device in-use
4918	 * checking.
4919	 */
4920	sav->sav_pending = dev;
4921	sav->sav_npending = ndev;
4922
4923	for (i = 0; i < ndev; i++) {
4924		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4925		    mode)) != 0)
4926			goto out;
4927
4928		if (!vd->vdev_ops->vdev_op_leaf) {
4929			vdev_free(vd);
4930			error = SET_ERROR(EINVAL);
4931			goto out;
4932		}
4933
4934		vd->vdev_top = vd;
4935
4936		if ((error = vdev_open(vd)) == 0 &&
4937		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
4938			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4939			    vd->vdev_guid) == 0);
4940		}
4941
4942		vdev_free(vd);
4943
4944		if (error &&
4945		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4946			goto out;
4947		else
4948			error = 0;
4949	}
4950
4951out:
4952	sav->sav_pending = NULL;
4953	sav->sav_npending = 0;
4954	return (error);
4955}
4956
4957static int
4958spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4959{
4960	int error;
4961
4962	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4963
4964	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4965	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4966	    VDEV_LABEL_SPARE)) != 0) {
4967		return (error);
4968	}
4969
4970	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4971	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4972	    VDEV_LABEL_L2CACHE));
4973}
4974
4975static void
4976spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4977    const char *config)
4978{
4979	int i;
4980
4981	if (sav->sav_config != NULL) {
4982		nvlist_t **olddevs;
4983		uint_t oldndevs;
4984		nvlist_t **newdevs;
4985
4986		/*
4987		 * Generate new dev list by concatentating with the
4988		 * current dev list.
4989		 */
4990		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4991		    &olddevs, &oldndevs) == 0);
4992
4993		newdevs = kmem_alloc(sizeof (void *) *
4994		    (ndevs + oldndevs), KM_SLEEP);
4995		for (i = 0; i < oldndevs; i++)
4996			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4997			    KM_SLEEP) == 0);
4998		for (i = 0; i < ndevs; i++)
4999			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
5000			    KM_SLEEP) == 0);
5001
5002		VERIFY(nvlist_remove(sav->sav_config, config,
5003		    DATA_TYPE_NVLIST_ARRAY) == 0);
5004
5005		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
5006		    config, newdevs, ndevs + oldndevs) == 0);
5007		for (i = 0; i < oldndevs + ndevs; i++)
5008			nvlist_free(newdevs[i]);
5009		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
5010	} else {
5011		/*
5012		 * Generate a new dev list.
5013		 */
5014		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
5015		    KM_SLEEP) == 0);
5016		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
5017		    devs, ndevs) == 0);
5018	}
5019}
5020
5021/*
5022 * Stop and drop level 2 ARC devices
5023 */
5024void
5025spa_l2cache_drop(spa_t *spa)
5026{
5027	vdev_t *vd;
5028	int i;
5029	spa_aux_vdev_t *sav = &spa->spa_l2cache;
5030
5031	for (i = 0; i < sav->sav_count; i++) {
5032		uint64_t pool;
5033
5034		vd = sav->sav_vdevs[i];
5035		ASSERT(vd != NULL);
5036
5037		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
5038		    pool != 0ULL && l2arc_vdev_present(vd))
5039			l2arc_remove_vdev(vd);
5040	}
5041}
5042
5043/*
5044 * Verify encryption parameters for spa creation. If we are encrypting, we must
5045 * have the encryption feature flag enabled.
5046 */
5047static int
5048spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
5049    boolean_t has_encryption)
5050{
5051	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
5052	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
5053	    !has_encryption)
5054		return (SET_ERROR(ENOTSUP));
5055
5056	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
5057}
5058
5059/*
5060 * Pool Creation
5061 */
5062int
5063spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
5064    nvlist_t *zplprops, dsl_crypto_params_t *dcp)
5065{
5066	spa_t *spa;
5067	char *altroot = NULL;
5068	vdev_t *rvd;
5069	dsl_pool_t *dp;
5070	dmu_tx_t *tx;
5071	int error = 0;
5072	uint64_t txg = TXG_INITIAL;
5073	nvlist_t **spares, **l2cache;
5074	uint_t nspares, nl2cache;
5075	uint64_t version, obj;
5076	boolean_t has_features;
5077	char *poolname;
5078	nvlist_t *nvl;
5079	boolean_t has_encryption;
5080	spa_feature_t feat;
5081	char *feat_name;
5082
5083	if (props == NULL ||
5084	    nvlist_lookup_string(props,
5085	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
5086		poolname = (char *)pool;
5087
5088	/*
5089	 * If this pool already exists, return failure.
5090	 */
5091	mutex_enter(&spa_namespace_lock);
5092	if (spa_lookup(poolname) != NULL) {
5093		mutex_exit(&spa_namespace_lock);
5094		return (SET_ERROR(EEXIST));
5095	}
5096
5097	/*
5098	 * Allocate a new spa_t structure.
5099	 */
5100	nvl = fnvlist_alloc();
5101	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
5102	(void) nvlist_lookup_string(props,
5103	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5104	spa = spa_add(poolname, nvl, altroot);
5105	fnvlist_free(nvl);
5106	spa_activate(spa, spa_mode_global);
5107
5108	if (props && (error = spa_prop_validate(spa, props))) {
5109		spa_deactivate(spa);
5110		spa_remove(spa);
5111		mutex_exit(&spa_namespace_lock);
5112		return (error);
5113	}
5114
5115	/*
5116	 * Temporary pool names should never be written to disk.
5117	 */
5118	if (poolname != pool)
5119		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
5120
5121	has_features = B_FALSE;
5122	has_encryption = B_FALSE;
5123	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
5124	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
5125		if (zpool_prop_feature(nvpair_name(elem))) {
5126			has_features = B_TRUE;
5127			feat_name = strchr(nvpair_name(elem), '@') + 1;
5128			VERIFY0(zfeature_lookup_name(feat_name, &feat));
5129			if (feat == SPA_FEATURE_ENCRYPTION)
5130				has_encryption = B_TRUE;
5131		}
5132	}
5133
5134	/* verify encryption params, if they were provided */
5135	if (dcp != NULL) {
5136		error = spa_create_check_encryption_params(dcp, has_encryption);
5137		if (error != 0) {
5138			spa_deactivate(spa);
5139			spa_remove(spa);
5140			mutex_exit(&spa_namespace_lock);
5141			return (error);
5142		}
5143	}
5144
5145	if (has_features || nvlist_lookup_uint64(props,
5146	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
5147		version = SPA_VERSION;
5148	}
5149	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
5150
5151	spa->spa_first_txg = txg;
5152	spa->spa_uberblock.ub_txg = txg - 1;
5153	spa->spa_uberblock.ub_version = version;
5154	spa->spa_ubsync = spa->spa_uberblock;
5155	spa->spa_load_state = SPA_LOAD_CREATE;
5156	spa->spa_removing_phys.sr_state = DSS_NONE;
5157	spa->spa_removing_phys.sr_removing_vdev = -1;
5158	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
5159	spa->spa_indirect_vdevs_loaded = B_TRUE;
5160
5161	/*
5162	 * Create "The Godfather" zio to hold all async IOs
5163	 */
5164	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
5165	    KM_SLEEP);
5166	for (int i = 0; i < max_ncpus; i++) {
5167		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
5168		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
5169		    ZIO_FLAG_GODFATHER);
5170	}
5171
5172	/*
5173	 * Create the root vdev.
5174	 */
5175	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5176
5177	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
5178
5179	ASSERT(error != 0 || rvd != NULL);
5180	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
5181
5182	if (error == 0 && !zfs_allocatable_devs(nvroot))
5183		error = SET_ERROR(EINVAL);
5184
5185	if (error == 0 &&
5186	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
5187	    (error = spa_validate_aux(spa, nvroot, txg,
5188	    VDEV_ALLOC_ADD)) == 0) {
5189		/*
5190		 * instantiate the metaslab groups (this will dirty the vdevs)
5191		 * we can no longer error exit past this point
5192		 */
5193		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
5194			vdev_t *vd = rvd->vdev_child[c];
5195
5196			vdev_metaslab_set_size(vd);
5197			vdev_expand(vd, txg);
5198		}
5199	}
5200
5201	spa_config_exit(spa, SCL_ALL, FTAG);
5202
5203	if (error != 0) {
5204		spa_unload(spa);
5205		spa_deactivate(spa);
5206		spa_remove(spa);
5207		mutex_exit(&spa_namespace_lock);
5208		return (error);
5209	}
5210
5211	/*
5212	 * Get the list of spares, if specified.
5213	 */
5214	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5215	    &spares, &nspares) == 0) {
5216		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
5217		    KM_SLEEP) == 0);
5218		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5219		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5220		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5221		spa_load_spares(spa);
5222		spa_config_exit(spa, SCL_ALL, FTAG);
5223		spa->spa_spares.sav_sync = B_TRUE;
5224	}
5225
5226	/*
5227	 * Get the list of level 2 cache devices, if specified.
5228	 */
5229	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5230	    &l2cache, &nl2cache) == 0) {
5231		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5232		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5233		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5234		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5235		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5236		spa_load_l2cache(spa);
5237		spa_config_exit(spa, SCL_ALL, FTAG);
5238		spa->spa_l2cache.sav_sync = B_TRUE;
5239	}
5240
5241	spa->spa_is_initializing = B_TRUE;
5242	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
5243	spa->spa_is_initializing = B_FALSE;
5244
5245	/*
5246	 * Create DDTs (dedup tables).
5247	 */
5248	ddt_create(spa);
5249
5250	spa_update_dspace(spa);
5251
5252	tx = dmu_tx_create_assigned(dp, txg);
5253
5254	/*
5255	 * Create the pool config object.
5256	 */
5257	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
5258	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
5259	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
5260
5261	if (zap_add(spa->spa_meta_objset,
5262	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
5263	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
5264		cmn_err(CE_PANIC, "failed to add pool config");
5265	}
5266
5267	if (zap_add(spa->spa_meta_objset,
5268	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
5269	    sizeof (uint64_t), 1, &version, tx) != 0) {
5270		cmn_err(CE_PANIC, "failed to add pool version");
5271	}
5272
5273	/* Newly created pools with the right version are always deflated. */
5274	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
5275		spa->spa_deflate = TRUE;
5276		if (zap_add(spa->spa_meta_objset,
5277		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5278		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
5279			cmn_err(CE_PANIC, "failed to add deflate");
5280		}
5281	}
5282
5283	/*
5284	 * Create the deferred-free bpobj.  Turn off compression
5285	 * because sync-to-convergence takes longer if the blocksize
5286	 * keeps changing.
5287	 */
5288	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
5289	dmu_object_set_compress(spa->spa_meta_objset, obj,
5290	    ZIO_COMPRESS_OFF, tx);
5291	if (zap_add(spa->spa_meta_objset,
5292	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
5293	    sizeof (uint64_t), 1, &obj, tx) != 0) {
5294		cmn_err(CE_PANIC, "failed to add bpobj");
5295	}
5296	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
5297	    spa->spa_meta_objset, obj));
5298
5299	/*
5300	 * Create the pool's history object.
5301	 */
5302	if (version >= SPA_VERSION_ZPOOL_HISTORY)
5303		spa_history_create_obj(spa, tx);
5304
5305	/*
5306	 * Generate some random noise for salted checksums to operate on.
5307	 */
5308	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
5309	    sizeof (spa->spa_cksum_salt.zcs_bytes));
5310
5311	/*
5312	 * Set pool properties.
5313	 */
5314	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
5315	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
5316	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
5317	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
5318	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
5319	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
5320
5321	if (props != NULL) {
5322		spa_configfile_set(spa, props, B_FALSE);
5323		spa_sync_props(props, tx);
5324	}
5325
5326	dmu_tx_commit(tx);
5327
5328	spa->spa_sync_on = B_TRUE;
5329	txg_sync_start(spa->spa_dsl_pool);
5330	mmp_thread_start(spa);
5331
5332	/*
5333	 * We explicitly wait for the first transaction to complete so that our
5334	 * bean counters are appropriately updated.
5335	 */
5336	txg_wait_synced(spa->spa_dsl_pool, txg);
5337
5338	spa_spawn_aux_threads(spa);
5339
5340	spa_write_cachefile(spa, B_FALSE, B_TRUE);
5341	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
5342
5343	spa_history_log_version(spa, "create");
5344
5345	/*
5346	 * Don't count references from objsets that are already closed
5347	 * and are making their way through the eviction process.
5348	 */
5349	spa_evicting_os_wait(spa);
5350	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
5351	spa->spa_load_state = SPA_LOAD_NONE;
5352
5353	mutex_exit(&spa_namespace_lock);
5354
5355	return (0);
5356}
5357
5358#ifdef _KERNEL
5359/*
5360 * Get the root pool information from the root disk, then import the root pool
5361 * during the system boot up time.
5362 */
5363static nvlist_t *
5364spa_generate_rootconf(const char *devpath, const char *devid, uint64_t *guid,
5365    uint64_t pool_guid)
5366{
5367	nvlist_t *config;
5368	nvlist_t *nvtop, *nvroot;
5369	uint64_t pgid;
5370
5371	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
5372		return (NULL);
5373
5374	/*
5375	 * Add this top-level vdev to the child array.
5376	 */
5377	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5378	    &nvtop) == 0);
5379	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5380	    &pgid) == 0);
5381	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
5382
5383	if (pool_guid != 0 && pool_guid != pgid) {
5384		/*
5385		 * The boot loader provided a pool GUID, but it does not match
5386		 * the one we found in the label.  Return failure so that we
5387		 * can fall back to the full device scan.
5388		 */
5389		zfs_dbgmsg("spa_generate_rootconf: loader pool guid %llu != "
5390		    "label pool guid %llu", (u_longlong_t)pool_guid,
5391		    (u_longlong_t)pgid);
5392		nvlist_free(config);
5393		return (NULL);
5394	}
5395
5396	/*
5397	 * Put this pool's top-level vdevs into a root vdev.
5398	 */
5399	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5400	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
5401	    VDEV_TYPE_ROOT) == 0);
5402	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
5403	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
5404	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
5405	    &nvtop, 1) == 0);
5406
5407	/*
5408	 * Replace the existing vdev_tree with the new root vdev in
5409	 * this pool's configuration (remove the old, add the new).
5410	 */
5411	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
5412	nvlist_free(nvroot);
5413	return (config);
5414}
5415
5416/*
5417 * Walk the vdev tree and see if we can find a device with "better"
5418 * configuration. A configuration is "better" if the label on that
5419 * device has a more recent txg.
5420 */
5421static void
5422spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
5423{
5424	for (int c = 0; c < vd->vdev_children; c++)
5425		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
5426
5427	if (vd->vdev_ops->vdev_op_leaf) {
5428		nvlist_t *label;
5429		uint64_t label_txg;
5430
5431		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
5432		    &label) != 0)
5433			return;
5434
5435		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
5436		    &label_txg) == 0);
5437
5438		/*
5439		 * Do we have a better boot device?
5440		 */
5441		if (label_txg > *txg) {
5442			*txg = label_txg;
5443			*avd = vd;
5444		}
5445		nvlist_free(label);
5446	}
5447}
5448
5449/*
5450 * Import a root pool.
5451 *
5452 * For x86. devpath_list will consist of devid and/or physpath name of
5453 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
5454 * The GRUB "findroot" command will return the vdev we should boot.
5455 *
5456 * For Sparc, devpath_list consists the physpath name of the booting device
5457 * no matter the rootpool is a single device pool or a mirrored pool.
5458 * e.g.
5459 *	"/pci@1f,0/ide@d/disk@0,0:a"
5460 */
5461int
5462spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid,
5463    uint64_t vdev_guid)
5464{
5465	spa_t *spa;
5466	vdev_t *rvd, *bvd, *avd = NULL;
5467	nvlist_t *config, *nvtop;
5468	uint64_t guid, txg;
5469	char *pname;
5470	int error;
5471	const char *altdevpath = NULL;
5472
5473	/*
5474	 * Read the label from the boot device and generate a configuration.
5475	 */
5476	config = spa_generate_rootconf(devpath, devid, &guid, pool_guid);
5477#if defined(_OBP) && defined(_KERNEL)
5478	if (config == NULL) {
5479		if (strstr(devpath, "/iscsi/ssd") != NULL) {
5480			/* iscsi boot */
5481			get_iscsi_bootpath_phy(devpath);
5482			config = spa_generate_rootconf(devpath, devid, &guid,
5483			    pool_guid);
5484		}
5485	}
5486#endif
5487
5488	/*
5489	 * We were unable to import the pool using the /devices path or devid
5490	 * provided by the boot loader.  This may be the case if the boot
5491	 * device has been connected to a different location in the system, or
5492	 * if a new boot environment has changed the driver used to access the
5493	 * boot device.
5494	 *
5495	 * Attempt an exhaustive scan of all visible block devices to see if we
5496	 * can locate an alternative /devices path with a label that matches
5497	 * the expected pool and vdev GUID.
5498	 */
5499	if (config == NULL && (altdevpath =
5500	    vdev_disk_preroot_lookup(pool_guid, vdev_guid)) != NULL) {
5501		cmn_err(CE_NOTE, "Original /devices path (%s) not available; "
5502		    "ZFS is trying an alternate path (%s)", devpath,
5503		    altdevpath);
5504		config = spa_generate_rootconf(altdevpath, NULL, &guid,
5505		    pool_guid);
5506	}
5507
5508	if (config == NULL) {
5509		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
5510		    devpath);
5511		return (SET_ERROR(EIO));
5512	}
5513
5514	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
5515	    &pname) == 0);
5516	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
5517
5518	mutex_enter(&spa_namespace_lock);
5519	if ((spa = spa_lookup(pname)) != NULL) {
5520		/*
5521		 * Remove the existing root pool from the namespace so that we
5522		 * can replace it with the correct config we just read in.
5523		 */
5524		spa_remove(spa);
5525	}
5526
5527	spa = spa_add(pname, config, NULL);
5528	spa->spa_is_root = B_TRUE;
5529	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
5530	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
5531	    &spa->spa_ubsync.ub_version) != 0)
5532		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
5533
5534	/*
5535	 * Build up a vdev tree based on the boot device's label config.
5536	 */
5537	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5538	    &nvtop) == 0);
5539	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5540	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
5541	    VDEV_ALLOC_ROOTPOOL);
5542	spa_config_exit(spa, SCL_ALL, FTAG);
5543	if (error) {
5544		mutex_exit(&spa_namespace_lock);
5545		nvlist_free(config);
5546		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
5547		    pname);
5548		return (error);
5549	}
5550
5551	/*
5552	 * Get the boot vdev.
5553	 */
5554	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
5555		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
5556		    (u_longlong_t)guid);
5557		error = SET_ERROR(ENOENT);
5558		goto out;
5559	}
5560
5561	/*
5562	 * Determine if there is a better boot device.
5563	 */
5564	avd = bvd;
5565	spa_alt_rootvdev(rvd, &avd, &txg);
5566	if (avd != bvd) {
5567		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
5568		    "try booting from '%s'", avd->vdev_path);
5569		error = SET_ERROR(EINVAL);
5570		goto out;
5571	}
5572
5573	/*
5574	 * If the boot device is part of a spare vdev then ensure that
5575	 * we're booting off the active spare.
5576	 */
5577	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
5578	    !bvd->vdev_isspare) {
5579		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
5580		    "try booting from '%s'",
5581		    bvd->vdev_parent->
5582		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
5583		error = SET_ERROR(EINVAL);
5584		goto out;
5585	}
5586
5587	error = 0;
5588out:
5589	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5590	vdev_free(rvd);
5591	spa_config_exit(spa, SCL_ALL, FTAG);
5592	mutex_exit(&spa_namespace_lock);
5593
5594	nvlist_free(config);
5595	return (error);
5596}
5597
5598#endif
5599
5600/*
5601 * Import a non-root pool into the system.
5602 */
5603int
5604spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
5605{
5606	spa_t *spa;
5607	char *altroot = NULL;
5608	spa_load_state_t state = SPA_LOAD_IMPORT;
5609	zpool_load_policy_t policy;
5610	uint64_t mode = spa_mode_global;
5611	uint64_t readonly = B_FALSE;
5612	int error;
5613	nvlist_t *nvroot;
5614	nvlist_t **spares, **l2cache;
5615	uint_t nspares, nl2cache;
5616
5617	/*
5618	 * If a pool with this name exists, return failure.
5619	 */
5620	mutex_enter(&spa_namespace_lock);
5621	if (spa_lookup(pool) != NULL) {
5622		mutex_exit(&spa_namespace_lock);
5623		return (SET_ERROR(EEXIST));
5624	}
5625
5626	/*
5627	 * Create and initialize the spa structure.
5628	 */
5629	(void) nvlist_lookup_string(props,
5630	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5631	(void) nvlist_lookup_uint64(props,
5632	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
5633	if (readonly)
5634		mode = FREAD;
5635	spa = spa_add(pool, config, altroot);
5636	spa->spa_import_flags = flags;
5637
5638	/*
5639	 * Verbatim import - Take a pool and insert it into the namespace
5640	 * as if it had been loaded at boot.
5641	 */
5642	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
5643		if (props != NULL)
5644			spa_configfile_set(spa, props, B_FALSE);
5645
5646		spa_write_cachefile(spa, B_FALSE, B_TRUE);
5647		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5648		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
5649		mutex_exit(&spa_namespace_lock);
5650		return (0);
5651	}
5652
5653	spa_activate(spa, mode);
5654
5655	/*
5656	 * Don't start async tasks until we know everything is healthy.
5657	 */
5658	spa_async_suspend(spa);
5659
5660	zpool_get_load_policy(config, &policy);
5661	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5662		state = SPA_LOAD_RECOVER;
5663
5664	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
5665
5666	if (state != SPA_LOAD_RECOVER) {
5667		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5668		zfs_dbgmsg("spa_import: importing %s", pool);
5669	} else {
5670		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
5671		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
5672	}
5673	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
5674
5675	/*
5676	 * Propagate anything learned while loading the pool and pass it
5677	 * back to caller (i.e. rewind info, missing devices, etc).
5678	 */
5679	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5680	    spa->spa_load_info) == 0);
5681
5682	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5683	/*
5684	 * Toss any existing sparelist, as it doesn't have any validity
5685	 * anymore, and conflicts with spa_has_spare().
5686	 */
5687	if (spa->spa_spares.sav_config) {
5688		nvlist_free(spa->spa_spares.sav_config);
5689		spa->spa_spares.sav_config = NULL;
5690		spa_load_spares(spa);
5691	}
5692	if (spa->spa_l2cache.sav_config) {
5693		nvlist_free(spa->spa_l2cache.sav_config);
5694		spa->spa_l2cache.sav_config = NULL;
5695		spa_load_l2cache(spa);
5696	}
5697
5698	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5699	    &nvroot) == 0);
5700	if (error == 0)
5701		error = spa_validate_aux(spa, nvroot, -1ULL,
5702		    VDEV_ALLOC_SPARE);
5703	if (error == 0)
5704		error = spa_validate_aux(spa, nvroot, -1ULL,
5705		    VDEV_ALLOC_L2CACHE);
5706	spa_config_exit(spa, SCL_ALL, FTAG);
5707
5708	if (props != NULL)
5709		spa_configfile_set(spa, props, B_FALSE);
5710
5711	if (error != 0 || (props && spa_writeable(spa) &&
5712	    (error = spa_prop_set(spa, props)))) {
5713		spa_unload(spa);
5714		spa_deactivate(spa);
5715		spa_remove(spa);
5716		mutex_exit(&spa_namespace_lock);
5717		return (error);
5718	}
5719
5720	spa_async_resume(spa);
5721
5722	/*
5723	 * Override any spares and level 2 cache devices as specified by
5724	 * the user, as these may have correct device names/devids, etc.
5725	 */
5726	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5727	    &spares, &nspares) == 0) {
5728		if (spa->spa_spares.sav_config)
5729			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
5730			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
5731		else
5732			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
5733			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5734		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5735		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5736		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5737		spa_load_spares(spa);
5738		spa_config_exit(spa, SCL_ALL, FTAG);
5739		spa->spa_spares.sav_sync = B_TRUE;
5740	}
5741	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5742	    &l2cache, &nl2cache) == 0) {
5743		if (spa->spa_l2cache.sav_config)
5744			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
5745			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
5746		else
5747			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5748			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5749		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5750		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5751		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5752		spa_load_l2cache(spa);
5753		spa_config_exit(spa, SCL_ALL, FTAG);
5754		spa->spa_l2cache.sav_sync = B_TRUE;
5755	}
5756
5757	/*
5758	 * Check for any removed devices.
5759	 */
5760	if (spa->spa_autoreplace) {
5761		spa_aux_check_removed(&spa->spa_spares);
5762		spa_aux_check_removed(&spa->spa_l2cache);
5763	}
5764
5765	if (spa_writeable(spa)) {
5766		/*
5767		 * Update the config cache to include the newly-imported pool.
5768		 */
5769		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5770	}
5771
5772	/*
5773	 * It's possible that the pool was expanded while it was exported.
5774	 * We kick off an async task to handle this for us.
5775	 */
5776	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5777
5778	spa_history_log_version(spa, "import");
5779
5780	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5781
5782	mutex_exit(&spa_namespace_lock);
5783
5784	return (0);
5785}
5786
5787nvlist_t *
5788spa_tryimport(nvlist_t *tryconfig)
5789{
5790	nvlist_t *config = NULL;
5791	char *poolname, *cachefile;
5792	spa_t *spa;
5793	uint64_t state;
5794	int error;
5795	zpool_load_policy_t policy;
5796
5797	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
5798		return (NULL);
5799
5800	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
5801		return (NULL);
5802
5803	/*
5804	 * Create and initialize the spa structure.
5805	 */
5806	mutex_enter(&spa_namespace_lock);
5807	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
5808	spa_activate(spa, FREAD);
5809
5810	/*
5811	 * Rewind pool if a max txg was provided.
5812	 */
5813	zpool_get_load_policy(spa->spa_config, &policy);
5814	if (policy.zlp_txg != UINT64_MAX) {
5815		spa->spa_load_max_txg = policy.zlp_txg;
5816		spa->spa_extreme_rewind = B_TRUE;
5817		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
5818		    poolname, (longlong_t)policy.zlp_txg);
5819	} else {
5820		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
5821	}
5822
5823	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
5824	    == 0) {
5825		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
5826		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5827	} else {
5828		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
5829	}
5830
5831	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
5832
5833	/*
5834	 * If 'tryconfig' was at least parsable, return the current config.
5835	 */
5836	if (spa->spa_root_vdev != NULL) {
5837		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5838		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
5839		    poolname) == 0);
5840		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5841		    state) == 0);
5842		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
5843		    spa->spa_uberblock.ub_timestamp) == 0);
5844		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5845		    spa->spa_load_info) == 0);
5846
5847		/*
5848		 * If the bootfs property exists on this pool then we
5849		 * copy it out so that external consumers can tell which
5850		 * pools are bootable.
5851		 */
5852		if ((!error || error == EEXIST) && spa->spa_bootfs) {
5853			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5854
5855			/*
5856			 * We have to play games with the name since the
5857			 * pool was opened as TRYIMPORT_NAME.
5858			 */
5859			if (dsl_dsobj_to_dsname(spa_name(spa),
5860			    spa->spa_bootfs, tmpname) == 0) {
5861				char *cp;
5862				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5863
5864				cp = strchr(tmpname, '/');
5865				if (cp == NULL) {
5866					(void) strlcpy(dsname, tmpname,
5867					    MAXPATHLEN);
5868				} else {
5869					(void) snprintf(dsname, MAXPATHLEN,
5870					    "%s/%s", poolname, ++cp);
5871				}
5872				VERIFY(nvlist_add_string(config,
5873				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
5874				kmem_free(dsname, MAXPATHLEN);
5875			}
5876			kmem_free(tmpname, MAXPATHLEN);
5877		}
5878
5879		/*
5880		 * Add the list of hot spares and level 2 cache devices.
5881		 */
5882		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5883		spa_add_spares(spa, config);
5884		spa_add_l2cache(spa, config);
5885		spa_config_exit(spa, SCL_CONFIG, FTAG);
5886	}
5887
5888	spa_unload(spa);
5889	spa_deactivate(spa);
5890	spa_remove(spa);
5891	mutex_exit(&spa_namespace_lock);
5892
5893	return (config);
5894}
5895
5896/*
5897 * Pool export/destroy
5898 *
5899 * The act of destroying or exporting a pool is very simple.  We make sure there
5900 * is no more pending I/O and any references to the pool are gone.  Then, we
5901 * update the pool state and sync all the labels to disk, removing the
5902 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
5903 * we don't sync the labels or remove the configuration cache.
5904 */
5905static int
5906spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5907    boolean_t force, boolean_t hardforce)
5908{
5909	spa_t *spa;
5910
5911	if (oldconfig)
5912		*oldconfig = NULL;
5913
5914	if (!(spa_mode_global & FWRITE))
5915		return (SET_ERROR(EROFS));
5916
5917	mutex_enter(&spa_namespace_lock);
5918	if ((spa = spa_lookup(pool)) == NULL) {
5919		mutex_exit(&spa_namespace_lock);
5920		return (SET_ERROR(ENOENT));
5921	}
5922
5923	/*
5924	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
5925	 * reacquire the namespace lock, and see if we can export.
5926	 */
5927	spa_open_ref(spa, FTAG);
5928	mutex_exit(&spa_namespace_lock);
5929	spa_async_suspend(spa);
5930	mutex_enter(&spa_namespace_lock);
5931	spa_close(spa, FTAG);
5932
5933	/*
5934	 * The pool will be in core if it's openable,
5935	 * in which case we can modify its state.
5936	 */
5937	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5938
5939		/*
5940		 * Objsets may be open only because they're dirty, so we
5941		 * have to force it to sync before checking spa_refcnt.
5942		 */
5943		txg_wait_synced(spa->spa_dsl_pool, 0);
5944		spa_evicting_os_wait(spa);
5945
5946		/*
5947		 * A pool cannot be exported or destroyed if there are active
5948		 * references.  If we are resetting a pool, allow references by
5949		 * fault injection handlers.
5950		 */
5951		if (!spa_refcount_zero(spa) ||
5952		    (spa->spa_inject_ref != 0 &&
5953		    new_state != POOL_STATE_UNINITIALIZED)) {
5954			spa_async_resume(spa);
5955			mutex_exit(&spa_namespace_lock);
5956			return (SET_ERROR(EBUSY));
5957		}
5958
5959		/*
5960		 * A pool cannot be exported if it has an active shared spare.
5961		 * This is to prevent other pools stealing the active spare
5962		 * from an exported pool. At user's own will, such pool can
5963		 * be forcedly exported.
5964		 */
5965		if (!force && new_state == POOL_STATE_EXPORTED &&
5966		    spa_has_active_shared_spare(spa)) {
5967			spa_async_resume(spa);
5968			mutex_exit(&spa_namespace_lock);
5969			return (SET_ERROR(EXDEV));
5970		}
5971
5972		/*
5973		 * We're about to export or destroy this pool. Make sure
5974		 * we stop all initialization and trim activity here before
5975		 * we set the spa_final_txg. This will ensure that all
5976		 * dirty data resulting from the initialization is
5977		 * committed to disk before we unload the pool.
5978		 */
5979		if (spa->spa_root_vdev != NULL) {
5980			vdev_t *rvd = spa->spa_root_vdev;
5981			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
5982			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
5983			vdev_autotrim_stop_all(spa);
5984		}
5985
5986		/*
5987		 * We want this to be reflected on every label,
5988		 * so mark them all dirty.  spa_unload() will do the
5989		 * final sync that pushes these changes out.
5990		 */
5991		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5992			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5993			spa->spa_state = new_state;
5994			spa->spa_final_txg = spa_last_synced_txg(spa) +
5995			    TXG_DEFER_SIZE + 1;
5996			vdev_config_dirty(spa->spa_root_vdev);
5997			spa_config_exit(spa, SCL_ALL, FTAG);
5998		}
5999	}
6000
6001	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
6002
6003	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6004		spa_unload(spa);
6005		spa_deactivate(spa);
6006	}
6007
6008	if (oldconfig && spa->spa_config)
6009		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
6010
6011	if (new_state != POOL_STATE_UNINITIALIZED) {
6012		if (!hardforce)
6013			spa_write_cachefile(spa, B_TRUE, B_TRUE);
6014		spa_remove(spa);
6015	}
6016	mutex_exit(&spa_namespace_lock);
6017
6018	return (0);
6019}
6020
6021/*
6022 * Destroy a storage pool.
6023 */
6024int
6025spa_destroy(char *pool)
6026{
6027	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
6028	    B_FALSE, B_FALSE));
6029}
6030
6031/*
6032 * Export a storage pool.
6033 */
6034int
6035spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
6036    boolean_t hardforce)
6037{
6038	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
6039	    force, hardforce));
6040}
6041
6042/*
6043 * Similar to spa_export(), this unloads the spa_t without actually removing it
6044 * from the namespace in any way.
6045 */
6046int
6047spa_reset(char *pool)
6048{
6049	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
6050	    B_FALSE, B_FALSE));
6051}
6052
6053/*
6054 * ==========================================================================
6055 * Device manipulation
6056 * ==========================================================================
6057 */
6058
6059/*
6060 * Add a device to a storage pool.
6061 */
6062int
6063spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
6064{
6065	uint64_t txg;
6066	int error;
6067	vdev_t *rvd = spa->spa_root_vdev;
6068	vdev_t *vd, *tvd;
6069	nvlist_t **spares, **l2cache;
6070	uint_t nspares, nl2cache;
6071
6072	ASSERT(spa_writeable(spa));
6073
6074	txg = spa_vdev_enter(spa);
6075
6076	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
6077	    VDEV_ALLOC_ADD)) != 0)
6078		return (spa_vdev_exit(spa, NULL, txg, error));
6079
6080	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
6081
6082	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
6083	    &nspares) != 0)
6084		nspares = 0;
6085
6086	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
6087	    &nl2cache) != 0)
6088		nl2cache = 0;
6089
6090	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
6091		return (spa_vdev_exit(spa, vd, txg, EINVAL));
6092
6093	if (vd->vdev_children != 0 &&
6094	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
6095		return (spa_vdev_exit(spa, vd, txg, error));
6096
6097	/*
6098	 * We must validate the spares and l2cache devices after checking the
6099	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
6100	 */
6101	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
6102		return (spa_vdev_exit(spa, vd, txg, error));
6103
6104	/*
6105	 * If we are in the middle of a device removal, we can only add
6106	 * devices which match the existing devices in the pool.
6107	 * If we are in the middle of a removal, or have some indirect
6108	 * vdevs, we can not add raidz toplevels.
6109	 */
6110	if (spa->spa_vdev_removal != NULL ||
6111	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
6112		for (int c = 0; c < vd->vdev_children; c++) {
6113			tvd = vd->vdev_child[c];
6114			if (spa->spa_vdev_removal != NULL &&
6115			    tvd->vdev_ashift != spa->spa_max_ashift) {
6116				return (spa_vdev_exit(spa, vd, txg, EINVAL));
6117			}
6118			/* Fail if top level vdev is raidz */
6119			if (tvd->vdev_ops == &vdev_raidz_ops) {
6120				return (spa_vdev_exit(spa, vd, txg, EINVAL));
6121			}
6122			/*
6123			 * Need the top level mirror to be
6124			 * a mirror of leaf vdevs only
6125			 */
6126			if (tvd->vdev_ops == &vdev_mirror_ops) {
6127				for (uint64_t cid = 0;
6128				    cid < tvd->vdev_children; cid++) {
6129					vdev_t *cvd = tvd->vdev_child[cid];
6130					if (!cvd->vdev_ops->vdev_op_leaf) {
6131						return (spa_vdev_exit(spa, vd,
6132						    txg, EINVAL));
6133					}
6134				}
6135			}
6136		}
6137	}
6138
6139	for (int c = 0; c < vd->vdev_children; c++) {
6140		tvd = vd->vdev_child[c];
6141		vdev_remove_child(vd, tvd);
6142		tvd->vdev_id = rvd->vdev_children;
6143		vdev_add_child(rvd, tvd);
6144		vdev_config_dirty(tvd);
6145	}
6146
6147	if (nspares != 0) {
6148		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
6149		    ZPOOL_CONFIG_SPARES);
6150		spa_load_spares(spa);
6151		spa->spa_spares.sav_sync = B_TRUE;
6152	}
6153
6154	if (nl2cache != 0) {
6155		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
6156		    ZPOOL_CONFIG_L2CACHE);
6157		spa_load_l2cache(spa);
6158		spa->spa_l2cache.sav_sync = B_TRUE;
6159	}
6160
6161	/*
6162	 * We have to be careful when adding new vdevs to an existing pool.
6163	 * If other threads start allocating from these vdevs before we
6164	 * sync the config cache, and we lose power, then upon reboot we may
6165	 * fail to open the pool because there are DVAs that the config cache
6166	 * can't translate.  Therefore, we first add the vdevs without
6167	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
6168	 * and then let spa_config_update() initialize the new metaslabs.
6169	 *
6170	 * spa_load() checks for added-but-not-initialized vdevs, so that
6171	 * if we lose power at any point in this sequence, the remaining
6172	 * steps will be completed the next time we load the pool.
6173	 */
6174	(void) spa_vdev_exit(spa, vd, txg, 0);
6175
6176	mutex_enter(&spa_namespace_lock);
6177	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6178	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
6179	mu