spa.c revision 6f7938128a2c5e23f4b970ea101137eadd1470a1
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 * Copyright 2016 Toomas Soome <tsoome@me.com>
30 * Copyright 2017 Joyent, Inc.
31 * Copyright (c) 2017 Datto Inc.
32 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
33 */
34
35/*
36 * SPA: Storage Pool Allocator
37 *
38 * This file contains all the routines used when modifying on-disk SPA state.
39 * This includes opening, importing, destroying, exporting a pool, and syncing a
40 * pool.
41 */
42
43#include <sys/zfs_context.h>
44#include <sys/fm/fs/zfs.h>
45#include <sys/spa_impl.h>
46#include <sys/zio.h>
47#include <sys/zio_checksum.h>
48#include <sys/dmu.h>
49#include <sys/dmu_tx.h>
50#include <sys/zap.h>
51#include <sys/zil.h>
52#include <sys/ddt.h>
53#include <sys/vdev_impl.h>
54#include <sys/vdev_removal.h>
55#include <sys/vdev_indirect_mapping.h>
56#include <sys/vdev_indirect_births.h>
57#include <sys/metaslab.h>
58#include <sys/metaslab_impl.h>
59#include <sys/uberblock_impl.h>
60#include <sys/txg.h>
61#include <sys/avl.h>
62#include <sys/bpobj.h>
63#include <sys/dmu_traverse.h>
64#include <sys/dmu_objset.h>
65#include <sys/unique.h>
66#include <sys/dsl_pool.h>
67#include <sys/dsl_dataset.h>
68#include <sys/dsl_dir.h>
69#include <sys/dsl_prop.h>
70#include <sys/dsl_synctask.h>
71#include <sys/fs/zfs.h>
72#include <sys/arc.h>
73#include <sys/callb.h>
74#include <sys/systeminfo.h>
75#include <sys/spa_boot.h>
76#include <sys/zfs_ioctl.h>
77#include <sys/dsl_scan.h>
78#include <sys/zfeature.h>
79#include <sys/dsl_destroy.h>
80#include <sys/abd.h>
81
82#ifdef	_KERNEL
83#include <sys/bootprops.h>
84#include <sys/callb.h>
85#include <sys/cpupart.h>
86#include <sys/pool.h>
87#include <sys/sysdc.h>
88#include <sys/zone.h>
89#endif	/* _KERNEL */
90
91#include "zfs_prop.h"
92#include "zfs_comutil.h"
93
94/*
95 * The interval, in seconds, at which failed configuration cache file writes
96 * should be retried.
97 */
98int zfs_ccw_retry_interval = 300;
99
100typedef enum zti_modes {
101	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
102	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
103	ZTI_MODE_NULL,			/* don't create a taskq */
104	ZTI_NMODES
105} zti_modes_t;
106
107#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
108#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
109#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
110
111#define	ZTI_N(n)	ZTI_P(n, 1)
112#define	ZTI_ONE		ZTI_N(1)
113
114typedef struct zio_taskq_info {
115	zti_modes_t zti_mode;
116	uint_t zti_value;
117	uint_t zti_count;
118} zio_taskq_info_t;
119
120static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
121	"issue", "issue_high", "intr", "intr_high"
122};
123
124/*
125 * This table defines the taskq settings for each ZFS I/O type. When
126 * initializing a pool, we use this table to create an appropriately sized
127 * taskq. Some operations are low volume and therefore have a small, static
128 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
129 * macros. Other operations process a large amount of data; the ZTI_BATCH
130 * macro causes us to create a taskq oriented for throughput. Some operations
131 * are so high frequency and short-lived that the taskq itself can become a a
132 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
133 * additional degree of parallelism specified by the number of threads per-
134 * taskq and the number of taskqs; when dispatching an event in this case, the
135 * particular taskq is chosen at random.
136 *
137 * The different taskq priorities are to handle the different contexts (issue
138 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
139 * need to be handled with minimum delay.
140 */
141const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
142	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
143	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
144	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
145	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
146	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
147	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
148	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
149};
150
151static void spa_sync_version(void *arg, dmu_tx_t *tx);
152static void spa_sync_props(void *arg, dmu_tx_t *tx);
153static boolean_t spa_has_active_shared_spare(spa_t *spa);
154static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
155    boolean_t reloading);
156static void spa_vdev_resilver_done(spa_t *spa);
157
158uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
159id_t		zio_taskq_psrset_bind = PS_NONE;
160boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
161uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
162
163boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
164extern int	zfs_sync_pass_deferred_free;
165
166/*
167 * Report any spa_load_verify errors found, but do not fail spa_load.
168 * This is used by zdb to analyze non-idle pools.
169 */
170boolean_t	spa_load_verify_dryrun = B_FALSE;
171
172/*
173 * This (illegal) pool name is used when temporarily importing a spa_t in order
174 * to get the vdev stats associated with the imported devices.
175 */
176#define	TRYIMPORT_NAME	"$import"
177
178/*
179 * For debugging purposes: print out vdev tree during pool import.
180 */
181boolean_t	spa_load_print_vdev_tree = B_FALSE;
182
183/*
184 * A non-zero value for zfs_max_missing_tvds means that we allow importing
185 * pools with missing top-level vdevs. This is strictly intended for advanced
186 * pool recovery cases since missing data is almost inevitable. Pools with
187 * missing devices can only be imported read-only for safety reasons, and their
188 * fail-mode will be automatically set to "continue".
189 *
190 * With 1 missing vdev we should be able to import the pool and mount all
191 * datasets. User data that was not modified after the missing device has been
192 * added should be recoverable. This means that snapshots created prior to the
193 * addition of that device should be completely intact.
194 *
195 * With 2 missing vdevs, some datasets may fail to mount since there are
196 * dataset statistics that are stored as regular metadata. Some data might be
197 * recoverable if those vdevs were added recently.
198 *
199 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
200 * may be missing entirely. Chances of data recovery are very low. Note that
201 * there are also risks of performing an inadvertent rewind as we might be
202 * missing all the vdevs with the latest uberblocks.
203 */
204uint64_t	zfs_max_missing_tvds = 0;
205
206/*
207 * The parameters below are similar to zfs_max_missing_tvds but are only
208 * intended for a preliminary open of the pool with an untrusted config which
209 * might be incomplete or out-dated.
210 *
211 * We are more tolerant for pools opened from a cachefile since we could have
212 * an out-dated cachefile where a device removal was not registered.
213 * We could have set the limit arbitrarily high but in the case where devices
214 * are really missing we would want to return the proper error codes; we chose
215 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
216 * and we get a chance to retrieve the trusted config.
217 */
218uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
219/*
220 * In the case where config was assembled by scanning device paths (/dev/dsks
221 * by default) we are less tolerant since all the existing devices should have
222 * been detected and we want spa_load to return the right error codes.
223 */
224uint64_t	zfs_max_missing_tvds_scan = 0;
225
226/*
227 * ==========================================================================
228 * SPA properties routines
229 * ==========================================================================
230 */
231
232/*
233 * Add a (source=src, propname=propval) list to an nvlist.
234 */
235static void
236spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
237    uint64_t intval, zprop_source_t src)
238{
239	const char *propname = zpool_prop_to_name(prop);
240	nvlist_t *propval;
241
242	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
243	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
244
245	if (strval != NULL)
246		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
247	else
248		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
249
250	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
251	nvlist_free(propval);
252}
253
254/*
255 * Get property values from the spa configuration.
256 */
257static void
258spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
259{
260	vdev_t *rvd = spa->spa_root_vdev;
261	dsl_pool_t *pool = spa->spa_dsl_pool;
262	uint64_t size, alloc, cap, version;
263	zprop_source_t src = ZPROP_SRC_NONE;
264	spa_config_dirent_t *dp;
265	metaslab_class_t *mc = spa_normal_class(spa);
266
267	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
268
269	if (rvd != NULL) {
270		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
271		size = metaslab_class_get_space(spa_normal_class(spa));
272		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
273		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
274		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
275		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
276		    size - alloc, src);
277
278		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
279		    metaslab_class_fragmentation(mc), src);
280		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
281		    metaslab_class_expandable_space(mc), src);
282		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
283		    (spa_mode(spa) == FREAD), src);
284
285		cap = (size == 0) ? 0 : (alloc * 100 / size);
286		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
287
288		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
289		    ddt_get_pool_dedup_ratio(spa), src);
290
291		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
292		    rvd->vdev_state, src);
293
294		version = spa_version(spa);
295		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
296			src = ZPROP_SRC_DEFAULT;
297		else
298			src = ZPROP_SRC_LOCAL;
299		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
300	}
301
302	if (pool != NULL) {
303		/*
304		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
305		 * when opening pools before this version freedir will be NULL.
306		 */
307		if (pool->dp_free_dir != NULL) {
308			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
309			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
310			    src);
311		} else {
312			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
313			    NULL, 0, src);
314		}
315
316		if (pool->dp_leak_dir != NULL) {
317			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
318			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
319			    src);
320		} else {
321			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
322			    NULL, 0, src);
323		}
324	}
325
326	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
327
328	if (spa->spa_comment != NULL) {
329		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
330		    0, ZPROP_SRC_LOCAL);
331	}
332
333	if (spa->spa_root != NULL)
334		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
335		    0, ZPROP_SRC_LOCAL);
336
337	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
338		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
339		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
340	} else {
341		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
342		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
343	}
344
345	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
346		if (dp->scd_path == NULL) {
347			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
348			    "none", 0, ZPROP_SRC_LOCAL);
349		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
350			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
351			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
352		}
353	}
354}
355
356/*
357 * Get zpool property values.
358 */
359int
360spa_prop_get(spa_t *spa, nvlist_t **nvp)
361{
362	objset_t *mos = spa->spa_meta_objset;
363	zap_cursor_t zc;
364	zap_attribute_t za;
365	int err;
366
367	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
368
369	mutex_enter(&spa->spa_props_lock);
370
371	/*
372	 * Get properties from the spa config.
373	 */
374	spa_prop_get_config(spa, nvp);
375
376	/* If no pool property object, no more prop to get. */
377	if (mos == NULL || spa->spa_pool_props_object == 0) {
378		mutex_exit(&spa->spa_props_lock);
379		return (0);
380	}
381
382	/*
383	 * Get properties from the MOS pool property object.
384	 */
385	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
386	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
387	    zap_cursor_advance(&zc)) {
388		uint64_t intval = 0;
389		char *strval = NULL;
390		zprop_source_t src = ZPROP_SRC_DEFAULT;
391		zpool_prop_t prop;
392
393		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
394			continue;
395
396		switch (za.za_integer_length) {
397		case 8:
398			/* integer property */
399			if (za.za_first_integer !=
400			    zpool_prop_default_numeric(prop))
401				src = ZPROP_SRC_LOCAL;
402
403			if (prop == ZPOOL_PROP_BOOTFS) {
404				dsl_pool_t *dp;
405				dsl_dataset_t *ds = NULL;
406
407				dp = spa_get_dsl(spa);
408				dsl_pool_config_enter(dp, FTAG);
409				if (err = dsl_dataset_hold_obj(dp,
410				    za.za_first_integer, FTAG, &ds)) {
411					dsl_pool_config_exit(dp, FTAG);
412					break;
413				}
414
415				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
416				    KM_SLEEP);
417				dsl_dataset_name(ds, strval);
418				dsl_dataset_rele(ds, FTAG);
419				dsl_pool_config_exit(dp, FTAG);
420			} else {
421				strval = NULL;
422				intval = za.za_first_integer;
423			}
424
425			spa_prop_add_list(*nvp, prop, strval, intval, src);
426
427			if (strval != NULL)
428				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
429
430			break;
431
432		case 1:
433			/* string property */
434			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
435			err = zap_lookup(mos, spa->spa_pool_props_object,
436			    za.za_name, 1, za.za_num_integers, strval);
437			if (err) {
438				kmem_free(strval, za.za_num_integers);
439				break;
440			}
441			spa_prop_add_list(*nvp, prop, strval, 0, src);
442			kmem_free(strval, za.za_num_integers);
443			break;
444
445		default:
446			break;
447		}
448	}
449	zap_cursor_fini(&zc);
450	mutex_exit(&spa->spa_props_lock);
451out:
452	if (err && err != ENOENT) {
453		nvlist_free(*nvp);
454		*nvp = NULL;
455		return (err);
456	}
457
458	return (0);
459}
460
461/*
462 * Validate the given pool properties nvlist and modify the list
463 * for the property values to be set.
464 */
465static int
466spa_prop_validate(spa_t *spa, nvlist_t *props)
467{
468	nvpair_t *elem;
469	int error = 0, reset_bootfs = 0;
470	uint64_t objnum = 0;
471	boolean_t has_feature = B_FALSE;
472
473	elem = NULL;
474	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
475		uint64_t intval;
476		char *strval, *slash, *check, *fname;
477		const char *propname = nvpair_name(elem);
478		zpool_prop_t prop = zpool_name_to_prop(propname);
479
480		switch (prop) {
481		case ZPOOL_PROP_INVAL:
482			if (!zpool_prop_feature(propname)) {
483				error = SET_ERROR(EINVAL);
484				break;
485			}
486
487			/*
488			 * Sanitize the input.
489			 */
490			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
491				error = SET_ERROR(EINVAL);
492				break;
493			}
494
495			if (nvpair_value_uint64(elem, &intval) != 0) {
496				error = SET_ERROR(EINVAL);
497				break;
498			}
499
500			if (intval != 0) {
501				error = SET_ERROR(EINVAL);
502				break;
503			}
504
505			fname = strchr(propname, '@') + 1;
506			if (zfeature_lookup_name(fname, NULL) != 0) {
507				error = SET_ERROR(EINVAL);
508				break;
509			}
510
511			has_feature = B_TRUE;
512			break;
513
514		case ZPOOL_PROP_VERSION:
515			error = nvpair_value_uint64(elem, &intval);
516			if (!error &&
517			    (intval < spa_version(spa) ||
518			    intval > SPA_VERSION_BEFORE_FEATURES ||
519			    has_feature))
520				error = SET_ERROR(EINVAL);
521			break;
522
523		case ZPOOL_PROP_DELEGATION:
524		case ZPOOL_PROP_AUTOREPLACE:
525		case ZPOOL_PROP_LISTSNAPS:
526		case ZPOOL_PROP_AUTOEXPAND:
527			error = nvpair_value_uint64(elem, &intval);
528			if (!error && intval > 1)
529				error = SET_ERROR(EINVAL);
530			break;
531
532		case ZPOOL_PROP_BOOTFS:
533			/*
534			 * If the pool version is less than SPA_VERSION_BOOTFS,
535			 * or the pool is still being created (version == 0),
536			 * the bootfs property cannot be set.
537			 */
538			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
539				error = SET_ERROR(ENOTSUP);
540				break;
541			}
542
543			/*
544			 * Make sure the vdev config is bootable
545			 */
546			if (!vdev_is_bootable(spa->spa_root_vdev)) {
547				error = SET_ERROR(ENOTSUP);
548				break;
549			}
550
551			reset_bootfs = 1;
552
553			error = nvpair_value_string(elem, &strval);
554
555			if (!error) {
556				objset_t *os;
557				uint64_t propval;
558
559				if (strval == NULL || strval[0] == '\0') {
560					objnum = zpool_prop_default_numeric(
561					    ZPOOL_PROP_BOOTFS);
562					break;
563				}
564
565				if (error = dmu_objset_hold(strval, FTAG, &os))
566					break;
567
568				/*
569				 * Must be ZPL, and its property settings
570				 * must be supported by GRUB (compression
571				 * is not gzip, and large blocks are not used).
572				 */
573
574				if (dmu_objset_type(os) != DMU_OST_ZFS) {
575					error = SET_ERROR(ENOTSUP);
576				} else if ((error =
577				    dsl_prop_get_int_ds(dmu_objset_ds(os),
578				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
579				    &propval)) == 0 &&
580				    !BOOTFS_COMPRESS_VALID(propval)) {
581					error = SET_ERROR(ENOTSUP);
582				} else {
583					objnum = dmu_objset_id(os);
584				}
585				dmu_objset_rele(os, FTAG);
586			}
587			break;
588
589		case ZPOOL_PROP_FAILUREMODE:
590			error = nvpair_value_uint64(elem, &intval);
591			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
592			    intval > ZIO_FAILURE_MODE_PANIC))
593				error = SET_ERROR(EINVAL);
594
595			/*
596			 * This is a special case which only occurs when
597			 * the pool has completely failed. This allows
598			 * the user to change the in-core failmode property
599			 * without syncing it out to disk (I/Os might
600			 * currently be blocked). We do this by returning
601			 * EIO to the caller (spa_prop_set) to trick it
602			 * into thinking we encountered a property validation
603			 * error.
604			 */
605			if (!error && spa_suspended(spa)) {
606				spa->spa_failmode = intval;
607				error = SET_ERROR(EIO);
608			}
609			break;
610
611		case ZPOOL_PROP_CACHEFILE:
612			if ((error = nvpair_value_string(elem, &strval)) != 0)
613				break;
614
615			if (strval[0] == '\0')
616				break;
617
618			if (strcmp(strval, "none") == 0)
619				break;
620
621			if (strval[0] != '/') {
622				error = SET_ERROR(EINVAL);
623				break;
624			}
625
626			slash = strrchr(strval, '/');
627			ASSERT(slash != NULL);
628
629			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
630			    strcmp(slash, "/..") == 0)
631				error = SET_ERROR(EINVAL);
632			break;
633
634		case ZPOOL_PROP_COMMENT:
635			if ((error = nvpair_value_string(elem, &strval)) != 0)
636				break;
637			for (check = strval; *check != '\0'; check++) {
638				/*
639				 * The kernel doesn't have an easy isprint()
640				 * check.  For this kernel check, we merely
641				 * check ASCII apart from DEL.  Fix this if
642				 * there is an easy-to-use kernel isprint().
643				 */
644				if (*check >= 0x7f) {
645					error = SET_ERROR(EINVAL);
646					break;
647				}
648			}
649			if (strlen(strval) > ZPROP_MAX_COMMENT)
650				error = E2BIG;
651			break;
652
653		case ZPOOL_PROP_DEDUPDITTO:
654			if (spa_version(spa) < SPA_VERSION_DEDUP)
655				error = SET_ERROR(ENOTSUP);
656			else
657				error = nvpair_value_uint64(elem, &intval);
658			if (error == 0 &&
659			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
660				error = SET_ERROR(EINVAL);
661			break;
662		}
663
664		if (error)
665			break;
666	}
667
668	if (!error && reset_bootfs) {
669		error = nvlist_remove(props,
670		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
671
672		if (!error) {
673			error = nvlist_add_uint64(props,
674			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
675		}
676	}
677
678	return (error);
679}
680
681void
682spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
683{
684	char *cachefile;
685	spa_config_dirent_t *dp;
686
687	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
688	    &cachefile) != 0)
689		return;
690
691	dp = kmem_alloc(sizeof (spa_config_dirent_t),
692	    KM_SLEEP);
693
694	if (cachefile[0] == '\0')
695		dp->scd_path = spa_strdup(spa_config_path);
696	else if (strcmp(cachefile, "none") == 0)
697		dp->scd_path = NULL;
698	else
699		dp->scd_path = spa_strdup(cachefile);
700
701	list_insert_head(&spa->spa_config_list, dp);
702	if (need_sync)
703		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
704}
705
706int
707spa_prop_set(spa_t *spa, nvlist_t *nvp)
708{
709	int error;
710	nvpair_t *elem = NULL;
711	boolean_t need_sync = B_FALSE;
712
713	if ((error = spa_prop_validate(spa, nvp)) != 0)
714		return (error);
715
716	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
717		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
718
719		if (prop == ZPOOL_PROP_CACHEFILE ||
720		    prop == ZPOOL_PROP_ALTROOT ||
721		    prop == ZPOOL_PROP_READONLY)
722			continue;
723
724		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
725			uint64_t ver;
726
727			if (prop == ZPOOL_PROP_VERSION) {
728				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
729			} else {
730				ASSERT(zpool_prop_feature(nvpair_name(elem)));
731				ver = SPA_VERSION_FEATURES;
732				need_sync = B_TRUE;
733			}
734
735			/* Save time if the version is already set. */
736			if (ver == spa_version(spa))
737				continue;
738
739			/*
740			 * In addition to the pool directory object, we might
741			 * create the pool properties object, the features for
742			 * read object, the features for write object, or the
743			 * feature descriptions object.
744			 */
745			error = dsl_sync_task(spa->spa_name, NULL,
746			    spa_sync_version, &ver,
747			    6, ZFS_SPACE_CHECK_RESERVED);
748			if (error)
749				return (error);
750			continue;
751		}
752
753		need_sync = B_TRUE;
754		break;
755	}
756
757	if (need_sync) {
758		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
759		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
760	}
761
762	return (0);
763}
764
765/*
766 * If the bootfs property value is dsobj, clear it.
767 */
768void
769spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
770{
771	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
772		VERIFY(zap_remove(spa->spa_meta_objset,
773		    spa->spa_pool_props_object,
774		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
775		spa->spa_bootfs = 0;
776	}
777}
778
779/*ARGSUSED*/
780static int
781spa_change_guid_check(void *arg, dmu_tx_t *tx)
782{
783	uint64_t *newguid = arg;
784	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
785	vdev_t *rvd = spa->spa_root_vdev;
786	uint64_t vdev_state;
787
788	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
789	vdev_state = rvd->vdev_state;
790	spa_config_exit(spa, SCL_STATE, FTAG);
791
792	if (vdev_state != VDEV_STATE_HEALTHY)
793		return (SET_ERROR(ENXIO));
794
795	ASSERT3U(spa_guid(spa), !=, *newguid);
796
797	return (0);
798}
799
800static void
801spa_change_guid_sync(void *arg, dmu_tx_t *tx)
802{
803	uint64_t *newguid = arg;
804	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
805	uint64_t oldguid;
806	vdev_t *rvd = spa->spa_root_vdev;
807
808	oldguid = spa_guid(spa);
809
810	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
811	rvd->vdev_guid = *newguid;
812	rvd->vdev_guid_sum += (*newguid - oldguid);
813	vdev_config_dirty(rvd);
814	spa_config_exit(spa, SCL_STATE, FTAG);
815
816	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
817	    oldguid, *newguid);
818}
819
820/*
821 * Change the GUID for the pool.  This is done so that we can later
822 * re-import a pool built from a clone of our own vdevs.  We will modify
823 * the root vdev's guid, our own pool guid, and then mark all of our
824 * vdevs dirty.  Note that we must make sure that all our vdevs are
825 * online when we do this, or else any vdevs that weren't present
826 * would be orphaned from our pool.  We are also going to issue a
827 * sysevent to update any watchers.
828 */
829int
830spa_change_guid(spa_t *spa)
831{
832	int error;
833	uint64_t guid;
834
835	mutex_enter(&spa->spa_vdev_top_lock);
836	mutex_enter(&spa_namespace_lock);
837	guid = spa_generate_guid(NULL);
838
839	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
840	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
841
842	if (error == 0) {
843		spa_write_cachefile(spa, B_FALSE, B_TRUE);
844		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
845	}
846
847	mutex_exit(&spa_namespace_lock);
848	mutex_exit(&spa->spa_vdev_top_lock);
849
850	return (error);
851}
852
853/*
854 * ==========================================================================
855 * SPA state manipulation (open/create/destroy/import/export)
856 * ==========================================================================
857 */
858
859static int
860spa_error_entry_compare(const void *a, const void *b)
861{
862	spa_error_entry_t *sa = (spa_error_entry_t *)a;
863	spa_error_entry_t *sb = (spa_error_entry_t *)b;
864	int ret;
865
866	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
867	    sizeof (zbookmark_phys_t));
868
869	if (ret < 0)
870		return (-1);
871	else if (ret > 0)
872		return (1);
873	else
874		return (0);
875}
876
877/*
878 * Utility function which retrieves copies of the current logs and
879 * re-initializes them in the process.
880 */
881void
882spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
883{
884	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
885
886	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
887	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
888
889	avl_create(&spa->spa_errlist_scrub,
890	    spa_error_entry_compare, sizeof (spa_error_entry_t),
891	    offsetof(spa_error_entry_t, se_avl));
892	avl_create(&spa->spa_errlist_last,
893	    spa_error_entry_compare, sizeof (spa_error_entry_t),
894	    offsetof(spa_error_entry_t, se_avl));
895}
896
897static void
898spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
899{
900	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
901	enum zti_modes mode = ztip->zti_mode;
902	uint_t value = ztip->zti_value;
903	uint_t count = ztip->zti_count;
904	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
905	char name[32];
906	uint_t flags = 0;
907	boolean_t batch = B_FALSE;
908
909	if (mode == ZTI_MODE_NULL) {
910		tqs->stqs_count = 0;
911		tqs->stqs_taskq = NULL;
912		return;
913	}
914
915	ASSERT3U(count, >, 0);
916
917	tqs->stqs_count = count;
918	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
919
920	switch (mode) {
921	case ZTI_MODE_FIXED:
922		ASSERT3U(value, >=, 1);
923		value = MAX(value, 1);
924		break;
925
926	case ZTI_MODE_BATCH:
927		batch = B_TRUE;
928		flags |= TASKQ_THREADS_CPU_PCT;
929		value = zio_taskq_batch_pct;
930		break;
931
932	default:
933		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
934		    "spa_activate()",
935		    zio_type_name[t], zio_taskq_types[q], mode, value);
936		break;
937	}
938
939	for (uint_t i = 0; i < count; i++) {
940		taskq_t *tq;
941
942		if (count > 1) {
943			(void) snprintf(name, sizeof (name), "%s_%s_%u",
944			    zio_type_name[t], zio_taskq_types[q], i);
945		} else {
946			(void) snprintf(name, sizeof (name), "%s_%s",
947			    zio_type_name[t], zio_taskq_types[q]);
948		}
949
950		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
951			if (batch)
952				flags |= TASKQ_DC_BATCH;
953
954			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
955			    spa->spa_proc, zio_taskq_basedc, flags);
956		} else {
957			pri_t pri = maxclsyspri;
958			/*
959			 * The write issue taskq can be extremely CPU
960			 * intensive.  Run it at slightly lower priority
961			 * than the other taskqs.
962			 */
963			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
964				pri--;
965
966			tq = taskq_create_proc(name, value, pri, 50,
967			    INT_MAX, spa->spa_proc, flags);
968		}
969
970		tqs->stqs_taskq[i] = tq;
971	}
972}
973
974static void
975spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
976{
977	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
978
979	if (tqs->stqs_taskq == NULL) {
980		ASSERT0(tqs->stqs_count);
981		return;
982	}
983
984	for (uint_t i = 0; i < tqs->stqs_count; i++) {
985		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
986		taskq_destroy(tqs->stqs_taskq[i]);
987	}
988
989	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
990	tqs->stqs_taskq = NULL;
991}
992
993/*
994 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
995 * Note that a type may have multiple discrete taskqs to avoid lock contention
996 * on the taskq itself. In that case we choose which taskq at random by using
997 * the low bits of gethrtime().
998 */
999void
1000spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1001    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1002{
1003	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1004	taskq_t *tq;
1005
1006	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1007	ASSERT3U(tqs->stqs_count, !=, 0);
1008
1009	if (tqs->stqs_count == 1) {
1010		tq = tqs->stqs_taskq[0];
1011	} else {
1012		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1013	}
1014
1015	taskq_dispatch_ent(tq, func, arg, flags, ent);
1016}
1017
1018static void
1019spa_create_zio_taskqs(spa_t *spa)
1020{
1021	for (int t = 0; t < ZIO_TYPES; t++) {
1022		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1023			spa_taskqs_init(spa, t, q);
1024		}
1025	}
1026}
1027
1028#ifdef _KERNEL
1029static void
1030spa_thread(void *arg)
1031{
1032	callb_cpr_t cprinfo;
1033
1034	spa_t *spa = arg;
1035	user_t *pu = PTOU(curproc);
1036
1037	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1038	    spa->spa_name);
1039
1040	ASSERT(curproc != &p0);
1041	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1042	    "zpool-%s", spa->spa_name);
1043	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1044
1045	/* bind this thread to the requested psrset */
1046	if (zio_taskq_psrset_bind != PS_NONE) {
1047		pool_lock();
1048		mutex_enter(&cpu_lock);
1049		mutex_enter(&pidlock);
1050		mutex_enter(&curproc->p_lock);
1051
1052		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1053		    0, NULL, NULL) == 0)  {
1054			curthread->t_bind_pset = zio_taskq_psrset_bind;
1055		} else {
1056			cmn_err(CE_WARN,
1057			    "Couldn't bind process for zfs pool \"%s\" to "
1058			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1059		}
1060
1061		mutex_exit(&curproc->p_lock);
1062		mutex_exit(&pidlock);
1063		mutex_exit(&cpu_lock);
1064		pool_unlock();
1065	}
1066
1067	if (zio_taskq_sysdc) {
1068		sysdc_thread_enter(curthread, 100, 0);
1069	}
1070
1071	spa->spa_proc = curproc;
1072	spa->spa_did = curthread->t_did;
1073
1074	spa_create_zio_taskqs(spa);
1075
1076	mutex_enter(&spa->spa_proc_lock);
1077	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1078
1079	spa->spa_proc_state = SPA_PROC_ACTIVE;
1080	cv_broadcast(&spa->spa_proc_cv);
1081
1082	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1083	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1084		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1085	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1086
1087	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1088	spa->spa_proc_state = SPA_PROC_GONE;
1089	spa->spa_proc = &p0;
1090	cv_broadcast(&spa->spa_proc_cv);
1091	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1092
1093	mutex_enter(&curproc->p_lock);
1094	lwp_exit();
1095}
1096#endif
1097
1098/*
1099 * Activate an uninitialized pool.
1100 */
1101static void
1102spa_activate(spa_t *spa, int mode)
1103{
1104	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1105
1106	spa->spa_state = POOL_STATE_ACTIVE;
1107	spa->spa_mode = mode;
1108
1109	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1110	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1111
1112	/* Try to create a covering process */
1113	mutex_enter(&spa->spa_proc_lock);
1114	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1115	ASSERT(spa->spa_proc == &p0);
1116	spa->spa_did = 0;
1117
1118	/* Only create a process if we're going to be around a while. */
1119	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1120		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1121		    NULL, 0) == 0) {
1122			spa->spa_proc_state = SPA_PROC_CREATED;
1123			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1124				cv_wait(&spa->spa_proc_cv,
1125				    &spa->spa_proc_lock);
1126			}
1127			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1128			ASSERT(spa->spa_proc != &p0);
1129			ASSERT(spa->spa_did != 0);
1130		} else {
1131#ifdef _KERNEL
1132			cmn_err(CE_WARN,
1133			    "Couldn't create process for zfs pool \"%s\"\n",
1134			    spa->spa_name);
1135#endif
1136		}
1137	}
1138	mutex_exit(&spa->spa_proc_lock);
1139
1140	/* If we didn't create a process, we need to create our taskqs. */
1141	if (spa->spa_proc == &p0) {
1142		spa_create_zio_taskqs(spa);
1143	}
1144
1145	for (size_t i = 0; i < TXG_SIZE; i++)
1146		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
1147
1148	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1149	    offsetof(vdev_t, vdev_config_dirty_node));
1150	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1151	    offsetof(objset_t, os_evicting_node));
1152	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1153	    offsetof(vdev_t, vdev_state_dirty_node));
1154
1155	txg_list_create(&spa->spa_vdev_txg_list, spa,
1156	    offsetof(struct vdev, vdev_txg_node));
1157
1158	avl_create(&spa->spa_errlist_scrub,
1159	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1160	    offsetof(spa_error_entry_t, se_avl));
1161	avl_create(&spa->spa_errlist_last,
1162	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1163	    offsetof(spa_error_entry_t, se_avl));
1164}
1165
1166/*
1167 * Opposite of spa_activate().
1168 */
1169static void
1170spa_deactivate(spa_t *spa)
1171{
1172	ASSERT(spa->spa_sync_on == B_FALSE);
1173	ASSERT(spa->spa_dsl_pool == NULL);
1174	ASSERT(spa->spa_root_vdev == NULL);
1175	ASSERT(spa->spa_async_zio_root == NULL);
1176	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1177
1178	spa_evicting_os_wait(spa);
1179
1180	txg_list_destroy(&spa->spa_vdev_txg_list);
1181
1182	list_destroy(&spa->spa_config_dirty_list);
1183	list_destroy(&spa->spa_evicting_os_list);
1184	list_destroy(&spa->spa_state_dirty_list);
1185
1186	for (int t = 0; t < ZIO_TYPES; t++) {
1187		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1188			spa_taskqs_fini(spa, t, q);
1189		}
1190	}
1191
1192	for (size_t i = 0; i < TXG_SIZE; i++) {
1193		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1194		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1195		spa->spa_txg_zio[i] = NULL;
1196	}
1197
1198	metaslab_class_destroy(spa->spa_normal_class);
1199	spa->spa_normal_class = NULL;
1200
1201	metaslab_class_destroy(spa->spa_log_class);
1202	spa->spa_log_class = NULL;
1203
1204	/*
1205	 * If this was part of an import or the open otherwise failed, we may
1206	 * still have errors left in the queues.  Empty them just in case.
1207	 */
1208	spa_errlog_drain(spa);
1209
1210	avl_destroy(&spa->spa_errlist_scrub);
1211	avl_destroy(&spa->spa_errlist_last);
1212
1213	spa->spa_state = POOL_STATE_UNINITIALIZED;
1214
1215	mutex_enter(&spa->spa_proc_lock);
1216	if (spa->spa_proc_state != SPA_PROC_NONE) {
1217		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1218		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1219		cv_broadcast(&spa->spa_proc_cv);
1220		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1221			ASSERT(spa->spa_proc != &p0);
1222			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1223		}
1224		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1225		spa->spa_proc_state = SPA_PROC_NONE;
1226	}
1227	ASSERT(spa->spa_proc == &p0);
1228	mutex_exit(&spa->spa_proc_lock);
1229
1230	/*
1231	 * We want to make sure spa_thread() has actually exited the ZFS
1232	 * module, so that the module can't be unloaded out from underneath
1233	 * it.
1234	 */
1235	if (spa->spa_did != 0) {
1236		thread_join(spa->spa_did);
1237		spa->spa_did = 0;
1238	}
1239}
1240
1241/*
1242 * Verify a pool configuration, and construct the vdev tree appropriately.  This
1243 * will create all the necessary vdevs in the appropriate layout, with each vdev
1244 * in the CLOSED state.  This will prep the pool before open/creation/import.
1245 * All vdev validation is done by the vdev_alloc() routine.
1246 */
1247static int
1248spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1249    uint_t id, int atype)
1250{
1251	nvlist_t **child;
1252	uint_t children;
1253	int error;
1254
1255	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1256		return (error);
1257
1258	if ((*vdp)->vdev_ops->vdev_op_leaf)
1259		return (0);
1260
1261	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1262	    &child, &children);
1263
1264	if (error == ENOENT)
1265		return (0);
1266
1267	if (error) {
1268		vdev_free(*vdp);
1269		*vdp = NULL;
1270		return (SET_ERROR(EINVAL));
1271	}
1272
1273	for (int c = 0; c < children; c++) {
1274		vdev_t *vd;
1275		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1276		    atype)) != 0) {
1277			vdev_free(*vdp);
1278			*vdp = NULL;
1279			return (error);
1280		}
1281	}
1282
1283	ASSERT(*vdp != NULL);
1284
1285	return (0);
1286}
1287
1288/*
1289 * Opposite of spa_load().
1290 */
1291static void
1292spa_unload(spa_t *spa)
1293{
1294	int i;
1295
1296	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1297
1298	spa_load_note(spa, "UNLOADING");
1299
1300	/*
1301	 * Stop async tasks.
1302	 */
1303	spa_async_suspend(spa);
1304
1305	/*
1306	 * Stop syncing.
1307	 */
1308	if (spa->spa_sync_on) {
1309		txg_sync_stop(spa->spa_dsl_pool);
1310		spa->spa_sync_on = B_FALSE;
1311	}
1312
1313	/*
1314	 * Even though vdev_free() also calls vdev_metaslab_fini, we need
1315	 * to call it earlier, before we wait for async i/o to complete.
1316	 * This ensures that there is no async metaslab prefetching, by
1317	 * calling taskq_wait(mg_taskq).
1318	 */
1319	if (spa->spa_root_vdev != NULL) {
1320		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1321		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
1322			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
1323		spa_config_exit(spa, SCL_ALL, FTAG);
1324	}
1325
1326	/*
1327	 * Wait for any outstanding async I/O to complete.
1328	 */
1329	if (spa->spa_async_zio_root != NULL) {
1330		for (int i = 0; i < max_ncpus; i++)
1331			(void) zio_wait(spa->spa_async_zio_root[i]);
1332		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1333		spa->spa_async_zio_root = NULL;
1334	}
1335
1336	if (spa->spa_vdev_removal != NULL) {
1337		spa_vdev_removal_destroy(spa->spa_vdev_removal);
1338		spa->spa_vdev_removal = NULL;
1339	}
1340
1341	spa_condense_fini(spa);
1342
1343	bpobj_close(&spa->spa_deferred_bpobj);
1344
1345	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1346
1347	/*
1348	 * Close all vdevs.
1349	 */
1350	if (spa->spa_root_vdev)
1351		vdev_free(spa->spa_root_vdev);
1352	ASSERT(spa->spa_root_vdev == NULL);
1353
1354	/*
1355	 * Close the dsl pool.
1356	 */
1357	if (spa->spa_dsl_pool) {
1358		dsl_pool_close(spa->spa_dsl_pool);
1359		spa->spa_dsl_pool = NULL;
1360		spa->spa_meta_objset = NULL;
1361	}
1362
1363	ddt_unload(spa);
1364
1365	/*
1366	 * Drop and purge level 2 cache
1367	 */
1368	spa_l2cache_drop(spa);
1369
1370	for (i = 0; i < spa->spa_spares.sav_count; i++)
1371		vdev_free(spa->spa_spares.sav_vdevs[i]);
1372	if (spa->spa_spares.sav_vdevs) {
1373		kmem_free(spa->spa_spares.sav_vdevs,
1374		    spa->spa_spares.sav_count * sizeof (void *));
1375		spa->spa_spares.sav_vdevs = NULL;
1376	}
1377	if (spa->spa_spares.sav_config) {
1378		nvlist_free(spa->spa_spares.sav_config);
1379		spa->spa_spares.sav_config = NULL;
1380	}
1381	spa->spa_spares.sav_count = 0;
1382
1383	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1384		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1385		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1386	}
1387	if (spa->spa_l2cache.sav_vdevs) {
1388		kmem_free(spa->spa_l2cache.sav_vdevs,
1389		    spa->spa_l2cache.sav_count * sizeof (void *));
1390		spa->spa_l2cache.sav_vdevs = NULL;
1391	}
1392	if (spa->spa_l2cache.sav_config) {
1393		nvlist_free(spa->spa_l2cache.sav_config);
1394		spa->spa_l2cache.sav_config = NULL;
1395	}
1396	spa->spa_l2cache.sav_count = 0;
1397
1398	spa->spa_async_suspended = 0;
1399
1400	spa->spa_indirect_vdevs_loaded = B_FALSE;
1401
1402	if (spa->spa_comment != NULL) {
1403		spa_strfree(spa->spa_comment);
1404		spa->spa_comment = NULL;
1405	}
1406
1407	spa_config_exit(spa, SCL_ALL, FTAG);
1408}
1409
1410/*
1411 * Load (or re-load) the current list of vdevs describing the active spares for
1412 * this pool.  When this is called, we have some form of basic information in
1413 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1414 * then re-generate a more complete list including status information.
1415 */
1416void
1417spa_load_spares(spa_t *spa)
1418{
1419	nvlist_t **spares;
1420	uint_t nspares;
1421	int i;
1422	vdev_t *vd, *tvd;
1423
1424	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1425
1426	/*
1427	 * First, close and free any existing spare vdevs.
1428	 */
1429	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1430		vd = spa->spa_spares.sav_vdevs[i];
1431
1432		/* Undo the call to spa_activate() below */
1433		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1434		    B_FALSE)) != NULL && tvd->vdev_isspare)
1435			spa_spare_remove(tvd);
1436		vdev_close(vd);
1437		vdev_free(vd);
1438	}
1439
1440	if (spa->spa_spares.sav_vdevs)
1441		kmem_free(spa->spa_spares.sav_vdevs,
1442		    spa->spa_spares.sav_count * sizeof (void *));
1443
1444	if (spa->spa_spares.sav_config == NULL)
1445		nspares = 0;
1446	else
1447		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1448		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1449
1450	spa->spa_spares.sav_count = (int)nspares;
1451	spa->spa_spares.sav_vdevs = NULL;
1452
1453	if (nspares == 0)
1454		return;
1455
1456	/*
1457	 * Construct the array of vdevs, opening them to get status in the
1458	 * process.   For each spare, there is potentially two different vdev_t
1459	 * structures associated with it: one in the list of spares (used only
1460	 * for basic validation purposes) and one in the active vdev
1461	 * configuration (if it's spared in).  During this phase we open and
1462	 * validate each vdev on the spare list.  If the vdev also exists in the
1463	 * active configuration, then we also mark this vdev as an active spare.
1464	 */
1465	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1466	    KM_SLEEP);
1467	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1468		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1469		    VDEV_ALLOC_SPARE) == 0);
1470		ASSERT(vd != NULL);
1471
1472		spa->spa_spares.sav_vdevs[i] = vd;
1473
1474		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1475		    B_FALSE)) != NULL) {
1476			if (!tvd->vdev_isspare)
1477				spa_spare_add(tvd);
1478
1479			/*
1480			 * We only mark the spare active if we were successfully
1481			 * able to load the vdev.  Otherwise, importing a pool
1482			 * with a bad active spare would result in strange
1483			 * behavior, because multiple pool would think the spare
1484			 * is actively in use.
1485			 *
1486			 * There is a vulnerability here to an equally bizarre
1487			 * circumstance, where a dead active spare is later
1488			 * brought back to life (onlined or otherwise).  Given
1489			 * the rarity of this scenario, and the extra complexity
1490			 * it adds, we ignore the possibility.
1491			 */
1492			if (!vdev_is_dead(tvd))
1493				spa_spare_activate(tvd);
1494		}
1495
1496		vd->vdev_top = vd;
1497		vd->vdev_aux = &spa->spa_spares;
1498
1499		if (vdev_open(vd) != 0)
1500			continue;
1501
1502		if (vdev_validate_aux(vd) == 0)
1503			spa_spare_add(vd);
1504	}
1505
1506	/*
1507	 * Recompute the stashed list of spares, with status information
1508	 * this time.
1509	 */
1510	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1511	    DATA_TYPE_NVLIST_ARRAY) == 0);
1512
1513	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1514	    KM_SLEEP);
1515	for (i = 0; i < spa->spa_spares.sav_count; i++)
1516		spares[i] = vdev_config_generate(spa,
1517		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1518	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1519	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1520	for (i = 0; i < spa->spa_spares.sav_count; i++)
1521		nvlist_free(spares[i]);
1522	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1523}
1524
1525/*
1526 * Load (or re-load) the current list of vdevs describing the active l2cache for
1527 * this pool.  When this is called, we have some form of basic information in
1528 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1529 * then re-generate a more complete list including status information.
1530 * Devices which are already active have their details maintained, and are
1531 * not re-opened.
1532 */
1533void
1534spa_load_l2cache(spa_t *spa)
1535{
1536	nvlist_t **l2cache;
1537	uint_t nl2cache;
1538	int i, j, oldnvdevs;
1539	uint64_t guid;
1540	vdev_t *vd, **oldvdevs, **newvdevs;
1541	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1542
1543	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1544
1545	if (sav->sav_config != NULL) {
1546		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1547		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1548		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1549	} else {
1550		nl2cache = 0;
1551		newvdevs = NULL;
1552	}
1553
1554	oldvdevs = sav->sav_vdevs;
1555	oldnvdevs = sav->sav_count;
1556	sav->sav_vdevs = NULL;
1557	sav->sav_count = 0;
1558
1559	/*
1560	 * Process new nvlist of vdevs.
1561	 */
1562	for (i = 0; i < nl2cache; i++) {
1563		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1564		    &guid) == 0);
1565
1566		newvdevs[i] = NULL;
1567		for (j = 0; j < oldnvdevs; j++) {
1568			vd = oldvdevs[j];
1569			if (vd != NULL && guid == vd->vdev_guid) {
1570				/*
1571				 * Retain previous vdev for add/remove ops.
1572				 */
1573				newvdevs[i] = vd;
1574				oldvdevs[j] = NULL;
1575				break;
1576			}
1577		}
1578
1579		if (newvdevs[i] == NULL) {
1580			/*
1581			 * Create new vdev
1582			 */
1583			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1584			    VDEV_ALLOC_L2CACHE) == 0);
1585			ASSERT(vd != NULL);
1586			newvdevs[i] = vd;
1587
1588			/*
1589			 * Commit this vdev as an l2cache device,
1590			 * even if it fails to open.
1591			 */
1592			spa_l2cache_add(vd);
1593
1594			vd->vdev_top = vd;
1595			vd->vdev_aux = sav;
1596
1597			spa_l2cache_activate(vd);
1598
1599			if (vdev_open(vd) != 0)
1600				continue;
1601
1602			(void) vdev_validate_aux(vd);
1603
1604			if (!vdev_is_dead(vd))
1605				l2arc_add_vdev(spa, vd);
1606		}
1607	}
1608
1609	/*
1610	 * Purge vdevs that were dropped
1611	 */
1612	for (i = 0; i < oldnvdevs; i++) {
1613		uint64_t pool;
1614
1615		vd = oldvdevs[i];
1616		if (vd != NULL) {
1617			ASSERT(vd->vdev_isl2cache);
1618
1619			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1620			    pool != 0ULL && l2arc_vdev_present(vd))
1621				l2arc_remove_vdev(vd);
1622			vdev_clear_stats(vd);
1623			vdev_free(vd);
1624		}
1625	}
1626
1627	if (oldvdevs)
1628		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1629
1630	if (sav->sav_config == NULL)
1631		goto out;
1632
1633	sav->sav_vdevs = newvdevs;
1634	sav->sav_count = (int)nl2cache;
1635
1636	/*
1637	 * Recompute the stashed list of l2cache devices, with status
1638	 * information this time.
1639	 */
1640	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1641	    DATA_TYPE_NVLIST_ARRAY) == 0);
1642
1643	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1644	for (i = 0; i < sav->sav_count; i++)
1645		l2cache[i] = vdev_config_generate(spa,
1646		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1647	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1648	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1649out:
1650	for (i = 0; i < sav->sav_count; i++)
1651		nvlist_free(l2cache[i]);
1652	if (sav->sav_count)
1653		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1654}
1655
1656static int
1657load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1658{
1659	dmu_buf_t *db;
1660	char *packed = NULL;
1661	size_t nvsize = 0;
1662	int error;
1663	*value = NULL;
1664
1665	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1666	if (error != 0)
1667		return (error);
1668
1669	nvsize = *(uint64_t *)db->db_data;
1670	dmu_buf_rele(db, FTAG);
1671
1672	packed = kmem_alloc(nvsize, KM_SLEEP);
1673	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1674	    DMU_READ_PREFETCH);
1675	if (error == 0)
1676		error = nvlist_unpack(packed, nvsize, value, 0);
1677	kmem_free(packed, nvsize);
1678
1679	return (error);
1680}
1681
1682/*
1683 * Concrete top-level vdevs that are not missing and are not logs. At every
1684 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1685 */
1686static uint64_t
1687spa_healthy_core_tvds(spa_t *spa)
1688{
1689	vdev_t *rvd = spa->spa_root_vdev;
1690	uint64_t tvds = 0;
1691
1692	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1693		vdev_t *vd = rvd->vdev_child[i];
1694		if (vd->vdev_islog)
1695			continue;
1696		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1697			tvds++;
1698	}
1699
1700	return (tvds);
1701}
1702
1703/*
1704 * Checks to see if the given vdev could not be opened, in which case we post a
1705 * sysevent to notify the autoreplace code that the device has been removed.
1706 */
1707static void
1708spa_check_removed(vdev_t *vd)
1709{
1710	for (uint64_t c = 0; c < vd->vdev_children; c++)
1711		spa_check_removed(vd->vdev_child[c]);
1712
1713	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1714	    vdev_is_concrete(vd)) {
1715		zfs_post_autoreplace(vd->vdev_spa, vd);
1716		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1717	}
1718}
1719
1720static int
1721spa_check_for_missing_logs(spa_t *spa)
1722{
1723	vdev_t *rvd = spa->spa_root_vdev;
1724
1725	/*
1726	 * If we're doing a normal import, then build up any additional
1727	 * diagnostic information about missing log devices.
1728	 * We'll pass this up to the user for further processing.
1729	 */
1730	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1731		nvlist_t **child, *nv;
1732		uint64_t idx = 0;
1733
1734		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1735		    KM_SLEEP);
1736		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1737
1738		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1739			vdev_t *tvd = rvd->vdev_child[c];
1740
1741			/*
1742			 * We consider a device as missing only if it failed
1743			 * to open (i.e. offline or faulted is not considered
1744			 * as missing).
1745			 */
1746			if (tvd->vdev_islog &&
1747			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1748				child[idx++] = vdev_config_generate(spa, tvd,
1749				    B_FALSE, VDEV_CONFIG_MISSING);
1750			}
1751		}
1752
1753		if (idx > 0) {
1754			fnvlist_add_nvlist_array(nv,
1755			    ZPOOL_CONFIG_CHILDREN, child, idx);
1756			fnvlist_add_nvlist(spa->spa_load_info,
1757			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
1758
1759			for (uint64_t i = 0; i < idx; i++)
1760				nvlist_free(child[i]);
1761		}
1762		nvlist_free(nv);
1763		kmem_free(child, rvd->vdev_children * sizeof (char **));
1764
1765		if (idx > 0) {
1766			spa_load_failed(spa, "some log devices are missing");
1767			return (SET_ERROR(ENXIO));
1768		}
1769	} else {
1770		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1771			vdev_t *tvd = rvd->vdev_child[c];
1772
1773			if (tvd->vdev_islog &&
1774			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1775				spa_set_log_state(spa, SPA_LOG_CLEAR);
1776				spa_load_note(spa, "some log devices are "
1777				    "missing, ZIL is dropped.");
1778				break;
1779			}
1780		}
1781	}
1782
1783	return (0);
1784}
1785
1786/*
1787 * Check for missing log devices
1788 */
1789static boolean_t
1790spa_check_logs(spa_t *spa)
1791{
1792	boolean_t rv = B_FALSE;
1793	dsl_pool_t *dp = spa_get_dsl(spa);
1794
1795	switch (spa->spa_log_state) {
1796	case SPA_LOG_MISSING:
1797		/* need to recheck in case slog has been restored */
1798	case SPA_LOG_UNKNOWN:
1799		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1800		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1801		if (rv)
1802			spa_set_log_state(spa, SPA_LOG_MISSING);
1803		break;
1804	}
1805	return (rv);
1806}
1807
1808static boolean_t
1809spa_passivate_log(spa_t *spa)
1810{
1811	vdev_t *rvd = spa->spa_root_vdev;
1812	boolean_t slog_found = B_FALSE;
1813
1814	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1815
1816	if (!spa_has_slogs(spa))
1817		return (B_FALSE);
1818
1819	for (int c = 0; c < rvd->vdev_children; c++) {
1820		vdev_t *tvd = rvd->vdev_child[c];
1821		metaslab_group_t *mg = tvd->vdev_mg;
1822
1823		if (tvd->vdev_islog) {
1824			metaslab_group_passivate(mg);
1825			slog_found = B_TRUE;
1826		}
1827	}
1828
1829	return (slog_found);
1830}
1831
1832static void
1833spa_activate_log(spa_t *spa)
1834{
1835	vdev_t *rvd = spa->spa_root_vdev;
1836
1837	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1838
1839	for (int c = 0; c < rvd->vdev_children; c++) {
1840		vdev_t *tvd = rvd->vdev_child[c];
1841		metaslab_group_t *mg = tvd->vdev_mg;
1842
1843		if (tvd->vdev_islog)
1844			metaslab_group_activate(mg);
1845	}
1846}
1847
1848int
1849spa_reset_logs(spa_t *spa)
1850{
1851	int error;
1852
1853	error = dmu_objset_find(spa_name(spa), zil_reset,
1854	    NULL, DS_FIND_CHILDREN);
1855	if (error == 0) {
1856		/*
1857		 * We successfully offlined the log device, sync out the
1858		 * current txg so that the "stubby" block can be removed
1859		 * by zil_sync().
1860		 */
1861		txg_wait_synced(spa->spa_dsl_pool, 0);
1862	}
1863	return (error);
1864}
1865
1866static void
1867spa_aux_check_removed(spa_aux_vdev_t *sav)
1868{
1869	for (int i = 0; i < sav->sav_count; i++)
1870		spa_check_removed(sav->sav_vdevs[i]);
1871}
1872
1873void
1874spa_claim_notify(zio_t *zio)
1875{
1876	spa_t *spa = zio->io_spa;
1877
1878	if (zio->io_error)
1879		return;
1880
1881	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
1882	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1883		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1884	mutex_exit(&spa->spa_props_lock);
1885}
1886
1887typedef struct spa_load_error {
1888	uint64_t	sle_meta_count;
1889	uint64_t	sle_data_count;
1890} spa_load_error_t;
1891
1892static void
1893spa_load_verify_done(zio_t *zio)
1894{
1895	blkptr_t *bp = zio->io_bp;
1896	spa_load_error_t *sle = zio->io_private;
1897	dmu_object_type_t type = BP_GET_TYPE(bp);
1898	int error = zio->io_error;
1899	spa_t *spa = zio->io_spa;
1900
1901	abd_free(zio->io_abd);
1902	if (error) {
1903		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1904		    type != DMU_OT_INTENT_LOG)
1905			atomic_inc_64(&sle->sle_meta_count);
1906		else
1907			atomic_inc_64(&sle->sle_data_count);
1908	}
1909
1910	mutex_enter(&spa->spa_scrub_lock);
1911	spa->spa_scrub_inflight--;
1912	cv_broadcast(&spa->spa_scrub_io_cv);
1913	mutex_exit(&spa->spa_scrub_lock);
1914}
1915
1916/*
1917 * Maximum number of concurrent scrub i/os to create while verifying
1918 * a pool while importing it.
1919 */
1920int spa_load_verify_maxinflight = 10000;
1921boolean_t spa_load_verify_metadata = B_TRUE;
1922boolean_t spa_load_verify_data = B_TRUE;
1923
1924/*ARGSUSED*/
1925static int
1926spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1927    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1928{
1929	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
1930		return (0);
1931	/*
1932	 * Note: normally this routine will not be called if
1933	 * spa_load_verify_metadata is not set.  However, it may be useful
1934	 * to manually set the flag after the traversal has begun.
1935	 */
1936	if (!spa_load_verify_metadata)
1937		return (0);
1938	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
1939		return (0);
1940
1941	zio_t *rio = arg;
1942	size_t size = BP_GET_PSIZE(bp);
1943
1944	mutex_enter(&spa->spa_scrub_lock);
1945	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
1946		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
1947	spa->spa_scrub_inflight++;
1948	mutex_exit(&spa->spa_scrub_lock);
1949
1950	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
1951	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1952	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1953	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1954	return (0);
1955}
1956
1957/* ARGSUSED */
1958int
1959verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1960{
1961	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
1962		return (SET_ERROR(ENAMETOOLONG));
1963
1964	return (0);
1965}
1966
1967static int
1968spa_load_verify(spa_t *spa)
1969{
1970	zio_t *rio;
1971	spa_load_error_t sle = { 0 };
1972	zpool_rewind_policy_t policy;
1973	boolean_t verify_ok = B_FALSE;
1974	int error = 0;
1975
1976	zpool_get_rewind_policy(spa->spa_config, &policy);
1977
1978	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1979		return (0);
1980
1981	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
1982	error = dmu_objset_find_dp(spa->spa_dsl_pool,
1983	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
1984	    DS_FIND_CHILDREN);
1985	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
1986	if (error != 0)
1987		return (error);
1988
1989	rio = zio_root(spa, NULL, &sle,
1990	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1991
1992	if (spa_load_verify_metadata) {
1993		if (spa->spa_extreme_rewind) {
1994			spa_load_note(spa, "performing a complete scan of the "
1995			    "pool since extreme rewind is on. This may take "
1996			    "a very long time.\n  (spa_load_verify_data=%u, "
1997			    "spa_load_verify_metadata=%u)",
1998			    spa_load_verify_data, spa_load_verify_metadata);
1999		}
2000		error = traverse_pool(spa, spa->spa_verify_min_txg,
2001		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2002		    spa_load_verify_cb, rio);
2003	}
2004
2005	(void) zio_wait(rio);
2006
2007	spa->spa_load_meta_errors = sle.sle_meta_count;
2008	spa->spa_load_data_errors = sle.sle_data_count;
2009
2010	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2011		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2012		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2013		    (u_longlong_t)sle.sle_data_count);
2014	}
2015
2016	if (spa_load_verify_dryrun ||
2017	    (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
2018	    sle.sle_data_count <= policy.zrp_maxdata)) {
2019		int64_t loss = 0;
2020
2021		verify_ok = B_TRUE;
2022		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2023		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2024
2025		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2026		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2027		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2028		VERIFY(nvlist_add_int64(spa->spa_load_info,
2029		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2030		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2031		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2032	} else {
2033		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2034	}
2035
2036	if (spa_load_verify_dryrun)
2037		return (0);
2038
2039	if (error) {
2040		if (error != ENXIO && error != EIO)
2041			error = SET_ERROR(EIO);
2042		return (error);
2043	}
2044
2045	return (verify_ok ? 0 : EIO);
2046}
2047
2048/*
2049 * Find a value in the pool props object.
2050 */
2051static void
2052spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2053{
2054	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2055	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2056}
2057
2058/*
2059 * Find a value in the pool directory object.
2060 */
2061static int
2062spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2063{
2064	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2065	    name, sizeof (uint64_t), 1, val);
2066
2067	if (error != 0 && (error != ENOENT || log_enoent)) {
2068		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2069		    "[error=%d]", name, error);
2070	}
2071
2072	return (error);
2073}
2074
2075static int
2076spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2077{
2078	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2079	return (SET_ERROR(err));
2080}
2081
2082/*
2083 * Fix up config after a partly-completed split.  This is done with the
2084 * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2085 * pool have that entry in their config, but only the splitting one contains
2086 * a list of all the guids of the vdevs that are being split off.
2087 *
2088 * This function determines what to do with that list: either rejoin
2089 * all the disks to the pool, or complete the splitting process.  To attempt
2090 * the rejoin, each disk that is offlined is marked online again, and
2091 * we do a reopen() call.  If the vdev label for every disk that was
2092 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2093 * then we call vdev_split() on each disk, and complete the split.
2094 *
2095 * Otherwise we leave the config alone, with all the vdevs in place in
2096 * the original pool.
2097 */
2098static void
2099spa_try_repair(spa_t *spa, nvlist_t *config)
2100{
2101	uint_t extracted;
2102	uint64_t *glist;
2103	uint_t i, gcount;
2104	nvlist_t *nvl;
2105	vdev_t **vd;
2106	boolean_t attempt_reopen;
2107
2108	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2109		return;
2110
2111	/* check that the config is complete */
2112	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2113	    &glist, &gcount) != 0)
2114		return;
2115
2116	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2117
2118	/* attempt to online all the vdevs & validate */
2119	attempt_reopen = B_TRUE;
2120	for (i = 0; i < gcount; i++) {
2121		if (glist[i] == 0)	/* vdev is hole */
2122			continue;
2123
2124		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2125		if (vd[i] == NULL) {
2126			/*
2127			 * Don't bother attempting to reopen the disks;
2128			 * just do the split.
2129			 */
2130			attempt_reopen = B_FALSE;
2131		} else {
2132			/* attempt to re-online it */
2133			vd[i]->vdev_offline = B_FALSE;
2134		}
2135	}
2136
2137	if (attempt_reopen) {
2138		vdev_reopen(spa->spa_root_vdev);
2139
2140		/* check each device to see what state it's in */
2141		for (extracted = 0, i = 0; i < gcount; i++) {
2142			if (vd[i] != NULL &&
2143			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2144				break;
2145			++extracted;
2146		}
2147	}
2148
2149	/*
2150	 * If every disk has been moved to the new pool, or if we never
2151	 * even attempted to look at them, then we split them off for
2152	 * good.
2153	 */
2154	if (!attempt_reopen || gcount == extracted) {
2155		for (i = 0; i < gcount; i++)
2156			if (vd[i] != NULL)
2157				vdev_split(vd[i]);
2158		vdev_reopen(spa->spa_root_vdev);
2159	}
2160
2161	kmem_free(vd, gcount * sizeof (vdev_t *));
2162}
2163
2164static int
2165spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2166{
2167	char *ereport = FM_EREPORT_ZFS_POOL;
2168	int error;
2169
2170	spa->spa_load_state = state;
2171
2172	gethrestime(&spa->spa_loaded_ts);
2173	error = spa_load_impl(spa, type, &ereport, B_FALSE);
2174
2175	/*
2176	 * Don't count references from objsets that are already closed
2177	 * and are making their way through the eviction process.
2178	 */
2179	spa_evicting_os_wait(spa);
2180	spa->spa_minref = refcount_count(&spa->spa_refcount);
2181	if (error) {
2182		if (error != EEXIST) {
2183			spa->spa_loaded_ts.tv_sec = 0;
2184			spa->spa_loaded_ts.tv_nsec = 0;
2185		}
2186		if (error != EBADF) {
2187			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2188		}
2189	}
2190	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2191	spa->spa_ena = 0;
2192
2193	return (error);
2194}
2195
2196/*
2197 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2198 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2199 * spa's per-vdev ZAP list.
2200 */
2201static uint64_t
2202vdev_count_verify_zaps(vdev_t *vd)
2203{
2204	spa_t *spa = vd->vdev_spa;
2205	uint64_t total = 0;
2206	if (vd->vdev_top_zap != 0) {
2207		total++;
2208		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2209		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2210	}
2211	if (vd->vdev_leaf_zap != 0) {
2212		total++;
2213		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2214		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2215	}
2216
2217	for (uint64_t i = 0; i < vd->vdev_children; i++) {
2218		total += vdev_count_verify_zaps(vd->vdev_child[i]);
2219	}
2220
2221	return (total);
2222}
2223
2224static int
2225spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2226{
2227	uint64_t hostid;
2228	char *hostname;
2229	uint64_t myhostid = 0;
2230
2231	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2232	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2233		hostname = fnvlist_lookup_string(mos_config,
2234		    ZPOOL_CONFIG_HOSTNAME);
2235
2236		myhostid = zone_get_hostid(NULL);
2237
2238		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2239			cmn_err(CE_WARN, "pool '%s' could not be "
2240			    "loaded as it was last accessed by "
2241			    "another system (host: %s hostid: 0x%llx). "
2242			    "See: http://illumos.org/msg/ZFS-8000-EY",
2243			    spa_name(spa), hostname, (u_longlong_t)hostid);
2244			spa_load_failed(spa, "hostid verification failed: pool "
2245			    "last accessed by host: %s (hostid: 0x%llx)",
2246			    hostname, (u_longlong_t)hostid);
2247			return (SET_ERROR(EBADF));
2248		}
2249	}
2250
2251	return (0);
2252}
2253
2254static int
2255spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2256{
2257	int error = 0;
2258	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2259	int parse;
2260	vdev_t *rvd;
2261	uint64_t pool_guid;
2262	char *comment;
2263
2264	/*
2265	 * Versioning wasn't explicitly added to the label until later, so if
2266	 * it's not present treat it as the initial version.
2267	 */
2268	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2269	    &spa->spa_ubsync.ub_version) != 0)
2270		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2271
2272	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2273		spa_load_failed(spa, "invalid config provided: '%s' missing",
2274		    ZPOOL_CONFIG_POOL_GUID);
2275		return (SET_ERROR(EINVAL));
2276	}
2277
2278	if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state ==
2279	    SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) {
2280		spa_load_failed(spa, "a pool with guid %llu is already open",
2281		    (u_longlong_t)pool_guid);
2282		return (SET_ERROR(EEXIST));
2283	}
2284
2285	spa->spa_config_guid = pool_guid;
2286
2287	nvlist_free(spa->spa_load_info);
2288	spa->spa_load_info = fnvlist_alloc();
2289
2290	ASSERT(spa->spa_comment == NULL);
2291	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2292		spa->spa_comment = spa_strdup(comment);
2293
2294	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2295	    &spa->spa_config_txg);
2296
2297	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2298		spa->spa_config_splitting = fnvlist_dup(nvl);
2299
2300	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2301		spa_load_failed(spa, "invalid config provided: '%s' missing",
2302		    ZPOOL_CONFIG_VDEV_TREE);
2303		return (SET_ERROR(EINVAL));
2304	}
2305
2306	/*
2307	 * Create "The Godfather" zio to hold all async IOs
2308	 */
2309	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2310	    KM_SLEEP);
2311	for (int i = 0; i < max_ncpus; i++) {
2312		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2313		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2314		    ZIO_FLAG_GODFATHER);
2315	}
2316
2317	/*
2318	 * Parse the configuration into a vdev tree.  We explicitly set the
2319	 * value that will be returned by spa_version() since parsing the
2320	 * configuration requires knowing the version number.
2321	 */
2322	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2323	parse = (type == SPA_IMPORT_EXISTING ?
2324	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2325	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2326	spa_config_exit(spa, SCL_ALL, FTAG);
2327
2328	if (error != 0) {
2329		spa_load_failed(spa, "unable to parse config [error=%d]",
2330		    error);
2331		return (error);
2332	}
2333
2334	ASSERT(spa->spa_root_vdev == rvd);
2335	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2336	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2337
2338	if (type != SPA_IMPORT_ASSEMBLE) {
2339		ASSERT(spa_guid(spa) == pool_guid);
2340	}
2341
2342	return (0);
2343}
2344
2345/*
2346 * Recursively open all vdevs in the vdev tree. This function is called twice:
2347 * first with the untrusted config, then with the trusted config.
2348 */
2349static int
2350spa_ld_open_vdevs(spa_t *spa)
2351{
2352	int error = 0;
2353
2354	/*
2355	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
2356	 * missing/unopenable for the root vdev to be still considered openable.
2357	 */
2358	if (spa->spa_trust_config) {
2359		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2360	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2361		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2362	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2363		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2364	} else {
2365		spa->spa_missing_tvds_allowed = 0;
2366	}
2367
2368	spa->spa_missing_tvds_allowed =
2369	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2370
2371	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2372	error = vdev_open(spa->spa_root_vdev);
2373	spa_config_exit(spa, SCL_ALL, FTAG);
2374
2375	if (spa->spa_missing_tvds != 0) {
2376		spa_load_note(spa, "vdev tree has %lld missing top-level "
2377		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2378		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2379			/*
2380			 * Although theoretically we could allow users to open
2381			 * incomplete pools in RW mode, we'd need to add a lot
2382			 * of extra logic (e.g. adjust pool space to account
2383			 * for missing vdevs).
2384			 * This limitation also prevents users from accidentally
2385			 * opening the pool in RW mode during data recovery and
2386			 * damaging it further.
2387			 */
2388			spa_load_note(spa, "pools with missing top-level "
2389			    "vdevs can only be opened in read-only mode.");
2390			error = SET_ERROR(ENXIO);
2391		} else {
2392			spa_load_note(spa, "current settings allow for maximum "
2393			    "%lld missing top-level vdevs at this stage.",
2394			    (u_longlong_t)spa->spa_missing_tvds_allowed);
2395		}
2396	}
2397	if (error != 0) {
2398		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2399		    error);
2400	}
2401	if (spa->spa_missing_tvds != 0 || error != 0)
2402		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2403
2404	return (error);
2405}
2406
2407/*
2408 * We need to validate the vdev labels against the configuration that
2409 * we have in hand. This function is called twice: first with an untrusted
2410 * config, then with a trusted config. The validation is more strict when the
2411 * config is trusted.
2412 */
2413static int
2414spa_ld_validate_vdevs(spa_t *spa)
2415{
2416	int error = 0;
2417	vdev_t *rvd = spa->spa_root_vdev;
2418
2419	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2420	error = vdev_validate(rvd);
2421	spa_config_exit(spa, SCL_ALL, FTAG);
2422
2423	if (error != 0) {
2424		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2425		return (error);
2426	}
2427
2428	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2429		spa_load_failed(spa, "cannot open vdev tree after invalidating "
2430		    "some vdevs");
2431		vdev_dbgmsg_print_tree(rvd, 2);
2432		return (SET_ERROR(ENXIO));
2433	}
2434
2435	return (0);
2436}
2437
2438static int
2439spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2440{
2441	vdev_t *rvd = spa->spa_root_vdev;
2442	nvlist_t *label;
2443	uberblock_t *ub = &spa->spa_uberblock;
2444
2445	/*
2446	 * Find the best uberblock.
2447	 */
2448	vdev_uberblock_load(rvd, ub, &label);
2449
2450	/*
2451	 * If we weren't able to find a single valid uberblock, return failure.
2452	 */
2453	if (ub->ub_txg == 0) {
2454		nvlist_free(label);
2455		spa_load_failed(spa, "no valid uberblock found");
2456		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2457	}
2458
2459	spa_load_note(spa, "using uberblock with txg=%llu",
2460	    (u_longlong_t)ub->ub_txg);
2461
2462	/*
2463	 * If the pool has an unsupported version we can't open it.
2464	 */
2465	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2466		nvlist_free(label);
2467		spa_load_failed(spa, "version %llu is not supported",
2468		    (u_longlong_t)ub->ub_version);
2469		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2470	}
2471
2472	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2473		nvlist_t *features;
2474
2475		/*
2476		 * If we weren't able to find what's necessary for reading the
2477		 * MOS in the label, return failure.
2478		 */
2479		if (label == NULL) {
2480			spa_load_failed(spa, "label config unavailable");
2481			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2482			    ENXIO));
2483		}
2484
2485		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
2486		    &features) != 0) {
2487			nvlist_free(label);
2488			spa_load_failed(spa, "invalid label: '%s' missing",
2489			    ZPOOL_CONFIG_FEATURES_FOR_READ);
2490			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2491			    ENXIO));
2492		}
2493
2494		/*
2495		 * Update our in-core representation with the definitive values
2496		 * from the label.
2497		 */
2498		nvlist_free(spa->spa_label_features);
2499		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2500	}
2501
2502	nvlist_free(label);
2503
2504	/*
2505	 * Look through entries in the label nvlist's features_for_read. If
2506	 * there is a feature listed there which we don't understand then we
2507	 * cannot open a pool.
2508	 */
2509	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2510		nvlist_t *unsup_feat;
2511
2512		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2513		    0);
2514
2515		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2516		    NULL); nvp != NULL;
2517		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2518			if (!zfeature_is_supported(nvpair_name(nvp))) {
2519				VERIFY(nvlist_add_string(unsup_feat,
2520				    nvpair_name(nvp), "") == 0);
2521			}
2522		}
2523
2524		if (!nvlist_empty(unsup_feat)) {
2525			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2526			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2527			nvlist_free(unsup_feat);
2528			spa_load_failed(spa, "some features are unsupported");
2529			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2530			    ENOTSUP));
2531		}
2532
2533		nvlist_free(unsup_feat);
2534	}
2535
2536	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2537		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2538		spa_try_repair(spa, spa->spa_config);
2539		spa_config_exit(spa, SCL_ALL, FTAG);
2540		nvlist_free(spa->spa_config_splitting);
2541		spa->spa_config_splitting = NULL;
2542	}
2543
2544	/*
2545	 * Initialize internal SPA structures.
2546	 */
2547	spa->spa_state = POOL_STATE_ACTIVE;
2548	spa->spa_ubsync = spa->spa_uberblock;
2549	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2550	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2551	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2552	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2553	spa->spa_claim_max_txg = spa->spa_first_txg;
2554	spa->spa_prev_software_version = ub->ub_software_version;
2555
2556	return (0);
2557}
2558
2559static int
2560spa_ld_open_rootbp(spa_t *spa)
2561{
2562	int error = 0;
2563	vdev_t *rvd = spa->spa_root_vdev;
2564
2565	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2566	if (error != 0) {
2567		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
2568		    "[error=%d]", error);
2569		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2570	}
2571	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2572
2573	return (0);
2574}
2575
2576static int
2577spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type,
2578    boolean_t reloading)
2579{
2580	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
2581	nvlist_t *nv, *mos_config, *policy;
2582	int error = 0, copy_error;
2583	uint64_t healthy_tvds, healthy_tvds_mos;
2584	uint64_t mos_config_txg;
2585
2586	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
2587	    != 0)
2588		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2589
2590	/*
2591	 * If we're assembling a pool from a split, the config provided is
2592	 * already trusted so there is nothing to do.
2593	 */
2594	if (type == SPA_IMPORT_ASSEMBLE)
2595		return (0);
2596
2597	healthy_tvds = spa_healthy_core_tvds(spa);
2598
2599	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
2600	    != 0) {
2601		spa_load_failed(spa, "unable to retrieve MOS config");
2602		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2603	}
2604
2605	/*
2606	 * If we are doing an open, pool owner wasn't verified yet, thus do
2607	 * the verification here.
2608	 */
2609	if (spa->spa_load_state == SPA_LOAD_OPEN) {
2610		error = spa_verify_host(spa, mos_config);
2611		if (error != 0) {
2612			nvlist_free(mos_config);
2613			return (error);
2614		}
2615	}
2616
2617	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
2618
2619	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2620
2621	/*
2622	 * Build a new vdev tree from the trusted config
2623	 */
2624	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
2625
2626	/*
2627	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
2628	 * obtained by scanning /dev/dsk, then it will have the right vdev
2629	 * paths. We update the trusted MOS config with this information.
2630	 * We first try to copy the paths with vdev_copy_path_strict, which
2631	 * succeeds only when both configs have exactly the same vdev tree.
2632	 * If that fails, we fall back to a more flexible method that has a
2633	 * best effort policy.
2634	 */
2635	copy_error = vdev_copy_path_strict(rvd, mrvd);
2636	if (copy_error != 0 || spa_load_print_vdev_tree) {
2637		spa_load_note(spa, "provided vdev tree:");
2638		vdev_dbgmsg_print_tree(rvd, 2);
2639		spa_load_note(spa, "MOS vdev tree:");
2640		vdev_dbgmsg_print_tree(mrvd, 2);
2641	}
2642	if (copy_error != 0) {
2643		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
2644		    "back to vdev_copy_path_relaxed");
2645		vdev_copy_path_relaxed(rvd, mrvd);
2646	}
2647
2648	vdev_close(rvd);
2649	vdev_free(rvd);
2650	spa->spa_root_vdev = mrvd;
2651	rvd = mrvd;
2652	spa_config_exit(spa, SCL_ALL, FTAG);
2653
2654	/*
2655	 * We will use spa_config if we decide to reload the spa or if spa_load
2656	 * fails and we rewind. We must thus regenerate the config using the
2657	 * MOS information with the updated paths. Rewind policy is an import
2658	 * setting and is not in the MOS. We copy it over to our new, trusted
2659	 * config.
2660	 */
2661	mos_config_txg = fnvlist_lookup_uint64(mos_config,
2662	    ZPOOL_CONFIG_POOL_TXG);
2663	nvlist_free(mos_config);
2664	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
2665	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY,
2666	    &policy) == 0)
2667		fnvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy);
2668	spa_config_set(spa, mos_config);
2669	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
2670
2671	/*
2672	 * Now that we got the config from the MOS, we should be more strict
2673	 * in checking blkptrs and can make assumptions about the consistency
2674	 * of the vdev tree. spa_trust_config must be set to true before opening
2675	 * vdevs in order for them to be writeable.
2676	 */
2677	spa->spa_trust_config = B_TRUE;
2678
2679	/*
2680	 * Open and validate the new vdev tree
2681	 */
2682	error = spa_ld_open_vdevs(spa);
2683	if (error != 0)
2684		return (error);
2685
2686	error = spa_ld_validate_vdevs(spa);
2687	if (error != 0)
2688		return (error);
2689
2690	if (copy_error != 0 || spa_load_print_vdev_tree) {
2691		spa_load_note(spa, "final vdev tree:");
2692		vdev_dbgmsg_print_tree(rvd, 2);
2693	}
2694
2695	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
2696	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
2697		/*
2698		 * Sanity check to make sure that we are indeed loading the
2699		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
2700		 * in the config provided and they happened to be the only ones
2701		 * to have the latest uberblock, we could involuntarily perform
2702		 * an extreme rewind.
2703		 */
2704		healthy_tvds_mos = spa_healthy_core_tvds(spa);
2705		if (healthy_tvds_mos - healthy_tvds >=
2706		    SPA_SYNC_MIN_VDEVS) {
2707			spa_load_note(spa, "config provided misses too many "
2708			    "top-level vdevs compared to MOS (%lld vs %lld). ",
2709			    (u_longlong_t)healthy_tvds,
2710			    (u_longlong_t)healthy_tvds_mos);
2711			spa_load_note(spa, "vdev tree:");
2712			vdev_dbgmsg_print_tree(rvd, 2);
2713			if (reloading) {
2714				spa_load_failed(spa, "config was already "
2715				    "provided from MOS. Aborting.");
2716				return (spa_vdev_err(rvd,
2717				    VDEV_AUX_CORRUPT_DATA, EIO));
2718			}
2719			spa_load_note(spa, "spa must be reloaded using MOS "
2720			    "config");
2721			return (SET_ERROR(EAGAIN));
2722		}
2723	}
2724
2725	error = spa_check_for_missing_logs(spa);
2726	if (error != 0)
2727		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2728
2729	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
2730		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
2731		    "guid sum (%llu != %llu)",
2732		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
2733		    (u_longlong_t)rvd->vdev_guid_sum);
2734		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2735		    ENXIO));
2736	}
2737
2738	return (0);
2739}
2740
2741static int
2742spa_ld_open_indirect_vdev_metadata(spa_t *spa)
2743{
2744	int error = 0;
2745	vdev_t *rvd = spa->spa_root_vdev;
2746
2747	/*
2748	 * Everything that we read before spa_remove_init() must be stored
2749	 * on concreted vdevs.  Therefore we do this as early as possible.
2750	 */
2751	error = spa_remove_init(spa);
2752	if (error != 0) {
2753		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
2754		    error);
2755		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2756	}
2757
2758	/*
2759	 * Retrieve information needed to condense indirect vdev mappings.
2760	 */
2761	error = spa_condense_init(spa);
2762	if (error != 0) {
2763		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
2764		    error);
2765		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
2766	}
2767
2768	return (0);
2769}
2770
2771static int
2772spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
2773{
2774	int error = 0;
2775	vdev_t *rvd = spa->spa_root_vdev;
2776
2777	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2778		boolean_t missing_feat_read = B_FALSE;
2779		nvlist_t *unsup_feat, *enabled_feat;
2780
2781		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2782		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
2783			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2784		}
2785
2786		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2787		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
2788			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2789		}
2790
2791		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2792		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
2793			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2794		}
2795
2796		enabled_feat = fnvlist_alloc();
2797		unsup_feat = fnvlist_alloc();
2798
2799		if (!spa_features_check(spa, B_FALSE,
2800		    unsup_feat, enabled_feat))
2801			missing_feat_read = B_TRUE;
2802
2803		if (spa_writeable(spa) ||
2804		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
2805			if (!spa_features_check(spa, B_TRUE,
2806			    unsup_feat, enabled_feat)) {
2807				*missing_feat_writep = B_TRUE;
2808			}
2809		}
2810
2811		fnvlist_add_nvlist(spa->spa_load_info,
2812		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2813
2814		if (!nvlist_empty(unsup_feat)) {
2815			fnvlist_add_nvlist(spa->spa_load_info,
2816			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2817		}
2818
2819		fnvlist_free(enabled_feat);
2820		fnvlist_free(unsup_feat);
2821
2822		if (!missing_feat_read) {
2823			fnvlist_add_boolean(spa->spa_load_info,
2824			    ZPOOL_CONFIG_CAN_RDONLY);
2825		}
2826
2827		/*
2828		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2829		 * twofold: to determine whether the pool is available for
2830		 * import in read-write mode and (if it is not) whether the
2831		 * pool is available for import in read-only mode. If the pool
2832		 * is available for import in read-write mode, it is displayed
2833		 * as available in userland; if it is not available for import
2834		 * in read-only mode, it is displayed as unavailable in
2835		 * userland. If the pool is available for import in read-only
2836		 * mode but not read-write mode, it is displayed as unavailable
2837		 * in userland with a special note that the pool is actually
2838		 * available for open in read-only mode.
2839		 *
2840		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2841		 * missing a feature for write, we must first determine whether
2842		 * the pool can be opened read-only before returning to
2843		 * userland in order to know whether to display the
2844		 * abovementioned note.
2845		 */
2846		if (missing_feat_read || (*missing_feat_writep &&
2847		    spa_writeable(spa))) {
2848			spa_load_failed(spa, "pool uses unsupported features");
2849			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2850			    ENOTSUP));
2851		}
2852
2853		/*
2854		 * Load refcounts for ZFS features from disk into an in-memory
2855		 * cache during SPA initialization.
2856		 */
2857		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
2858			uint64_t refcount;
2859
2860			error = feature_get_refcount_from_disk(spa,
2861			    &spa_feature_table[i], &refcount);
2862			if (error == 0) {
2863				spa->spa_feat_refcount_cache[i] = refcount;
2864			} else if (error == ENOTSUP) {
2865				spa->spa_feat_refcount_cache[i] =
2866				    SPA_FEATURE_DISABLED;
2867			} else {
2868				spa_load_failed(spa, "error getting refcount "
2869				    "for feature %s [error=%d]",
2870				    spa_feature_table[i].fi_guid, error);
2871				return (spa_vdev_err(rvd,
2872				    VDEV_AUX_CORRUPT_DATA, EIO));
2873			}
2874		}
2875	}
2876
2877	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
2878		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
2879		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
2880			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2881	}
2882
2883	return (0);
2884}
2885
2886static int
2887spa_ld_load_special_directories(spa_t *spa)
2888{
2889	int error = 0;
2890	vdev_t *rvd = spa->spa_root_vdev;
2891
2892	spa->spa_is_initializing = B_TRUE;
2893	error = dsl_pool_open(spa->spa_dsl_pool);
2894	spa->spa_is_initializing = B_FALSE;
2895	if (error != 0) {
2896		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
2897		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2898	}
2899
2900	return (0);
2901}
2902
2903static int
2904spa_ld_get_props(spa_t *spa)
2905{
2906	int error = 0;
2907	uint64_t obj;
2908	vdev_t *rvd = spa->spa_root_vdev;
2909
2910	/* Grab the secret checksum salt from the MOS. */
2911	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2912	    DMU_POOL_CHECKSUM_SALT, 1,
2913	    sizeof (spa->spa_cksum_salt.zcs_bytes),
2914	    spa->spa_cksum_salt.zcs_bytes);
2915	if (error == ENOENT) {
2916		/* Generate a new salt for subsequent use */
2917		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
2918		    sizeof (spa->spa_cksum_salt.zcs_bytes));
2919	} else if (error != 0) {
2920		spa_load_failed(spa, "unable to retrieve checksum salt from "
2921		    "MOS [error=%d]", error);
2922		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2923	}
2924
2925	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
2926		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2927	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2928	if (error != 0) {
2929		spa_load_failed(spa, "error opening deferred-frees bpobj "
2930		    "[error=%d]", error);
2931		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2932	}
2933
2934	/*
2935	 * Load the bit that tells us to use the new accounting function
2936	 * (raid-z deflation).  If we have an older pool, this will not
2937	 * be present.
2938	 */
2939	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
2940	if (error != 0 && error != ENOENT)
2941		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2942
2943	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2944	    &spa->spa_creation_version, B_FALSE);
2945	if (error != 0 && error != ENOENT)
2946		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2947
2948	/*
2949	 * Load the persistent error log.  If we have an older pool, this will
2950	 * not be present.
2951	 */
2952	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
2953	    B_FALSE);
2954	if (error != 0 && error != ENOENT)
2955		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2956
2957	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2958	    &spa->spa_errlog_scrub, B_FALSE);
2959	if (error != 0 && error != ENOENT)
2960		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2961
2962	/*
2963	 * Load the history object.  If we have an older pool, this
2964	 * will not be present.
2965	 */
2966	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
2967	if (error != 0 && error != ENOENT)
2968		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2969
2970	/*
2971	 * Load the per-vdev ZAP map. If we have an older pool, this will not
2972	 * be present; in this case, defer its creation to a later time to
2973	 * avoid dirtying the MOS this early / out of sync context. See
2974	 * spa_sync_config_object.
2975	 */
2976
2977	/* The sentinel is only available in the MOS config. */
2978	nvlist_t *mos_config;
2979	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
2980		spa_load_failed(spa, "unable to retrieve MOS config");
2981		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2982	}
2983
2984	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
2985	    &spa->spa_all_vdev_zaps, B_FALSE);
2986
2987	if (error == ENOENT) {
2988		VERIFY(!nvlist_exists(mos_config,
2989		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
2990		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
2991		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
2992	} else if (error != 0) {
2993		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2994	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
2995		/*
2996		 * An older version of ZFS overwrote the sentinel value, so
2997		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
2998		 * destruction to later; see spa_sync_config_object.
2999		 */
3000		spa->spa_avz_action = AVZ_ACTION_DESTROY;
3001		/*
3002		 * We're assuming that no vdevs have had their ZAPs created
3003		 * before this. Better be sure of it.
3004		 */
3005		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3006	}
3007	nvlist_free(mos_config);
3008
3009	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3010
3011	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3012	    B_FALSE);
3013	if (error && error != ENOENT)
3014		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3015
3016	if (error == 0) {
3017		uint64_t autoreplace;
3018
3019		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3020		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3021		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3022		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3023		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3024		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3025		    &spa->spa_dedup_ditto);
3026
3027		spa->spa_autoreplace = (autoreplace != 0);
3028	}
3029
3030	/*
3031	 * If we are importing a pool with missing top-level vdevs,
3032	 * we enforce that the pool doesn't panic or get suspended on
3033	 * error since the likelihood of missing data is extremely high.
3034	 */
3035	if (spa->spa_missing_tvds > 0 &&
3036	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3037	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3038		spa_load_note(spa, "forcing failmode to 'continue' "
3039		    "as some top level vdevs are missing");
3040		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3041	}
3042
3043	return (0);
3044}
3045
3046static int
3047spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3048{
3049	int error = 0;
3050	vdev_t *rvd = spa->spa_root_vdev;
3051
3052	/*
3053	 * If we're assembling the pool from the split-off vdevs of
3054	 * an existing pool, we don't want to attach the spares & cache
3055	 * devices.
3056	 */
3057
3058	/*
3059	 * Load any hot spares for this pool.
3060	 */
3061	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3062	    B_FALSE);
3063	if (error != 0 && error != ENOENT)
3064		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3065	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3066		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3067		if (load_nvlist(spa, spa->spa_spares.sav_object,
3068		    &spa->spa_spares.sav_config) != 0) {
3069			spa_load_failed(spa, "error loading spares nvlist");
3070			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3071		}
3072
3073		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3074		spa_load_spares(spa);
3075		spa_config_exit(spa, SCL_ALL, FTAG);
3076	} else if (error == 0) {
3077		spa->spa_spares.sav_sync = B_TRUE;
3078	}
3079
3080	/*
3081	 * Load any level 2 ARC devices for this pool.
3082	 */
3083	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3084	    &spa->spa_l2cache.sav_object, B_FALSE);
3085	if (error != 0 && error != ENOENT)
3086		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3087	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3088		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3089		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3090		    &spa->spa_l2cache.sav_config) != 0) {
3091			spa_load_failed(spa, "error loading l2cache nvlist");
3092			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3093		}
3094
3095		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3096		spa_load_l2cache(spa);
3097		spa_config_exit(spa, SCL_ALL, FTAG);
3098	} else if (error == 0) {
3099		spa->spa_l2cache.sav_sync = B_TRUE;
3100	}
3101
3102	return (0);
3103}
3104
3105static int
3106spa_ld_load_vdev_metadata(spa_t *spa)
3107{
3108	int error = 0;
3109	vdev_t *rvd = spa->spa_root_vdev;
3110
3111	/*
3112	 * If the 'autoreplace' property is set, then post a resource notifying
3113	 * the ZFS DE that it should not issue any faults for unopenable
3114	 * devices.  We also iterate over the vdevs, and post a sysevent for any
3115	 * unopenable vdevs so that the normal autoreplace handler can take
3116	 * over.
3117	 */
3118	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3119		spa_check_removed(spa->spa_root_vdev);
3120		/*
3121		 * For the import case, this is done in spa_import(), because
3122		 * at this point we're using the spare definitions from
3123		 * the MOS config, not necessarily from the userland config.
3124		 */
3125		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3126			spa_aux_check_removed(&spa->spa_spares);
3127			spa_aux_check_removed(&spa->spa_l2cache);
3128		}
3129	}
3130
3131	/*
3132	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3133	 */
3134	error = vdev_load(rvd);
3135	if (error != 0) {
3136		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3137		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3138	}
3139
3140	/*
3141	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3142	 */
3143	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3144	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3145	spa_config_exit(spa, SCL_ALL, FTAG);
3146
3147	return (0);
3148}
3149
3150static int
3151spa_ld_load_dedup_tables(spa_t *spa)
3152{
3153	int error = 0;
3154	vdev_t *rvd = spa->spa_root_vdev;
3155
3156	error = ddt_load(spa);
3157	if (error != 0) {
3158		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3159		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3160	}
3161
3162	return (0);
3163}
3164
3165static int
3166spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3167{
3168	vdev_t *rvd = spa->spa_root_vdev;
3169
3170	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3171		boolean_t missing = spa_check_logs(spa);
3172		if (missing) {
3173			if (spa->spa_missing_tvds != 0) {
3174				spa_load_note(spa, "spa_check_logs failed "
3175				    "so dropping the logs");
3176			} else {
3177				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3178				spa_load_failed(spa, "spa_check_logs failed");
3179				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3180				    ENXIO));
3181			}
3182		}
3183	}
3184
3185	return (0);
3186}
3187
3188static int
3189spa_ld_verify_pool_data(spa_t *spa)
3190{
3191	int error = 0;
3192	vdev_t *rvd = spa->spa_root_vdev;
3193
3194	/*
3195	 * We've successfully opened the pool, verify that we're ready
3196	 * to start pushing transactions.
3197	 */
3198	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3199		error = spa_load_verify(spa);
3200		if (error != 0) {
3201			spa_load_failed(spa, "spa_load_verify failed "
3202			    "[error=%d]", error);
3203			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3204			    error));
3205		}
3206	}
3207
3208	return (0);
3209}
3210
3211static void
3212spa_ld_claim_log_blocks(spa_t *spa)
3213{
3214	dmu_tx_t *tx;
3215	dsl_pool_t *dp = spa_get_dsl(spa);
3216
3217	/*
3218	 * Claim log blocks that haven't been committed yet.
3219	 * This must all happen in a single txg.
3220	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3221	 * invoked from zil_claim_log_block()'s i/o done callback.
3222	 * Price of rollback is that we abandon the log.
3223	 */
3224	spa->spa_claiming = B_TRUE;
3225
3226	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3227	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3228	    zil_claim, tx, DS_FIND_CHILDREN);
3229	dmu_tx_commit(tx);
3230
3231	spa->spa_claiming = B_FALSE;
3232
3233	spa_set_log_state(spa, SPA_LOG_GOOD);
3234}
3235
3236static void
3237spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3238    boolean_t reloading)
3239{
3240	vdev_t *rvd = spa->spa_root_vdev;
3241	int need_update = B_FALSE;
3242
3243	/*
3244	 * If the config cache is stale, or we have uninitialized
3245	 * metaslabs (see spa_vdev_add()), then update the config.
3246	 *
3247	 * If this is a verbatim import, trust the current
3248	 * in-core spa_config and update the disk labels.
3249	 */
3250	if (reloading || config_cache_txg != spa->spa_config_txg ||
3251	    spa->spa_load_state == SPA_LOAD_IMPORT ||
3252	    spa->spa_load_state == SPA_LOAD_RECOVER ||
3253	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3254		need_update = B_TRUE;
3255
3256	for (int c = 0; c < rvd->vdev_children; c++)
3257		if (rvd->vdev_child[c]->vdev_ms_array == 0)
3258			need_update = B_TRUE;
3259
3260	/*
3261	 * Update the config cache asychronously in case we're the
3262	 * root pool, in which case the config cache isn't writable yet.
3263	 */
3264	if (need_update)
3265		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3266}
3267
3268static void
3269spa_ld_prepare_for_reload(spa_t *spa)
3270{
3271	int mode = spa->spa_mode;
3272	int async_suspended = spa->spa_async_suspended;
3273
3274	spa_unload(spa);
3275	spa_deactivate(spa);
3276	spa_activate(spa, mode);
3277
3278	/*
3279	 * We save the value of spa_async_suspended as it gets reset to 0 by
3280	 * spa_unload(). We want to restore it back to the original value before
3281	 * returning as we might be calling spa_async_resume() later.
3282	 */
3283	spa->spa_async_suspended = async_suspended;
3284}
3285
3286/*
3287 * Load an existing storage pool, using the config provided. This config
3288 * describes which vdevs are part of the pool and is later validated against
3289 * partial configs present in each vdev's label and an entire copy of the
3290 * config stored in the MOS.
3291 */
3292static int
3293spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport,
3294    boolean_t reloading)
3295{
3296	int error = 0;
3297	boolean_t missing_feat_write = B_FALSE;
3298
3299	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3300	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3301
3302	/*
3303	 * Never trust the config that is provided unless we are assembling
3304	 * a pool following a split.
3305	 * This means don't trust blkptrs and the vdev tree in general. This
3306	 * also effectively puts the spa in read-only mode since
3307	 * spa_writeable() checks for spa_trust_config to be true.
3308	 * We will later load a trusted config from the MOS.
3309	 */
3310	if (type != SPA_IMPORT_ASSEMBLE)
3311		spa->spa_trust_config = B_FALSE;
3312
3313	if (reloading)
3314		spa_load_note(spa, "RELOADING");
3315	else
3316		spa_load_note(spa, "LOADING");
3317
3318	/*
3319	 * Parse the config provided to create a vdev tree.
3320	 */
3321	error = spa_ld_parse_config(spa, type);
3322	if (error != 0)
3323		return (error);
3324
3325	/*
3326	 * Now that we have the vdev tree, try to open each vdev. This involves
3327	 * opening the underlying physical device, retrieving its geometry and
3328	 * probing the vdev with a dummy I/O. The state of each vdev will be set
3329	 * based on the success of those operations. After this we'll be ready
3330	 * to read from the vdevs.
3331	 */
3332	error = spa_ld_open_vdevs(spa);
3333	if (error != 0)
3334		return (error);
3335
3336	/*
3337	 * Read the label of each vdev and make sure that the GUIDs stored
3338	 * there match the GUIDs in the config provided.
3339	 * If we're assembling a new pool that's been split off from an
3340	 * existing pool, the labels haven't yet been updated so we skip
3341	 * validation for now.
3342	 */
3343	if (type != SPA_IMPORT_ASSEMBLE) {
3344		error = spa_ld_validate_vdevs(spa);
3345		if (error != 0)
3346			return (error);
3347	}
3348
3349	/*
3350	 * Read vdev labels to find the best uberblock (i.e. latest, unless
3351	 * spa_load_max_txg is set) and store it in spa_uberblock. We get the
3352	 * list of features required to read blkptrs in the MOS from the vdev
3353	 * label with the best uberblock and verify that our version of zfs
3354	 * supports them all.
3355	 */
3356	error = spa_ld_select_uberblock(spa, type);
3357	if (error != 0)
3358		return (error);
3359
3360	/*
3361	 * Pass that uberblock to the dsl_pool layer which will open the root
3362	 * blkptr. This blkptr points to the latest version of the MOS and will
3363	 * allow us to read its contents.
3364	 */
3365	error = spa_ld_open_rootbp(spa);
3366	if (error != 0)
3367		return (error);
3368
3369	/*
3370	 * Retrieve the trusted config stored in the MOS and use it to create
3371	 * a new, exact version of the vdev tree, then reopen all vdevs.
3372	 */
3373	error = spa_ld_load_trusted_config(spa, type, reloading);
3374	if (error == EAGAIN) {
3375		VERIFY(!reloading);
3376		/*
3377		 * Redo the loading process with the trusted config if it is
3378		 * too different from the untrusted config.
3379		 */
3380		spa_ld_prepare_for_reload(spa);
3381		return (spa_load_impl(spa, type, ereport, B_TRUE));
3382	} else if (error != 0) {
3383		return (error);
3384	}
3385
3386	/*
3387	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
3388	 * from the pool and their contents were re-mapped to other vdevs. Note
3389	 * that everything that we read before this step must have been
3390	 * rewritten on concrete vdevs after the last device removal was
3391	 * initiated. Otherwise we could be reading from indirect vdevs before
3392	 * we have loaded their mappings.
3393	 */
3394	error = spa_ld_open_indirect_vdev_metadata(spa);
3395	if (error != 0)
3396		return (error);
3397
3398	/*
3399	 * Retrieve the full list of active features from the MOS and check if
3400	 * they are all supported.
3401	 */
3402	error = spa_ld_check_features(spa, &missing_feat_write);
3403	if (error != 0)
3404		return (error);
3405
3406	/*
3407	 * Load several special directories from the MOS needed by the dsl_pool
3408	 * layer.
3409	 */
3410	error = spa_ld_load_special_directories(spa);
3411	if (error != 0)
3412		return (error);
3413
3414	/*
3415	 * Retrieve pool properties from the MOS.
3416	 */
3417	error = spa_ld_get_props(spa);
3418	if (error != 0)
3419		return (error);
3420
3421	/*
3422	 * Retrieve the list of auxiliary devices - cache devices and spares -
3423	 * and open them.
3424	 */
3425	error = spa_ld_open_aux_vdevs(spa, type);
3426	if (error != 0)
3427		return (error);
3428
3429	/*
3430	 * Load the metadata for all vdevs. Also check if unopenable devices
3431	 * should be autoreplaced.
3432	 */
3433	error = spa_ld_load_vdev_metadata(spa);
3434	if (error != 0)
3435		return (error);
3436
3437	error = spa_ld_load_dedup_tables(spa);
3438	if (error != 0)
3439		return (error);
3440
3441	/*
3442	 * Verify the logs now to make sure we don't have any unexpected errors
3443	 * when we claim log blocks later.
3444	 */
3445	error = spa_ld_verify_logs(spa, type, ereport);
3446	if (error != 0)
3447		return (error);
3448
3449	if (missing_feat_write) {
3450		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
3451
3452		/*
3453		 * At this point, we know that we can open the pool in
3454		 * read-only mode but not read-write mode. We now have enough
3455		 * information and can return to userland.
3456		 */
3457		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
3458		    ENOTSUP));
3459	}
3460
3461	/*
3462	 * Traverse the last txgs to make sure the pool was left off in a safe
3463	 * state. When performing an extreme rewind, we verify the whole pool,
3464	 * which can take a very long time.
3465	 */
3466	error = spa_ld_verify_pool_data(spa);
3467	if (error != 0)
3468		return (error);
3469
3470	/*
3471	 * Calculate the deflated space for the pool. This must be done before
3472	 * we write anything to the pool because we'd need to update the space
3473	 * accounting using the deflated sizes.
3474	 */
3475	spa_update_dspace(spa);
3476
3477	/*
3478	 * We have now retrieved all the information we needed to open the
3479	 * pool. If we are importing the pool in read-write mode, a few
3480	 * additional steps must be performed to finish the import.
3481	 */
3482	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
3483	    spa->spa_load_max_txg == UINT64_MAX)) {
3484		uint64_t config_cache_txg = spa->spa_config_txg;
3485
3486		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
3487
3488		/*
3489		 * We must check this before we start the sync thread, because
3490		 * we only want to start a condense thread for condense
3491		 * operations that were in progress when the pool was
3492		 * imported.  Once we start syncing, spa_sync() could
3493		 * initiate a condense (and start a thread for it).  In
3494		 * that case it would be wrong to start a second
3495		 * condense thread.
3496		 */
3497		boolean_t condense_in_progress =
3498		    (spa->spa_condensing_indirect != NULL);
3499
3500		/*
3501		 * Traverse the ZIL and claim all blocks.
3502		 */
3503		spa_ld_claim_log_blocks(spa);
3504
3505		/*
3506		 * Kick-off the syncing thread.
3507		 */
3508		spa->spa_sync_on = B_TRUE;
3509		txg_sync_start(spa->spa_dsl_pool);
3510
3511		/*
3512		 * Wait for all claims to sync.  We sync up to the highest
3513		 * claimed log block birth time so that claimed log blocks
3514		 * don't appear to be from the future.  spa_claim_max_txg
3515		 * will have been set for us by ZIL traversal operations
3516		 * performed above.
3517		 */
3518		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
3519
3520		/*
3521		 * Check if we need to request an update of the config. On the
3522		 * next sync, we would update the config stored in vdev labels
3523		 * and the cachefile (by default /etc/zfs/zpool.cache).
3524		 */
3525		spa_ld_check_for_config_update(spa, config_cache_txg,
3526		    reloading);
3527
3528		/*
3529		 * Check all DTLs to see if anything needs resilvering.
3530		 */
3531		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
3532		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
3533			spa_async_request(spa, SPA_ASYNC_RESILVER);
3534
3535		/*
3536		 * Log the fact that we booted up (so that we can detect if
3537		 * we rebooted in the middle of an operation).
3538		 */
3539		spa_history_log_version(spa, "open");
3540
3541		/*
3542		 * Delete any inconsistent datasets.
3543		 */
3544		(void) dmu_objset_find(spa_name(spa),
3545		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
3546
3547		/*
3548		 * Clean up any stale temporary dataset userrefs.
3549		 */
3550		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
3551
3552		/*
3553		 * Note: unlike condensing, we don't need an analogous
3554		 * "removal_in_progress" dance because no other thread
3555		 * can start a removal while we hold the spa_namespace_lock.
3556		 */
3557		spa_restart_removal(spa);
3558
3559		if (condense_in_progress)
3560			spa_condense_indirect_restart(spa);
3561	}
3562
3563	spa_load_note(spa, "LOADED");
3564
3565	return (0);
3566}
3567
3568static int
3569spa_load_retry(spa_t *spa, spa_load_state_t state)
3570{
3571	int mode = spa->spa_mode;
3572
3573	spa_unload(spa);
3574	spa_deactivate(spa);
3575
3576	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
3577
3578	spa_activate(spa, mode);
3579	spa_async_suspend(spa);
3580
3581	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
3582	    (u_longlong_t)spa->spa_load_max_txg);
3583
3584	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
3585}
3586
3587/*
3588 * If spa_load() fails this function will try loading prior txg's. If
3589 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3590 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3591 * function will not rewind the pool and will return the same error as
3592 * spa_load().
3593 */
3594static int
3595spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
3596    int rewind_flags)
3597{
3598	nvlist_t *loadinfo = NULL;
3599	nvlist_t *config = NULL;
3600	int load_error, rewind_error;
3601	uint64_t safe_rewind_txg;
3602	uint64_t min_txg;
3603
3604	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
3605		spa->spa_load_max_txg = spa->spa_load_txg;
3606		spa_set_log_state(spa, SPA_LOG_CLEAR);
3607	} else {
3608		spa->spa_load_max_txg = max_request;
3609		if (max_request != UINT64_MAX)
3610			spa->spa_extreme_rewind = B_TRUE;
3611	}
3612
3613	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
3614	if (load_error == 0)
3615		return (0);
3616
3617	if (spa->spa_root_vdev != NULL)
3618		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3619
3620	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
3621	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
3622
3623	if (rewind_flags & ZPOOL_NEVER_REWIND) {
3624		nvlist_free(config);
3625		return (load_error);
3626	}
3627
3628	if (state == SPA_LOAD_RECOVER) {
3629		/* Price of rolling back is discarding txgs, including log */
3630		spa_set_log_state(spa, SPA_LOG_CLEAR);
3631	} else {
3632		/*
3633		 * If we aren't rolling back save the load info from our first
3634		 * import attempt so that we can restore it after attempting
3635		 * to rewind.
3636		 */
3637		loadinfo = spa->spa_load_info;
3638		spa->spa_load_info = fnvlist_alloc();
3639	}
3640
3641	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
3642	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
3643	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
3644	    TXG_INITIAL : safe_rewind_txg;
3645
3646	/*
3647	 * Continue as long as we're finding errors, we're still within
3648	 * the acceptable rewind range, and we're still finding uberblocks
3649	 */
3650	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
3651	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
3652		if (spa->spa_load_max_txg < safe_rewind_txg)
3653			spa->spa_extreme_rewind = B_TRUE;
3654		rewind_error = spa_load_retry(spa, state);
3655	}
3656
3657	spa->spa_extreme_rewind = B_FALSE;
3658	spa->spa_load_max_txg = UINT64_MAX;
3659
3660	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
3661		spa_config_set(spa, config);
3662	else
3663		nvlist_free(config);
3664
3665	if (state == SPA_LOAD_RECOVER) {
3666		ASSERT3P(loadinfo, ==, NULL);
3667		return (rewind_error);
3668	} else {
3669		/* Store the rewind info as part of the initial load info */
3670		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
3671		    spa->spa_load_info);
3672
3673		/* Restore the initial load info */
3674		fnvlist_free(spa->spa_load_info);
3675		spa->spa_load_info = loadinfo;
3676
3677		return (load_error);
3678	}
3679}
3680
3681/*
3682 * Pool Open/Import
3683 *
3684 * The import case is identical to an open except that the configuration is sent
3685 * down from userland, instead of grabbed from the configuration cache.  For the
3686 * case of an open, the pool configuration will exist in the
3687 * POOL_STATE_UNINITIALIZED state.
3688 *
3689 * The stats information (gen/count/ustats) is used to gather vdev statistics at
3690 * the same time open the pool, without having to keep around the spa_t in some
3691 * ambiguous state.
3692 */
3693static int
3694spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
3695    nvlist_t **config)
3696{
3697	spa_t *spa;
3698	spa_load_state_t state = SPA_LOAD_OPEN;
3699	int error;
3700	int locked = B_FALSE;
3701
3702	*spapp = NULL;
3703
3704	/*
3705	 * As disgusting as this is, we need to support recursive calls to this
3706	 * function because dsl_dir_open() is called during spa_load(), and ends
3707	 * up calling spa_open() again.  The real fix is to figure out how to
3708	 * avoid dsl_dir_open() calling this in the first place.
3709	 */
3710	if (mutex_owner(&spa_namespace_lock) != curthread) {
3711		mutex_enter(&spa_namespace_lock);
3712		locked = B_TRUE;
3713	}
3714
3715	if ((spa = spa_lookup(pool)) == NULL) {
3716		if (locked)
3717			mutex_exit(&spa_namespace_lock);
3718		return (SET_ERROR(ENOENT));
3719	}
3720
3721	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
3722		zpool_rewind_policy_t policy;
3723
3724		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
3725		    &policy);
3726		if (policy.zrp_request & ZPOOL_DO_REWIND)
3727			state = SPA_LOAD_RECOVER;
3728
3729		spa_activate(spa, spa_mode_global);
3730
3731		if (state != SPA_LOAD_RECOVER)
3732			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3733		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
3734
3735		zfs_dbgmsg("spa_open_common: opening %s", pool);
3736		error = spa_load_best(spa, state, policy.zrp_txg,
3737		    policy.zrp_request);
3738
3739		if (error == EBADF) {
3740			/*
3741			 * If vdev_validate() returns failure (indicated by
3742			 * EBADF), it indicates that one of the vdevs indicates
3743			 * that the pool has been exported or destroyed.  If
3744			 * this is the case, the config cache is out of sync and
3745			 * we should remove the pool from the namespace.
3746			 */
3747			spa_unload(spa);
3748			spa_deactivate(spa);
3749			spa_write_cachefile(spa, B_TRUE, B_TRUE);
3750			spa_remove(spa);
3751			if (locked)
3752				mutex_exit(&spa_namespace_lock);
3753			return (SET_ERROR(ENOENT));
3754		}
3755
3756		if (error) {
3757			/*
3758			 * We can't open the pool, but we still have useful
3759			 * information: the state of each vdev after the
3760			 * attempted vdev_open().  Return this to the user.
3761			 */
3762			if (config != NULL && spa->spa_config) {
3763				VERIFY(nvlist_dup(spa->spa_config, config,
3764				    KM_SLEEP) == 0);
3765				VERIFY(nvlist_add_nvlist(*config,
3766				    ZPOOL_CONFIG_LOAD_INFO,
3767				    spa->spa_load_info) == 0);
3768			}
3769			spa_unload(spa);
3770			spa_deactivate(spa);
3771			spa->spa_last_open_failed = error;
3772			if (locked)
3773				mutex_exit(&spa_namespace_lock);
3774			*spapp = NULL;
3775			return (error);
3776		}
3777	}
3778
3779	spa_open_ref(spa, tag);
3780
3781	if (config != NULL)
3782		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3783
3784	/*
3785	 * If we've recovered the pool, pass back any information we
3786	 * gathered while doing the load.
3787	 */
3788	if (state == SPA_LOAD_RECOVER) {
3789		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
3790		    spa->spa_load_info) == 0);
3791	}
3792
3793	if (locked) {
3794		spa->spa_last_open_failed = 0;
3795		spa->spa_last_ubsync_txg = 0;
3796		spa->spa_load_txg = 0;
3797		mutex_exit(&spa_namespace_lock);
3798	}
3799
3800	*spapp = spa;
3801
3802	return (0);
3803}
3804
3805int
3806spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
3807    nvlist_t **config)
3808{
3809	return (spa_open_common(name, spapp, tag, policy, config));
3810}
3811
3812int
3813spa_open(const char *name, spa_t **spapp, void *tag)
3814{
3815	return (spa_open_common(name, spapp, tag, NULL, NULL));
3816}
3817
3818/*
3819 * Lookup the given spa_t, incrementing the inject count in the process,
3820 * preventing it from being exported or destroyed.
3821 */
3822spa_t *
3823spa_inject_addref(char *name)
3824{
3825	spa_t *spa;
3826
3827	mutex_enter(&spa_namespace_lock);
3828	if ((spa = spa_lookup(name)) == NULL) {
3829		mutex_exit(&spa_namespace_lock);
3830		return (NULL);
3831	}
3832	spa->spa_inject_ref++;
3833	mutex_exit(&spa_namespace_lock);
3834
3835	return (spa);
3836}
3837
3838void
3839spa_inject_delref(spa_t *spa)
3840{
3841	mutex_enter(&spa_namespace_lock);
3842	spa->spa_inject_ref--;
3843	mutex_exit(&spa_namespace_lock);
3844}
3845
3846/*
3847 * Add spares device information to the nvlist.
3848 */
3849static void
3850spa_add_spares(spa_t *spa, nvlist_t *config)
3851{
3852	nvlist_t **spares;
3853	uint_t i, nspares;
3854	nvlist_t *nvroot;
3855	uint64_t guid;
3856	vdev_stat_t *vs;
3857	uint_t vsc;
3858	uint64_t pool;
3859
3860	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3861
3862	if (spa->spa_spares.sav_count == 0)
3863		return;
3864
3865	VERIFY(nvlist_lookup_nvlist(config,
3866	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3867	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3868	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3869	if (nspares != 0) {
3870		VERIFY(nvlist_add_nvlist_array(nvroot,
3871		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3872		VERIFY(nvlist_lookup_nvlist_array(nvroot,
3873		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
3874
3875		/*
3876		 * Go through and find any spares which have since been
3877		 * repurposed as an active spare.  If this is the case, update
3878		 * their status appropriately.
3879		 */
3880		for (i = 0; i < nspares; i++) {
3881			VERIFY(nvlist_lookup_uint64(spares[i],
3882			    ZPOOL_CONFIG_GUID, &guid) == 0);
3883			if (spa_spare_exists(guid, &pool, NULL) &&
3884			    pool != 0ULL) {
3885				VERIFY(nvlist_lookup_uint64_array(
3886				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
3887				    (uint64_t **)&vs, &vsc) == 0);
3888				vs->vs_state = VDEV_STATE_CANT_OPEN;
3889				vs->vs_aux = VDEV_AUX_SPARED;
3890			}
3891		}
3892	}
3893}
3894
3895/*
3896 * Add l2cache device information to the nvlist, including vdev stats.
3897 */
3898static void
3899spa_add_l2cache(spa_t *spa, nvlist_t *config)
3900{
3901	nvlist_t **l2cache;
3902	uint_t i, j, nl2cache;
3903	nvlist_t *nvroot;
3904	uint64_t guid;
3905	vdev_t *vd;
3906	vdev_stat_t *vs;
3907	uint_t vsc;
3908
3909	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3910
3911	if (spa->spa_l2cache.sav_count == 0)
3912		return;
3913
3914	VERIFY(nvlist_lookup_nvlist(config,
3915	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3916	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3917	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3918	if (nl2cache != 0) {
3919		VERIFY(nvlist_add_nvlist_array(nvroot,
3920		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3921		VERIFY(nvlist_lookup_nvlist_array(nvroot,
3922		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3923
3924		/*
3925		 * Update level 2 cache device stats.
3926		 */
3927
3928		for (i = 0; i < nl2cache; i++) {
3929			VERIFY(nvlist_lookup_uint64(l2cache[i],
3930			    ZPOOL_CONFIG_GUID, &guid) == 0);
3931
3932			vd = NULL;
3933			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3934				if (guid ==
3935				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3936					vd = spa->spa_l2cache.sav_vdevs[j];
3937					break;
3938				}
3939			}
3940			ASSERT(vd != NULL);
3941
3942			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3943			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3944			    == 0);
3945			vdev_get_stats(vd, vs);
3946		}
3947	}
3948}
3949
3950static void
3951spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3952{
3953	nvlist_t *features;
3954	zap_cursor_t zc;
3955	zap_attribute_t za;
3956
3957	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3958	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3959
3960	if (spa->spa_feat_for_read_obj != 0) {
3961		for (zap_cursor_init(&zc, spa->spa_meta_objset,
3962		    spa->spa_feat_for_read_obj);
3963		    zap_cursor_retrieve(&zc, &za) == 0;
3964		    zap_cursor_advance(&zc)) {
3965			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3966			    za.za_num_integers == 1);
3967			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3968			    za.za_first_integer));
3969		}
3970		zap_cursor_fini(&zc);
3971	}
3972
3973	if (spa->spa_feat_for_write_obj != 0) {
3974		for (zap_cursor_init(&zc, spa->spa_meta_objset,
3975		    spa->spa_feat_for_write_obj);
3976		    zap_cursor_retrieve(&zc, &za) == 0;
3977		    zap_cursor_advance(&zc)) {
3978			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3979			    za.za_num_integers == 1);
3980			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3981			    za.za_first_integer));
3982		}
3983		zap_cursor_fini(&zc);
3984	}
3985
3986	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3987	    features) == 0);
3988	nvlist_free(features);
3989}
3990
3991int
3992spa_get_stats(const char *name, nvlist_t **config,
3993    char *altroot, size_t buflen)
3994{
3995	int error;
3996	spa_t *spa;
3997
3998	*config = NULL;
3999	error = spa_open_common(name, &spa, FTAG, NULL, config);
4000
4001	if (spa != NULL) {
4002		/*
4003		 * This still leaves a window of inconsistency where the spares
4004		 * or l2cache devices could change and the config would be
4005		 * self-inconsistent.
4006		 */
4007		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4008
4009		if (*config != NULL) {
4010			uint64_t loadtimes[2];
4011
4012			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4013			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4014			VERIFY(nvlist_add_uint64_array(*config,
4015			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4016
4017			VERIFY(nvlist_add_uint64(*config,
4018			    ZPOOL_CONFIG_ERRCOUNT,
4019			    spa_get_errlog_size(spa)) == 0);
4020
4021			if (spa_suspended(spa))
4022				VERIFY(nvlist_add_uint64(*config,
4023				    ZPOOL_CONFIG_SUSPENDED,
4024				    spa->spa_failmode) == 0);
4025
4026			spa_add_spares(spa, *config);
4027			spa_add_l2cache(spa, *config);
4028			spa_add_feature_stats(spa, *config);
4029		}
4030	}
4031
4032	/*
4033	 * We want to get the alternate root even for faulted pools, so we cheat
4034	 * and call spa_lookup() directly.
4035	 */
4036	if (altroot) {
4037		if (spa == NULL) {
4038			mutex_enter(&spa_namespace_lock);
4039			spa = spa_lookup(name);
4040			if (spa)
4041				spa_altroot(spa, altroot, buflen);
4042			else
4043				altroot[0] = '\0';
4044			spa = NULL;
4045			mutex_exit(&spa_namespace_lock);
4046		} else {
4047			spa_altroot(spa, altroot, buflen);
4048		}
4049	}
4050
4051	if (spa != NULL) {
4052		spa_config_exit(spa, SCL_CONFIG, FTAG);
4053		spa_close(spa, FTAG);
4054	}
4055
4056	return (error);
4057}
4058
4059/*
4060 * Validate that the auxiliary device array is well formed.  We must have an
4061 * array of nvlists, each which describes a valid leaf vdev.  If this is an
4062 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4063 * specified, as long as they are well-formed.
4064 */
4065static int
4066spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4067    spa_aux_vdev_t *sav, const char *config, uint64_t version,
4068    vdev_labeltype_t label)
4069{
4070	nvlist_t **dev;
4071	uint_t i, ndev;
4072	vdev_t *vd;
4073	int error;
4074
4075	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4076
4077	/*
4078	 * It's acceptable to have no devs specified.
4079	 */
4080	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4081		return (0);
4082
4083	if (ndev == 0)
4084		return (SET_ERROR(EINVAL));
4085
4086	/*
4087	 * Make sure the pool is formatted with a version that supports this
4088	 * device type.
4089	 */
4090	if (spa_version(spa) < version)
4091		return (SET_ERROR(ENOTSUP));
4092
4093	/*
4094	 * Set the pending device list so we correctly handle device in-use
4095	 * checking.
4096	 */
4097	sav->sav_pending = dev;
4098	sav->sav_npending = ndev;
4099
4100	for (i = 0; i < ndev; i++) {
4101		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4102		    mode)) != 0)
4103			goto out;
4104
4105		if (!vd->vdev_ops->vdev_op_leaf) {
4106			vdev_free(vd);
4107			error = SET_ERROR(EINVAL);
4108			goto out;
4109		}
4110
4111		/*
4112		 * The L2ARC currently only supports disk devices in
4113		 * kernel context.  For user-level testing, we allow it.
4114		 */
4115#ifdef _KERNEL
4116		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
4117		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
4118			error = SET_ERROR(ENOTBLK);
4119			vdev_free(vd);
4120			goto out;
4121		}
4122#endif
4123		vd->vdev_top = vd;
4124
4125		if ((error = vdev_open(vd)) == 0 &&
4126		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
4127			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4128			    vd->vdev_guid) == 0);
4129		}
4130
4131		vdev_free(vd);
4132
4133		if (error &&
4134		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4135			goto out;
4136		else
4137			error = 0;
4138	}
4139
4140out:
4141	sav->sav_pending = NULL;
4142	sav->sav_npending = 0;
4143	return (error);
4144}
4145
4146static int
4147spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4148{
4149	int error;
4150
4151	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4152
4153	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4154	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4155	    VDEV_LABEL_SPARE)) != 0) {
4156		return (error);
4157	}
4158
4159	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4160	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4161	    VDEV_LABEL_L2CACHE));
4162}
4163
4164static void
4165spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4166    const char *config)
4167{
4168	int i;
4169
4170	if (sav->sav_config != NULL) {
4171		nvlist_t **olddevs;
4172		uint_t oldndevs;
4173		nvlist_t **newdevs;
4174
4175		/*
4176		 * Generate new dev list by concatentating with the
4177		 * current dev list.
4178		 */
4179		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4180		    &olddevs, &oldndevs) == 0);
4181
4182		newdevs = kmem_alloc(sizeof (void *) *
4183		    (ndevs + oldndevs), KM_SLEEP);
4184		for (i = 0; i < oldndevs; i++)
4185			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4186			    KM_SLEEP) == 0);
4187		for (i = 0; i < ndevs; i++)
4188			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
4189			    KM_SLEEP) == 0);
4190
4191		VERIFY(nvlist_remove(sav->sav_config, config,
4192		    DATA_TYPE_NVLIST_ARRAY) == 0);
4193
4194		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
4195		    config, newdevs, ndevs + oldndevs) == 0);
4196		for (i = 0; i < oldndevs + ndevs; i++)
4197			nvlist_free(newdevs[i]);
4198		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
4199	} else {
4200		/*
4201		 * Generate a new dev list.
4202		 */
4203		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
4204		    KM_SLEEP) == 0);
4205		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
4206		    devs, ndevs) == 0);
4207	}
4208}
4209
4210/*
4211 * Stop and drop level 2 ARC devices
4212 */
4213void
4214spa_l2cache_drop(spa_t *spa)
4215{
4216	vdev_t *vd;
4217	int i;
4218	spa_aux_vdev_t *sav = &spa->spa_l2cache;
4219
4220	for (i = 0; i < sav->sav_count; i++) {
4221		uint64_t pool;
4222
4223		vd = sav->sav_vdevs[i];
4224		ASSERT(vd != NULL);
4225
4226		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
4227		    pool != 0ULL && l2arc_vdev_present(vd))
4228			l2arc_remove_vdev(vd);
4229	}
4230}
4231
4232/*
4233 * Pool Creation
4234 */
4235int
4236spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
4237    nvlist_t *zplprops)
4238{
4239	spa_t *spa;
4240	char *altroot = NULL;
4241	vdev_t *rvd;
4242	dsl_pool_t *dp;
4243	dmu_tx_t *tx;
4244	int error = 0;
4245	uint64_t txg = TXG_INITIAL;
4246	nvlist_t **spares, **l2cache;
4247	uint_t nspares, nl2cache;
4248	uint64_t version, obj;
4249	boolean_t has_features;
4250
4251	/*
4252	 * If this pool already exists, return failure.
4253	 */
4254	mutex_enter(&spa_namespace_lock);
4255	if (spa_lookup(pool) != NULL) {
4256		mutex_exit(&spa_namespace_lock);
4257		return (SET_ERROR(EEXIST));
4258	}
4259
4260	/*
4261	 * Allocate a new spa_t structure.
4262	 */
4263	(void) nvlist_lookup_string(props,
4264	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4265	spa = spa_add(pool, NULL, altroot);
4266	spa_activate(spa, spa_mode_global);
4267
4268	if (props && (error = spa_prop_validate(spa, props))) {
4269		spa_deactivate(spa);
4270		spa_remove(spa);
4271		mutex_exit(&spa_namespace_lock);
4272		return (error);
4273	}
4274
4275	has_features = B_FALSE;
4276	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
4277	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
4278		if (zpool_prop_feature(nvpair_name(elem)))
4279			has_features = B_TRUE;
4280	}
4281
4282	if (has_features || nvlist_lookup_uint64(props,
4283	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
4284		version = SPA_VERSION;
4285	}
4286	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
4287
4288	spa->spa_first_txg = txg;
4289	spa->spa_uberblock.ub_txg = txg - 1;
4290	spa->spa_uberblock.ub_version = version;
4291	spa->spa_ubsync = spa->spa_uberblock;
4292	spa->spa_load_state = SPA_LOAD_CREATE;
4293	spa->spa_removing_phys.sr_state = DSS_NONE;
4294	spa->spa_removing_phys.sr_removing_vdev = -1;
4295	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
4296
4297	/*
4298	 * Create "The Godfather" zio to hold all async IOs
4299	 */
4300	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
4301	    KM_SLEEP);
4302	for (int i = 0; i < max_ncpus; i++) {
4303		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4304		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4305		    ZIO_FLAG_GODFATHER);
4306	}
4307
4308	/*
4309	 * Create the root vdev.
4310	 */
4311	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4312
4313	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
4314
4315	ASSERT(error != 0 || rvd != NULL);
4316	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
4317
4318	if (error == 0 && !zfs_allocatable_devs(nvroot))
4319		error = SET_ERROR(EINVAL);
4320
4321	if (error == 0 &&
4322	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
4323	    (error = spa_validate_aux(spa, nvroot, txg,
4324	    VDEV_ALLOC_ADD)) == 0) {
4325		for (int c = 0; c < rvd->vdev_children; c++) {
4326			vdev_metaslab_set_size(rvd->vdev_child[c]);
4327			vdev_expand(rvd->vdev_child[c], txg);
4328		}
4329	}
4330
4331	spa_config_exit(spa, SCL_ALL, FTAG);
4332
4333	if (error != 0) {
4334		spa_unload(spa);
4335		spa_deactivate(spa);
4336		spa_remove(spa);
4337		mutex_exit(&spa_namespace_lock);
4338		return (error);
4339	}
4340
4341	/*
4342	 * Get the list of spares, if specified.
4343	 */
4344	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4345	    &spares, &nspares) == 0) {
4346		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
4347		    KM_SLEEP) == 0);
4348		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4349		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4350		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4351		spa_load_spares(spa);
4352		spa_config_exit(spa, SCL_ALL, FTAG);
4353		spa->spa_spares.sav_sync = B_TRUE;
4354	}
4355
4356	/*
4357	 * Get the list of level 2 cache devices, if specified.
4358	 */
4359	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4360	    &l2cache, &nl2cache) == 0) {
4361		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4362		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4363		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4364		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4365		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4366		spa_load_l2cache(spa);
4367		spa_config_exit(spa, SCL_ALL, FTAG);
4368		spa->spa_l2cache.sav_sync = B_TRUE;
4369	}
4370
4371	spa->spa_is_initializing = B_TRUE;
4372	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
4373	spa->spa_meta_objset = dp->dp_meta_objset;
4374	spa->spa_is_initializing = B_FALSE;
4375
4376	/*
4377	 * Create DDTs (dedup tables).
4378	 */
4379	ddt_create(spa);
4380
4381	spa_update_dspace(spa);
4382
4383	tx = dmu_tx_create_assigned(dp, txg);
4384
4385	/*
4386	 * Create the pool config object.
4387	 */
4388	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
4389	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
4390	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
4391
4392	if (zap_add(spa->spa_meta_objset,
4393	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
4394	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
4395		cmn_err(CE_PANIC, "failed to add pool config");
4396	}
4397
4398	if (spa_version(spa) >= SPA_VERSION_FEATURES)
4399		spa_feature_create_zap_objects(spa, tx);
4400
4401	if (zap_add(spa->spa_meta_objset,
4402	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
4403	    sizeof (uint64_t), 1, &version, tx) != 0) {
4404		cmn_err(CE_PANIC, "failed to add pool version");
4405	}
4406
4407	/* Newly created pools with the right version are always deflated. */
4408	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
4409		spa->spa_deflate = TRUE;
4410		if (zap_add(spa->spa_meta_objset,
4411		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4412		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
4413			cmn_err(CE_PANIC, "failed to add deflate");
4414		}
4415	}
4416
4417	/*
4418	 * Create the deferred-free bpobj.  Turn off compression
4419	 * because sync-to-convergence takes longer if the blocksize
4420	 * keeps changing.
4421	 */
4422	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
4423	dmu_object_set_compress(spa->spa_meta_objset, obj,
4424	    ZIO_COMPRESS_OFF, tx);
4425	if (zap_add(spa->spa_meta_objset,
4426	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
4427	    sizeof (uint64_t), 1, &obj, tx) != 0) {
4428		cmn_err(CE_PANIC, "failed to add bpobj");
4429	}
4430	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
4431	    spa->spa_meta_objset, obj));
4432
4433	/*
4434	 * Create the pool's history object.
4435	 */
4436	if (version >= SPA_VERSION_ZPOOL_HISTORY)
4437		spa_history_create_obj(spa, tx);
4438
4439	/*
4440	 * Generate some random noise for salted checksums to operate on.
4441	 */
4442	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4443	    sizeof (spa->spa_cksum_salt.zcs_bytes));
4444
4445	/*
4446	 * Set pool properties.
4447	 */
4448	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
4449	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4450	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
4451	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
4452
4453	if (props != NULL) {
4454		spa_configfile_set(spa, props, B_FALSE);
4455		spa_sync_props(props, tx);
4456	}
4457
4458	dmu_tx_commit(tx);
4459
4460	spa->spa_sync_on = B_TRUE;
4461	txg_sync_start(spa->spa_dsl_pool);
4462
4463	/*
4464	 * We explicitly wait for the first transaction to complete so that our
4465	 * bean counters are appropriately updated.
4466	 */
4467	txg_wait_synced(spa->spa_dsl_pool, txg);
4468
4469	spa_write_cachefile(spa, B_FALSE, B_TRUE);
4470	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
4471
4472	spa_history_log_version(spa, "create");
4473
4474	/*
4475	 * Don't count references from objsets that are already closed
4476	 * and are making their way through the eviction process.
4477	 */
4478	spa_evicting_os_wait(spa);
4479	spa->spa_minref = refcount_count(&spa->spa_refcount);
4480	spa->spa_load_state = SPA_LOAD_NONE;
4481
4482	mutex_exit(&spa_namespace_lock);
4483
4484	return (0);
4485}
4486
4487#ifdef _KERNEL
4488/*
4489 * Get the root pool information from the root disk, then import the root pool
4490 * during the system boot up time.
4491 */
4492extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
4493
4494static nvlist_t *
4495spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
4496{
4497	nvlist_t *config;
4498	nvlist_t *nvtop, *nvroot;
4499	uint64_t pgid;
4500
4501	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
4502		return (NULL);
4503
4504	/*
4505	 * Add this top-level vdev to the child array.
4506	 */
4507	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4508	    &nvtop) == 0);
4509	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4510	    &pgid) == 0);
4511	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
4512
4513	/*
4514	 * Put this pool's top-level vdevs into a root vdev.
4515	 */
4516	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4517	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
4518	    VDEV_TYPE_ROOT) == 0);
4519	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
4520	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
4521	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
4522	    &nvtop, 1) == 0);
4523
4524	/*
4525	 * Replace the existing vdev_tree with the new root vdev in
4526	 * this pool's configuration (remove the old, add the new).
4527	 */
4528	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
4529	nvlist_free(nvroot);
4530	return (config);
4531}
4532
4533/*
4534 * Walk the vdev tree and see if we can find a device with "better"
4535 * configuration. A configuration is "better" if the label on that
4536 * device has a more recent txg.
4537 */
4538static void
4539spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
4540{
4541	for (int c = 0; c < vd->vdev_children; c++)
4542		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
4543
4544	if (vd->vdev_ops->vdev_op_leaf) {
4545		nvlist_t *label;
4546		uint64_t label_txg;
4547
4548		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
4549		    &label) != 0)
4550			return;
4551
4552		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
4553		    &label_txg) == 0);
4554
4555		/*
4556		 * Do we have a better boot device?
4557		 */
4558		if (label_txg > *txg) {
4559			*txg = label_txg;
4560			*avd = vd;
4561		}
4562		nvlist_free(label);
4563	}
4564}
4565
4566/*
4567 * Import a root pool.
4568 *
4569 * For x86. devpath_list will consist of devid and/or physpath name of
4570 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
4571 * The GRUB "findroot" command will return the vdev we should boot.
4572 *
4573 * For Sparc, devpath_list consists the physpath name of the booting device
4574 * no matter the rootpool is a single device pool or a mirrored pool.
4575 * e.g.
4576 *	"/pci@1f,0/ide@d/disk@0,0:a"
4577 */
4578int
4579spa_import_rootpool(char *devpath, char *devid)
4580{
4581	spa_t *spa;
4582	vdev_t *rvd, *bvd, *avd = NULL;
4583	nvlist_t *config, *nvtop;
4584	uint64_t guid, txg;
4585	char *pname;
4586	int error;
4587
4588	/*
4589	 * Read the label from the boot device and generate a configuration.
4590	 */
4591	config = spa_generate_rootconf(devpath, devid, &guid);
4592#if defined(_OBP) && defined(_KERNEL)
4593	if (config == NULL) {
4594		if (strstr(devpath, "/iscsi/ssd") != NULL) {
4595			/* iscsi boot */
4596			get_iscsi_bootpath_phy(devpath);
4597			config = spa_generate_rootconf(devpath, devid, &guid);
4598		}
4599	}
4600#endif
4601	if (config == NULL) {
4602		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
4603		    devpath);
4604		return (SET_ERROR(EIO));
4605	}
4606
4607	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
4608	    &pname) == 0);
4609	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
4610
4611	mutex_enter(&spa_namespace_lock);
4612	if ((spa = spa_lookup(pname)) != NULL) {
4613		/*
4614		 * Remove the existing root pool from the namespace so that we
4615		 * can replace it with the correct config we just read in.
4616		 */
4617		spa_remove(spa);
4618	}
4619
4620	spa = spa_add(pname, config, NULL);
4621	spa->spa_is_root = B_TRUE;
4622	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
4623	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
4624	    &spa->spa_ubsync.ub_version) != 0)
4625		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
4626
4627	/*
4628	 * Build up a vdev tree based on the boot device's label config.
4629	 */
4630	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4631	    &nvtop) == 0);
4632	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4633	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
4634	    VDEV_ALLOC_ROOTPOOL);
4635	spa_config_exit(spa, SCL_ALL, FTAG);
4636	if (error) {
4637		mutex_exit(&spa_namespace_lock);
4638		nvlist_free(config);
4639		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
4640		    pname);
4641		return (error);
4642	}
4643
4644	/*
4645	 * Get the boot vdev.
4646	 */
4647	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
4648		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
4649		    (u_longlong_t)guid);
4650		error = SET_ERROR(ENOENT);
4651		goto out;
4652	}
4653
4654	/*
4655	 * Determine if there is a better boot device.
4656	 */
4657	avd = bvd;
4658	spa_alt_rootvdev(rvd, &avd, &txg);
4659	if (avd != bvd) {
4660		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
4661		    "try booting from '%s'", avd->vdev_path);
4662		error = SET_ERROR(EINVAL);
4663		goto out;
4664	}
4665
4666	/*
4667	 * If the boot device is part of a spare vdev then ensure that
4668	 * we're booting off the active spare.
4669	 */
4670	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4671	    !bvd->vdev_isspare) {
4672		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
4673		    "try booting from '%s'",
4674		    bvd->vdev_parent->
4675		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
4676		error = SET_ERROR(EINVAL);
4677		goto out;
4678	}
4679
4680	error = 0;
4681out:
4682	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4683	vdev_free(rvd);
4684	spa_config_exit(spa, SCL_ALL, FTAG);
4685	mutex_exit(&spa_namespace_lock);
4686
4687	nvlist_free(config);
4688	return (error);
4689}
4690
4691#endif
4692
4693/*
4694 * Import a non-root pool into the system.
4695 */
4696int
4697spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
4698{
4699	spa_t *spa;
4700	char *altroot = NULL;
4701	spa_load_state_t state = SPA_LOAD_IMPORT;
4702	zpool_rewind_policy_t policy;
4703	uint64_t mode = spa_mode_global;
4704	uint64_t readonly = B_FALSE;
4705	int error;
4706	nvlist_t *nvroot;
4707	nvlist_t **spares, **l2cache;
4708	uint_t nspares, nl2cache;
4709
4710	/*
4711	 * If a pool with this name exists, return failure.
4712	 */
4713	mutex_enter(&spa_namespace_lock);
4714	if (spa_lookup(pool) != NULL) {
4715		mutex_exit(&spa_namespace_lock);
4716		return (SET_ERROR(EEXIST));
4717	}
4718
4719	/*
4720	 * Create and initialize the spa structure.
4721	 */
4722	(void) nvlist_lookup_string(props,
4723	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4724	(void) nvlist_lookup_uint64(props,
4725	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
4726	if (readonly)
4727		mode = FREAD;
4728	spa = spa_add(pool, config, altroot);
4729	spa->spa_import_flags = flags;
4730
4731	/*
4732	 * Verbatim import - Take a pool and insert it into the namespace
4733	 * as if it had been loaded at boot.
4734	 */
4735	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4736		if (props != NULL)
4737			spa_configfile_set(spa, props, B_FALSE);
4738
4739		spa_write_cachefile(spa, B_FALSE, B_TRUE);
4740		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
4741		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
4742		mutex_exit(&spa_namespace_lock);
4743		return (0);
4744	}
4745
4746	spa_activate(spa, mode);
4747
4748	/*
4749	 * Don't start async tasks until we know everything is healthy.
4750	 */
4751	spa_async_suspend(spa);
4752
4753	zpool_get_rewind_policy(config, &policy);
4754	if (policy.zrp_request & ZPOOL_DO_REWIND)
4755		state = SPA_LOAD_RECOVER;
4756
4757	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
4758
4759	if (state != SPA_LOAD_RECOVER) {
4760		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4761		zfs_dbgmsg("spa_import: importing %s", pool);
4762	} else {
4763		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
4764		    "(RECOVERY MODE)", pool, (longlong_t)policy.zrp_txg);
4765	}
4766	error = spa_load_best(spa, state, policy.zrp_txg, policy.zrp_request);
4767
4768	/*
4769	 * Propagate anything learned while loading the pool and pass it
4770	 * back to caller (i.e. rewind info, missing devices, etc).
4771	 */
4772	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4773	    spa->spa_load_info) == 0);
4774
4775	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4776	/*
4777	 * Toss any existing sparelist, as it doesn't have any validity
4778	 * anymore, and conflicts with spa_has_spare().
4779	 */
4780	if (spa->spa_spares.sav_config) {
4781		nvlist_free(spa->spa_spares.sav_config);
4782		spa->spa_spares.sav_config = NULL;
4783		spa_load_spares(spa);
4784	}
4785	if (spa->spa_l2cache.sav_config) {
4786		nvlist_free(spa->spa_l2cache.sav_config);
4787		spa->spa_l2cache.sav_config = NULL;
4788		spa_load_l2cache(spa);
4789	}
4790
4791	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4792	    &nvroot) == 0);
4793	if (error == 0)
4794		error = spa_validate_aux(spa, nvroot, -1ULL,
4795		    VDEV_ALLOC_SPARE);
4796	if (error == 0)
4797		error = spa_validate_aux(spa, nvroot, -1ULL,
4798		    VDEV_ALLOC_L2CACHE);
4799	spa_config_exit(spa, SCL_ALL, FTAG);
4800
4801	if (props != NULL)
4802		spa_configfile_set(spa, props, B_FALSE);
4803
4804	if (error != 0 || (props && spa_writeable(spa) &&
4805	    (error = spa_prop_set(spa, props)))) {
4806		spa_unload(spa);
4807		spa_deactivate(spa);
4808		spa_remove(spa);
4809		mutex_exit(&spa_namespace_lock);
4810		return (error);
4811	}
4812
4813	spa_async_resume(spa);
4814
4815	/*
4816	 * Override any spares and level 2 cache devices as specified by
4817	 * the user, as these may have correct device names/devids, etc.
4818	 */
4819	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4820	    &spares, &nspares) == 0) {
4821		if (spa->spa_spares.sav_config)
4822			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4823			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4824		else
4825			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4826			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4827		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4828		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4829		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4830		spa_load_spares(spa);
4831		spa_config_exit(spa, SCL_ALL, FTAG);
4832		spa->spa_spares.sav_sync = B_TRUE;
4833	}
4834	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4835	    &l2cache, &nl2cache) == 0) {
4836		if (spa->spa_l2cache.sav_config)
4837			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4838			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4839		else
4840			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4841			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4842		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4843		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4844		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4845		spa_load_l2cache(spa);
4846		spa_config_exit(spa, SCL_ALL, FTAG);
4847		spa->spa_l2cache.sav_sync = B_TRUE;
4848	}
4849
4850	/*
4851	 * Check for any removed devices.
4852	 */
4853	if (spa->spa_autoreplace) {
4854		spa_aux_check_removed(&spa->spa_spares);
4855		spa_aux_check_removed(&spa->spa_l2cache);
4856	}
4857
4858	if (spa_writeable(spa)) {
4859		/*
4860		 * Update the config cache to include the newly-imported pool.
4861		 */
4862		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4863	}
4864
4865	/*
4866	 * It's possible that the pool was expanded while it was exported.
4867	 * We kick off an async task to handle this for us.
4868	 */
4869	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4870
4871	spa_history_log_version(spa, "import");
4872
4873	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
4874
4875	mutex_exit(&spa_namespace_lock);
4876
4877	return (0);
4878}
4879
4880nvlist_t *
4881spa_tryimport(nvlist_t *tryconfig)
4882{
4883	nvlist_t *config = NULL;
4884	char *poolname, *cachefile;
4885	spa_t *spa;
4886	uint64_t state;
4887	int error;
4888	zpool_rewind_policy_t policy;
4889
4890	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4891		return (NULL);
4892
4893	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4894		return (NULL);
4895
4896	/*
4897	 * Create and initialize the spa structure.
4898	 */
4899	mutex_enter(&spa_namespace_lock);
4900	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4901	spa_activate(spa, FREAD);
4902
4903	/*
4904	 * Rewind pool if a max txg was provided. Note that even though we
4905	 * retrieve the complete rewind policy, only the rewind txg is relevant
4906	 * for tryimport.
4907	 */
4908	zpool_get_rewind_policy(spa->spa_config, &policy);
4909	if (policy.zrp_txg != UINT64_MAX) {
4910		spa->spa_load_max_txg = policy.zrp_txg;
4911		spa->spa_extreme_rewind = B_TRUE;
4912		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
4913		    poolname, (longlong_t)policy.zrp_txg);
4914	} else {
4915		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
4916	}
4917
4918	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
4919	    == 0) {
4920		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
4921		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4922	} else {
4923		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
4924	}
4925
4926	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
4927
4928	/*
4929	 * If 'tryconfig' was at least parsable, return the current config.
4930	 */
4931	if (spa->spa_root_vdev != NULL) {
4932		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4933		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4934		    poolname) == 0);
4935		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4936		    state) == 0);
4937		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4938		    spa->spa_uberblock.ub_timestamp) == 0);
4939		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4940		    spa->spa_load_info) == 0);
4941
4942		/*
4943		 * If the bootfs property exists on this pool then we
4944		 * copy it out so that external consumers can tell which
4945		 * pools are bootable.
4946		 */
4947		if ((!error || error == EEXIST) && spa->spa_bootfs) {
4948			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4949
4950			/*
4951			 * We have to play games with the name since the
4952			 * pool was opened as TRYIMPORT_NAME.
4953			 */
4954			if (dsl_dsobj_to_dsname(spa_name(spa),
4955			    spa->spa_bootfs, tmpname) == 0) {
4956				char *cp;
4957				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4958
4959				cp = strchr(tmpname, '/');
4960				if (cp == NULL) {
4961					(void) strlcpy(dsname, tmpname,
4962					    MAXPATHLEN);
4963				} else {
4964					(void) snprintf(dsname, MAXPATHLEN,
4965					    "%s/%s", poolname, ++cp);
4966				}
4967				VERIFY(nvlist_add_string(config,
4968				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4969				kmem_free(dsname, MAXPATHLEN);
4970			}
4971			kmem_free(tmpname, MAXPATHLEN);
4972		}
4973
4974		/*
4975		 * Add the list of hot spares and level 2 cache devices.
4976		 */
4977		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4978		spa_add_spares(spa, config);
4979		spa_add_l2cache(spa, config);
4980		spa_config_exit(spa, SCL_CONFIG, FTAG);
4981	}
4982
4983	spa_unload(spa);
4984	spa_deactivate(spa);
4985	spa_remove(spa);
4986	mutex_exit(&spa_namespace_lock);
4987
4988	return (config);
4989}
4990
4991/*
4992 * Pool export/destroy
4993 *
4994 * The act of destroying or exporting a pool is very simple.  We make sure there
4995 * is no more pending I/O and any references to the pool are gone.  Then, we
4996 * update the pool state and sync all the labels to disk, removing the
4997 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4998 * we don't sync the labels or remove the configuration cache.
4999 */
5000static int
5001spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5002    boolean_t force, boolean_t hardforce)
5003{
5004	spa_t *spa;
5005
5006	if (oldconfig)
5007		*oldconfig = NULL;
5008
5009	if (!(spa_mode_global & FWRITE))
5010		return (SET_ERROR(EROFS));
5011
5012	mutex_enter(&spa_namespace_lock);
5013	if ((spa = spa_lookup(pool)) == NULL) {
5014		mutex_exit(&spa_namespace_lock);
5015		return (SET_ERROR(ENOENT));
5016	}
5017
5018	/*
5019	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
5020	 * reacquire the namespace lock, and see if we can export.
5021	 */
5022	spa_open_ref(spa, FTAG);
5023	mutex_exit(&spa_namespace_lock);
5024	spa_async_suspend(spa);
5025	mutex_enter(&spa_namespace_lock);
5026	spa_close(spa, FTAG);
5027
5028	/*
5029	 * The pool will be in core if it's openable,
5030	 * in which case we can modify its state.
5031	 */
5032	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5033		/*
5034		 * Objsets may be open only because they're dirty, so we
5035		 * have to force it to sync before checking spa_refcnt.
5036		 */
5037		txg_wait_synced(spa->spa_dsl_pool, 0);
5038		spa_evicting_os_wait(spa);
5039
5040		/*
5041		 * A pool cannot be exported or destroyed if there are active
5042		 * references.  If we are resetting a pool, allow references by
5043		 * fault injection handlers.
5044		 */
5045		if (!spa_refcount_zero(spa) ||
5046		    (spa->spa_inject_ref != 0 &&
5047		    new_state != POOL_STATE_UNINITIALIZED)) {
5048			spa_async_resume(spa);
5049			mutex_exit(&spa_namespace_lock);
5050			return (SET_ERROR(EBUSY));
5051		}
5052
5053		/*
5054		 * A pool cannot be exported if it has an active shared spare.
5055		 * This is to prevent other pools stealing the active spare
5056		 * from an exported pool. At user's own will, such pool can
5057		 * be forcedly exported.
5058		 */
5059		if (!force && new_state == POOL_STATE_EXPORTED &&
5060		    spa_has_active_shared_spare(spa)) {
5061			spa_async_resume(spa);
5062			mutex_exit(&spa_namespace_lock);
5063			return (SET_ERROR(EXDEV));
5064		}
5065
5066		/*
5067		 * We want this to be reflected on every label,
5068		 * so mark them all dirty.  spa_unload() will do the
5069		 * final sync that pushes these changes out.
5070		 */
5071		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5072			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5073			spa->spa_state = new_state;
5074			spa->spa_final_txg = spa_last_synced_txg(spa) +
5075			    TXG_DEFER_SIZE + 1;
5076			vdev_config_dirty(spa->spa_root_vdev);
5077			spa_config_exit(spa, SCL_ALL, FTAG);
5078		}
5079	}
5080
5081	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
5082
5083	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5084		spa_unload(spa);
5085		spa_deactivate(spa);
5086	}
5087
5088	if (oldconfig && spa->spa_config)
5089		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
5090
5091	if (new_state != POOL_STATE_UNINITIALIZED) {
5092		if (!hardforce)
5093			spa_write_cachefile(spa, B_TRUE, B_TRUE);
5094		spa_remove(spa);
5095	}
5096	mutex_exit(&spa_namespace_lock);
5097
5098	return (0);
5099}
5100
5101/*
5102 * Destroy a storage pool.
5103 */
5104int
5105spa_destroy(char *pool)
5106{
5107	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
5108	    B_FALSE, B_FALSE));
5109}
5110
5111/*
5112 * Export a storage pool.
5113 */
5114int
5115spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
5116    boolean_t hardforce)
5117{
5118	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
5119	    force, hardforce));
5120}
5121
5122/*
5123 * Similar to spa_export(), this unloads the spa_t without actually removing it
5124 * from the namespace in any way.
5125 */
5126int
5127spa_reset(char *pool)
5128{
5129	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
5130	    B_FALSE, B_FALSE));
5131}
5132
5133/*
5134 * ==========================================================================
5135 * Device manipulation
5136 * ==========================================================================
5137 */
5138
5139/*
5140 * Add a device to a storage pool.
5141 */
5142int
5143spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
5144{
5145	uint64_t txg, id;
5146	int error;
5147	vdev_t *rvd = spa->spa_root_vdev;
5148	vdev_t *vd, *tvd;
5149	nvlist_t **spares, **l2cache;
5150	uint_t nspares, nl2cache;
5151
5152	ASSERT(spa_writeable(spa));
5153
5154	txg = spa_vdev_enter(spa);
5155
5156	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
5157	    VDEV_ALLOC_ADD)) != 0)
5158		return (spa_vdev_exit(spa, NULL, txg, error));
5159
5160	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
5161
5162	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
5163	    &nspares) != 0)
5164		nspares = 0;
5165
5166	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
5167	    &nl2cache) != 0)
5168		nl2cache = 0;
5169
5170	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
5171		return (spa_vdev_exit(spa, vd, txg, EINVAL));
5172
5173	if (vd->vdev_children != 0 &&
5174	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
5175		return (spa_vdev_exit(spa, vd, txg, error));
5176
5177	/*
5178	 * We must validate the spares and l2cache devices after checking the
5179	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
5180	 */
5181	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
5182		return (spa_vdev_exit(spa, vd, txg, error));
5183
5184	/*
5185	 * If we are in the middle of a device removal, we can only add
5186	 * devices which match the existing devices in the pool.
5187	 * If we are in the middle of a removal, or have some indirect
5188	 * vdevs, we can not add raidz toplevels.
5189	 */
5190	if (spa->spa_vdev_removal != NULL ||
5191	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
5192		for (int c = 0; c < vd->vdev_children; c++) {
5193			tvd = vd->vdev_child[c];
5194			if (spa->spa_vdev_removal != NULL &&
5195			    tvd->vdev_ashift !=
5196			    spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
5197				return (spa_vdev_exit(spa, vd, txg, EINVAL));
5198			}
5199			/* Fail if top level vdev is raidz */
5200			if (tvd->vdev_ops == &vdev_raidz_ops) {
5201				return (spa_vdev_exit(spa, vd, txg, EINVAL));
5202			}
5203			/*
5204			 * Need the top level mirror to be
5205			 * a mirror of leaf vdevs only
5206			 */
5207			if (tvd->vdev_ops == &vdev_mirror_ops) {
5208				for (uint64_t cid = 0;
5209				    cid < tvd->vdev_children; cid++) {
5210					vdev_t *cvd = tvd->vdev_child[cid];
5211					if (!cvd->vdev_ops->vdev_op_leaf) {
5212						return (spa_vdev_exit(spa, vd,
5213						    txg, EINVAL));
5214					}
5215				}
5216			}
5217		}
5218	}
5219
5220	for (int c = 0; c < vd->vdev_children; c++) {
5221
5222		/*
5223		 * Set the vdev id to the first hole, if one exists.
5224		 */
5225		for (id = 0; id < rvd->vdev_children; id++) {
5226			if (rvd->vdev_child[id]->vdev_ishole) {
5227				vdev_free(rvd->vdev_child[id]);
5228				break;
5229			}
5230		}
5231		tvd = vd->vdev_child[c];
5232		vdev_remove_child(vd, tvd);
5233		tvd->vdev_id = id;
5234		vdev_add_child(rvd, tvd);
5235		vdev_config_dirty(tvd);
5236	}
5237
5238	if (nspares != 0) {
5239		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
5240		    ZPOOL_CONFIG_SPARES);
5241		spa_load_spares(spa);
5242		spa->spa_spares.sav_sync = B_TRUE;
5243	}
5244
5245	if (nl2cache != 0) {
5246		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
5247		    ZPOOL_CONFIG_L2CACHE);
5248		spa_load_l2cache(spa);
5249		spa->spa_l2cache.sav_sync = B_TRUE;
5250	}
5251
5252	/*
5253	 * We have to be careful when adding new vdevs to an existing pool.
5254	 * If other threads start allocating from these vdevs before we
5255	 * sync the config cache, and we lose power, then upon reboot we may
5256	 * fail to open the pool because there are DVAs that the config cache
5257	 * can't translate.  Therefore, we first add the vdevs without
5258	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
5259	 * and then let spa_config_update() initialize the new metaslabs.
5260	 *
5261	 * spa_load() checks for added-but-not-initialized vdevs, so that
5262	 * if we lose power at any point in this sequence, the remaining
5263	 * steps will be completed the next time we load the pool.
5264	 */
5265	(void) spa_vdev_exit(spa, vd, txg, 0);
5266
5267	mutex_enter(&spa_namespace_lock);
5268	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5269	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
5270	mutex_exit(&spa_namespace_lock);
5271
5272	return (0);
5273}
5274
5275/*
5276 * Attach a device to a mirror.  The arguments are the path to any device
5277 * in the mirror, and the nvroot for the new device.  If the path specifies
5278 * a device that is not mirrored, we automatically insert the mirror vdev.
5279 *
5280 * If 'replacing' is specified, the new device is intended to replace the
5281 * existing device; in this case the two devices are made into their own
5282 * mirror using the 'replacing' vdev, which is functionally identical to
5283 * the mirror vdev (it actually reuses all the same ops) but has a few
5284 * extra rules: you can't attach to it after it's been created, and upon
5285 * completion of resilvering, the first disk (the one being replaced)
5286 * is automatically detached.
5287 */
5288int
5289spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
5290{
5291	uint64_t txg, dtl_max_txg;
5292	vdev_t *rvd = spa->spa_root_vdev;
5293	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
5294	vdev_ops_t *pvops;
5295	char *oldvdpath, *newvdpath;
5296	int newvd_isspare;
5297	int error;
5298
5299	ASSERT(spa_writeable(spa));
5300
5301	txg = spa_vdev_enter(spa);
5302
5303	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
5304
5305	if (spa->spa_vdev_removal != NULL ||
5306	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
5307		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5308	}
5309
5310	if (oldvd == NULL)
5311		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5312
5313	if (!oldvd->vdev_ops->vdev_op_leaf)
5314		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5315
5316	pvd = oldvd->vdev_parent;
5317
5318	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
5319	    VDEV_ALLOC_ATTACH)) != 0)
5320		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5321
5322	if (newrootvd->vdev_children != 1)
5323		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5324
5325	newvd = newrootvd->vdev_child[0];
5326
5327	if (!newvd->vdev_ops->vdev_op_leaf)
5328		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5329
5330	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
5331		return (spa_vdev_exit(spa, newrootvd, txg, error));
5332
5333	/*
5334	 * Spares can't replace logs
5335	 */
5336	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
5337		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5338
5339	if (!replacing) {
5340		/*
5341		 * For attach, the only allowable parent is a mirror or the root
5342		 * vdev.
5343		 */
5344		if (pvd->vdev_ops != &vdev_mirror_ops &&
5345		    pvd->vdev_ops != &vdev_root_ops)
5346			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5347
5348		pvops = &vdev_mirror_ops;
5349	} else {
5350		/*
5351		 * Active hot spares can only be replaced by inactive hot
5352		 * spares.
5353		 */
5354		if (pvd->vdev_ops == &vdev_spare_ops &&
5355		    oldvd->vdev_isspare &&
5356		    !spa_has_spare(spa, newvd->vdev_guid))
5357			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5358
5359		/*
5360		 * If the source is a hot spare, and the parent isn't already a
5361		 * spare, then we want to create a new hot spare.  Otherwise, we
5362		 * want to create a replacing vdev.  The user is not allowed to
5363		 * attach to a spared vdev child unless the 'isspare' state is
5364		 * the same (spare replaces spare, non-spare replaces
5365		 * non-spare).
5366		 */
5367		if (pvd->vdev_ops == &vdev_replacing_ops &&
5368		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
5369			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5370		} else if (pvd->vdev_ops == &vdev_spare_ops &&
5371		    newvd->vdev_isspare != oldvd->vdev_isspare) {
5372			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5373		}
5374
5375		if (newvd->vdev_isspare)
5376			pvops = &vdev_spare_ops;
5377		else
5378			pvops = &vdev_replacing_ops;
5379	}
5380
5381	/*
5382	 * Make sure the new device is big enough.
5383	 */
5384	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
5385		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
5386
5387	/*
5388	 * The new device cannot have a higher alignment requirement
5389	 * than the top-level vdev.
5390	 */
5391	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
5392		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
5393
5394	/*
5395	 * If this is an in-place replacement, update oldvd's path and devid
5396	 * to make it distinguishable from newvd, and unopenable from now on.
5397	 */
5398	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
5399		spa_strfree(oldvd->vdev_path);
5400		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
5401		    KM_SLEEP);
5402		(void) sprintf(oldvd->vdev_path, "%s/%s",
5403		    newvd->vdev_path, "old");
5404		if (oldvd->vdev_devid != NULL) {
5405			spa_strfree(oldvd->vdev_devid);
5406			oldvd->vdev_devid = NULL;
5407		}
5408	}
5409
5410	/* mark the device being resilvered */
5411	newvd->vdev_resilver_txg = txg;
5412
5413	/*
5414	 * If the parent is not a mirror, or if we're replacing, insert the new
5415	 * mirror/replacing/spare vdev above oldvd.
5416	 */
5417	if (pvd->vdev_ops != pvops)
5418		pvd = vdev_add_parent(oldvd, pvops);
5419
5420	ASSERT(pvd->vdev_top->vdev_parent == rvd);
5421	ASSERT(pvd->vdev_ops == pvops);
5422	ASSERT(oldvd->vdev_parent == pvd);
5423
5424	/*
5425	 * Extract the new device from its root and add it to pvd.
5426	 */
5427	vdev_remove_child(newrootvd, newvd);
5428	newvd->vdev_id = pvd->vdev_children;
5429	newvd->vdev_crtxg = oldvd->vdev_crtxg;
5430	vdev_add_child(pvd, newvd);
5431
5432	tvd = newvd->vdev_top;
5433	ASSERT(pvd->vdev_top == tvd);
5434	ASSERT(tvd->vdev_parent == rvd);
5435
5436	vdev_config_dirty(tvd);
5437
5438	/*
5439	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
5440	 * for any dmu_sync-ed blocks.  It will propagate upward when
5441	 * spa_vdev_exit() calls vdev_dtl_reassess().
5442	 */
5443	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
5444
5445	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
5446	    dtl_max_txg - TXG_INITIAL);
5447
5448	if (newvd->vdev_isspare) {
5449		spa_spare_activate(newvd);
5450		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
5451	}
5452
5453	oldvdpath = spa_strdup(oldvd->vdev_path);
5454	newvdpath = spa_strdup(newvd->vdev_path);
5455	newvd_isspare = newvd->vdev_isspare;
5456
5457	/*
5458	 * Mark newvd's DTL dirty in this txg.
5459	 */
5460	vdev_dirty(tvd, VDD_DTL, newvd, txg);
5461
5462	/*
5463	 * Schedule the resilver to restart in the future. We do this to
5464	 * ensure that dmu_sync-ed blocks have been stitched into the
5465	 * respective datasets.
5466	 */
5467	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
5468
5469	if (spa->spa_bootfs)
5470		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
5471
5472	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
5473
5474	/*
5475	 * Commit the config
5476	 */
5477	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
5478
5479	spa_history_log_internal(spa, "vdev attach", NULL,
5480	    "%s vdev=%s %s vdev=%s",
5481	    replacing && newvd_isspare ? "spare in" :
5482	    replacing ? "replace" : "attach", newvdpath,
5483	    replacing ? "for" : "to", oldvdpath);
5484
5485	spa_strfree(oldvdpath);
5486	spa_strfree(newvdpath);
5487
5488	return (0);
5489}
5490
5491/*
5492 * Detach a device from a mirror or replacing vdev.
5493 *
5494 * If 'replace_done' is specified, only detach if the parent
5495 * is a replacing vdev.
5496 */
5497int
5498spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
5499{
5500	uint64_t txg;
5501	int error;
5502	vdev_t *rvd = spa->spa_root_vdev;
5503	vdev_t *vd, *pvd, *cvd, *tvd;
5504	boolean_t unspare = B_FALSE;
5505	uint64_t unspare_guid = 0;
5506	char *vdpath;
5507
5508	ASSERT(spa_writeable(spa));
5509
5510	txg = spa_vdev_enter(spa);
5511
5512	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5513
5514	if (vd == NULL)
5515		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5516
5517	if (!vd->vdev_ops->vdev_op_leaf)
5518		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5519
5520	pvd = vd->vdev_parent;
5521
5522	/*
5523	 * If the parent/child relationship is not as expected, don't do it.
5524	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
5525	 * vdev that's replacing B with C.  The user's intent in replacing
5526	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
5527	 * the replace by detaching C, the expected behavior is to end up
5528	 * M(A,B).  But suppose that right after deciding to detach C,
5529	 * the replacement of B completes.  We would have M(A,C), and then
5530	 * ask to detach C, which would leave us with just A -- not what
5531	 * the user wanted.  To prevent this, we make sure that the
5532	 * parent/child relationship hasn't changed -- in this example,
5533	 * that C's parent is still the replacing vdev R.
5534	 */
5535	if (pvd->vdev_guid != pguid && pguid != 0)
5536		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5537
5538	/*
5539	 * Only 'replacing' or 'spare' vdevs can be replaced.
5540	 */
5541	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
5542	    pvd->vdev_ops != &vdev_spare_ops)
5543		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5544
5545	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
5546	    spa_version(spa) >= SPA_VERSION_SPARES);
5547
5548	/*
5549	 * Only mirror, replacing, and spare vdevs support detach.
5550	 */
5551	if (pvd->vdev_ops != &vdev_replacing_ops &&
5552	    pvd->vdev_ops != &vdev_mirror_ops &&
5553	    pvd->vdev_ops != &vdev_spare_ops)
5554		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5555
5556	/*
5557	 * If this device has the only valid copy of some data,
5558	 * we cannot safely detach it.
5559	 */
5560	if (vdev_dtl_required(vd))
5561		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5562
5563	ASSERT(pvd->vdev_children >= 2);
5564
5565	/*
5566	 * If we are detaching the second disk from a replacing vdev, then
5567	 * check to see if we changed the original vdev's path to have "/old"
5568	 * at the end in spa_vdev_attach().  If so, undo that change now.
5569	 */
5570	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
5571	    vd->vdev_path != NULL) {
5572		size_t len = strlen(vd->vdev_path);
5573
5574		for (int c = 0; c < pvd->vdev_children; c++) {
5575			cvd = pvd->vdev_child[c];
5576
5577			if (cvd == vd || cvd->vdev_path == NULL)
5578				continue;
5579
5580			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
5581			    strcmp(cvd->vdev_path + len, "/old") == 0) {
5582				spa_strfree(cvd->vdev_path);
5583				cvd->vdev_path = spa_strdup(vd->vdev_path);
5584				break;
5585			}
5586		}
5587	}
5588
5589	/*
5590	 * If we are detaching the original disk from a spare, then it implies
5591	 * that the spare should become a real disk, and be removed from the
5592	 * active spare list for the pool.
5593	 */
5594	if (pvd->vdev_ops == &vdev_spare_ops &&
5595	    vd->vdev_id == 0 &&
5596	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
5597		unspare = B_TRUE;
5598
5599	/*
5600	 * Erase the disk labels so the disk can be used for other things.
5601	 * This must be done after all other error cases are handled,
5602	 * but before we disembowel vd (so we can still do I/O to it).
5603	 * But if we can't do it, don't treat the error as fatal --
5604	 * it may be that the unwritability of the disk is the reason
5605	 * it's being detached!
5606	 */
5607	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5608
5609	/*
5610	 * Remove vd from its parent and compact the parent's children.
5611	 */
5612	vdev_remove_child(pvd, vd);
5613	vdev_compact_children(pvd);
5614
5615	/*
5616	 * Remember one of the remaining children so we can get tvd below.
5617	 */
5618	cvd = pvd->vdev_child[pvd->vdev_children - 1];
5619
5620	/*
5621	 * If we need to remove the remaining child from the list of hot spares,
5622	 * do it now, marking the vdev as no longer a spare in the process.
5623	 * We must do this before vdev_remove_parent(), because that can
5624	 * change the GUID if it creates a new toplevel GUID.  For a similar
5625	 * reason, we must remove the spare now, in the same txg as the detach;
5626	 * otherwise someone could attach a new sibling, change the GUID, and
5627	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
5628	 */
5629	if (unspare) {
5630		ASSERT(cvd->vdev_isspare);
5631		spa_spare_remove(cvd);
5632		unspare_guid = cvd->vdev_guid;
5633		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
5634		cvd->vdev_unspare = B_TRUE;
5635	}
5636
5637	/*
5638	 * If the parent mirror/replacing vdev only has one child,
5639	 * the parent is no longer needed.  Remove it from the tree.
5640	 */
5641	if (pvd->vdev_children == 1) {
5642		if (pvd->vdev_ops == &vdev_spare_ops)
5643			cvd->vdev_unspare = B_FALSE;
5644		vdev_remove_parent(cvd);
5645	}
5646
5647
5648	/*
5649	 * We don't set tvd until now because the parent we just removed
5650	 * may have been the previous top-level vdev.
5651	 */
5652	tvd = cvd->vdev_top;
5653	ASSERT(tvd->vdev_parent == rvd);
5654
5655	/*
5656	 * Reevaluate the parent vdev state.
5657	 */
5658	vdev_propagate_state(cvd);
5659
5660	/*
5661	 * If the 'autoexpand' property is set on the pool then automatically
5662	 * try to expand the size of the pool. For example if the device we
5663	 * just detached was smaller than the others, it may be possible to
5664	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
5665	 * first so that we can obtain the updated sizes of the leaf vdevs.
5666	 */
5667	if (spa->spa_autoexpand) {
5668		vdev_reopen(tvd);
5669		vdev_expand(tvd, txg);
5670	}
5671
5672	vdev_config_dirty(tvd);
5673
5674	/*
5675	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
5676	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
5677	 * But first make sure we're not on any *other* txg's DTL list, to
5678	 * prevent vd from being accessed after it's freed.
5679	 */
5680	vdpath = spa_strdup(vd->vdev_path);
5681	for (int t = 0; t < TXG_SIZE; t++)
5682		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
5683	vd->vdev_detached = B_TRUE;
5684	vdev_dirty(tvd, VDD_DTL, vd, txg);
5685
5686	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
5687
5688	/* hang on to the spa before we release the lock */
5689	spa_open_ref(spa, FTAG);
5690
5691	error = spa_vdev_exit(spa, vd, txg, 0);
5692
5693	spa_history_log_internal(spa, "detach", NULL,
5694	    "vdev=%s", vdpath);
5695	spa_strfree(vdpath);
5696
5697	/*
5698	 * If this was the removal of the original device in a hot spare vdev,
5699	 * then we want to go through and remove the device from the hot spare
5700	 * list of every other pool.
5701	 */
5702	if (unspare) {
5703		spa_t *altspa = NULL;
5704
5705		mutex_enter(&spa_namespace_lock);
5706		while ((altspa = spa_next(altspa)) != NULL) {
5707			if (altspa->spa_state != POOL_STATE_ACTIVE ||
5708			    altspa == spa)
5709				continue;
5710
5711			spa_open_ref(altspa, FTAG);
5712			mutex_exit(&spa_namespace_lock);
5713			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
5714			mutex_enter(&spa_namespace_lock);
5715			spa_close(altspa, FTAG);
5716		}
5717		mutex_exit(&spa_namespace_lock);
5718
5719		/* search the rest of the vdevs for spares to remove */
5720		spa_vdev_resilver_done(spa);
5721	}
5722
5723	/* all done with the spa; OK to release */
5724	mutex_enter(&spa_namespace_lock);
5725	spa_close(spa, FTAG);
5726	mutex_exit(&spa_namespace_lock);
5727
5728	return (error);
5729}
5730
5731/*
5732 * Split a set of devices from their mirrors, and create a new pool from them.
5733 */
5734int
5735spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
5736    nvlist_t *props, boolean_t exp)
5737{
5738	int error = 0;
5739	uint64_t txg, *glist;
5740	spa_t *newspa;
5741	uint_t c, children, lastlog;
5742	nvlist_t **child, *nvl, *tmp;
5743	dmu_tx_t *tx;
5744	char *altroot = NULL;
5745	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
5746	boolean_t activate_slog;
5747
5748	ASSERT(spa_writeable(spa));
5749
5750	txg = spa_vdev_enter(spa);
5751
5752	/* clear the log and flush everything up to now */
5753	activate_slog = spa_passivate_log(spa);
5754	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5755	error = spa_reset_logs(spa);
5756	txg = spa_vdev_config_enter(spa);
5757
5758	if (activate_slog)
5759		spa_activate_log(spa);
5760
5761	if (error != 0)
5762		return (spa_vdev_exit(spa, NULL, txg, error));
5763
5764	/* check new spa name before going any further */
5765	if (spa_lookup(newname) != NULL)
5766		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
5767
5768	/*
5769	 * scan through all the children to ensure they're all mirrors
5770	 */
5771	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
5772	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
5773	    &children) != 0)
5774		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5775
5776	/* first, check to ensure we've got the right child count */
5777	rvd = spa->spa_root_vdev;
5778	lastlog = 0;
5779	for (c = 0; c < rvd->vdev_children; c++) {
5780		vdev_t *vd = rvd->vdev_child[c];
5781
5782		/* don't count the holes & logs as children */
5783		if (vd->vdev_islog || !vdev_is_concrete(vd)) {
5784			if (lastlog == 0)
5785				lastlog = c;
5786			continue;
5787		}
5788
5789		lastlog = 0;
5790	}
5791	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5792		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5793
5794	/* next, ensure no spare or cache devices are part of the split */
5795	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5796	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5797		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5798
5799	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5800	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5801
5802	/* then, loop over each vdev and validate it */
5803	for (c = 0; c < children; c++) {
5804		uint64_t is_hole = 0;
5805
5806		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5807		    &is_hole);
5808
5809		if (is_hole != 0) {
5810			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5811			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5812				continue;
5813			} else {
5814				error = SET_ERROR(EINVAL);
5815				break;
5816			}
5817		}
5818
5819		/* which disk is going to be split? */
5820		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5821		    &glist[c]) != 0) {
5822			error = SET_ERROR(EINVAL);
5823			break;
5824		}
5825
5826		/* look it up in the spa */
5827		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5828		if (vml[c] == NULL) {
5829			error = SET_ERROR(ENODEV);
5830			break;
5831		}
5832
5833		/* make sure there's nothing stopping the split */
5834		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5835		    vml[c]->vdev_islog ||
5836		    !vdev_is_concrete(vml[c]) ||
5837		    vml[c]->vdev_isspare ||
5838		    vml[c]->vdev_isl2cache ||
5839		    !vdev_writeable(vml[c]) ||
5840		    vml[c]->vdev_children != 0 ||
5841		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5842		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5843			error = SET_ERROR(EINVAL);
5844			break;
5845		}
5846
5847		if (vdev_dtl_required(vml[c])) {
5848			error = SET_ERROR(EBUSY);
5849			break;
5850		}
5851
5852		/* we need certain info from the top level */
5853		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5854		    vml[c]->vdev_top->vdev_ms_array) == 0);
5855		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5856		    vml[c]->vdev_top->vdev_ms_shift) == 0);
5857		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5858		    vml[c]->vdev_top->vdev_asize) == 0);
5859		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5860		    vml[c]->vdev_top->vdev_ashift) == 0);
5861
5862		/* transfer per-vdev ZAPs */
5863		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
5864		VERIFY0(nvlist_add_uint64(child[c],
5865		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
5866
5867		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
5868		VERIFY0(nvlist_add_uint64(child[c],
5869		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
5870		    vml[c]->vdev_parent->vdev_top_zap));
5871	}
5872
5873	if (error != 0) {
5874		kmem_free(vml, children * sizeof (vdev_t *));
5875		kmem_free(glist, children * sizeof (uint64_t));
5876		return (spa_vdev_exit(spa, NULL, txg, error));
5877	}
5878
5879	/* stop writers from using the disks */
5880	for (c = 0; c < children; c++) {
5881		if (vml[c] != NULL)
5882			vml[c]->vdev_offline = B_TRUE;
5883	}
5884	vdev_reopen(spa->spa_root_vdev);
5885
5886	/*
5887	 * Temporarily record the splitting vdevs in the spa config.  This
5888	 * will disappear once the config is regenerated.
5889	 */
5890	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5891	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5892	    glist, children) == 0);
5893	kmem_free(glist, children * sizeof (uint64_t));
5894
5895	mutex_enter(&spa->spa_props_lock);
5896	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5897	    nvl) == 0);
5898	mutex_exit(&spa->spa_props_lock);
5899	spa->spa_config_splitting = nvl;
5900	vdev_config_dirty(spa->spa_root_vdev);
5901
5902	/* configure and create the new pool */
5903	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5904	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5905	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5906	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5907	    spa_version(spa)) == 0);
5908	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5909	    spa->spa_config_txg) == 0);
5910	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5911	    spa_generate_guid(NULL)) == 0);
5912	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
5913	(void) nvlist_lookup_string(props,
5914	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5915
5916	/* add the new pool to the namespace */
5917	newspa = spa_add(newname, config, altroot);
5918	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
5919	newspa->spa_config_txg = spa->spa_config_txg;
5920	spa_set_log_state(newspa, SPA_LOG_CLEAR);
5921
5922	/* release the spa config lock, retaining the namespace lock */
5923	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5924
5925	if (zio_injection_enabled)
5926		zio_handle_panic_injection(spa, FTAG, 1);
5927
5928	spa_activate(newspa, spa_mode_global);
5929	spa_async_suspend(newspa);
5930
5931	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
5932
5933	/* create the new pool from the disks of the original pool */
5934	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
5935	if (error)
5936		goto out;
5937
5938	/* if that worked, generate a real config for the new pool */
5939	if (newspa->spa_root_vdev != NULL) {
5940		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5941		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5942		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5943		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5944		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5945		    B_TRUE));
5946	}
5947
5948	/* set the props */
5949	if (props != NULL) {
5950		spa_configfile_set(newspa, props, B_FALSE);
5951		error = spa_prop_set(newspa, props);
5952		if (error)
5953			goto out;
5954	}
5955
5956	/* flush everything */
5957	txg = spa_vdev_config_enter(newspa);
5958	vdev_config_dirty(newspa->spa_root_vdev);
5959	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5960
5961	if (zio_injection_enabled)
5962		zio_handle_panic_injection(spa, FTAG, 2);
5963
5964	spa_async_resume(newspa);
5965
5966	/* finally, update the original pool's config */
5967	txg = spa_vdev_config_enter(spa);
5968	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5969	error = dmu_tx_assign(tx, TXG_WAIT);
5970	if (error != 0)
5971		dmu_tx_abort(tx);
5972	for (c = 0; c < children; c++) {
5973		if (vml[c] != NULL) {
5974			vdev_split(vml[c]);
5975			if (error == 0)
5976				spa_history_log_internal(spa, "detach", tx,
5977				    "vdev=%s", vml[c]->vdev_path);
5978
5979			vdev_free(vml[c]);
5980		}
5981	}
5982	spa->spa_avz_action = AVZ_ACTION_REBUILD;
5983	vdev_config_dirty(spa->spa_root_vdev);
5984	spa->spa_config_splitting = NULL;
5985	nvlist_free(nvl);
5986	if (error == 0)
5987		dmu_tx_commit(tx);
5988	(void) spa_vdev_exit(spa, NULL, txg, 0);
5989
5990	if (zio_injection_enabled)
5991		zio_handle_panic_injection(spa, FTAG, 3);
5992
5993	/* split is complete; log a history record */
5994	spa_history_log_internal(newspa, "split", NULL,
5995	    "from pool %s", spa_name(spa));
5996
5997	kmem_free(vml, children * sizeof (vdev_t *));
5998
5999	/* if we're not going to mount the filesystems in userland, export */
6000	if (exp)
6001		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
6002		    B_FALSE, B_FALSE);
6003
6004	return (error);
6005
6006out:
6007	spa_unload(newspa);
6008	spa_deactivate(newspa);
6009	spa_remove(newspa);
6010
6011	txg = spa_vdev_config_enter(spa);
6012
6013	/* re-online all offlined disks */
6014	for (c = 0; c < children; c++) {
6015		if (vml[c] != NULL)
6016			vml[c]->vdev_offline = B_FALSE;
6017	}
6018	vdev_reopen(spa->spa_root_vdev);
6019
6020	nvlist_free(spa->spa_config_splitting);
6021	spa->spa_config_splitting = NULL;
6022	(void) spa_vdev_exit(spa, NULL, txg, error);
6023
6024	kmem_free(vml, children * sizeof (vdev_t *));
6025	return (error);
6026}
6027
6028/*
6029 * Find any device that's done replacing, or a vdev marked 'unspare' that's
6030 * currently spared, so we can detach it.
6031 */
6032static vdev_t *
6033spa_vdev_resilver_done_hunt(vdev_t *vd)
6034{
6035	vdev_t *newvd, *oldvd;
6036
6037	for (int c = 0; c < vd->vdev_children; c++) {
6038		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
6039		if (oldvd != NULL)
6040			return (oldvd);
6041	}
6042
6043	/*
6044	 * Check for a completed replacement.  We always consider the first
6045	 * vdev in the list to be the oldest vdev, and the last one to be
6046	 * the newest (see spa_vdev_attach() for how that works).  In
6047	 * the case where the newest vdev is faulted, we will not automatically
6048	 * remove it after a resilver completes.  This is OK as it will require
6049	 * user intervention to determine which disk the admin wishes to keep.
6050	 */
6051	if (vd->vdev_ops == &vdev_replacing_ops) {
6052		ASSERT(vd->vdev_children > 1);
6053
6054		newvd = vd->vdev_child[vd->vdev_children - 1];
6055		oldvd = vd->vdev_child[0];
6056
6057		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
6058		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6059		    !vdev_dtl_required(oldvd))
6060			return (oldvd);
6061	}
6062
6063	/*
6064	 * Check for a completed resilver with the 'unspare' flag set.
6065	 */
6066	if (vd->vdev_ops == &vdev_spare_ops) {
6067		vdev_t *first = vd->vdev_child[0];
6068		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
6069
6070		if (last->vdev_unspare) {
6071			oldvd = first;
6072			newvd = last;
6073		} else if (first->vdev_unspare) {
6074			oldvd = last;
6075			newvd = first;
6076		} else {
6077			oldvd = NULL;
6078		}
6079
6080		if (oldvd != NULL &&
6081		    vdev_dtl_empty(newvd, DTL_MISSING) &&
6082		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6083		    !vdev_dtl_required(oldvd))
6084			return (oldvd);
6085
6086		/*
6087		 * If there are more than two spares attached to a disk,
6088		 * and those spares are not required, then we want to
6089		 * attempt to free them up now so that they can be used
6090		 * by other pools.  Once we're back down to a single
6091		 * disk+spare, we stop removing them.
6092		 */
6093		if (vd->vdev_children > 2) {
6094			newvd = vd->vdev_child[1];
6095
6096			if (newvd->vdev_isspare && last->vdev_isspare &&
6097			    vdev_dtl_empty(last, DTL_MISSING) &&
6098			    vdev_dtl_empty(last, DTL_OUTAGE) &&
6099			    !vdev_dtl_required(newvd))
6100				return (newvd);
6101		}
6102	}
6103
6104	return (NULL);
6105}
6106
6107static void
6108spa_vdev_resilver_done(spa_t *spa)
6109{
6110	vdev_t *vd, *pvd, *ppvd;
6111	uint64_t guid, sguid, pguid, ppguid;
6112
6113	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6114
6115	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
6116		pvd = vd->vdev_parent;
6117		ppvd = pvd->vdev_parent;
6118		guid = vd->vdev_guid;
6119		pguid = pvd->vdev_guid;
6120		ppguid = ppvd->vdev_guid;
6121		sguid = 0;
6122		/*
6123		 * If we have just finished replacing a hot spared device, then
6124		 * we need to detach the parent's first child (the original hot
6125		 * spare) as well.
6126		 */
6127		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
6128		    ppvd->vdev_children == 2) {
6129			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
6130			sguid = ppvd->vdev_child[1]->vdev_guid;
6131		}
6132		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
6133
6134		spa_config_exit(spa, SCL_ALL, FTAG);
6135		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
6136			return;
6137		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
6138			return;
6139		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6140	}
6141
6142	spa_config_exit(spa, SCL_ALL, FTAG);
6143}
6144