xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa.c (revision 3f9d6ad73e45c6823b409f93b0c8d4f62861d2d5)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
2199653d4eSeschrock 
22fa9e4066Sahrens /*
2398d1cbfeSGeorge Wilson  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens /*
27fa9e4066Sahrens  * This file contains all the routines used when modifying on-disk SPA state.
28fa9e4066Sahrens  * This includes opening, importing, destroying, exporting a pool, and syncing a
29fa9e4066Sahrens  * pool.
30fa9e4066Sahrens  */
31fa9e4066Sahrens 
32fa9e4066Sahrens #include <sys/zfs_context.h>
33ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
34fa9e4066Sahrens #include <sys/spa_impl.h>
35fa9e4066Sahrens #include <sys/zio.h>
36fa9e4066Sahrens #include <sys/zio_checksum.h>
37fa9e4066Sahrens #include <sys/dmu.h>
38fa9e4066Sahrens #include <sys/dmu_tx.h>
39fa9e4066Sahrens #include <sys/zap.h>
40fa9e4066Sahrens #include <sys/zil.h>
41b24ab676SJeff Bonwick #include <sys/ddt.h>
42fa9e4066Sahrens #include <sys/vdev_impl.h>
43fa9e4066Sahrens #include <sys/metaslab.h>
4488ecc943SGeorge Wilson #include <sys/metaslab_impl.h>
45fa9e4066Sahrens #include <sys/uberblock_impl.h>
46fa9e4066Sahrens #include <sys/txg.h>
47fa9e4066Sahrens #include <sys/avl.h>
48fa9e4066Sahrens #include <sys/dmu_traverse.h>
49b1b8ab34Slling #include <sys/dmu_objset.h>
50fa9e4066Sahrens #include <sys/unique.h>
51fa9e4066Sahrens #include <sys/dsl_pool.h>
52b1b8ab34Slling #include <sys/dsl_dataset.h>
53fa9e4066Sahrens #include <sys/dsl_dir.h>
54fa9e4066Sahrens #include <sys/dsl_prop.h>
55b1b8ab34Slling #include <sys/dsl_synctask.h>
56fa9e4066Sahrens #include <sys/fs/zfs.h>
57fa94a07fSbrendan #include <sys/arc.h>
58fa9e4066Sahrens #include <sys/callb.h>
5995173954Sek #include <sys/systeminfo.h>
60e7cbe64fSgw #include <sys/spa_boot.h>
61573ca77eSGeorge Wilson #include <sys/zfs_ioctl.h>
62*3f9d6ad7SLin Ling #include <sys/dsl_scan.h>
63fa9e4066Sahrens 
645679c89fSjv #ifdef	_KERNEL
65dedec472SJack Meng #include <sys/bootprops.h>
6635a5a358SJonathan Adams #include <sys/callb.h>
6735a5a358SJonathan Adams #include <sys/cpupart.h>
6835a5a358SJonathan Adams #include <sys/pool.h>
6935a5a358SJonathan Adams #include <sys/sysdc.h>
7035a5a358SJonathan Adams #include <sys/zone.h>
715679c89fSjv #endif	/* _KERNEL */
725679c89fSjv 
73990b4856Slling #include "zfs_prop.h"
74b7b97454Sperrin #include "zfs_comutil.h"
75990b4856Slling 
7635a5a358SJonathan Adams typedef enum zti_modes {
772e0c549eSJonathan Adams 	zti_mode_fixed,			/* value is # of threads (min 1) */
782e0c549eSJonathan Adams 	zti_mode_online_percent,	/* value is % of online CPUs */
7935a5a358SJonathan Adams 	zti_mode_batch,			/* cpu-intensive; value is ignored */
8080eb36f2SGeorge Wilson 	zti_mode_null,			/* don't create a taskq */
812e0c549eSJonathan Adams 	zti_nmodes
8235a5a358SJonathan Adams } zti_modes_t;
83416e0cd8Sek 
8480eb36f2SGeorge Wilson #define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
8580eb36f2SGeorge Wilson #define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
8635a5a358SJonathan Adams #define	ZTI_BATCH	{ zti_mode_batch, 0 }
8780eb36f2SGeorge Wilson #define	ZTI_NULL	{ zti_mode_null, 0 }
882e0c549eSJonathan Adams 
8980eb36f2SGeorge Wilson #define	ZTI_ONE		ZTI_FIX(1)
902e0c549eSJonathan Adams 
912e0c549eSJonathan Adams typedef struct zio_taskq_info {
9280eb36f2SGeorge Wilson 	enum zti_modes zti_mode;
9380eb36f2SGeorge Wilson 	uint_t zti_value;
942e0c549eSJonathan Adams } zio_taskq_info_t;
952e0c549eSJonathan Adams 
962e0c549eSJonathan Adams static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
9735a5a358SJonathan Adams 	"issue", "issue_high", "intr", "intr_high"
982e0c549eSJonathan Adams };
992e0c549eSJonathan Adams 
10080eb36f2SGeorge Wilson /*
10180eb36f2SGeorge Wilson  * Define the taskq threads for the following I/O types:
10280eb36f2SGeorge Wilson  * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
10380eb36f2SGeorge Wilson  */
10480eb36f2SGeorge Wilson const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
10580eb36f2SGeorge Wilson 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
10680eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
10735a5a358SJonathan Adams 	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
10835a5a358SJonathan Adams 	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
109c6065d0fSGeorge Wilson 	{ ZTI_FIX(10),	ZTI_NULL,	ZTI_FIX(10),	ZTI_NULL },
11080eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
11180eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
1122e0c549eSJonathan Adams };
1132e0c549eSJonathan Adams 
114*3f9d6ad7SLin Ling static dsl_syncfunc_t spa_sync_props;
11589a89ebfSlling static boolean_t spa_has_active_shared_spare(spa_t *spa);
1161195e687SMark J Musante static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
1171195e687SMark J Musante     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1181195e687SMark J Musante     char **ereport);
119990b4856Slling 
12035a5a358SJonathan Adams uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
12135a5a358SJonathan Adams id_t		zio_taskq_psrset_bind = PS_NONE;
12235a5a358SJonathan Adams boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
12335a5a358SJonathan Adams uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
12435a5a358SJonathan Adams 
12535a5a358SJonathan Adams boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
12635a5a358SJonathan Adams 
12735a5a358SJonathan Adams /*
12835a5a358SJonathan Adams  * This (illegal) pool name is used when temporarily importing a spa_t in order
12935a5a358SJonathan Adams  * to get the vdev stats associated with the imported devices.
13035a5a358SJonathan Adams  */
13135a5a358SJonathan Adams #define	TRYIMPORT_NAME	"$import"
13235a5a358SJonathan Adams 
133990b4856Slling /*
134990b4856Slling  * ==========================================================================
135990b4856Slling  * SPA properties routines
136990b4856Slling  * ==========================================================================
137990b4856Slling  */
138990b4856Slling 
139990b4856Slling /*
140990b4856Slling  * Add a (source=src, propname=propval) list to an nvlist.
141990b4856Slling  */
1429d82f4f6Slling static void
143990b4856Slling spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
144990b4856Slling     uint64_t intval, zprop_source_t src)
145990b4856Slling {
146990b4856Slling 	const char *propname = zpool_prop_to_name(prop);
147990b4856Slling 	nvlist_t *propval;
148990b4856Slling 
1499d82f4f6Slling 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1509d82f4f6Slling 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
151990b4856Slling 
1529d82f4f6Slling 	if (strval != NULL)
1539d82f4f6Slling 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
1549d82f4f6Slling 	else
1559d82f4f6Slling 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
156990b4856Slling 
1579d82f4f6Slling 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
158990b4856Slling 	nvlist_free(propval);
159990b4856Slling }
160990b4856Slling 
161990b4856Slling /*
162990b4856Slling  * Get property values from the spa configuration.
163990b4856Slling  */
1649d82f4f6Slling static void
165990b4856Slling spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
166990b4856Slling {
167379c004dSEric Schrock 	uint64_t size;
168485bbbf5SGeorge Wilson 	uint64_t alloc;
169990b4856Slling 	uint64_t cap, version;
170990b4856Slling 	zprop_source_t src = ZPROP_SRC_NONE;
171c5904d13Seschrock 	spa_config_dirent_t *dp;
172990b4856Slling 
173e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
174e14bb325SJeff Bonwick 
175379c004dSEric Schrock 	if (spa->spa_root_vdev != NULL) {
176485bbbf5SGeorge Wilson 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
177b24ab676SJeff Bonwick 		size = metaslab_class_get_space(spa_normal_class(spa));
178379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
179379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
180485bbbf5SGeorge Wilson 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
181485bbbf5SGeorge Wilson 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
182485bbbf5SGeorge Wilson 		    size - alloc, src);
183379c004dSEric Schrock 
184485bbbf5SGeorge Wilson 		cap = (size == 0) ? 0 : (alloc * 100 / size);
185379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
186379c004dSEric Schrock 
187b24ab676SJeff Bonwick 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
188b24ab676SJeff Bonwick 		    ddt_get_pool_dedup_ratio(spa), src);
189b24ab676SJeff Bonwick 
190379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
191379c004dSEric Schrock 		    spa->spa_root_vdev->vdev_state, src);
192379c004dSEric Schrock 
193379c004dSEric Schrock 		version = spa_version(spa);
194379c004dSEric Schrock 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
195379c004dSEric Schrock 			src = ZPROP_SRC_DEFAULT;
196379c004dSEric Schrock 		else
197379c004dSEric Schrock 			src = ZPROP_SRC_LOCAL;
198379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
199379c004dSEric Schrock 	}
200990b4856Slling 
2019d82f4f6Slling 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
202990b4856Slling 
2039d82f4f6Slling 	if (spa->spa_root != NULL)
2049d82f4f6Slling 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
2059d82f4f6Slling 		    0, ZPROP_SRC_LOCAL);
206990b4856Slling 
207c5904d13Seschrock 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
208c5904d13Seschrock 		if (dp->scd_path == NULL) {
2099d82f4f6Slling 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
210c5904d13Seschrock 			    "none", 0, ZPROP_SRC_LOCAL);
211c5904d13Seschrock 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
2129d82f4f6Slling 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
213c5904d13Seschrock 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
2142f8aaab3Seschrock 		}
2152f8aaab3Seschrock 	}
216990b4856Slling }
217990b4856Slling 
218990b4856Slling /*
219990b4856Slling  * Get zpool property values.
220990b4856Slling  */
221990b4856Slling int
222990b4856Slling spa_prop_get(spa_t *spa, nvlist_t **nvp)
223990b4856Slling {
224b24ab676SJeff Bonwick 	objset_t *mos = spa->spa_meta_objset;
225990b4856Slling 	zap_cursor_t zc;
226990b4856Slling 	zap_attribute_t za;
227990b4856Slling 	int err;
228990b4856Slling 
2299d82f4f6Slling 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
230990b4856Slling 
231e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);
232e14bb325SJeff Bonwick 
233990b4856Slling 	/*
234990b4856Slling 	 * Get properties from the spa config.
235990b4856Slling 	 */
2369d82f4f6Slling 	spa_prop_get_config(spa, nvp);
237990b4856Slling 
238990b4856Slling 	/* If no pool property object, no more prop to get. */
239afee20e4SGeorge Wilson 	if (mos == NULL || spa->spa_pool_props_object == 0) {
240990b4856Slling 		mutex_exit(&spa->spa_props_lock);
241990b4856Slling 		return (0);
242990b4856Slling 	}
243990b4856Slling 
244990b4856Slling 	/*
245990b4856Slling 	 * Get properties from the MOS pool property object.
246990b4856Slling 	 */
247990b4856Slling 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
248990b4856Slling 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
249990b4856Slling 	    zap_cursor_advance(&zc)) {
250990b4856Slling 		uint64_t intval = 0;
251990b4856Slling 		char *strval = NULL;
252990b4856Slling 		zprop_source_t src = ZPROP_SRC_DEFAULT;
253990b4856Slling 		zpool_prop_t prop;
254990b4856Slling 
255990b4856Slling 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
256990b4856Slling 			continue;
257990b4856Slling 
258990b4856Slling 		switch (za.za_integer_length) {
259990b4856Slling 		case 8:
260990b4856Slling 			/* integer property */
261990b4856Slling 			if (za.za_first_integer !=
262990b4856Slling 			    zpool_prop_default_numeric(prop))
263990b4856Slling 				src = ZPROP_SRC_LOCAL;
264990b4856Slling 
265990b4856Slling 			if (prop == ZPOOL_PROP_BOOTFS) {
266990b4856Slling 				dsl_pool_t *dp;
267990b4856Slling 				dsl_dataset_t *ds = NULL;
268990b4856Slling 
269990b4856Slling 				dp = spa_get_dsl(spa);
270990b4856Slling 				rw_enter(&dp->dp_config_rwlock, RW_READER);
271745cd3c5Smaybee 				if (err = dsl_dataset_hold_obj(dp,
272745cd3c5Smaybee 				    za.za_first_integer, FTAG, &ds)) {
273990b4856Slling 					rw_exit(&dp->dp_config_rwlock);
274990b4856Slling 					break;
275990b4856Slling 				}
276990b4856Slling 
277990b4856Slling 				strval = kmem_alloc(
278990b4856Slling 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
279990b4856Slling 				    KM_SLEEP);
280990b4856Slling 				dsl_dataset_name(ds, strval);
281745cd3c5Smaybee 				dsl_dataset_rele(ds, FTAG);
282990b4856Slling 				rw_exit(&dp->dp_config_rwlock);
283990b4856Slling 			} else {
284990b4856Slling 				strval = NULL;
285990b4856Slling 				intval = za.za_first_integer;
286990b4856Slling 			}
287990b4856Slling 
2889d82f4f6Slling 			spa_prop_add_list(*nvp, prop, strval, intval, src);
289990b4856Slling 
290990b4856Slling 			if (strval != NULL)
291990b4856Slling 				kmem_free(strval,
292990b4856Slling 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
293990b4856Slling 
294990b4856Slling 			break;
295990b4856Slling 
296990b4856Slling 		case 1:
297990b4856Slling 			/* string property */
298990b4856Slling 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
299990b4856Slling 			err = zap_lookup(mos, spa->spa_pool_props_object,
300990b4856Slling 			    za.za_name, 1, za.za_num_integers, strval);
301990b4856Slling 			if (err) {
302990b4856Slling 				kmem_free(strval, za.za_num_integers);
303990b4856Slling 				break;
304990b4856Slling 			}
3059d82f4f6Slling 			spa_prop_add_list(*nvp, prop, strval, 0, src);
306990b4856Slling 			kmem_free(strval, za.za_num_integers);
307990b4856Slling 			break;
308990b4856Slling 
309990b4856Slling 		default:
310990b4856Slling 			break;
311990b4856Slling 		}
312990b4856Slling 	}
313990b4856Slling 	zap_cursor_fini(&zc);
314990b4856Slling 	mutex_exit(&spa->spa_props_lock);
315990b4856Slling out:
316990b4856Slling 	if (err && err != ENOENT) {
317990b4856Slling 		nvlist_free(*nvp);
3189d82f4f6Slling 		*nvp = NULL;
319990b4856Slling 		return (err);
320990b4856Slling 	}
321990b4856Slling 
322990b4856Slling 	return (0);
323990b4856Slling }
324990b4856Slling 
325990b4856Slling /*
326990b4856Slling  * Validate the given pool properties nvlist and modify the list
327990b4856Slling  * for the property values to be set.
328990b4856Slling  */
329990b4856Slling static int
330990b4856Slling spa_prop_validate(spa_t *spa, nvlist_t *props)
331990b4856Slling {
332990b4856Slling 	nvpair_t *elem;
333990b4856Slling 	int error = 0, reset_bootfs = 0;
334990b4856Slling 	uint64_t objnum;
335990b4856Slling 
336990b4856Slling 	elem = NULL;
337990b4856Slling 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
338990b4856Slling 		zpool_prop_t prop;
339990b4856Slling 		char *propname, *strval;
340990b4856Slling 		uint64_t intval;
341990b4856Slling 		objset_t *os;
3422f8aaab3Seschrock 		char *slash;
343990b4856Slling 
344990b4856Slling 		propname = nvpair_name(elem);
345990b4856Slling 
346990b4856Slling 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
347990b4856Slling 			return (EINVAL);
348990b4856Slling 
349990b4856Slling 		switch (prop) {
350990b4856Slling 		case ZPOOL_PROP_VERSION:
351990b4856Slling 			error = nvpair_value_uint64(elem, &intval);
352990b4856Slling 			if (!error &&
353990b4856Slling 			    (intval < spa_version(spa) || intval > SPA_VERSION))
354990b4856Slling 				error = EINVAL;
355990b4856Slling 			break;
356990b4856Slling 
357990b4856Slling 		case ZPOOL_PROP_DELEGATION:
358990b4856Slling 		case ZPOOL_PROP_AUTOREPLACE:
359d5b5bb25SRich Morris 		case ZPOOL_PROP_LISTSNAPS:
360573ca77eSGeorge Wilson 		case ZPOOL_PROP_AUTOEXPAND:
361990b4856Slling 			error = nvpair_value_uint64(elem, &intval);
362990b4856Slling 			if (!error && intval > 1)
363990b4856Slling 				error = EINVAL;
364990b4856Slling 			break;
365990b4856Slling 
366990b4856Slling 		case ZPOOL_PROP_BOOTFS:
36725f89ee2SJeff Bonwick 			/*
36825f89ee2SJeff Bonwick 			 * If the pool version is less than SPA_VERSION_BOOTFS,
36925f89ee2SJeff Bonwick 			 * or the pool is still being created (version == 0),
37025f89ee2SJeff Bonwick 			 * the bootfs property cannot be set.
37125f89ee2SJeff Bonwick 			 */
372990b4856Slling 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
373990b4856Slling 				error = ENOTSUP;
374990b4856Slling 				break;
375990b4856Slling 			}
376990b4856Slling 
377990b4856Slling 			/*
37815e6edf1Sgw 			 * Make sure the vdev config is bootable
379990b4856Slling 			 */
38015e6edf1Sgw 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
381990b4856Slling 				error = ENOTSUP;
382990b4856Slling 				break;
383990b4856Slling 			}
384990b4856Slling 
385990b4856Slling 			reset_bootfs = 1;
386990b4856Slling 
387990b4856Slling 			error = nvpair_value_string(elem, &strval);
388990b4856Slling 
389990b4856Slling 			if (!error) {
39015e6edf1Sgw 				uint64_t compress;
39115e6edf1Sgw 
392990b4856Slling 				if (strval == NULL || strval[0] == '\0') {
393990b4856Slling 					objnum = zpool_prop_default_numeric(
394990b4856Slling 					    ZPOOL_PROP_BOOTFS);
395990b4856Slling 					break;
396990b4856Slling 				}
397990b4856Slling 
398503ad85cSMatthew Ahrens 				if (error = dmu_objset_hold(strval, FTAG, &os))
399990b4856Slling 					break;
40015e6edf1Sgw 
401503ad85cSMatthew Ahrens 				/* Must be ZPL and not gzip compressed. */
402503ad85cSMatthew Ahrens 
403503ad85cSMatthew Ahrens 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
404503ad85cSMatthew Ahrens 					error = ENOTSUP;
405503ad85cSMatthew Ahrens 				} else if ((error = dsl_prop_get_integer(strval,
40615e6edf1Sgw 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
40715e6edf1Sgw 				    &compress, NULL)) == 0 &&
40815e6edf1Sgw 				    !BOOTFS_COMPRESS_VALID(compress)) {
40915e6edf1Sgw 					error = ENOTSUP;
41015e6edf1Sgw 				} else {
41115e6edf1Sgw 					objnum = dmu_objset_id(os);
41215e6edf1Sgw 				}
413503ad85cSMatthew Ahrens 				dmu_objset_rele(os, FTAG);
414990b4856Slling 			}
415990b4856Slling 			break;
416e14bb325SJeff Bonwick 
4170a4e9518Sgw 		case ZPOOL_PROP_FAILUREMODE:
4180a4e9518Sgw 			error = nvpair_value_uint64(elem, &intval);
4190a4e9518Sgw 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
4200a4e9518Sgw 			    intval > ZIO_FAILURE_MODE_PANIC))
4210a4e9518Sgw 				error = EINVAL;
4220a4e9518Sgw 
4230a4e9518Sgw 			/*
4240a4e9518Sgw 			 * This is a special case which only occurs when
4250a4e9518Sgw 			 * the pool has completely failed. This allows
4260a4e9518Sgw 			 * the user to change the in-core failmode property
4270a4e9518Sgw 			 * without syncing it out to disk (I/Os might
4280a4e9518Sgw 			 * currently be blocked). We do this by returning
4290a4e9518Sgw 			 * EIO to the caller (spa_prop_set) to trick it
4300a4e9518Sgw 			 * into thinking we encountered a property validation
4310a4e9518Sgw 			 * error.
4320a4e9518Sgw 			 */
433e14bb325SJeff Bonwick 			if (!error && spa_suspended(spa)) {
4340a4e9518Sgw 				spa->spa_failmode = intval;
4350a4e9518Sgw 				error = EIO;
4360a4e9518Sgw 			}
4370a4e9518Sgw 			break;
4382f8aaab3Seschrock 
4392f8aaab3Seschrock 		case ZPOOL_PROP_CACHEFILE:
4402f8aaab3Seschrock 			if ((error = nvpair_value_string(elem, &strval)) != 0)
4412f8aaab3Seschrock 				break;
4422f8aaab3Seschrock 
4432f8aaab3Seschrock 			if (strval[0] == '\0')
4442f8aaab3Seschrock 				break;
4452f8aaab3Seschrock 
4462f8aaab3Seschrock 			if (strcmp(strval, "none") == 0)
4472f8aaab3Seschrock 				break;
4482f8aaab3Seschrock 
4492f8aaab3Seschrock 			if (strval[0] != '/') {
4502f8aaab3Seschrock 				error = EINVAL;
4512f8aaab3Seschrock 				break;
4522f8aaab3Seschrock 			}
4532f8aaab3Seschrock 
4542f8aaab3Seschrock 			slash = strrchr(strval, '/');
4552f8aaab3Seschrock 			ASSERT(slash != NULL);
4562f8aaab3Seschrock 
4572f8aaab3Seschrock 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
4582f8aaab3Seschrock 			    strcmp(slash, "/..") == 0)
4592f8aaab3Seschrock 				error = EINVAL;
4602f8aaab3Seschrock 			break;
461b24ab676SJeff Bonwick 
462b24ab676SJeff Bonwick 		case ZPOOL_PROP_DEDUPDITTO:
463b24ab676SJeff Bonwick 			if (spa_version(spa) < SPA_VERSION_DEDUP)
464b24ab676SJeff Bonwick 				error = ENOTSUP;
465b24ab676SJeff Bonwick 			else
466b24ab676SJeff Bonwick 				error = nvpair_value_uint64(elem, &intval);
467b24ab676SJeff Bonwick 			if (error == 0 &&
468b24ab676SJeff Bonwick 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
469b24ab676SJeff Bonwick 				error = EINVAL;
470b24ab676SJeff Bonwick 			break;
471990b4856Slling 		}
472990b4856Slling 
473990b4856Slling 		if (error)
474990b4856Slling 			break;
475990b4856Slling 	}
476990b4856Slling 
477990b4856Slling 	if (!error && reset_bootfs) {
478990b4856Slling 		error = nvlist_remove(props,
479990b4856Slling 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
480990b4856Slling 
481990b4856Slling 		if (!error) {
482990b4856Slling 			error = nvlist_add_uint64(props,
483990b4856Slling 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
484990b4856Slling 		}
485990b4856Slling 	}
486990b4856Slling 
487990b4856Slling 	return (error);
488990b4856Slling }
489990b4856Slling 
490379c004dSEric Schrock void
491379c004dSEric Schrock spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
492379c004dSEric Schrock {
493379c004dSEric Schrock 	char *cachefile;
494379c004dSEric Schrock 	spa_config_dirent_t *dp;
495379c004dSEric Schrock 
496379c004dSEric Schrock 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
497379c004dSEric Schrock 	    &cachefile) != 0)
498379c004dSEric Schrock 		return;
499379c004dSEric Schrock 
500379c004dSEric Schrock 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
501379c004dSEric Schrock 	    KM_SLEEP);
502379c004dSEric Schrock 
503379c004dSEric Schrock 	if (cachefile[0] == '\0')
504379c004dSEric Schrock 		dp->scd_path = spa_strdup(spa_config_path);
505379c004dSEric Schrock 	else if (strcmp(cachefile, "none") == 0)
506379c004dSEric Schrock 		dp->scd_path = NULL;
507379c004dSEric Schrock 	else
508379c004dSEric Schrock 		dp->scd_path = spa_strdup(cachefile);
509379c004dSEric Schrock 
510379c004dSEric Schrock 	list_insert_head(&spa->spa_config_list, dp);
511379c004dSEric Schrock 	if (need_sync)
512379c004dSEric Schrock 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
513379c004dSEric Schrock }
514379c004dSEric Schrock 
515990b4856Slling int
516990b4856Slling spa_prop_set(spa_t *spa, nvlist_t *nvp)
517990b4856Slling {
518990b4856Slling 	int error;
519379c004dSEric Schrock 	nvpair_t *elem;
520379c004dSEric Schrock 	boolean_t need_sync = B_FALSE;
521379c004dSEric Schrock 	zpool_prop_t prop;
522990b4856Slling 
523990b4856Slling 	if ((error = spa_prop_validate(spa, nvp)) != 0)
524990b4856Slling 		return (error);
525990b4856Slling 
526379c004dSEric Schrock 	elem = NULL;
527379c004dSEric Schrock 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
528379c004dSEric Schrock 		if ((prop = zpool_name_to_prop(
529379c004dSEric Schrock 		    nvpair_name(elem))) == ZPROP_INVAL)
530379c004dSEric Schrock 			return (EINVAL);
531379c004dSEric Schrock 
532379c004dSEric Schrock 		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
533379c004dSEric Schrock 			continue;
534379c004dSEric Schrock 
535379c004dSEric Schrock 		need_sync = B_TRUE;
536379c004dSEric Schrock 		break;
537379c004dSEric Schrock 	}
538379c004dSEric Schrock 
539379c004dSEric Schrock 	if (need_sync)
540379c004dSEric Schrock 		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
541379c004dSEric Schrock 		    spa, nvp, 3));
542379c004dSEric Schrock 	else
543379c004dSEric Schrock 		return (0);
544990b4856Slling }
545990b4856Slling 
546990b4856Slling /*
547990b4856Slling  * If the bootfs property value is dsobj, clear it.
548990b4856Slling  */
549990b4856Slling void
550990b4856Slling spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
551990b4856Slling {
552990b4856Slling 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
553990b4856Slling 		VERIFY(zap_remove(spa->spa_meta_objset,
554990b4856Slling 		    spa->spa_pool_props_object,
555990b4856Slling 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
556990b4856Slling 		spa->spa_bootfs = 0;
557990b4856Slling 	}
558990b4856Slling }
559990b4856Slling 
560fa9e4066Sahrens /*
561fa9e4066Sahrens  * ==========================================================================
562fa9e4066Sahrens  * SPA state manipulation (open/create/destroy/import/export)
563fa9e4066Sahrens  * ==========================================================================
564fa9e4066Sahrens  */
565fa9e4066Sahrens 
566ea8dc4b6Seschrock static int
567ea8dc4b6Seschrock spa_error_entry_compare(const void *a, const void *b)
568ea8dc4b6Seschrock {
569ea8dc4b6Seschrock 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
570ea8dc4b6Seschrock 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
571ea8dc4b6Seschrock 	int ret;
572ea8dc4b6Seschrock 
573ea8dc4b6Seschrock 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
574ea8dc4b6Seschrock 	    sizeof (zbookmark_t));
575ea8dc4b6Seschrock 
576ea8dc4b6Seschrock 	if (ret < 0)
577ea8dc4b6Seschrock 		return (-1);
578ea8dc4b6Seschrock 	else if (ret > 0)
579ea8dc4b6Seschrock 		return (1);
580ea8dc4b6Seschrock 	else
581ea8dc4b6Seschrock 		return (0);
582ea8dc4b6Seschrock }
583ea8dc4b6Seschrock 
584ea8dc4b6Seschrock /*
585ea8dc4b6Seschrock  * Utility function which retrieves copies of the current logs and
586ea8dc4b6Seschrock  * re-initializes them in the process.
587ea8dc4b6Seschrock  */
588ea8dc4b6Seschrock void
589ea8dc4b6Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
590ea8dc4b6Seschrock {
591ea8dc4b6Seschrock 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
592ea8dc4b6Seschrock 
593ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
594ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
595ea8dc4b6Seschrock 
596ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
597ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
598ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
599ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
600ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
601ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
602ea8dc4b6Seschrock }
603ea8dc4b6Seschrock 
60435a5a358SJonathan Adams static taskq_t *
60535a5a358SJonathan Adams spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
60635a5a358SJonathan Adams     uint_t value)
607fa9e4066Sahrens {
60835a5a358SJonathan Adams 	uint_t flags = TASKQ_PREPOPULATE;
60935a5a358SJonathan Adams 	boolean_t batch = B_FALSE;
610fa9e4066Sahrens 
61135a5a358SJonathan Adams 	switch (mode) {
61235a5a358SJonathan Adams 	case zti_mode_null:
61335a5a358SJonathan Adams 		return (NULL);		/* no taskq needed */
614fa9e4066Sahrens 
61535a5a358SJonathan Adams 	case zti_mode_fixed:
61635a5a358SJonathan Adams 		ASSERT3U(value, >=, 1);
61735a5a358SJonathan Adams 		value = MAX(value, 1);
61835a5a358SJonathan Adams 		break;
619fa9e4066Sahrens 
62035a5a358SJonathan Adams 	case zti_mode_batch:
62135a5a358SJonathan Adams 		batch = B_TRUE;
62235a5a358SJonathan Adams 		flags |= TASKQ_THREADS_CPU_PCT;
62335a5a358SJonathan Adams 		value = zio_taskq_batch_pct;
62435a5a358SJonathan Adams 		break;
62535a5a358SJonathan Adams 
62635a5a358SJonathan Adams 	case zti_mode_online_percent:
62735a5a358SJonathan Adams 		flags |= TASKQ_THREADS_CPU_PCT;
62835a5a358SJonathan Adams 		break;
62935a5a358SJonathan Adams 
63035a5a358SJonathan Adams 	default:
63135a5a358SJonathan Adams 		panic("unrecognized mode for %s taskq (%u:%u) in "
63235a5a358SJonathan Adams 		    "spa_activate()",
63335a5a358SJonathan Adams 		    name, mode, value);
63435a5a358SJonathan Adams 		break;
63535a5a358SJonathan Adams 	}
63635a5a358SJonathan Adams 
63735a5a358SJonathan Adams 	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
63835a5a358SJonathan Adams 		if (batch)
63935a5a358SJonathan Adams 			flags |= TASKQ_DC_BATCH;
64035a5a358SJonathan Adams 
64135a5a358SJonathan Adams 		return (taskq_create_sysdc(name, value, 50, INT_MAX,
64235a5a358SJonathan Adams 		    spa->spa_proc, zio_taskq_basedc, flags));
64335a5a358SJonathan Adams 	}
64435a5a358SJonathan Adams 	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
64535a5a358SJonathan Adams 	    spa->spa_proc, flags));
64635a5a358SJonathan Adams }
64735a5a358SJonathan Adams 
64835a5a358SJonathan Adams static void
64935a5a358SJonathan Adams spa_create_zio_taskqs(spa_t *spa)
65035a5a358SJonathan Adams {
651e14bb325SJeff Bonwick 	for (int t = 0; t < ZIO_TYPES; t++) {
652e14bb325SJeff Bonwick 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
65380eb36f2SGeorge Wilson 			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
65480eb36f2SGeorge Wilson 			enum zti_modes mode = ztip->zti_mode;
65580eb36f2SGeorge Wilson 			uint_t value = ztip->zti_value;
6562e0c549eSJonathan Adams 			char name[32];
6572e0c549eSJonathan Adams 
6582e0c549eSJonathan Adams 			(void) snprintf(name, sizeof (name),
65980eb36f2SGeorge Wilson 			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
6602e0c549eSJonathan Adams 
66135a5a358SJonathan Adams 			spa->spa_zio_taskq[t][q] =
66235a5a358SJonathan Adams 			    spa_taskq_create(spa, name, mode, value);
66335a5a358SJonathan Adams 		}
66435a5a358SJonathan Adams 	}
66535a5a358SJonathan Adams }
66635a5a358SJonathan Adams 
66735a5a358SJonathan Adams #ifdef _KERNEL
66835a5a358SJonathan Adams static void
66935a5a358SJonathan Adams spa_thread(void *arg)
67035a5a358SJonathan Adams {
67135a5a358SJonathan Adams 	callb_cpr_t cprinfo;
6722e0c549eSJonathan Adams 
67335a5a358SJonathan Adams 	spa_t *spa = arg;
67435a5a358SJonathan Adams 	user_t *pu = PTOU(curproc);
6752e0c549eSJonathan Adams 
67635a5a358SJonathan Adams 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
67735a5a358SJonathan Adams 	    spa->spa_name);
6782e0c549eSJonathan Adams 
67935a5a358SJonathan Adams 	ASSERT(curproc != &p0);
68035a5a358SJonathan Adams 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
68135a5a358SJonathan Adams 	    "zpool-%s", spa->spa_name);
68235a5a358SJonathan Adams 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
6832e0c549eSJonathan Adams 
68435a5a358SJonathan Adams 	/* bind this thread to the requested psrset */
68535a5a358SJonathan Adams 	if (zio_taskq_psrset_bind != PS_NONE) {
68635a5a358SJonathan Adams 		pool_lock();
68735a5a358SJonathan Adams 		mutex_enter(&cpu_lock);
68835a5a358SJonathan Adams 		mutex_enter(&pidlock);
68935a5a358SJonathan Adams 		mutex_enter(&curproc->p_lock);
69080eb36f2SGeorge Wilson 
69135a5a358SJonathan Adams 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
69235a5a358SJonathan Adams 		    0, NULL, NULL) == 0)  {
69335a5a358SJonathan Adams 			curthread->t_bind_pset = zio_taskq_psrset_bind;
69435a5a358SJonathan Adams 		} else {
69535a5a358SJonathan Adams 			cmn_err(CE_WARN,
69635a5a358SJonathan Adams 			    "Couldn't bind process for zfs pool \"%s\" to "
69735a5a358SJonathan Adams 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
69835a5a358SJonathan Adams 		}
69935a5a358SJonathan Adams 
70035a5a358SJonathan Adams 		mutex_exit(&curproc->p_lock);
70135a5a358SJonathan Adams 		mutex_exit(&pidlock);
70235a5a358SJonathan Adams 		mutex_exit(&cpu_lock);
70335a5a358SJonathan Adams 		pool_unlock();
70435a5a358SJonathan Adams 	}
70535a5a358SJonathan Adams 
70635a5a358SJonathan Adams 	if (zio_taskq_sysdc) {
70735a5a358SJonathan Adams 		sysdc_thread_enter(curthread, 100, 0);
70835a5a358SJonathan Adams 	}
70935a5a358SJonathan Adams 
71035a5a358SJonathan Adams 	spa->spa_proc = curproc;
71135a5a358SJonathan Adams 	spa->spa_did = curthread->t_did;
71235a5a358SJonathan Adams 
71335a5a358SJonathan Adams 	spa_create_zio_taskqs(spa);
71435a5a358SJonathan Adams 
71535a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
71635a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
71735a5a358SJonathan Adams 
71835a5a358SJonathan Adams 	spa->spa_proc_state = SPA_PROC_ACTIVE;
71935a5a358SJonathan Adams 	cv_broadcast(&spa->spa_proc_cv);
72035a5a358SJonathan Adams 
72135a5a358SJonathan Adams 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
72235a5a358SJonathan Adams 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
72335a5a358SJonathan Adams 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
72435a5a358SJonathan Adams 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
72535a5a358SJonathan Adams 
72635a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
72735a5a358SJonathan Adams 	spa->spa_proc_state = SPA_PROC_GONE;
72835a5a358SJonathan Adams 	spa->spa_proc = &p0;
72935a5a358SJonathan Adams 	cv_broadcast(&spa->spa_proc_cv);
73035a5a358SJonathan Adams 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
73135a5a358SJonathan Adams 
73235a5a358SJonathan Adams 	mutex_enter(&curproc->p_lock);
73335a5a358SJonathan Adams 	lwp_exit();
73435a5a358SJonathan Adams }
73535a5a358SJonathan Adams #endif
73635a5a358SJonathan Adams 
73735a5a358SJonathan Adams /*
73835a5a358SJonathan Adams  * Activate an uninitialized pool.
73935a5a358SJonathan Adams  */
74035a5a358SJonathan Adams static void
74135a5a358SJonathan Adams spa_activate(spa_t *spa, int mode)
74235a5a358SJonathan Adams {
74335a5a358SJonathan Adams 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
74435a5a358SJonathan Adams 
74535a5a358SJonathan Adams 	spa->spa_state = POOL_STATE_ACTIVE;
74635a5a358SJonathan Adams 	spa->spa_mode = mode;
74735a5a358SJonathan Adams 
74835a5a358SJonathan Adams 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
74935a5a358SJonathan Adams 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
75035a5a358SJonathan Adams 
75135a5a358SJonathan Adams 	/* Try to create a covering process */
75235a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
75335a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
75435a5a358SJonathan Adams 	ASSERT(spa->spa_proc == &p0);
75535a5a358SJonathan Adams 	spa->spa_did = 0;
75635a5a358SJonathan Adams 
75735a5a358SJonathan Adams 	/* Only create a process if we're going to be around a while. */
75835a5a358SJonathan Adams 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
75935a5a358SJonathan Adams 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
76035a5a358SJonathan Adams 		    NULL, 0) == 0) {
76135a5a358SJonathan Adams 			spa->spa_proc_state = SPA_PROC_CREATED;
76235a5a358SJonathan Adams 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
76335a5a358SJonathan Adams 				cv_wait(&spa->spa_proc_cv,
76435a5a358SJonathan Adams 				    &spa->spa_proc_lock);
7652e0c549eSJonathan Adams 			}
76635a5a358SJonathan Adams 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
76735a5a358SJonathan Adams 			ASSERT(spa->spa_proc != &p0);
76835a5a358SJonathan Adams 			ASSERT(spa->spa_did != 0);
76935a5a358SJonathan Adams 		} else {
77035a5a358SJonathan Adams #ifdef _KERNEL
77135a5a358SJonathan Adams 			cmn_err(CE_WARN,
77235a5a358SJonathan Adams 			    "Couldn't create process for zfs pool \"%s\"\n",
77335a5a358SJonathan Adams 			    spa->spa_name);
77435a5a358SJonathan Adams #endif
775e14bb325SJeff Bonwick 		}
776fa9e4066Sahrens 	}
77735a5a358SJonathan Adams 	mutex_exit(&spa->spa_proc_lock);
77835a5a358SJonathan Adams 
77935a5a358SJonathan Adams 	/* If we didn't create a process, we need to create our taskqs. */
78035a5a358SJonathan Adams 	if (spa->spa_proc == &p0) {
78135a5a358SJonathan Adams 		spa_create_zio_taskqs(spa);
78235a5a358SJonathan Adams 	}
783fa9e4066Sahrens 
784e14bb325SJeff Bonwick 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
785e14bb325SJeff Bonwick 	    offsetof(vdev_t, vdev_config_dirty_node));
786e14bb325SJeff Bonwick 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
787e14bb325SJeff Bonwick 	    offsetof(vdev_t, vdev_state_dirty_node));
788fa9e4066Sahrens 
789fa9e4066Sahrens 	txg_list_create(&spa->spa_vdev_txg_list,
790fa9e4066Sahrens 	    offsetof(struct vdev, vdev_txg_node));
791ea8dc4b6Seschrock 
792ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
793ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
794ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
795ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
796ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
797ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
798fa9e4066Sahrens }
799fa9e4066Sahrens 
800fa9e4066Sahrens /*
801fa9e4066Sahrens  * Opposite of spa_activate().
802fa9e4066Sahrens  */
803fa9e4066Sahrens static void
804fa9e4066Sahrens spa_deactivate(spa_t *spa)
805fa9e4066Sahrens {
806fa9e4066Sahrens 	ASSERT(spa->spa_sync_on == B_FALSE);
807fa9e4066Sahrens 	ASSERT(spa->spa_dsl_pool == NULL);
808fa9e4066Sahrens 	ASSERT(spa->spa_root_vdev == NULL);
80925f89ee2SJeff Bonwick 	ASSERT(spa->spa_async_zio_root == NULL);
810fa9e4066Sahrens 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
811fa9e4066Sahrens 
812fa9e4066Sahrens 	txg_list_destroy(&spa->spa_vdev_txg_list);
813fa9e4066Sahrens 
814e14bb325SJeff Bonwick 	list_destroy(&spa->spa_config_dirty_list);
815e14bb325SJeff Bonwick 	list_destroy(&spa->spa_state_dirty_list);
816fa9e4066Sahrens 
817e14bb325SJeff Bonwick 	for (int t = 0; t < ZIO_TYPES; t++) {
818e14bb325SJeff Bonwick 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
81980eb36f2SGeorge Wilson 			if (spa->spa_zio_taskq[t][q] != NULL)
82080eb36f2SGeorge Wilson 				taskq_destroy(spa->spa_zio_taskq[t][q]);
821e14bb325SJeff Bonwick 			spa->spa_zio_taskq[t][q] = NULL;
822e14bb325SJeff Bonwick 		}
823fa9e4066Sahrens 	}
824fa9e4066Sahrens 
825fa9e4066Sahrens 	metaslab_class_destroy(spa->spa_normal_class);
826fa9e4066Sahrens 	spa->spa_normal_class = NULL;
827fa9e4066Sahrens 
8288654d025Sperrin 	metaslab_class_destroy(spa->spa_log_class);
8298654d025Sperrin 	spa->spa_log_class = NULL;
8308654d025Sperrin 
831ea8dc4b6Seschrock 	/*
832ea8dc4b6Seschrock 	 * If this was part of an import or the open otherwise failed, we may
833ea8dc4b6Seschrock 	 * still have errors left in the queues.  Empty them just in case.
834ea8dc4b6Seschrock 	 */
835ea8dc4b6Seschrock 	spa_errlog_drain(spa);
836ea8dc4b6Seschrock 
837ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_scrub);
838ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_last);
839ea8dc4b6Seschrock 
840fa9e4066Sahrens 	spa->spa_state = POOL_STATE_UNINITIALIZED;
84135a5a358SJonathan Adams 
84235a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
84335a5a358SJonathan Adams 	if (spa->spa_proc_state != SPA_PROC_NONE) {
84435a5a358SJonathan Adams 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
84535a5a358SJonathan Adams 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
84635a5a358SJonathan Adams 		cv_broadcast(&spa->spa_proc_cv);
84735a5a358SJonathan Adams 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
84835a5a358SJonathan Adams 			ASSERT(spa->spa_proc != &p0);
84935a5a358SJonathan Adams 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
85035a5a358SJonathan Adams 		}
85135a5a358SJonathan Adams 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
85235a5a358SJonathan Adams 		spa->spa_proc_state = SPA_PROC_NONE;
85335a5a358SJonathan Adams 	}
85435a5a358SJonathan Adams 	ASSERT(spa->spa_proc == &p0);
85535a5a358SJonathan Adams 	mutex_exit(&spa->spa_proc_lock);
85635a5a358SJonathan Adams 
85735a5a358SJonathan Adams 	/*
85835a5a358SJonathan Adams 	 * We want to make sure spa_thread() has actually exited the ZFS
85935a5a358SJonathan Adams 	 * module, so that the module can't be unloaded out from underneath
86035a5a358SJonathan Adams 	 * it.
86135a5a358SJonathan Adams 	 */
86235a5a358SJonathan Adams 	if (spa->spa_did != 0) {
86335a5a358SJonathan Adams 		thread_join(spa->spa_did);
86435a5a358SJonathan Adams 		spa->spa_did = 0;
86535a5a358SJonathan Adams 	}
866fa9e4066Sahrens }
867fa9e4066Sahrens 
868fa9e4066Sahrens /*
869fa9e4066Sahrens  * Verify a pool configuration, and construct the vdev tree appropriately.  This
870fa9e4066Sahrens  * will create all the necessary vdevs in the appropriate layout, with each vdev
871fa9e4066Sahrens  * in the CLOSED state.  This will prep the pool before open/creation/import.
872fa9e4066Sahrens  * All vdev validation is done by the vdev_alloc() routine.
873fa9e4066Sahrens  */
87499653d4eSeschrock static int
87599653d4eSeschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
87699653d4eSeschrock     uint_t id, int atype)
877fa9e4066Sahrens {
878fa9e4066Sahrens 	nvlist_t **child;
879573ca77eSGeorge Wilson 	uint_t children;
88099653d4eSeschrock 	int error;
881fa9e4066Sahrens 
88299653d4eSeschrock 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
88399653d4eSeschrock 		return (error);
884fa9e4066Sahrens 
88599653d4eSeschrock 	if ((*vdp)->vdev_ops->vdev_op_leaf)
88699653d4eSeschrock 		return (0);
887fa9e4066Sahrens 
888e14bb325SJeff Bonwick 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
889e14bb325SJeff Bonwick 	    &child, &children);
890e14bb325SJeff Bonwick 
891e14bb325SJeff Bonwick 	if (error == ENOENT)
892e14bb325SJeff Bonwick 		return (0);
893e14bb325SJeff Bonwick 
894e14bb325SJeff Bonwick 	if (error) {
89599653d4eSeschrock 		vdev_free(*vdp);
89699653d4eSeschrock 		*vdp = NULL;
89799653d4eSeschrock 		return (EINVAL);
898fa9e4066Sahrens 	}
899fa9e4066Sahrens 
900573ca77eSGeorge Wilson 	for (int c = 0; c < children; c++) {
90199653d4eSeschrock 		vdev_t *vd;
90299653d4eSeschrock 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
90399653d4eSeschrock 		    atype)) != 0) {
90499653d4eSeschrock 			vdev_free(*vdp);
90599653d4eSeschrock 			*vdp = NULL;
90699653d4eSeschrock 			return (error);
907fa9e4066Sahrens 		}
908fa9e4066Sahrens 	}
909fa9e4066Sahrens 
91099653d4eSeschrock 	ASSERT(*vdp != NULL);
91199653d4eSeschrock 
91299653d4eSeschrock 	return (0);
913fa9e4066Sahrens }
914fa9e4066Sahrens 
915fa9e4066Sahrens /*
916fa9e4066Sahrens  * Opposite of spa_load().
917fa9e4066Sahrens  */
918fa9e4066Sahrens static void
919fa9e4066Sahrens spa_unload(spa_t *spa)
920fa9e4066Sahrens {
92199653d4eSeschrock 	int i;
92299653d4eSeschrock 
923e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
924e14bb325SJeff Bonwick 
925ea8dc4b6Seschrock 	/*
926ea8dc4b6Seschrock 	 * Stop async tasks.
927ea8dc4b6Seschrock 	 */
928ea8dc4b6Seschrock 	spa_async_suspend(spa);
929ea8dc4b6Seschrock 
930fa9e4066Sahrens 	/*
931fa9e4066Sahrens 	 * Stop syncing.
932fa9e4066Sahrens 	 */
933fa9e4066Sahrens 	if (spa->spa_sync_on) {
934fa9e4066Sahrens 		txg_sync_stop(spa->spa_dsl_pool);
935fa9e4066Sahrens 		spa->spa_sync_on = B_FALSE;
936fa9e4066Sahrens 	}
937fa9e4066Sahrens 
938fa9e4066Sahrens 	/*
939e14bb325SJeff Bonwick 	 * Wait for any outstanding async I/O to complete.
940fa9e4066Sahrens 	 */
94154d692b7SGeorge Wilson 	if (spa->spa_async_zio_root != NULL) {
94254d692b7SGeorge Wilson 		(void) zio_wait(spa->spa_async_zio_root);
94354d692b7SGeorge Wilson 		spa->spa_async_zio_root = NULL;
94454d692b7SGeorge Wilson 	}
945fa9e4066Sahrens 
946fa9e4066Sahrens 	/*
947fa9e4066Sahrens 	 * Close the dsl pool.
948fa9e4066Sahrens 	 */
949fa9e4066Sahrens 	if (spa->spa_dsl_pool) {
950fa9e4066Sahrens 		dsl_pool_close(spa->spa_dsl_pool);
951fa9e4066Sahrens 		spa->spa_dsl_pool = NULL;
952afee20e4SGeorge Wilson 		spa->spa_meta_objset = NULL;
953fa9e4066Sahrens 	}
954fa9e4066Sahrens 
955b24ab676SJeff Bonwick 	ddt_unload(spa);
956b24ab676SJeff Bonwick 
9578ad4d6ddSJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9588ad4d6ddSJeff Bonwick 
9598ad4d6ddSJeff Bonwick 	/*
9608ad4d6ddSJeff Bonwick 	 * Drop and purge level 2 cache
9618ad4d6ddSJeff Bonwick 	 */
9628ad4d6ddSJeff Bonwick 	spa_l2cache_drop(spa);
9638ad4d6ddSJeff Bonwick 
964fa9e4066Sahrens 	/*
965fa9e4066Sahrens 	 * Close all vdevs.
966fa9e4066Sahrens 	 */
9670e34b6a7Sbonwick 	if (spa->spa_root_vdev)
968fa9e4066Sahrens 		vdev_free(spa->spa_root_vdev);
9690e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == NULL);
970ea8dc4b6Seschrock 
971fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
972fa94a07fSbrendan 		vdev_free(spa->spa_spares.sav_vdevs[i]);
973fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs) {
974fa94a07fSbrendan 		kmem_free(spa->spa_spares.sav_vdevs,
975fa94a07fSbrendan 		    spa->spa_spares.sav_count * sizeof (void *));
976fa94a07fSbrendan 		spa->spa_spares.sav_vdevs = NULL;
97799653d4eSeschrock 	}
978fa94a07fSbrendan 	if (spa->spa_spares.sav_config) {
979fa94a07fSbrendan 		nvlist_free(spa->spa_spares.sav_config);
980fa94a07fSbrendan 		spa->spa_spares.sav_config = NULL;
981fa94a07fSbrendan 	}
9822ce8af81SEric Schrock 	spa->spa_spares.sav_count = 0;
983fa94a07fSbrendan 
984fa94a07fSbrendan 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
985fa94a07fSbrendan 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
986fa94a07fSbrendan 	if (spa->spa_l2cache.sav_vdevs) {
987fa94a07fSbrendan 		kmem_free(spa->spa_l2cache.sav_vdevs,
988fa94a07fSbrendan 		    spa->spa_l2cache.sav_count * sizeof (void *));
989fa94a07fSbrendan 		spa->spa_l2cache.sav_vdevs = NULL;
990fa94a07fSbrendan 	}
991fa94a07fSbrendan 	if (spa->spa_l2cache.sav_config) {
992fa94a07fSbrendan 		nvlist_free(spa->spa_l2cache.sav_config);
993fa94a07fSbrendan 		spa->spa_l2cache.sav_config = NULL;
99499653d4eSeschrock 	}
9952ce8af81SEric Schrock 	spa->spa_l2cache.sav_count = 0;
99699653d4eSeschrock 
997ea8dc4b6Seschrock 	spa->spa_async_suspended = 0;
9988ad4d6ddSJeff Bonwick 
9998ad4d6ddSJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
1000fa9e4066Sahrens }
1001fa9e4066Sahrens 
100299653d4eSeschrock /*
100399653d4eSeschrock  * Load (or re-load) the current list of vdevs describing the active spares for
100499653d4eSeschrock  * this pool.  When this is called, we have some form of basic information in
1005fa94a07fSbrendan  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1006fa94a07fSbrendan  * then re-generate a more complete list including status information.
100799653d4eSeschrock  */
100899653d4eSeschrock static void
100999653d4eSeschrock spa_load_spares(spa_t *spa)
101099653d4eSeschrock {
101199653d4eSeschrock 	nvlist_t **spares;
101299653d4eSeschrock 	uint_t nspares;
101399653d4eSeschrock 	int i;
101439c23413Seschrock 	vdev_t *vd, *tvd;
101599653d4eSeschrock 
1016e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1017e14bb325SJeff Bonwick 
101899653d4eSeschrock 	/*
101999653d4eSeschrock 	 * First, close and free any existing spare vdevs.
102099653d4eSeschrock 	 */
1021fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1022fa94a07fSbrendan 		vd = spa->spa_spares.sav_vdevs[i];
102339c23413Seschrock 
102439c23413Seschrock 		/* Undo the call to spa_activate() below */
1025c5904d13Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1026c5904d13Seschrock 		    B_FALSE)) != NULL && tvd->vdev_isspare)
102739c23413Seschrock 			spa_spare_remove(tvd);
102839c23413Seschrock 		vdev_close(vd);
102939c23413Seschrock 		vdev_free(vd);
103099653d4eSeschrock 	}
103139c23413Seschrock 
1032fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs)
1033fa94a07fSbrendan 		kmem_free(spa->spa_spares.sav_vdevs,
1034fa94a07fSbrendan 		    spa->spa_spares.sav_count * sizeof (void *));
103599653d4eSeschrock 
1036fa94a07fSbrendan 	if (spa->spa_spares.sav_config == NULL)
103799653d4eSeschrock 		nspares = 0;
103899653d4eSeschrock 	else
1039fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
104099653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
104199653d4eSeschrock 
1042fa94a07fSbrendan 	spa->spa_spares.sav_count = (int)nspares;
1043fa94a07fSbrendan 	spa->spa_spares.sav_vdevs = NULL;
104499653d4eSeschrock 
104599653d4eSeschrock 	if (nspares == 0)
104699653d4eSeschrock 		return;
104799653d4eSeschrock 
104899653d4eSeschrock 	/*
104999653d4eSeschrock 	 * Construct the array of vdevs, opening them to get status in the
105039c23413Seschrock 	 * process.   For each spare, there is potentially two different vdev_t
105139c23413Seschrock 	 * structures associated with it: one in the list of spares (used only
105239c23413Seschrock 	 * for basic validation purposes) and one in the active vdev
105339c23413Seschrock 	 * configuration (if it's spared in).  During this phase we open and
105439c23413Seschrock 	 * validate each vdev on the spare list.  If the vdev also exists in the
105539c23413Seschrock 	 * active configuration, then we also mark this vdev as an active spare.
105699653d4eSeschrock 	 */
1057fa94a07fSbrendan 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1058fa94a07fSbrendan 	    KM_SLEEP);
1059fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
106099653d4eSeschrock 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
106199653d4eSeschrock 		    VDEV_ALLOC_SPARE) == 0);
106299653d4eSeschrock 		ASSERT(vd != NULL);
106399653d4eSeschrock 
1064fa94a07fSbrendan 		spa->spa_spares.sav_vdevs[i] = vd;
106599653d4eSeschrock 
1066c5904d13Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1067c5904d13Seschrock 		    B_FALSE)) != NULL) {
106839c23413Seschrock 			if (!tvd->vdev_isspare)
106939c23413Seschrock 				spa_spare_add(tvd);
107039c23413Seschrock 
107139c23413Seschrock 			/*
107239c23413Seschrock 			 * We only mark the spare active if we were successfully
107339c23413Seschrock 			 * able to load the vdev.  Otherwise, importing a pool
107439c23413Seschrock 			 * with a bad active spare would result in strange
107539c23413Seschrock 			 * behavior, because multiple pool would think the spare
107639c23413Seschrock 			 * is actively in use.
107739c23413Seschrock 			 *
107839c23413Seschrock 			 * There is a vulnerability here to an equally bizarre
107939c23413Seschrock 			 * circumstance, where a dead active spare is later
108039c23413Seschrock 			 * brought back to life (onlined or otherwise).  Given
108139c23413Seschrock 			 * the rarity of this scenario, and the extra complexity
108239c23413Seschrock 			 * it adds, we ignore the possibility.
108339c23413Seschrock 			 */
108439c23413Seschrock 			if (!vdev_is_dead(tvd))
108539c23413Seschrock 				spa_spare_activate(tvd);
108639c23413Seschrock 		}
108739c23413Seschrock 
1088e14bb325SJeff Bonwick 		vd->vdev_top = vd;
10896809eb4eSEric Schrock 		vd->vdev_aux = &spa->spa_spares;
1090e14bb325SJeff Bonwick 
109199653d4eSeschrock 		if (vdev_open(vd) != 0)
109299653d4eSeschrock 			continue;
109399653d4eSeschrock 
1094fa94a07fSbrendan 		if (vdev_validate_aux(vd) == 0)
1095fa94a07fSbrendan 			spa_spare_add(vd);
109699653d4eSeschrock 	}
109799653d4eSeschrock 
109899653d4eSeschrock 	/*
109999653d4eSeschrock 	 * Recompute the stashed list of spares, with status information
110099653d4eSeschrock 	 * this time.
110199653d4eSeschrock 	 */
1102fa94a07fSbrendan 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
110399653d4eSeschrock 	    DATA_TYPE_NVLIST_ARRAY) == 0);
110499653d4eSeschrock 
1105fa94a07fSbrendan 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1106fa94a07fSbrendan 	    KM_SLEEP);
1107fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
1108fa94a07fSbrendan 		spares[i] = vdev_config_generate(spa,
1109*3f9d6ad7SLin Ling 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1110fa94a07fSbrendan 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1111fa94a07fSbrendan 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1112fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
111399653d4eSeschrock 		nvlist_free(spares[i]);
1114fa94a07fSbrendan 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1115fa94a07fSbrendan }
1116fa94a07fSbrendan 
1117fa94a07fSbrendan /*
1118fa94a07fSbrendan  * Load (or re-load) the current list of vdevs describing the active l2cache for
1119fa94a07fSbrendan  * this pool.  When this is called, we have some form of basic information in
1120fa94a07fSbrendan  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1121fa94a07fSbrendan  * then re-generate a more complete list including status information.
1122fa94a07fSbrendan  * Devices which are already active have their details maintained, and are
1123fa94a07fSbrendan  * not re-opened.
1124fa94a07fSbrendan  */
1125fa94a07fSbrendan static void
1126fa94a07fSbrendan spa_load_l2cache(spa_t *spa)
1127fa94a07fSbrendan {
1128fa94a07fSbrendan 	nvlist_t **l2cache;
1129fa94a07fSbrendan 	uint_t nl2cache;
1130fa94a07fSbrendan 	int i, j, oldnvdevs;
1131573ca77eSGeorge Wilson 	uint64_t guid;
1132fa94a07fSbrendan 	vdev_t *vd, **oldvdevs, **newvdevs;
1133fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1134fa94a07fSbrendan 
1135e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1136e14bb325SJeff Bonwick 
1137fa94a07fSbrendan 	if (sav->sav_config != NULL) {
1138fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1139fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1140fa94a07fSbrendan 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1141fa94a07fSbrendan 	} else {
1142fa94a07fSbrendan 		nl2cache = 0;
1143fa94a07fSbrendan 	}
1144fa94a07fSbrendan 
1145fa94a07fSbrendan 	oldvdevs = sav->sav_vdevs;
1146fa94a07fSbrendan 	oldnvdevs = sav->sav_count;
1147fa94a07fSbrendan 	sav->sav_vdevs = NULL;
1148fa94a07fSbrendan 	sav->sav_count = 0;
1149fa94a07fSbrendan 
1150fa94a07fSbrendan 	/*
1151fa94a07fSbrendan 	 * Process new nvlist of vdevs.
1152fa94a07fSbrendan 	 */
1153fa94a07fSbrendan 	for (i = 0; i < nl2cache; i++) {
1154fa94a07fSbrendan 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1155fa94a07fSbrendan 		    &guid) == 0);
1156fa94a07fSbrendan 
1157fa94a07fSbrendan 		newvdevs[i] = NULL;
1158fa94a07fSbrendan 		for (j = 0; j < oldnvdevs; j++) {
1159fa94a07fSbrendan 			vd = oldvdevs[j];
1160fa94a07fSbrendan 			if (vd != NULL && guid == vd->vdev_guid) {
1161fa94a07fSbrendan 				/*
1162fa94a07fSbrendan 				 * Retain previous vdev for add/remove ops.
1163fa94a07fSbrendan 				 */
1164fa94a07fSbrendan 				newvdevs[i] = vd;
1165fa94a07fSbrendan 				oldvdevs[j] = NULL;
1166fa94a07fSbrendan 				break;
1167fa94a07fSbrendan 			}
1168fa94a07fSbrendan 		}
1169fa94a07fSbrendan 
1170fa94a07fSbrendan 		if (newvdevs[i] == NULL) {
1171fa94a07fSbrendan 			/*
1172fa94a07fSbrendan 			 * Create new vdev
1173fa94a07fSbrendan 			 */
1174fa94a07fSbrendan 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1175fa94a07fSbrendan 			    VDEV_ALLOC_L2CACHE) == 0);
1176fa94a07fSbrendan 			ASSERT(vd != NULL);
1177fa94a07fSbrendan 			newvdevs[i] = vd;
1178fa94a07fSbrendan 
1179fa94a07fSbrendan 			/*
1180fa94a07fSbrendan 			 * Commit this vdev as an l2cache device,
1181fa94a07fSbrendan 			 * even if it fails to open.
1182fa94a07fSbrendan 			 */
1183fa94a07fSbrendan 			spa_l2cache_add(vd);
1184fa94a07fSbrendan 
1185c5904d13Seschrock 			vd->vdev_top = vd;
1186c5904d13Seschrock 			vd->vdev_aux = sav;
1187c5904d13Seschrock 
1188c5904d13Seschrock 			spa_l2cache_activate(vd);
1189c5904d13Seschrock 
1190fa94a07fSbrendan 			if (vdev_open(vd) != 0)
1191fa94a07fSbrendan 				continue;
1192fa94a07fSbrendan 
1193fa94a07fSbrendan 			(void) vdev_validate_aux(vd);
1194fa94a07fSbrendan 
1195573ca77eSGeorge Wilson 			if (!vdev_is_dead(vd))
1196573ca77eSGeorge Wilson 				l2arc_add_vdev(spa, vd);
1197fa94a07fSbrendan 		}
1198fa94a07fSbrendan 	}
1199fa94a07fSbrendan 
1200fa94a07fSbrendan 	/*
1201fa94a07fSbrendan 	 * Purge vdevs that were dropped
1202fa94a07fSbrendan 	 */
1203fa94a07fSbrendan 	for (i = 0; i < oldnvdevs; i++) {
1204fa94a07fSbrendan 		uint64_t pool;
1205fa94a07fSbrendan 
1206fa94a07fSbrendan 		vd = oldvdevs[i];
1207fa94a07fSbrendan 		if (vd != NULL) {
12088ad4d6ddSJeff Bonwick 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
12098ad4d6ddSJeff Bonwick 			    pool != 0ULL && l2arc_vdev_present(vd))
1210fa94a07fSbrendan 				l2arc_remove_vdev(vd);
1211fa94a07fSbrendan 			(void) vdev_close(vd);
1212fa94a07fSbrendan 			spa_l2cache_remove(vd);
1213fa94a07fSbrendan 		}
1214fa94a07fSbrendan 	}
1215fa94a07fSbrendan 
1216fa94a07fSbrendan 	if (oldvdevs)
1217fa94a07fSbrendan 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1218fa94a07fSbrendan 
1219fa94a07fSbrendan 	if (sav->sav_config == NULL)
1220fa94a07fSbrendan 		goto out;
1221fa94a07fSbrendan 
1222fa94a07fSbrendan 	sav->sav_vdevs = newvdevs;
1223fa94a07fSbrendan 	sav->sav_count = (int)nl2cache;
1224fa94a07fSbrendan 
1225fa94a07fSbrendan 	/*
1226fa94a07fSbrendan 	 * Recompute the stashed list of l2cache devices, with status
1227fa94a07fSbrendan 	 * information this time.
1228fa94a07fSbrendan 	 */
1229fa94a07fSbrendan 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1230fa94a07fSbrendan 	    DATA_TYPE_NVLIST_ARRAY) == 0);
1231fa94a07fSbrendan 
1232fa94a07fSbrendan 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1233fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
1234fa94a07fSbrendan 		l2cache[i] = vdev_config_generate(spa,
1235*3f9d6ad7SLin Ling 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1236fa94a07fSbrendan 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1237fa94a07fSbrendan 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1238fa94a07fSbrendan out:
1239fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
1240fa94a07fSbrendan 		nvlist_free(l2cache[i]);
1241fa94a07fSbrendan 	if (sav->sav_count)
1242fa94a07fSbrendan 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
124399653d4eSeschrock }
124499653d4eSeschrock 
124599653d4eSeschrock static int
124699653d4eSeschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
124799653d4eSeschrock {
124899653d4eSeschrock 	dmu_buf_t *db;
124999653d4eSeschrock 	char *packed = NULL;
125099653d4eSeschrock 	size_t nvsize = 0;
125199653d4eSeschrock 	int error;
125299653d4eSeschrock 	*value = NULL;
125399653d4eSeschrock 
125499653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
125599653d4eSeschrock 	nvsize = *(uint64_t *)db->db_data;
125699653d4eSeschrock 	dmu_buf_rele(db, FTAG);
125799653d4eSeschrock 
125899653d4eSeschrock 	packed = kmem_alloc(nvsize, KM_SLEEP);
12597bfdf011SNeil Perrin 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
12607bfdf011SNeil Perrin 	    DMU_READ_PREFETCH);
126199653d4eSeschrock 	if (error == 0)
126299653d4eSeschrock 		error = nvlist_unpack(packed, nvsize, value, 0);
126399653d4eSeschrock 	kmem_free(packed, nvsize);
126499653d4eSeschrock 
126599653d4eSeschrock 	return (error);
126699653d4eSeschrock }
126799653d4eSeschrock 
12683d7072f8Seschrock /*
12693d7072f8Seschrock  * Checks to see if the given vdev could not be opened, in which case we post a
12703d7072f8Seschrock  * sysevent to notify the autoreplace code that the device has been removed.
12713d7072f8Seschrock  */
12723d7072f8Seschrock static void
12733d7072f8Seschrock spa_check_removed(vdev_t *vd)
12743d7072f8Seschrock {
1275573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
12763d7072f8Seschrock 		spa_check_removed(vd->vdev_child[c]);
12773d7072f8Seschrock 
12783d7072f8Seschrock 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
12793d7072f8Seschrock 		zfs_post_autoreplace(vd->vdev_spa, vd);
12803d7072f8Seschrock 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
12813d7072f8Seschrock 	}
12823d7072f8Seschrock }
12833d7072f8Seschrock 
1284e6ca193dSGeorge Wilson /*
1285e6ca193dSGeorge Wilson  * Load the slog device state from the config object since it's possible
1286e6ca193dSGeorge Wilson  * that the label does not contain the most up-to-date information.
1287e6ca193dSGeorge Wilson  */
1288e6ca193dSGeorge Wilson void
128988ecc943SGeorge Wilson spa_load_log_state(spa_t *spa, nvlist_t *nv)
1290e6ca193dSGeorge Wilson {
129188ecc943SGeorge Wilson 	vdev_t *ovd, *rvd = spa->spa_root_vdev;
1292e6ca193dSGeorge Wilson 
129388ecc943SGeorge Wilson 	/*
129488ecc943SGeorge Wilson 	 * Load the original root vdev tree from the passed config.
129588ecc943SGeorge Wilson 	 */
129688ecc943SGeorge Wilson 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
129788ecc943SGeorge Wilson 	VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1298e6ca193dSGeorge Wilson 
129988ecc943SGeorge Wilson 	for (int c = 0; c < rvd->vdev_children; c++) {
130088ecc943SGeorge Wilson 		vdev_t *cvd = rvd->vdev_child[c];
130188ecc943SGeorge Wilson 		if (cvd->vdev_islog)
130288ecc943SGeorge Wilson 			vdev_load_log_state(cvd, ovd->vdev_child[c]);
1303e6ca193dSGeorge Wilson 	}
130488ecc943SGeorge Wilson 	vdev_free(ovd);
130588ecc943SGeorge Wilson 	spa_config_exit(spa, SCL_ALL, FTAG);
1306e6ca193dSGeorge Wilson }
1307e6ca193dSGeorge Wilson 
1308b87f3af3Sperrin /*
1309b87f3af3Sperrin  * Check for missing log devices
1310b87f3af3Sperrin  */
1311b87f3af3Sperrin int
1312b87f3af3Sperrin spa_check_logs(spa_t *spa)
1313b87f3af3Sperrin {
1314b87f3af3Sperrin 	switch (spa->spa_log_state) {
1315b87f3af3Sperrin 	case SPA_LOG_MISSING:
1316b87f3af3Sperrin 		/* need to recheck in case slog has been restored */
1317b87f3af3Sperrin 	case SPA_LOG_UNKNOWN:
1318b87f3af3Sperrin 		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1319b87f3af3Sperrin 		    DS_FIND_CHILDREN)) {
13201195e687SMark J Musante 			spa_set_log_state(spa, SPA_LOG_MISSING);
1321b87f3af3Sperrin 			return (1);
1322b87f3af3Sperrin 		}
1323b87f3af3Sperrin 		break;
1324b87f3af3Sperrin 	}
1325b87f3af3Sperrin 	return (0);
1326b87f3af3Sperrin }
1327b87f3af3Sperrin 
13281195e687SMark J Musante static boolean_t
13291195e687SMark J Musante spa_passivate_log(spa_t *spa)
13301195e687SMark J Musante {
13311195e687SMark J Musante 	vdev_t *rvd = spa->spa_root_vdev;
13321195e687SMark J Musante 	boolean_t slog_found = B_FALSE;
13331195e687SMark J Musante 
13341195e687SMark J Musante 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
13351195e687SMark J Musante 
13361195e687SMark J Musante 	if (!spa_has_slogs(spa))
13371195e687SMark J Musante 		return (B_FALSE);
13381195e687SMark J Musante 
13391195e687SMark J Musante 	for (int c = 0; c < rvd->vdev_children; c++) {
13401195e687SMark J Musante 		vdev_t *tvd = rvd->vdev_child[c];
13411195e687SMark J Musante 		metaslab_group_t *mg = tvd->vdev_mg;
13421195e687SMark J Musante 
13431195e687SMark J Musante 		if (tvd->vdev_islog) {
13441195e687SMark J Musante 			metaslab_group_passivate(mg);
13451195e687SMark J Musante 			slog_found = B_TRUE;
13461195e687SMark J Musante 		}
13471195e687SMark J Musante 	}
13481195e687SMark J Musante 
13491195e687SMark J Musante 	return (slog_found);
13501195e687SMark J Musante }
13511195e687SMark J Musante 
13521195e687SMark J Musante static void
13531195e687SMark J Musante spa_activate_log(spa_t *spa)
13541195e687SMark J Musante {
13551195e687SMark J Musante 	vdev_t *rvd = spa->spa_root_vdev;
13561195e687SMark J Musante 
13571195e687SMark J Musante 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
13581195e687SMark J Musante 
13591195e687SMark J Musante 	for (int c = 0; c < rvd->vdev_children; c++) {
13601195e687SMark J Musante 		vdev_t *tvd = rvd->vdev_child[c];
13611195e687SMark J Musante 		metaslab_group_t *mg = tvd->vdev_mg;
13621195e687SMark J Musante 
13631195e687SMark J Musante 		if (tvd->vdev_islog)
13641195e687SMark J Musante 			metaslab_group_activate(mg);
13651195e687SMark J Musante 	}
13661195e687SMark J Musante }
13671195e687SMark J Musante 
13681195e687SMark J Musante int
13691195e687SMark J Musante spa_offline_log(spa_t *spa)
13701195e687SMark J Musante {
13711195e687SMark J Musante 	int error = 0;
13721195e687SMark J Musante 
13731195e687SMark J Musante 	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
13741195e687SMark J Musante 	    NULL, DS_FIND_CHILDREN)) == 0) {
13751195e687SMark J Musante 
13761195e687SMark J Musante 		/*
13771195e687SMark J Musante 		 * We successfully offlined the log device, sync out the
13781195e687SMark J Musante 		 * current txg so that the "stubby" block can be removed
13791195e687SMark J Musante 		 * by zil_sync().
13801195e687SMark J Musante 		 */
13811195e687SMark J Musante 		txg_wait_synced(spa->spa_dsl_pool, 0);
13821195e687SMark J Musante 	}
13831195e687SMark J Musante 	return (error);
13841195e687SMark J Musante }
13851195e687SMark J Musante 
1386b693757aSEric Schrock static void
1387b693757aSEric Schrock spa_aux_check_removed(spa_aux_vdev_t *sav)
1388b693757aSEric Schrock {
1389b24ab676SJeff Bonwick 	for (int i = 0; i < sav->sav_count; i++)
1390b693757aSEric Schrock 		spa_check_removed(sav->sav_vdevs[i]);
1391b693757aSEric Schrock }
1392b693757aSEric Schrock 
1393b24ab676SJeff Bonwick void
1394b24ab676SJeff Bonwick spa_claim_notify(zio_t *zio)
1395b24ab676SJeff Bonwick {
1396b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
1397b24ab676SJeff Bonwick 
1398b24ab676SJeff Bonwick 	if (zio->io_error)
1399b24ab676SJeff Bonwick 		return;
1400b24ab676SJeff Bonwick 
1401b24ab676SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
1402b24ab676SJeff Bonwick 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1403b24ab676SJeff Bonwick 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1404b24ab676SJeff Bonwick 	mutex_exit(&spa->spa_props_lock);
1405b24ab676SJeff Bonwick }
1406b24ab676SJeff Bonwick 
1407468c413aSTim Haley typedef struct spa_load_error {
1408c8ee1847SVictor Latushkin 	uint64_t	sle_meta_count;
1409468c413aSTim Haley 	uint64_t	sle_data_count;
1410468c413aSTim Haley } spa_load_error_t;
1411468c413aSTim Haley 
1412468c413aSTim Haley static void
1413468c413aSTim Haley spa_load_verify_done(zio_t *zio)
1414468c413aSTim Haley {
1415468c413aSTim Haley 	blkptr_t *bp = zio->io_bp;
1416468c413aSTim Haley 	spa_load_error_t *sle = zio->io_private;
1417468c413aSTim Haley 	dmu_object_type_t type = BP_GET_TYPE(bp);
1418468c413aSTim Haley 	int error = zio->io_error;
1419468c413aSTim Haley 
1420468c413aSTim Haley 	if (error) {
1421468c413aSTim Haley 		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1422468c413aSTim Haley 		    type != DMU_OT_INTENT_LOG)
1423c8ee1847SVictor Latushkin 			atomic_add_64(&sle->sle_meta_count, 1);
1424468c413aSTim Haley 		else
1425468c413aSTim Haley 			atomic_add_64(&sle->sle_data_count, 1);
1426468c413aSTim Haley 	}
1427468c413aSTim Haley 	zio_data_buf_free(zio->io_data, zio->io_size);
1428468c413aSTim Haley }
1429468c413aSTim Haley 
1430468c413aSTim Haley /*ARGSUSED*/
1431468c413aSTim Haley static int
1432b24ab676SJeff Bonwick spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1433*3f9d6ad7SLin Ling     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1434468c413aSTim Haley {
1435468c413aSTim Haley 	if (bp != NULL) {
1436468c413aSTim Haley 		zio_t *rio = arg;
1437468c413aSTim Haley 		size_t size = BP_GET_PSIZE(bp);
1438468c413aSTim Haley 		void *data = zio_data_buf_alloc(size);
1439468c413aSTim Haley 
1440468c413aSTim Haley 		zio_nowait(zio_read(rio, spa, bp, data, size,
1441468c413aSTim Haley 		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1442468c413aSTim Haley 		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1443468c413aSTim Haley 		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1444468c413aSTim Haley 	}
1445468c413aSTim Haley 	return (0);
1446468c413aSTim Haley }
1447468c413aSTim Haley 
1448468c413aSTim Haley static int
1449468c413aSTim Haley spa_load_verify(spa_t *spa)
1450468c413aSTim Haley {
1451468c413aSTim Haley 	zio_t *rio;
1452468c413aSTim Haley 	spa_load_error_t sle = { 0 };
1453468c413aSTim Haley 	zpool_rewind_policy_t policy;
1454468c413aSTim Haley 	boolean_t verify_ok = B_FALSE;
1455468c413aSTim Haley 	int error;
1456468c413aSTim Haley 
1457c8ee1847SVictor Latushkin 	zpool_get_rewind_policy(spa->spa_config, &policy);
1458c8ee1847SVictor Latushkin 
1459c8ee1847SVictor Latushkin 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1460c8ee1847SVictor Latushkin 		return (0);
1461c8ee1847SVictor Latushkin 
1462468c413aSTim Haley 	rio = zio_root(spa, NULL, &sle,
1463468c413aSTim Haley 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1464468c413aSTim Haley 
1465bbfd46c4SJeff Bonwick 	error = traverse_pool(spa, spa->spa_verify_min_txg,
1466bbfd46c4SJeff Bonwick 	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1467468c413aSTim Haley 
1468468c413aSTim Haley 	(void) zio_wait(rio);
1469468c413aSTim Haley 
1470c8ee1847SVictor Latushkin 	spa->spa_load_meta_errors = sle.sle_meta_count;
1471468c413aSTim Haley 	spa->spa_load_data_errors = sle.sle_data_count;
1472468c413aSTim Haley 
1473c8ee1847SVictor Latushkin 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1474468c413aSTim Haley 	    sle.sle_data_count <= policy.zrp_maxdata) {
1475468c413aSTim Haley 		verify_ok = B_TRUE;
1476468c413aSTim Haley 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1477468c413aSTim Haley 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1478a33cae98STim Haley 	} else {
1479a33cae98STim Haley 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1480468c413aSTim Haley 	}
1481468c413aSTim Haley 
1482468c413aSTim Haley 	if (error) {
1483468c413aSTim Haley 		if (error != ENXIO && error != EIO)
1484468c413aSTim Haley 			error = EIO;
1485468c413aSTim Haley 		return (error);
1486468c413aSTim Haley 	}
1487468c413aSTim Haley 
1488468c413aSTim Haley 	return (verify_ok ? 0 : EIO);
1489468c413aSTim Haley }
1490468c413aSTim Haley 
14911195e687SMark J Musante /*
14921195e687SMark J Musante  * Find a value in the pool props object.
14931195e687SMark J Musante  */
14941195e687SMark J Musante static void
14951195e687SMark J Musante spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
14961195e687SMark J Musante {
14971195e687SMark J Musante 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
14981195e687SMark J Musante 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
14991195e687SMark J Musante }
15001195e687SMark J Musante 
15011195e687SMark J Musante /*
15021195e687SMark J Musante  * Find a value in the pool directory object.
15031195e687SMark J Musante  */
15041195e687SMark J Musante static int
15051195e687SMark J Musante spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
15061195e687SMark J Musante {
15071195e687SMark J Musante 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
15081195e687SMark J Musante 	    name, sizeof (uint64_t), 1, val));
15091195e687SMark J Musante }
15101195e687SMark J Musante 
15111195e687SMark J Musante static int
15121195e687SMark J Musante spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
15131195e687SMark J Musante {
15141195e687SMark J Musante 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
15151195e687SMark J Musante 	return (err);
15161195e687SMark J Musante }
15171195e687SMark J Musante 
15181195e687SMark J Musante /*
15191195e687SMark J Musante  * Fix up config after a partly-completed split.  This is done with the
15201195e687SMark J Musante  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
15211195e687SMark J Musante  * pool have that entry in their config, but only the splitting one contains
15221195e687SMark J Musante  * a list of all the guids of the vdevs that are being split off.
15231195e687SMark J Musante  *
15241195e687SMark J Musante  * This function determines what to do with that list: either rejoin
15251195e687SMark J Musante  * all the disks to the pool, or complete the splitting process.  To attempt
15261195e687SMark J Musante  * the rejoin, each disk that is offlined is marked online again, and
15271195e687SMark J Musante  * we do a reopen() call.  If the vdev label for every disk that was
15281195e687SMark J Musante  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
15291195e687SMark J Musante  * then we call vdev_split() on each disk, and complete the split.
15301195e687SMark J Musante  *
1531d41c4376SMark J Musante  * Otherwise we leave the config alone, with all the vdevs in place in
1532d41c4376SMark J Musante  * the original pool.
15331195e687SMark J Musante  */
15341195e687SMark J Musante static void
15351195e687SMark J Musante spa_try_repair(spa_t *spa, nvlist_t *config)
15361195e687SMark J Musante {
15371195e687SMark J Musante 	uint_t extracted;
15381195e687SMark J Musante 	uint64_t *glist;
15391195e687SMark J Musante 	uint_t i, gcount;
15401195e687SMark J Musante 	nvlist_t *nvl;
15411195e687SMark J Musante 	vdev_t **vd;
15421195e687SMark J Musante 	boolean_t attempt_reopen;
15431195e687SMark J Musante 
15441195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
15451195e687SMark J Musante 		return;
15461195e687SMark J Musante 
15471195e687SMark J Musante 	/* check that the config is complete */
15481195e687SMark J Musante 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
15491195e687SMark J Musante 	    &glist, &gcount) != 0)
15501195e687SMark J Musante 		return;
15511195e687SMark J Musante 
15521195e687SMark J Musante 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
15531195e687SMark J Musante 
15541195e687SMark J Musante 	/* attempt to online all the vdevs & validate */
15551195e687SMark J Musante 	attempt_reopen = B_TRUE;
15561195e687SMark J Musante 	for (i = 0; i < gcount; i++) {
15571195e687SMark J Musante 		if (glist[i] == 0)	/* vdev is hole */
15581195e687SMark J Musante 			continue;
15591195e687SMark J Musante 
15601195e687SMark J Musante 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
15611195e687SMark J Musante 		if (vd[i] == NULL) {
15621195e687SMark J Musante 			/*
15631195e687SMark J Musante 			 * Don't bother attempting to reopen the disks;
15641195e687SMark J Musante 			 * just do the split.
15651195e687SMark J Musante 			 */
15661195e687SMark J Musante 			attempt_reopen = B_FALSE;
15671195e687SMark J Musante 		} else {
15681195e687SMark J Musante 			/* attempt to re-online it */
15691195e687SMark J Musante 			vd[i]->vdev_offline = B_FALSE;
15701195e687SMark J Musante 		}
15711195e687SMark J Musante 	}
15721195e687SMark J Musante 
15731195e687SMark J Musante 	if (attempt_reopen) {
15741195e687SMark J Musante 		vdev_reopen(spa->spa_root_vdev);
15751195e687SMark J Musante 
15761195e687SMark J Musante 		/* check each device to see what state it's in */
15771195e687SMark J Musante 		for (extracted = 0, i = 0; i < gcount; i++) {
15781195e687SMark J Musante 			if (vd[i] != NULL &&
15791195e687SMark J Musante 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
15801195e687SMark J Musante 				break;
15811195e687SMark J Musante 			++extracted;
15821195e687SMark J Musante 		}
15831195e687SMark J Musante 	}
15841195e687SMark J Musante 
15851195e687SMark J Musante 	/*
15861195e687SMark J Musante 	 * If every disk has been moved to the new pool, or if we never
15871195e687SMark J Musante 	 * even attempted to look at them, then we split them off for
15881195e687SMark J Musante 	 * good.
15891195e687SMark J Musante 	 */
15901195e687SMark J Musante 	if (!attempt_reopen || gcount == extracted) {
15911195e687SMark J Musante 		for (i = 0; i < gcount; i++)
15921195e687SMark J Musante 			if (vd[i] != NULL)
15931195e687SMark J Musante 				vdev_split(vd[i]);
15941195e687SMark J Musante 		vdev_reopen(spa->spa_root_vdev);
15951195e687SMark J Musante 	}
15961195e687SMark J Musante 
15971195e687SMark J Musante 	kmem_free(vd, gcount * sizeof (vdev_t *));
15981195e687SMark J Musante }
15991195e687SMark J Musante 
16001195e687SMark J Musante static int
16011195e687SMark J Musante spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
16021195e687SMark J Musante     boolean_t mosconfig)
16031195e687SMark J Musante {
16041195e687SMark J Musante 	nvlist_t *config = spa->spa_config;
16051195e687SMark J Musante 	char *ereport = FM_EREPORT_ZFS_POOL;
16061195e687SMark J Musante 	int error;
16071195e687SMark J Musante 	uint64_t pool_guid;
16081195e687SMark J Musante 	nvlist_t *nvl;
16091195e687SMark J Musante 
16101195e687SMark J Musante 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
16111195e687SMark J Musante 		return (EINVAL);
16121195e687SMark J Musante 
16131195e687SMark J Musante 	/*
16141195e687SMark J Musante 	 * Versioning wasn't explicitly added to the label until later, so if
16151195e687SMark J Musante 	 * it's not present treat it as the initial version.
16161195e687SMark J Musante 	 */
16171195e687SMark J Musante 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
16181195e687SMark J Musante 	    &spa->spa_ubsync.ub_version) != 0)
16191195e687SMark J Musante 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
16201195e687SMark J Musante 
16211195e687SMark J Musante 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
16221195e687SMark J Musante 	    &spa->spa_config_txg);
16231195e687SMark J Musante 
16241195e687SMark J Musante 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
16251195e687SMark J Musante 	    spa_guid_exists(pool_guid, 0)) {
16261195e687SMark J Musante 		error = EEXIST;
16271195e687SMark J Musante 	} else {
16281195e687SMark J Musante 		spa->spa_load_guid = pool_guid;
16291195e687SMark J Musante 
16301195e687SMark J Musante 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
16311195e687SMark J Musante 		    &nvl) == 0) {
16321195e687SMark J Musante 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
16331195e687SMark J Musante 			    KM_SLEEP) == 0);
16341195e687SMark J Musante 		}
16351195e687SMark J Musante 
16361195e687SMark J Musante 		error = spa_load_impl(spa, pool_guid, config, state, type,
16371195e687SMark J Musante 		    mosconfig, &ereport);
16381195e687SMark J Musante 	}
16391195e687SMark J Musante 
16401195e687SMark J Musante 	spa->spa_minref = refcount_count(&spa->spa_refcount);
16411195e687SMark J Musante 	if (error && error != EBADF)
16421195e687SMark J Musante 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
16431195e687SMark J Musante 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
16441195e687SMark J Musante 	spa->spa_ena = 0;
16451195e687SMark J Musante 
16461195e687SMark J Musante 	return (error);
16471195e687SMark J Musante }
16481195e687SMark J Musante 
1649fa9e4066Sahrens /*
1650fa9e4066Sahrens  * Load an existing storage pool, using the pool's builtin spa_config as a
1651ea8dc4b6Seschrock  * source of configuration information.
1652fa9e4066Sahrens  */
1653fa9e4066Sahrens static int
16541195e687SMark J Musante spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
16551195e687SMark J Musante     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
16561195e687SMark J Musante     char **ereport)
1657fa9e4066Sahrens {
1658fa9e4066Sahrens 	int error = 0;
1659871a9500SMark J Musante 	nvlist_t *nvroot = NULL;
1660fa9e4066Sahrens 	vdev_t *rvd;
1661fa9e4066Sahrens 	uberblock_t *ub = &spa->spa_uberblock;
16620373e76bSbonwick 	uint64_t config_cache_txg = spa->spa_config_txg;
16638ad4d6ddSJeff Bonwick 	int orig_mode = spa->spa_mode;
16641195e687SMark J Musante 	int parse;
1665fa9e4066Sahrens 
16668ad4d6ddSJeff Bonwick 	/*
16678ad4d6ddSJeff Bonwick 	 * If this is an untrusted config, access the pool in read-only mode.
16688ad4d6ddSJeff Bonwick 	 * This prevents things like resilvering recently removed devices.
16698ad4d6ddSJeff Bonwick 	 */
16708ad4d6ddSJeff Bonwick 	if (!mosconfig)
16718ad4d6ddSJeff Bonwick 		spa->spa_mode = FREAD;
16728ad4d6ddSJeff Bonwick 
1673e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1674e14bb325SJeff Bonwick 
1675ea8dc4b6Seschrock 	spa->spa_load_state = state;
16760373e76bSbonwick 
16771195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
16781195e687SMark J Musante 		return (EINVAL);
1679fa9e4066Sahrens 
16801195e687SMark J Musante 	parse = (type == SPA_IMPORT_EXISTING ?
16811195e687SMark J Musante 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1682b5989ec7Seschrock 
168354d692b7SGeorge Wilson 	/*
168454d692b7SGeorge Wilson 	 * Create "The Godfather" zio to hold all async IOs
168554d692b7SGeorge Wilson 	 */
168625f89ee2SJeff Bonwick 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
168725f89ee2SJeff Bonwick 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
168854d692b7SGeorge Wilson 
1689fa9e4066Sahrens 	/*
169099653d4eSeschrock 	 * Parse the configuration into a vdev tree.  We explicitly set the
169199653d4eSeschrock 	 * value that will be returned by spa_version() since parsing the
169299653d4eSeschrock 	 * configuration requires knowing the version number.
1693fa9e4066Sahrens 	 */
1694e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
16951195e687SMark J Musante 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1696e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
1697fa9e4066Sahrens 
169899653d4eSeschrock 	if (error != 0)
16991195e687SMark J Musante 		return (error);
1700fa9e4066Sahrens 
17010e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == rvd);
17021195e687SMark J Musante 
17031195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
17041195e687SMark J Musante 		ASSERT(spa_guid(spa) == pool_guid);
17051195e687SMark J Musante 	}
1706fa9e4066Sahrens 
1707fa9e4066Sahrens 	/*
1708fa9e4066Sahrens 	 * Try to open all vdevs, loading each label in the process.
1709fa9e4066Sahrens 	 */
1710e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
17110bf246f5Smc 	error = vdev_open(rvd);
1712e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
17130bf246f5Smc 	if (error != 0)
17141195e687SMark J Musante 		return (error);
1715fa9e4066Sahrens 
1716560e6e96Seschrock 	/*
171777e3a39cSMark J Musante 	 * We need to validate the vdev labels against the configuration that
171877e3a39cSMark J Musante 	 * we have in hand, which is dependent on the setting of mosconfig. If
171977e3a39cSMark J Musante 	 * mosconfig is true then we're validating the vdev labels based on
17201195e687SMark J Musante 	 * that config.  Otherwise, we're validating against the cached config
172177e3a39cSMark J Musante 	 * (zpool.cache) that was read when we loaded the zfs module, and then
172277e3a39cSMark J Musante 	 * later we will recursively call spa_load() and validate against
172377e3a39cSMark J Musante 	 * the vdev config.
17241195e687SMark J Musante 	 *
17251195e687SMark J Musante 	 * If we're assembling a new pool that's been split off from an
17261195e687SMark J Musante 	 * existing pool, the labels haven't yet been updated so we skip
17271195e687SMark J Musante 	 * validation for now.
1728560e6e96Seschrock 	 */
17291195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
17301195e687SMark J Musante 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
17311195e687SMark J Musante 		error = vdev_validate(rvd);
17321195e687SMark J Musante 		spa_config_exit(spa, SCL_ALL, FTAG);
1733560e6e96Seschrock 
17341195e687SMark J Musante 		if (error != 0)
17351195e687SMark J Musante 			return (error);
17361195e687SMark J Musante 
17371195e687SMark J Musante 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
17381195e687SMark J Musante 			return (ENXIO);
1739560e6e96Seschrock 	}
1740560e6e96Seschrock 
1741fa9e4066Sahrens 	/*
1742fa9e4066Sahrens 	 * Find the best uberblock.
1743fa9e4066Sahrens 	 */
1744e14bb325SJeff Bonwick 	vdev_uberblock_load(NULL, rvd, ub);
1745fa9e4066Sahrens 
1746fa9e4066Sahrens 	/*
1747fa9e4066Sahrens 	 * If we weren't able to find a single valid uberblock, return failure.
1748fa9e4066Sahrens 	 */
17491195e687SMark J Musante 	if (ub->ub_txg == 0)
17501195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1751ea8dc4b6Seschrock 
1752ea8dc4b6Seschrock 	/*
1753ea8dc4b6Seschrock 	 * If the pool is newer than the code, we can't open it.
1754ea8dc4b6Seschrock 	 */
17551195e687SMark J Musante 	if (ub->ub_version > SPA_VERSION)
17561195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1757fa9e4066Sahrens 
1758fa9e4066Sahrens 	/*
1759fa9e4066Sahrens 	 * If the vdev guid sum doesn't match the uberblock, we have an
1760fa9e4066Sahrens 	 * incomplete configuration.
1761fa9e4066Sahrens 	 */
17621195e687SMark J Musante 	if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
17631195e687SMark J Musante 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
17641195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
17651195e687SMark J Musante 
17661195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
17671195e687SMark J Musante 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
17681195e687SMark J Musante 		spa_try_repair(spa, config);
17691195e687SMark J Musante 		spa_config_exit(spa, SCL_ALL, FTAG);
17701195e687SMark J Musante 		nvlist_free(spa->spa_config_splitting);
17711195e687SMark J Musante 		spa->spa_config_splitting = NULL;
1772fa9e4066Sahrens 	}
1773fa9e4066Sahrens 
1774fa9e4066Sahrens 	/*
1775fa9e4066Sahrens 	 * Initialize internal SPA structures.
1776fa9e4066Sahrens 	 */
1777fa9e4066Sahrens 	spa->spa_state = POOL_STATE_ACTIVE;
1778fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
1779468c413aSTim Haley 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1780c8ee1847SVictor Latushkin 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1781468c413aSTim Haley 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1782468c413aSTim Haley 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
1783b24ab676SJeff Bonwick 	spa->spa_claim_max_txg = spa->spa_first_txg;
1784*3f9d6ad7SLin Ling 	spa->spa_prev_software_version = ub->ub_software_version;
1785b24ab676SJeff Bonwick 
1786ea8dc4b6Seschrock 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
17871195e687SMark J Musante 	if (error)
17881195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1789fa9e4066Sahrens 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1790fa9e4066Sahrens 
17911195e687SMark J Musante 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
17921195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1793fa9e4066Sahrens 
1794fa9e4066Sahrens 	if (!mosconfig) {
179595173954Sek 		uint64_t hostid;
1796871a9500SMark J Musante 		nvlist_t *policy = NULL, *nvconfig;
1797871a9500SMark J Musante 
1798871a9500SMark J Musante 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
1799871a9500SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1800fa9e4066Sahrens 
180188ecc943SGeorge Wilson 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
180277650510SLin Ling 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
180395173954Sek 			char *hostname;
180495173954Sek 			unsigned long myhostid = 0;
180595173954Sek 
180688ecc943SGeorge Wilson 			VERIFY(nvlist_lookup_string(nvconfig,
180795173954Sek 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
180895173954Sek 
18095679c89fSjv #ifdef	_KERNEL
18105679c89fSjv 			myhostid = zone_get_hostid(NULL);
18115679c89fSjv #else	/* _KERNEL */
18125679c89fSjv 			/*
18135679c89fSjv 			 * We're emulating the system's hostid in userland, so
18145679c89fSjv 			 * we can't use zone_get_hostid().
18155679c89fSjv 			 */
181695173954Sek 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
18175679c89fSjv #endif	/* _KERNEL */
181817194a52Slling 			if (hostid != 0 && myhostid != 0 &&
18195679c89fSjv 			    hostid != myhostid) {
1820871a9500SMark J Musante 				nvlist_free(nvconfig);
182195173954Sek 				cmn_err(CE_WARN, "pool '%s' could not be "
182295173954Sek 				    "loaded as it was last accessed by "
182377650510SLin Ling 				    "another system (host: %s hostid: 0x%lx). "
182495173954Sek 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1825e14bb325SJeff Bonwick 				    spa_name(spa), hostname,
182695173954Sek 				    (unsigned long)hostid);
18271195e687SMark J Musante 				return (EBADF);
182895173954Sek 			}
182995173954Sek 		}
1830c8ee1847SVictor Latushkin 		if (nvlist_lookup_nvlist(spa->spa_config,
1831c8ee1847SVictor Latushkin 		    ZPOOL_REWIND_POLICY, &policy) == 0)
1832c8ee1847SVictor Latushkin 			VERIFY(nvlist_add_nvlist(nvconfig,
1833c8ee1847SVictor Latushkin 			    ZPOOL_REWIND_POLICY, policy) == 0);
183495173954Sek 
183588ecc943SGeorge Wilson 		spa_config_set(spa, nvconfig);
1836fa9e4066Sahrens 		spa_unload(spa);
1837fa9e4066Sahrens 		spa_deactivate(spa);
18388ad4d6ddSJeff Bonwick 		spa_activate(spa, orig_mode);
1839fa9e4066Sahrens 
18401195e687SMark J Musante 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
1841fa9e4066Sahrens 	}
1842fa9e4066Sahrens 
18431195e687SMark J Musante 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST,
18441195e687SMark J Musante 	    &spa->spa_deferred_bplist_obj) != 0)
18451195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1846fa9e4066Sahrens 
184799653d4eSeschrock 	/*
184899653d4eSeschrock 	 * Load the bit that tells us to use the new accounting function
184999653d4eSeschrock 	 * (raid-z deflation).  If we have an older pool, this will not
185099653d4eSeschrock 	 * be present.
185199653d4eSeschrock 	 */
18521195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
18531195e687SMark J Musante 	if (error != 0 && error != ENOENT)
18541195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
185599653d4eSeschrock 
1856*3f9d6ad7SLin Ling 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
1857*3f9d6ad7SLin Ling 	    &spa->spa_creation_version);
1858*3f9d6ad7SLin Ling 	if (error != 0 && error != ENOENT)
1859*3f9d6ad7SLin Ling 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1860*3f9d6ad7SLin Ling 
1861fa9e4066Sahrens 	/*
1862ea8dc4b6Seschrock 	 * Load the persistent error log.  If we have an older pool, this will
1863ea8dc4b6Seschrock 	 * not be present.
1864fa9e4066Sahrens 	 */
18651195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
18661195e687SMark J Musante 	if (error != 0 && error != ENOENT)
18671195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1868ea8dc4b6Seschrock 
18691195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
18701195e687SMark J Musante 	    &spa->spa_errlog_scrub);
18711195e687SMark J Musante 	if (error != 0 && error != ENOENT)
18721195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1873ea8dc4b6Seschrock 
187406eeb2adSek 	/*
187506eeb2adSek 	 * Load the history object.  If we have an older pool, this
187606eeb2adSek 	 * will not be present.
187706eeb2adSek 	 */
18781195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
18791195e687SMark J Musante 	if (error != 0 && error != ENOENT)
18801195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
18811195e687SMark J Musante 
18821195e687SMark J Musante 	/*
18831195e687SMark J Musante 	 * If we're assembling the pool from the split-off vdevs of
18841195e687SMark J Musante 	 * an existing pool, we don't want to attach the spares & cache
18851195e687SMark J Musante 	 * devices.
18861195e687SMark J Musante 	 */
188706eeb2adSek 
188899653d4eSeschrock 	/*
188999653d4eSeschrock 	 * Load any hot spares for this pool.
189099653d4eSeschrock 	 */
18911195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
18921195e687SMark J Musante 	if (error != 0 && error != ENOENT)
18931195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
18941195e687SMark J Musante 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
1895e7437265Sahrens 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1896fa94a07fSbrendan 		if (load_nvlist(spa, spa->spa_spares.sav_object,
18971195e687SMark J Musante 		    &spa->spa_spares.sav_config) != 0)
18981195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
189999653d4eSeschrock 
1900e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
190199653d4eSeschrock 		spa_load_spares(spa);
1902e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
19031195e687SMark J Musante 	} else if (error == 0) {
19041195e687SMark J Musante 		spa->spa_spares.sav_sync = B_TRUE;
190599653d4eSeschrock 	}
190699653d4eSeschrock 
1907fa94a07fSbrendan 	/*
1908fa94a07fSbrendan 	 * Load any level 2 ARC devices for this pool.
1909fa94a07fSbrendan 	 */
19101195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
1911fa94a07fSbrendan 	    &spa->spa_l2cache.sav_object);
19121195e687SMark J Musante 	if (error != 0 && error != ENOENT)
19131195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
19141195e687SMark J Musante 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
1915fa94a07fSbrendan 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1916fa94a07fSbrendan 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
19171195e687SMark J Musante 		    &spa->spa_l2cache.sav_config) != 0)
19181195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1919fa94a07fSbrendan 
1920e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1921fa94a07fSbrendan 		spa_load_l2cache(spa);
1922e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
19231195e687SMark J Musante 	} else if (error == 0) {
19241195e687SMark J Musante 		spa->spa_l2cache.sav_sync = B_TRUE;
1925fa94a07fSbrendan 	}
1926fa94a07fSbrendan 
1927990b4856Slling 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1928ecd6cf80Smarks 
19291195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
19301195e687SMark J Musante 	if (error && error != ENOENT)
19311195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1932b1b8ab34Slling 
1933b1b8ab34Slling 	if (error == 0) {
19341195e687SMark J Musante 		uint64_t autoreplace;
19351195e687SMark J Musante 
19361195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
19371195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
19381195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
19391195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
19401195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
19411195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
19421195e687SMark J Musante 		    &spa->spa_dedup_ditto);
19431195e687SMark J Musante 
1944b693757aSEric Schrock 		spa->spa_autoreplace = (autoreplace != 0);
1945b1b8ab34Slling 	}
1946b1b8ab34Slling 
19473d7072f8Seschrock 	/*
19483d7072f8Seschrock 	 * If the 'autoreplace' property is set, then post a resource notifying
19493d7072f8Seschrock 	 * the ZFS DE that it should not issue any faults for unopenable
19503d7072f8Seschrock 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
19513d7072f8Seschrock 	 * unopenable vdevs so that the normal autoreplace handler can take
19523d7072f8Seschrock 	 * over.
19533d7072f8Seschrock 	 */
1954b693757aSEric Schrock 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
19553d7072f8Seschrock 		spa_check_removed(spa->spa_root_vdev);
1956b693757aSEric Schrock 		/*
1957b693757aSEric Schrock 		 * For the import case, this is done in spa_import(), because
1958b693757aSEric Schrock 		 * at this point we're using the spare definitions from
1959b693757aSEric Schrock 		 * the MOS config, not necessarily from the userland config.
1960b693757aSEric Schrock 		 */
1961b693757aSEric Schrock 		if (state != SPA_LOAD_IMPORT) {
1962b693757aSEric Schrock 			spa_aux_check_removed(&spa->spa_spares);
1963b693757aSEric Schrock 			spa_aux_check_removed(&spa->spa_l2cache);
1964b693757aSEric Schrock 		}
1965b693757aSEric Schrock 	}
19663d7072f8Seschrock 
1967ea8dc4b6Seschrock 	/*
1968560e6e96Seschrock 	 * Load the vdev state for all toplevel vdevs.
1969ea8dc4b6Seschrock 	 */
1970560e6e96Seschrock 	vdev_load(rvd);
19710373e76bSbonwick 
1972fa9e4066Sahrens 	/*
1973fa9e4066Sahrens 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1974fa9e4066Sahrens 	 */
1975e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1976fa9e4066Sahrens 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1977e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
1978fa9e4066Sahrens 
1979fa9e4066Sahrens 	/*
1980fa9e4066Sahrens 	 * Check the state of the root vdev.  If it can't be opened, it
1981fa9e4066Sahrens 	 * indicates one or more toplevel vdevs are faulted.
1982fa9e4066Sahrens 	 */
19831195e687SMark J Musante 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
19841195e687SMark J Musante 		return (ENXIO);
1985fa9e4066Sahrens 
1986b24ab676SJeff Bonwick 	/*
1987b24ab676SJeff Bonwick 	 * Load the DDTs (dedup tables).
1988b24ab676SJeff Bonwick 	 */
1989b24ab676SJeff Bonwick 	error = ddt_load(spa);
19901195e687SMark J Musante 	if (error != 0)
19911195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1992b24ab676SJeff Bonwick 
1993485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
1994485bbbf5SGeorge Wilson 
1995468c413aSTim Haley 	if (state != SPA_LOAD_TRYIMPORT) {
1996468c413aSTim Haley 		error = spa_load_verify(spa);
19971195e687SMark J Musante 		if (error)
19981195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
19991195e687SMark J Musante 			    error));
2000468c413aSTim Haley 	}
2001468c413aSTim Haley 
2002b24ab676SJeff Bonwick 	/*
20031195e687SMark J Musante 	 * Load the intent log state and check log integrity.  If we're
20041195e687SMark J Musante 	 * assembling a pool from a split, the log is not transferred over.
2005b24ab676SJeff Bonwick 	 */
20061195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
2007871a9500SMark J Musante 		nvlist_t *nvconfig;
2008871a9500SMark J Musante 
2009871a9500SMark J Musante 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2010871a9500SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2011871a9500SMark J Musante 
20121195e687SMark J Musante 		VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
20131195e687SMark J Musante 		    &nvroot) == 0);
20141195e687SMark J Musante 		spa_load_log_state(spa, nvroot);
20151195e687SMark J Musante 		nvlist_free(nvconfig);
20161195e687SMark J Musante 
20171195e687SMark J Musante 		if (spa_check_logs(spa)) {
20181195e687SMark J Musante 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
20191195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
20201195e687SMark J Musante 		}
2021b24ab676SJeff Bonwick 	}
2022b24ab676SJeff Bonwick 
2023468c413aSTim Haley 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2024468c413aSTim Haley 	    spa->spa_load_max_txg == UINT64_MAX)) {
20255dabedeeSbonwick 		dmu_tx_t *tx;
20260373e76bSbonwick 		int need_update = B_FALSE;
20278ad4d6ddSJeff Bonwick 
20288ad4d6ddSJeff Bonwick 		ASSERT(state != SPA_LOAD_TRYIMPORT);
20295dabedeeSbonwick 
20300373e76bSbonwick 		/*
20310373e76bSbonwick 		 * Claim log blocks that haven't been committed yet.
20320373e76bSbonwick 		 * This must all happen in a single txg.
2033b24ab676SJeff Bonwick 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2034b24ab676SJeff Bonwick 		 * invoked from zil_claim_log_block()'s i/o done callback.
2035468c413aSTim Haley 		 * Price of rollback is that we abandon the log.
20360373e76bSbonwick 		 */
2037b24ab676SJeff Bonwick 		spa->spa_claiming = B_TRUE;
2038b24ab676SJeff Bonwick 
20395dabedeeSbonwick 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2040fa9e4066Sahrens 		    spa_first_txg(spa));
2041e14bb325SJeff Bonwick 		(void) dmu_objset_find(spa_name(spa),
20420b69c2f0Sahrens 		    zil_claim, tx, DS_FIND_CHILDREN);
2043fa9e4066Sahrens 		dmu_tx_commit(tx);
2044fa9e4066Sahrens 
2045b24ab676SJeff Bonwick 		spa->spa_claiming = B_FALSE;
2046b24ab676SJeff Bonwick 
20471195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_GOOD);
2048fa9e4066Sahrens 		spa->spa_sync_on = B_TRUE;
2049fa9e4066Sahrens 		txg_sync_start(spa->spa_dsl_pool);
2050fa9e4066Sahrens 
2051fa9e4066Sahrens 		/*
2052b24ab676SJeff Bonwick 		 * Wait for all claims to sync.  We sync up to the highest
2053b24ab676SJeff Bonwick 		 * claimed log block birth time so that claimed log blocks
2054b24ab676SJeff Bonwick 		 * don't appear to be from the future.  spa_claim_max_txg
2055b24ab676SJeff Bonwick 		 * will have been set for us by either zil_check_log_chain()
2056b24ab676SJeff Bonwick 		 * (invoked from spa_check_logs()) or zil_claim() above.
2057fa9e4066Sahrens 		 */
2058b24ab676SJeff Bonwick 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
20590e34b6a7Sbonwick 
20600e34b6a7Sbonwick 		/*
20610373e76bSbonwick 		 * If the config cache is stale, or we have uninitialized
20620373e76bSbonwick 		 * metaslabs (see spa_vdev_add()), then update the config.
2063bc758434SLin Ling 		 *
2064bc758434SLin Ling 		 * If spa_load_verbatim is true, trust the current
2065bc758434SLin Ling 		 * in-core spa_config and update the disk labels.
20660e34b6a7Sbonwick 		 */
20670373e76bSbonwick 		if (config_cache_txg != spa->spa_config_txg ||
2068468c413aSTim Haley 		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
2069468c413aSTim Haley 		    state == SPA_LOAD_RECOVER)
20700373e76bSbonwick 			need_update = B_TRUE;
20710373e76bSbonwick 
20728ad4d6ddSJeff Bonwick 		for (int c = 0; c < rvd->vdev_children; c++)
20730373e76bSbonwick 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
20740373e76bSbonwick 				need_update = B_TRUE;
20750e34b6a7Sbonwick 
20760e34b6a7Sbonwick 		/*
20770373e76bSbonwick 		 * Update the config cache asychronously in case we're the
20780373e76bSbonwick 		 * root pool, in which case the config cache isn't writable yet.
20790e34b6a7Sbonwick 		 */
20800373e76bSbonwick 		if (need_update)
20810373e76bSbonwick 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
20828ad4d6ddSJeff Bonwick 
20838ad4d6ddSJeff Bonwick 		/*
20848ad4d6ddSJeff Bonwick 		 * Check all DTLs to see if anything needs resilvering.
20858ad4d6ddSJeff Bonwick 		 */
2086*3f9d6ad7SLin Ling 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2087*3f9d6ad7SLin Ling 		    vdev_resilver_needed(rvd, NULL, NULL))
20888ad4d6ddSJeff Bonwick 			spa_async_request(spa, SPA_ASYNC_RESILVER);
2089503ad85cSMatthew Ahrens 
2090503ad85cSMatthew Ahrens 		/*
2091503ad85cSMatthew Ahrens 		 * Delete any inconsistent datasets.
2092503ad85cSMatthew Ahrens 		 */
2093503ad85cSMatthew Ahrens 		(void) dmu_objset_find(spa_name(spa),
2094503ad85cSMatthew Ahrens 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2095ca45db41SChris Kirby 
2096ca45db41SChris Kirby 		/*
2097ca45db41SChris Kirby 		 * Clean up any stale temporary dataset userrefs.
2098ca45db41SChris Kirby 		 */
2099ca45db41SChris Kirby 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2100fa9e4066Sahrens 	}
2101fa9e4066Sahrens 
21021195e687SMark J Musante 	return (0);
2103fa9e4066Sahrens }
2104fa9e4066Sahrens 
2105468c413aSTim Haley static int
2106468c413aSTim Haley spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2107468c413aSTim Haley {
2108468c413aSTim Haley 	spa_unload(spa);
2109468c413aSTim Haley 	spa_deactivate(spa);
2110468c413aSTim Haley 
2111468c413aSTim Haley 	spa->spa_load_max_txg--;
2112468c413aSTim Haley 
2113468c413aSTim Haley 	spa_activate(spa, spa_mode_global);
2114468c413aSTim Haley 	spa_async_suspend(spa);
2115468c413aSTim Haley 
21161195e687SMark J Musante 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2117468c413aSTim Haley }
2118468c413aSTim Haley 
2119468c413aSTim Haley static int
2120468c413aSTim Haley spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2121c8ee1847SVictor Latushkin     uint64_t max_request, int rewind_flags)
2122468c413aSTim Haley {
2123468c413aSTim Haley 	nvlist_t *config = NULL;
2124468c413aSTim Haley 	int load_error, rewind_error;
2125c8ee1847SVictor Latushkin 	uint64_t safe_rewind_txg;
2126468c413aSTim Haley 	uint64_t min_txg;
2127468c413aSTim Haley 
2128a33cae98STim Haley 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2129468c413aSTim Haley 		spa->spa_load_max_txg = spa->spa_load_txg;
21301195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_CLEAR);
2131a33cae98STim Haley 	} else {
2132468c413aSTim Haley 		spa->spa_load_max_txg = max_request;
2133a33cae98STim Haley 	}
2134468c413aSTim Haley 
21351195e687SMark J Musante 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
21361195e687SMark J Musante 	    mosconfig);
2137468c413aSTim Haley 	if (load_error == 0)
2138468c413aSTim Haley 		return (0);
2139468c413aSTim Haley 
2140468c413aSTim Haley 	if (spa->spa_root_vdev != NULL)
2141468c413aSTim Haley 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2142468c413aSTim Haley 
2143468c413aSTim Haley 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2144468c413aSTim Haley 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2145468c413aSTim Haley 
2146c8ee1847SVictor Latushkin 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
2147468c413aSTim Haley 		nvlist_free(config);
2148468c413aSTim Haley 		return (load_error);
2149468c413aSTim Haley 	}
2150468c413aSTim Haley 
2151468c413aSTim Haley 	/* Price of rolling back is discarding txgs, including log */
2152468c413aSTim Haley 	if (state == SPA_LOAD_RECOVER)
21531195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_CLEAR);
2154468c413aSTim Haley 
2155c8ee1847SVictor Latushkin 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2156c8ee1847SVictor Latushkin 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2157c8ee1847SVictor Latushkin 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2158c8ee1847SVictor Latushkin 	    TXG_INITIAL : safe_rewind_txg;
2159468c413aSTim Haley 
2160c8ee1847SVictor Latushkin 	/*
2161c8ee1847SVictor Latushkin 	 * Continue as long as we're finding errors, we're still within
2162c8ee1847SVictor Latushkin 	 * the acceptable rewind range, and we're still finding uberblocks
2163c8ee1847SVictor Latushkin 	 */
2164c8ee1847SVictor Latushkin 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2165c8ee1847SVictor Latushkin 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2166c8ee1847SVictor Latushkin 		if (spa->spa_load_max_txg < safe_rewind_txg)
2167468c413aSTim Haley 			spa->spa_extreme_rewind = B_TRUE;
2168468c413aSTim Haley 		rewind_error = spa_load_retry(spa, state, mosconfig);
2169468c413aSTim Haley 	}
2170468c413aSTim Haley 
2171468c413aSTim Haley 	if (config)
2172468c413aSTim Haley 		spa_rewind_data_to_nvlist(spa, config);
2173468c413aSTim Haley 
2174468c413aSTim Haley 	spa->spa_extreme_rewind = B_FALSE;
2175468c413aSTim Haley 	spa->spa_load_max_txg = UINT64_MAX;
2176468c413aSTim Haley 
2177468c413aSTim Haley 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2178468c413aSTim Haley 		spa_config_set(spa, config);
2179468c413aSTim Haley 
2180468c413aSTim Haley 	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2181468c413aSTim Haley }
2182468c413aSTim Haley 
2183fa9e4066Sahrens /*
2184fa9e4066Sahrens  * Pool Open/Import
2185fa9e4066Sahrens  *
2186fa9e4066Sahrens  * The import case is identical to an open except that the configuration is sent
2187fa9e4066Sahrens  * down from userland, instead of grabbed from the configuration cache.  For the
2188fa9e4066Sahrens  * case of an open, the pool configuration will exist in the
21893d7072f8Seschrock  * POOL_STATE_UNINITIALIZED state.
2190fa9e4066Sahrens  *
2191fa9e4066Sahrens  * The stats information (gen/count/ustats) is used to gather vdev statistics at
2192fa9e4066Sahrens  * the same time open the pool, without having to keep around the spa_t in some
2193fa9e4066Sahrens  * ambiguous state.
2194fa9e4066Sahrens  */
2195fa9e4066Sahrens static int
2196468c413aSTim Haley spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2197468c413aSTim Haley     nvlist_t **config)
2198fa9e4066Sahrens {
2199fa9e4066Sahrens 	spa_t *spa;
2200fa9e4066Sahrens 	int error;
2201fa9e4066Sahrens 	int locked = B_FALSE;
2202fa9e4066Sahrens 
2203fa9e4066Sahrens 	*spapp = NULL;
2204fa9e4066Sahrens 
2205fa9e4066Sahrens 	/*
2206fa9e4066Sahrens 	 * As disgusting as this is, we need to support recursive calls to this
2207fa9e4066Sahrens 	 * function because dsl_dir_open() is called during spa_load(), and ends
2208fa9e4066Sahrens 	 * up calling spa_open() again.  The real fix is to figure out how to
2209fa9e4066Sahrens 	 * avoid dsl_dir_open() calling this in the first place.
2210fa9e4066Sahrens 	 */
2211fa9e4066Sahrens 	if (mutex_owner(&spa_namespace_lock) != curthread) {
2212fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
2213fa9e4066Sahrens 		locked = B_TRUE;
2214fa9e4066Sahrens 	}
2215fa9e4066Sahrens 
2216fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
2217fa9e4066Sahrens 		if (locked)
2218fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
2219fa9e4066Sahrens 		return (ENOENT);
2220fa9e4066Sahrens 	}
2221468c413aSTim Haley 
2222fa9e4066Sahrens 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
22234b44c88cSTim Haley 		spa_load_state_t state = SPA_LOAD_OPEN;
22244b44c88cSTim Haley 		zpool_rewind_policy_t policy;
22254b44c88cSTim Haley 
22264b44c88cSTim Haley 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
22274b44c88cSTim Haley 		    &policy);
22284b44c88cSTim Haley 		if (policy.zrp_request & ZPOOL_DO_REWIND)
22294b44c88cSTim Haley 			state = SPA_LOAD_RECOVER;
2230fa9e4066Sahrens 
22318ad4d6ddSJeff Bonwick 		spa_activate(spa, spa_mode_global);
2232fa9e4066Sahrens 
2233c8ee1847SVictor Latushkin 		if (spa->spa_last_open_failed && (policy.zrp_request &
2234c8ee1847SVictor Latushkin 		    (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) {
2235468c413aSTim Haley 			if (config != NULL && spa->spa_config)
2236468c413aSTim Haley 				VERIFY(nvlist_dup(spa->spa_config,
2237468c413aSTim Haley 				    config, KM_SLEEP) == 0);
2238468c413aSTim Haley 			spa_deactivate(spa);
2239468c413aSTim Haley 			if (locked)
2240468c413aSTim Haley 				mutex_exit(&spa_namespace_lock);
2241468c413aSTim Haley 			return (spa->spa_last_open_failed);
2242468c413aSTim Haley 		}
2243468c413aSTim Haley 
2244468c413aSTim Haley 		if (state != SPA_LOAD_RECOVER)
2245468c413aSTim Haley 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2246468c413aSTim Haley 
2247468c413aSTim Haley 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2248c8ee1847SVictor Latushkin 		    policy.zrp_request);
2249fa9e4066Sahrens 
2250fa9e4066Sahrens 		if (error == EBADF) {
2251fa9e4066Sahrens 			/*
2252560e6e96Seschrock 			 * If vdev_validate() returns failure (indicated by
2253560e6e96Seschrock 			 * EBADF), it indicates that one of the vdevs indicates
2254560e6e96Seschrock 			 * that the pool has been exported or destroyed.  If
2255560e6e96Seschrock 			 * this is the case, the config cache is out of sync and
2256560e6e96Seschrock 			 * we should remove the pool from the namespace.
2257fa9e4066Sahrens 			 */
2258fa9e4066Sahrens 			spa_unload(spa);
2259fa9e4066Sahrens 			spa_deactivate(spa);
2260c5904d13Seschrock 			spa_config_sync(spa, B_TRUE, B_TRUE);
2261fa9e4066Sahrens 			spa_remove(spa);
2262fa9e4066Sahrens 			if (locked)
2263fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
2264fa9e4066Sahrens 			return (ENOENT);
2265ea8dc4b6Seschrock 		}
2266ea8dc4b6Seschrock 
2267ea8dc4b6Seschrock 		if (error) {
2268fa9e4066Sahrens 			/*
2269fa9e4066Sahrens 			 * We can't open the pool, but we still have useful
2270fa9e4066Sahrens 			 * information: the state of each vdev after the
2271fa9e4066Sahrens 			 * attempted vdev_open().  Return this to the user.
2272fa9e4066Sahrens 			 */
2273468c413aSTim Haley 			if (config != NULL && spa->spa_config)
2274468c413aSTim Haley 				VERIFY(nvlist_dup(spa->spa_config, config,
2275468c413aSTim Haley 				    KM_SLEEP) == 0);
2276fa9e4066Sahrens 			spa_unload(spa);
2277fa9e4066Sahrens 			spa_deactivate(spa);
2278468c413aSTim Haley 			spa->spa_last_open_failed = error;
2279fa9e4066Sahrens 			if (locked)
2280fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
2281fa9e4066Sahrens 			*spapp = NULL;
2282fa9e4066Sahrens 			return (error);
2283fa9e4066Sahrens 		}
2284468c413aSTim Haley 
2285fa9e4066Sahrens 	}
2286fa9e4066Sahrens 
2287fa9e4066Sahrens 	spa_open_ref(spa, tag);
22883d7072f8Seschrock 
2289468c413aSTim Haley 
2290468c413aSTim Haley 	if (config != NULL)
2291468c413aSTim Haley 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2292468c413aSTim Haley 
2293a33cae98STim Haley 	if (locked) {
2294a33cae98STim Haley 		spa->spa_last_open_failed = 0;
2295a33cae98STim Haley 		spa->spa_last_ubsync_txg = 0;
2296a33cae98STim Haley 		spa->spa_load_txg = 0;
2297fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2298a33cae98STim Haley 	}
2299fa9e4066Sahrens 
2300fa9e4066Sahrens 	*spapp = spa;
2301fa9e4066Sahrens 
2302fa9e4066Sahrens 	return (0);
2303fa9e4066Sahrens }
2304fa9e4066Sahrens 
2305468c413aSTim Haley int
2306468c413aSTim Haley spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2307468c413aSTim Haley     nvlist_t **config)
2308468c413aSTim Haley {
2309468c413aSTim Haley 	return (spa_open_common(name, spapp, tag, policy, config));
2310468c413aSTim Haley }
2311468c413aSTim Haley 
2312fa9e4066Sahrens int
2313fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag)
2314fa9e4066Sahrens {
2315468c413aSTim Haley 	return (spa_open_common(name, spapp, tag, NULL, NULL));
2316fa9e4066Sahrens }
2317fa9e4066Sahrens 
2318ea8dc4b6Seschrock /*
2319ea8dc4b6Seschrock  * Lookup the given spa_t, incrementing the inject count in the process,
2320ea8dc4b6Seschrock  * preventing it from being exported or destroyed.
2321ea8dc4b6Seschrock  */
2322ea8dc4b6Seschrock spa_t *
2323ea8dc4b6Seschrock spa_inject_addref(char *name)
2324ea8dc4b6Seschrock {
2325ea8dc4b6Seschrock 	spa_t *spa;
2326ea8dc4b6Seschrock 
2327ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
2328ea8dc4b6Seschrock 	if ((spa = spa_lookup(name)) == NULL) {
2329ea8dc4b6Seschrock 		mutex_exit(&spa_namespace_lock);
2330ea8dc4b6Seschrock 		return (NULL);
2331ea8dc4b6Seschrock 	}
2332ea8dc4b6Seschrock 	spa->spa_inject_ref++;
2333ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
2334ea8dc4b6Seschrock 
2335ea8dc4b6Seschrock 	return (spa);
2336ea8dc4b6Seschrock }
2337ea8dc4b6Seschrock 
2338ea8dc4b6Seschrock void
2339ea8dc4b6Seschrock spa_inject_delref(spa_t *spa)
2340ea8dc4b6Seschrock {
2341ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
2342ea8dc4b6Seschrock 	spa->spa_inject_ref--;
2343ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
2344ea8dc4b6Seschrock }
2345ea8dc4b6Seschrock 
2346fa94a07fSbrendan /*
2347fa94a07fSbrendan  * Add spares device information to the nvlist.
2348fa94a07fSbrendan  */
234999653d4eSeschrock static void
235099653d4eSeschrock spa_add_spares(spa_t *spa, nvlist_t *config)
235199653d4eSeschrock {
235299653d4eSeschrock 	nvlist_t **spares;
235399653d4eSeschrock 	uint_t i, nspares;
235499653d4eSeschrock 	nvlist_t *nvroot;
235599653d4eSeschrock 	uint64_t guid;
235699653d4eSeschrock 	vdev_stat_t *vs;
235799653d4eSeschrock 	uint_t vsc;
235839c23413Seschrock 	uint64_t pool;
235999653d4eSeschrock 
23606809eb4eSEric Schrock 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
23616809eb4eSEric Schrock 
2362fa94a07fSbrendan 	if (spa->spa_spares.sav_count == 0)
236399653d4eSeschrock 		return;
236499653d4eSeschrock 
236599653d4eSeschrock 	VERIFY(nvlist_lookup_nvlist(config,
236699653d4eSeschrock 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2367fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
236899653d4eSeschrock 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
236999653d4eSeschrock 	if (nspares != 0) {
237099653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(nvroot,
237199653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
237299653d4eSeschrock 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
237399653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
237499653d4eSeschrock 
237599653d4eSeschrock 		/*
237699653d4eSeschrock 		 * Go through and find any spares which have since been
237799653d4eSeschrock 		 * repurposed as an active spare.  If this is the case, update
237899653d4eSeschrock 		 * their status appropriately.
237999653d4eSeschrock 		 */
238099653d4eSeschrock 		for (i = 0; i < nspares; i++) {
238199653d4eSeschrock 			VERIFY(nvlist_lookup_uint64(spares[i],
238299653d4eSeschrock 			    ZPOOL_CONFIG_GUID, &guid) == 0);
238389a89ebfSlling 			if (spa_spare_exists(guid, &pool, NULL) &&
238489a89ebfSlling 			    pool != 0ULL) {
238599653d4eSeschrock 				VERIFY(nvlist_lookup_uint64_array(
2386*3f9d6ad7SLin Ling 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
238799653d4eSeschrock 				    (uint64_t **)&vs, &vsc) == 0);
238899653d4eSeschrock 				vs->vs_state = VDEV_STATE_CANT_OPEN;
238999653d4eSeschrock 				vs->vs_aux = VDEV_AUX_SPARED;
239099653d4eSeschrock 			}
239199653d4eSeschrock 		}
239299653d4eSeschrock 	}
239399653d4eSeschrock }
239499653d4eSeschrock 
2395fa94a07fSbrendan /*
2396fa94a07fSbrendan  * Add l2cache device information to the nvlist, including vdev stats.
2397fa94a07fSbrendan  */
2398fa94a07fSbrendan static void
2399fa94a07fSbrendan spa_add_l2cache(spa_t *spa, nvlist_t *config)
2400fa94a07fSbrendan {
2401fa94a07fSbrendan 	nvlist_t **l2cache;
2402fa94a07fSbrendan 	uint_t i, j, nl2cache;
2403fa94a07fSbrendan 	nvlist_t *nvroot;
2404fa94a07fSbrendan 	uint64_t guid;
2405fa94a07fSbrendan 	vdev_t *vd;
2406fa94a07fSbrendan 	vdev_stat_t *vs;
2407fa94a07fSbrendan 	uint_t vsc;
2408fa94a07fSbrendan 
24096809eb4eSEric Schrock 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
24106809eb4eSEric Schrock 
2411fa94a07fSbrendan 	if (spa->spa_l2cache.sav_count == 0)
2412fa94a07fSbrendan 		return;
2413fa94a07fSbrendan 
2414fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist(config,
2415fa94a07fSbrendan 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2416fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2417fa94a07fSbrendan 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2418fa94a07fSbrendan 	if (nl2cache != 0) {
2419fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot,
2420fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2421fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
2422fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2423fa94a07fSbrendan 
2424fa94a07fSbrendan 		/*
2425fa94a07fSbrendan 		 * Update level 2 cache device stats.
2426fa94a07fSbrendan 		 */
2427fa94a07fSbrendan 
2428fa94a07fSbrendan 		for (i = 0; i < nl2cache; i++) {
2429fa94a07fSbrendan 			VERIFY(nvlist_lookup_uint64(l2cache[i],
2430fa94a07fSbrendan 			    ZPOOL_CONFIG_GUID, &guid) == 0);
2431fa94a07fSbrendan 
2432fa94a07fSbrendan 			vd = NULL;
2433fa94a07fSbrendan 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2434fa94a07fSbrendan 				if (guid ==
2435fa94a07fSbrendan 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2436fa94a07fSbrendan 					vd = spa->spa_l2cache.sav_vdevs[j];
2437fa94a07fSbrendan 					break;
2438fa94a07fSbrendan 				}
2439fa94a07fSbrendan 			}
2440fa94a07fSbrendan 			ASSERT(vd != NULL);
2441fa94a07fSbrendan 
2442fa94a07fSbrendan 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2443*3f9d6ad7SLin Ling 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2444*3f9d6ad7SLin Ling 			    == 0);
2445fa94a07fSbrendan 			vdev_get_stats(vd, vs);
2446fa94a07fSbrendan 		}
2447fa94a07fSbrendan 	}
2448fa94a07fSbrendan }
2449fa94a07fSbrendan 
2450fa9e4066Sahrens int
2451ea8dc4b6Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2452fa9e4066Sahrens {
2453fa9e4066Sahrens 	int error;
2454fa9e4066Sahrens 	spa_t *spa;
2455fa9e4066Sahrens 
2456fa9e4066Sahrens 	*config = NULL;
2457468c413aSTim Haley 	error = spa_open_common(name, &spa, FTAG, NULL, config);
2458fa9e4066Sahrens 
24596809eb4eSEric Schrock 	if (spa != NULL) {
24606809eb4eSEric Schrock 		/*
24616809eb4eSEric Schrock 		 * This still leaves a window of inconsistency where the spares
24626809eb4eSEric Schrock 		 * or l2cache devices could change and the config would be
24636809eb4eSEric Schrock 		 * self-inconsistent.
24646809eb4eSEric Schrock 		 */
24656809eb4eSEric Schrock 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2466ea8dc4b6Seschrock 
24676809eb4eSEric Schrock 		if (*config != NULL) {
2468e14bb325SJeff Bonwick 			VERIFY(nvlist_add_uint64(*config,
24696809eb4eSEric Schrock 			    ZPOOL_CONFIG_ERRCOUNT,
24706809eb4eSEric Schrock 			    spa_get_errlog_size(spa)) == 0);
2471e14bb325SJeff Bonwick 
24726809eb4eSEric Schrock 			if (spa_suspended(spa))
24736809eb4eSEric Schrock 				VERIFY(nvlist_add_uint64(*config,
24746809eb4eSEric Schrock 				    ZPOOL_CONFIG_SUSPENDED,
24756809eb4eSEric Schrock 				    spa->spa_failmode) == 0);
24766809eb4eSEric Schrock 
24776809eb4eSEric Schrock 			spa_add_spares(spa, *config);
24786809eb4eSEric Schrock 			spa_add_l2cache(spa, *config);
24796809eb4eSEric Schrock 		}
248099653d4eSeschrock 	}
248199653d4eSeschrock 
2482ea8dc4b6Seschrock 	/*
2483ea8dc4b6Seschrock 	 * We want to get the alternate root even for faulted pools, so we cheat
2484ea8dc4b6Seschrock 	 * and call spa_lookup() directly.
2485ea8dc4b6Seschrock 	 */
2486ea8dc4b6Seschrock 	if (altroot) {
2487ea8dc4b6Seschrock 		if (spa == NULL) {
2488ea8dc4b6Seschrock 			mutex_enter(&spa_namespace_lock);
2489ea8dc4b6Seschrock 			spa = spa_lookup(name);
2490ea8dc4b6Seschrock 			if (spa)
2491ea8dc4b6Seschrock 				spa_altroot(spa, altroot, buflen);
2492ea8dc4b6Seschrock 			else
2493ea8dc4b6Seschrock 				altroot[0] = '\0';
2494ea8dc4b6Seschrock 			spa = NULL;
2495ea8dc4b6Seschrock 			mutex_exit(&spa_namespace_lock);
2496ea8dc4b6Seschrock 		} else {
2497ea8dc4b6Seschrock 			spa_altroot(spa, altroot, buflen);
2498ea8dc4b6Seschrock 		}
2499ea8dc4b6Seschrock 	}
2500ea8dc4b6Seschrock 
25016809eb4eSEric Schrock 	if (spa != NULL) {
25026809eb4eSEric Schrock 		spa_config_exit(spa, SCL_CONFIG, FTAG);
2503fa9e4066Sahrens 		spa_close(spa, FTAG);
25046809eb4eSEric Schrock 	}
2505fa9e4066Sahrens 
2506fa9e4066Sahrens 	return (error);
2507fa9e4066Sahrens }
2508fa9e4066Sahrens 
250999653d4eSeschrock /*
2510fa94a07fSbrendan  * Validate that the auxiliary device array is well formed.  We must have an
2511fa94a07fSbrendan  * array of nvlists, each which describes a valid leaf vdev.  If this is an
2512fa94a07fSbrendan  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2513fa94a07fSbrendan  * specified, as long as they are well-formed.
251499653d4eSeschrock  */
251599653d4eSeschrock static int
2516fa94a07fSbrendan spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2517fa94a07fSbrendan     spa_aux_vdev_t *sav, const char *config, uint64_t version,
2518fa94a07fSbrendan     vdev_labeltype_t label)
251999653d4eSeschrock {
2520fa94a07fSbrendan 	nvlist_t **dev;
2521fa94a07fSbrendan 	uint_t i, ndev;
252299653d4eSeschrock 	vdev_t *vd;
252399653d4eSeschrock 	int error;
252499653d4eSeschrock 
2525e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2526e14bb325SJeff Bonwick 
252799653d4eSeschrock 	/*
2528fa94a07fSbrendan 	 * It's acceptable to have no devs specified.
252999653d4eSeschrock 	 */
2530fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
253199653d4eSeschrock 		return (0);
253299653d4eSeschrock 
2533fa94a07fSbrendan 	if (ndev == 0)
253499653d4eSeschrock 		return (EINVAL);
253599653d4eSeschrock 
253699653d4eSeschrock 	/*
2537fa94a07fSbrendan 	 * Make sure the pool is formatted with a version that supports this
2538fa94a07fSbrendan 	 * device type.
253999653d4eSeschrock 	 */
2540fa94a07fSbrendan 	if (spa_version(spa) < version)
254199653d4eSeschrock 		return (ENOTSUP);
254299653d4eSeschrock 
254339c23413Seschrock 	/*
2544fa94a07fSbrendan 	 * Set the pending device list so we correctly handle device in-use
254539c23413Seschrock 	 * checking.
254639c23413Seschrock 	 */
2547fa94a07fSbrendan 	sav->sav_pending = dev;
2548fa94a07fSbrendan 	sav->sav_npending = ndev;
254939c23413Seschrock 
2550fa94a07fSbrendan 	for (i = 0; i < ndev; i++) {
2551fa94a07fSbrendan 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
255299653d4eSeschrock 		    mode)) != 0)
255339c23413Seschrock 			goto out;
255499653d4eSeschrock 
255599653d4eSeschrock 		if (!vd->vdev_ops->vdev_op_leaf) {
255699653d4eSeschrock 			vdev_free(vd);
255739c23413Seschrock 			error = EINVAL;
255839c23413Seschrock 			goto out;
255999653d4eSeschrock 		}
256099653d4eSeschrock 
2561fa94a07fSbrendan 		/*
2562e14bb325SJeff Bonwick 		 * The L2ARC currently only supports disk devices in
2563e14bb325SJeff Bonwick 		 * kernel context.  For user-level testing, we allow it.
2564fa94a07fSbrendan 		 */
2565e14bb325SJeff Bonwick #ifdef _KERNEL
2566fa94a07fSbrendan 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2567fa94a07fSbrendan 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2568fa94a07fSbrendan 			error = ENOTBLK;
2569fa94a07fSbrendan 			goto out;
2570fa94a07fSbrendan 		}
2571e14bb325SJeff Bonwick #endif
257299653d4eSeschrock 		vd->vdev_top = vd;
257399653d4eSeschrock 
257439c23413Seschrock 		if ((error = vdev_open(vd)) == 0 &&
2575fa94a07fSbrendan 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
2576fa94a07fSbrendan 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
257739c23413Seschrock 			    vd->vdev_guid) == 0);
257839c23413Seschrock 		}
257999653d4eSeschrock 
258099653d4eSeschrock 		vdev_free(vd);
258139c23413Seschrock 
2582fa94a07fSbrendan 		if (error &&
2583fa94a07fSbrendan 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
258439c23413Seschrock 			goto out;
258539c23413Seschrock 		else
258639c23413Seschrock 			error = 0;
258799653d4eSeschrock 	}
258899653d4eSeschrock 
258939c23413Seschrock out:
2590fa94a07fSbrendan 	sav->sav_pending = NULL;
2591fa94a07fSbrendan 	sav->sav_npending = 0;
259239c23413Seschrock 	return (error);
259399653d4eSeschrock }
259499653d4eSeschrock 
2595fa94a07fSbrendan static int
2596fa94a07fSbrendan spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2597fa94a07fSbrendan {
2598fa94a07fSbrendan 	int error;
2599fa94a07fSbrendan 
2600e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2601e14bb325SJeff Bonwick 
2602fa94a07fSbrendan 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2603fa94a07fSbrendan 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2604fa94a07fSbrendan 	    VDEV_LABEL_SPARE)) != 0) {
2605fa94a07fSbrendan 		return (error);
2606fa94a07fSbrendan 	}
2607fa94a07fSbrendan 
2608fa94a07fSbrendan 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2609fa94a07fSbrendan 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2610fa94a07fSbrendan 	    VDEV_LABEL_L2CACHE));
2611fa94a07fSbrendan }
2612fa94a07fSbrendan 
2613fa94a07fSbrendan static void
2614fa94a07fSbrendan spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2615fa94a07fSbrendan     const char *config)
2616fa94a07fSbrendan {
2617fa94a07fSbrendan 	int i;
2618fa94a07fSbrendan 
2619fa94a07fSbrendan 	if (sav->sav_config != NULL) {
2620fa94a07fSbrendan 		nvlist_t **olddevs;
2621fa94a07fSbrendan 		uint_t oldndevs;
2622fa94a07fSbrendan 		nvlist_t **newdevs;
2623fa94a07fSbrendan 
2624fa94a07fSbrendan 		/*
2625fa94a07fSbrendan 		 * Generate new dev list by concatentating with the
2626fa94a07fSbrendan 		 * current dev list.
2627fa94a07fSbrendan 		 */
2628fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2629fa94a07fSbrendan 		    &olddevs, &oldndevs) == 0);
2630fa94a07fSbrendan 
2631fa94a07fSbrendan 		newdevs = kmem_alloc(sizeof (void *) *
2632fa94a07fSbrendan 		    (ndevs + oldndevs), KM_SLEEP);
2633fa94a07fSbrendan 		for (i = 0; i < oldndevs; i++)
2634fa94a07fSbrendan 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2635fa94a07fSbrendan 			    KM_SLEEP) == 0);
2636fa94a07fSbrendan 		for (i = 0; i < ndevs; i++)
2637fa94a07fSbrendan 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2638fa94a07fSbrendan 			    KM_SLEEP) == 0);
2639fa94a07fSbrendan 
2640fa94a07fSbrendan 		VERIFY(nvlist_remove(sav->sav_config, config,
2641fa94a07fSbrendan 		    DATA_TYPE_NVLIST_ARRAY) == 0);
2642fa94a07fSbrendan 
2643fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2644fa94a07fSbrendan 		    config, newdevs, ndevs + oldndevs) == 0);
2645fa94a07fSbrendan 		for (i = 0; i < oldndevs + ndevs; i++)
2646fa94a07fSbrendan 			nvlist_free(newdevs[i]);
2647fa94a07fSbrendan 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2648fa94a07fSbrendan 	} else {
2649fa94a07fSbrendan 		/*
2650fa94a07fSbrendan 		 * Generate a new dev list.
2651fa94a07fSbrendan 		 */
2652fa94a07fSbrendan 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2653fa94a07fSbrendan 		    KM_SLEEP) == 0);
2654fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2655fa94a07fSbrendan 		    devs, ndevs) == 0);
2656fa94a07fSbrendan 	}
2657fa94a07fSbrendan }
2658fa94a07fSbrendan 
2659fa94a07fSbrendan /*
2660fa94a07fSbrendan  * Stop and drop level 2 ARC devices
2661fa94a07fSbrendan  */
2662fa94a07fSbrendan void
2663fa94a07fSbrendan spa_l2cache_drop(spa_t *spa)
2664fa94a07fSbrendan {
2665fa94a07fSbrendan 	vdev_t *vd;
2666fa94a07fSbrendan 	int i;
2667fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2668fa94a07fSbrendan 
2669fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++) {
2670fa94a07fSbrendan 		uint64_t pool;
2671fa94a07fSbrendan 
2672fa94a07fSbrendan 		vd = sav->sav_vdevs[i];
2673fa94a07fSbrendan 		ASSERT(vd != NULL);
2674fa94a07fSbrendan 
26758ad4d6ddSJeff Bonwick 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
26768ad4d6ddSJeff Bonwick 		    pool != 0ULL && l2arc_vdev_present(vd))
2677fa94a07fSbrendan 			l2arc_remove_vdev(vd);
2678fa94a07fSbrendan 		if (vd->vdev_isl2cache)
2679fa94a07fSbrendan 			spa_l2cache_remove(vd);
2680fa94a07fSbrendan 		vdev_clear_stats(vd);
2681fa94a07fSbrendan 		(void) vdev_close(vd);
2682fa94a07fSbrendan 	}
2683fa94a07fSbrendan }
2684fa94a07fSbrendan 
2685fa9e4066Sahrens /*
2686fa9e4066Sahrens  * Pool Creation
2687fa9e4066Sahrens  */
2688fa9e4066Sahrens int
2689990b4856Slling spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
26900a48a24eStimh     const char *history_str, nvlist_t *zplprops)
2691fa9e4066Sahrens {
2692fa9e4066Sahrens 	spa_t *spa;
2693990b4856Slling 	char *altroot = NULL;
26940373e76bSbonwick 	vdev_t *rvd;
2695fa9e4066Sahrens 	dsl_pool_t *dp;
2696fa9e4066Sahrens 	dmu_tx_t *tx;
2697573ca77eSGeorge Wilson 	int error = 0;
2698fa9e4066Sahrens 	uint64_t txg = TXG_INITIAL;
2699fa94a07fSbrendan 	nvlist_t **spares, **l2cache;
2700fa94a07fSbrendan 	uint_t nspares, nl2cache;
2701990b4856Slling 	uint64_t version;
2702fa9e4066Sahrens 
2703fa9e4066Sahrens 	/*
2704fa9e4066Sahrens 	 * If this pool already exists, return failure.
2705fa9e4066Sahrens 	 */
2706fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
2707fa9e4066Sahrens 	if (spa_lookup(pool) != NULL) {
2708fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2709fa9e4066Sahrens 		return (EEXIST);
2710fa9e4066Sahrens 	}
2711fa9e4066Sahrens 
2712fa9e4066Sahrens 	/*
2713fa9e4066Sahrens 	 * Allocate a new spa_t structure.
2714fa9e4066Sahrens 	 */
2715990b4856Slling 	(void) nvlist_lookup_string(props,
2716990b4856Slling 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2717468c413aSTim Haley 	spa = spa_add(pool, NULL, altroot);
27188ad4d6ddSJeff Bonwick 	spa_activate(spa, spa_mode_global);
2719fa9e4066Sahrens 
2720990b4856Slling 	if (props && (error = spa_prop_validate(spa, props))) {
2721990b4856Slling 		spa_deactivate(spa);
2722990b4856Slling 		spa_remove(spa);
2723c5904d13Seschrock 		mutex_exit(&spa_namespace_lock);
2724990b4856Slling 		return (error);
2725990b4856Slling 	}
2726990b4856Slling 
2727990b4856Slling 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2728990b4856Slling 	    &version) != 0)
2729990b4856Slling 		version = SPA_VERSION;
2730990b4856Slling 	ASSERT(version <= SPA_VERSION);
2731b24ab676SJeff Bonwick 
2732b24ab676SJeff Bonwick 	spa->spa_first_txg = txg;
2733b24ab676SJeff Bonwick 	spa->spa_uberblock.ub_txg = txg - 1;
2734990b4856Slling 	spa->spa_uberblock.ub_version = version;
2735fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
2736fa9e4066Sahrens 
273754d692b7SGeorge Wilson 	/*
273854d692b7SGeorge Wilson 	 * Create "The Godfather" zio to hold all async IOs
273954d692b7SGeorge Wilson 	 */
274025f89ee2SJeff Bonwick 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
274125f89ee2SJeff Bonwick 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
274254d692b7SGeorge Wilson 
27430373e76bSbonwick 	/*
27440373e76bSbonwick 	 * Create the root vdev.
27450373e76bSbonwick 	 */
2746e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
27470373e76bSbonwick 
274899653d4eSeschrock 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
27490373e76bSbonwick 
275099653d4eSeschrock 	ASSERT(error != 0 || rvd != NULL);
275199653d4eSeschrock 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
27520373e76bSbonwick 
2753b7b97454Sperrin 	if (error == 0 && !zfs_allocatable_devs(nvroot))
27540373e76bSbonwick 		error = EINVAL;
275599653d4eSeschrock 
275699653d4eSeschrock 	if (error == 0 &&
275799653d4eSeschrock 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2758fa94a07fSbrendan 	    (error = spa_validate_aux(spa, nvroot, txg,
275999653d4eSeschrock 	    VDEV_ALLOC_ADD)) == 0) {
2760573ca77eSGeorge Wilson 		for (int c = 0; c < rvd->vdev_children; c++) {
2761573ca77eSGeorge Wilson 			vdev_metaslab_set_size(rvd->vdev_child[c]);
2762573ca77eSGeorge Wilson 			vdev_expand(rvd->vdev_child[c], txg);
2763573ca77eSGeorge Wilson 		}
27640373e76bSbonwick 	}
27650373e76bSbonwick 
2766e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
2767fa9e4066Sahrens 
276899653d4eSeschrock 	if (error != 0) {
2769fa9e4066Sahrens 		spa_unload(spa);
2770fa9e4066Sahrens 		spa_deactivate(spa);
2771fa9e4066Sahrens 		spa_remove(spa);
2772fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2773fa9e4066Sahrens 		return (error);
2774fa9e4066Sahrens 	}
2775fa9e4066Sahrens 
277699653d4eSeschrock 	/*
277799653d4eSeschrock 	 * Get the list of spares, if specified.
277899653d4eSeschrock 	 */
277999653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
278099653d4eSeschrock 	    &spares, &nspares) == 0) {
2781fa94a07fSbrendan 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
278299653d4eSeschrock 		    KM_SLEEP) == 0);
2783fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
278499653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2785e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
278699653d4eSeschrock 		spa_load_spares(spa);
2787e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
2788fa94a07fSbrendan 		spa->spa_spares.sav_sync = B_TRUE;
2789fa94a07fSbrendan 	}
2790fa94a07fSbrendan 
2791fa94a07fSbrendan 	/*
2792fa94a07fSbrendan 	 * Get the list of level 2 cache devices, if specified.
2793fa94a07fSbrendan 	 */
2794fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2795fa94a07fSbrendan 	    &l2cache, &nl2cache) == 0) {
2796fa94a07fSbrendan 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2797fa94a07fSbrendan 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2798fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2799fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2800e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2801fa94a07fSbrendan 		spa_load_l2cache(spa);
2802e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
2803fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
280499653d4eSeschrock 	}
280599653d4eSeschrock 
28060a48a24eStimh 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
2807fa9e4066Sahrens 	spa->spa_meta_objset = dp->dp_meta_objset;
2808fa9e4066Sahrens 
2809485bbbf5SGeorge Wilson 	/*
2810485bbbf5SGeorge Wilson 	 * Create DDTs (dedup tables).
2811485bbbf5SGeorge Wilson 	 */
2812485bbbf5SGeorge Wilson 	ddt_create(spa);
2813485bbbf5SGeorge Wilson 
2814485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
2815485bbbf5SGeorge Wilson 
2816fa9e4066Sahrens 	tx = dmu_tx_create_assigned(dp, txg);
2817fa9e4066Sahrens 
2818fa9e4066Sahrens 	/*
2819fa9e4066Sahrens 	 * Create the pool config object.
2820fa9e4066Sahrens 	 */
2821fa9e4066Sahrens 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
2822f7991ba4STim Haley 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
2823fa9e4066Sahrens 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2824fa9e4066Sahrens 
2825ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
2826fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2827ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2828ea8dc4b6Seschrock 		cmn_err(CE_PANIC, "failed to add pool config");
2829ea8dc4b6Seschrock 	}
2830fa9e4066Sahrens 
2831*3f9d6ad7SLin Ling 	if (zap_add(spa->spa_meta_objset,
2832*3f9d6ad7SLin Ling 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
2833*3f9d6ad7SLin Ling 	    sizeof (uint64_t), 1, &version, tx) != 0) {
2834*3f9d6ad7SLin Ling 		cmn_err(CE_PANIC, "failed to add pool version");
2835*3f9d6ad7SLin Ling 	}
2836*3f9d6ad7SLin Ling 
2837990b4856Slling 	/* Newly created pools with the right version are always deflated. */
2838990b4856Slling 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2839990b4856Slling 		spa->spa_deflate = TRUE;
2840990b4856Slling 		if (zap_add(spa->spa_meta_objset,
2841990b4856Slling 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2842990b4856Slling 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2843990b4856Slling 			cmn_err(CE_PANIC, "failed to add deflate");
2844990b4856Slling 		}
284599653d4eSeschrock 	}
284699653d4eSeschrock 
2847fa9e4066Sahrens 	/*
2848fa9e4066Sahrens 	 * Create the deferred-free bplist object.  Turn off compression
2849fa9e4066Sahrens 	 * because sync-to-convergence takes longer if the blocksize
2850fa9e4066Sahrens 	 * keeps changing.
2851fa9e4066Sahrens 	 */
2852b24ab676SJeff Bonwick 	spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
2853fa9e4066Sahrens 	    1 << 14, tx);
2854b24ab676SJeff Bonwick 	dmu_object_set_compress(spa->spa_meta_objset,
2855b24ab676SJeff Bonwick 	    spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
2856fa9e4066Sahrens 
2857ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
2858fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2859b24ab676SJeff Bonwick 	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
2860ea8dc4b6Seschrock 		cmn_err(CE_PANIC, "failed to add bplist");
2861ea8dc4b6Seschrock 	}
2862fa9e4066Sahrens 
286306eeb2adSek 	/*
286406eeb2adSek 	 * Create the pool's history object.
286506eeb2adSek 	 */
2866990b4856Slling 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2867990b4856Slling 		spa_history_create_obj(spa, tx);
2868990b4856Slling 
2869990b4856Slling 	/*
2870990b4856Slling 	 * Set pool properties.
2871990b4856Slling 	 */
2872990b4856Slling 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2873990b4856Slling 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
28740a4e9518Sgw 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2875573ca77eSGeorge Wilson 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
2876b24ab676SJeff Bonwick 
2877379c004dSEric Schrock 	if (props != NULL) {
2878379c004dSEric Schrock 		spa_configfile_set(spa, props, B_FALSE);
2879*3f9d6ad7SLin Ling 		spa_sync_props(spa, props, tx);
2880379c004dSEric Schrock 	}
288106eeb2adSek 
2882fa9e4066Sahrens 	dmu_tx_commit(tx);
2883fa9e4066Sahrens 
2884fa9e4066Sahrens 	spa->spa_sync_on = B_TRUE;
2885fa9e4066Sahrens 	txg_sync_start(spa->spa_dsl_pool);
2886fa9e4066Sahrens 
2887fa9e4066Sahrens 	/*
2888fa9e4066Sahrens 	 * We explicitly wait for the first transaction to complete so that our
2889fa9e4066Sahrens 	 * bean counters are appropriately updated.
2890fa9e4066Sahrens 	 */
2891fa9e4066Sahrens 	txg_wait_synced(spa->spa_dsl_pool, txg);
2892fa9e4066Sahrens 
2893c5904d13Seschrock 	spa_config_sync(spa, B_FALSE, B_TRUE);
2894fa9e4066Sahrens 
2895990b4856Slling 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2896228975ccSek 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2897c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_CREATE);
2898228975ccSek 
2899088f3894Sahrens 	spa->spa_minref = refcount_count(&spa->spa_refcount);
2900088f3894Sahrens 
2901daaa36a7SGeorge Wilson 	mutex_exit(&spa_namespace_lock);
2902daaa36a7SGeorge Wilson 
2903fa9e4066Sahrens 	return (0);
2904fa9e4066Sahrens }
2905fa9e4066Sahrens 
2906e7cbe64fSgw #ifdef _KERNEL
2907e7cbe64fSgw /*
290821ecdf64SLin Ling  * Get the root pool information from the root disk, then import the root pool
290921ecdf64SLin Ling  * during the system boot up time.
2910e7cbe64fSgw  */
291121ecdf64SLin Ling extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
291221ecdf64SLin Ling 
291321ecdf64SLin Ling static nvlist_t *
291421ecdf64SLin Ling spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
2915e7cbe64fSgw {
291621ecdf64SLin Ling 	nvlist_t *config;
2917e7cbe64fSgw 	nvlist_t *nvtop, *nvroot;
2918e7cbe64fSgw 	uint64_t pgid;
2919e7cbe64fSgw 
292021ecdf64SLin Ling 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
292121ecdf64SLin Ling 		return (NULL);
292221ecdf64SLin Ling 
2923e7cbe64fSgw 	/*
2924e7cbe64fSgw 	 * Add this top-level vdev to the child array.
2925e7cbe64fSgw 	 */
292621ecdf64SLin Ling 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
292721ecdf64SLin Ling 	    &nvtop) == 0);
292821ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
292921ecdf64SLin Ling 	    &pgid) == 0);
293021ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
2931e7cbe64fSgw 
2932e7cbe64fSgw 	/*
2933e7cbe64fSgw 	 * Put this pool's top-level vdevs into a root vdev.
2934e7cbe64fSgw 	 */
2935e7cbe64fSgw 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
293621ecdf64SLin Ling 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
293721ecdf64SLin Ling 	    VDEV_TYPE_ROOT) == 0);
2938e7cbe64fSgw 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
2939e7cbe64fSgw 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
2940e7cbe64fSgw 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
2941e7cbe64fSgw 	    &nvtop, 1) == 0);
2942e7cbe64fSgw 
2943e7cbe64fSgw 	/*
2944e7cbe64fSgw 	 * Replace the existing vdev_tree with the new root vdev in
2945e7cbe64fSgw 	 * this pool's configuration (remove the old, add the new).
2946e7cbe64fSgw 	 */
2947e7cbe64fSgw 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
2948e7cbe64fSgw 	nvlist_free(nvroot);
294921ecdf64SLin Ling 	return (config);
2950e7cbe64fSgw }
2951e7cbe64fSgw 
2952e7cbe64fSgw /*
295321ecdf64SLin Ling  * Walk the vdev tree and see if we can find a device with "better"
295421ecdf64SLin Ling  * configuration. A configuration is "better" if the label on that
295521ecdf64SLin Ling  * device has a more recent txg.
2956051aabe6Staylor  */
295721ecdf64SLin Ling static void
295821ecdf64SLin Ling spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
2959051aabe6Staylor {
2960573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
296121ecdf64SLin Ling 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
2962051aabe6Staylor 
296321ecdf64SLin Ling 	if (vd->vdev_ops->vdev_op_leaf) {
296421ecdf64SLin Ling 		nvlist_t *label;
296521ecdf64SLin Ling 		uint64_t label_txg;
2966051aabe6Staylor 
296721ecdf64SLin Ling 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
296821ecdf64SLin Ling 		    &label) != 0)
296921ecdf64SLin Ling 			return;
2970051aabe6Staylor 
297121ecdf64SLin Ling 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
297221ecdf64SLin Ling 		    &label_txg) == 0);
2973051aabe6Staylor 
297421ecdf64SLin Ling 		/*
297521ecdf64SLin Ling 		 * Do we have a better boot device?
297621ecdf64SLin Ling 		 */
297721ecdf64SLin Ling 		if (label_txg > *txg) {
297821ecdf64SLin Ling 			*txg = label_txg;
297921ecdf64SLin Ling 			*avd = vd;
2980051aabe6Staylor 		}
298121ecdf64SLin Ling 		nvlist_free(label);
2982051aabe6Staylor 	}
2983051aabe6Staylor }
2984051aabe6Staylor 
2985e7cbe64fSgw /*
2986e7cbe64fSgw  * Import a root pool.
2987e7cbe64fSgw  *
2988051aabe6Staylor  * For x86. devpath_list will consist of devid and/or physpath name of
2989051aabe6Staylor  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
2990051aabe6Staylor  * The GRUB "findroot" command will return the vdev we should boot.
2991e7cbe64fSgw  *
2992e7cbe64fSgw  * For Sparc, devpath_list consists the physpath name of the booting device
2993e7cbe64fSgw  * no matter the rootpool is a single device pool or a mirrored pool.
2994e7cbe64fSgw  * e.g.
2995e7cbe64fSgw  *	"/pci@1f,0/ide@d/disk@0,0:a"
2996e7cbe64fSgw  */
2997e7cbe64fSgw int
2998051aabe6Staylor spa_import_rootpool(char *devpath, char *devid)
2999e7cbe64fSgw {
300021ecdf64SLin Ling 	spa_t *spa;
300121ecdf64SLin Ling 	vdev_t *rvd, *bvd, *avd = NULL;
300221ecdf64SLin Ling 	nvlist_t *config, *nvtop;
300321ecdf64SLin Ling 	uint64_t guid, txg;
3004e7cbe64fSgw 	char *pname;
3005e7cbe64fSgw 	int error;
3006e7cbe64fSgw 
3007e7cbe64fSgw 	/*
300821ecdf64SLin Ling 	 * Read the label from the boot device and generate a configuration.
3009e7cbe64fSgw 	 */
3010dedec472SJack Meng 	config = spa_generate_rootconf(devpath, devid, &guid);
3011dedec472SJack Meng #if defined(_OBP) && defined(_KERNEL)
3012dedec472SJack Meng 	if (config == NULL) {
3013dedec472SJack Meng 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
3014dedec472SJack Meng 			/* iscsi boot */
3015dedec472SJack Meng 			get_iscsi_bootpath_phy(devpath);
3016dedec472SJack Meng 			config = spa_generate_rootconf(devpath, devid, &guid);
3017dedec472SJack Meng 		}
3018dedec472SJack Meng 	}
3019dedec472SJack Meng #endif
3020dedec472SJack Meng 	if (config == NULL) {
302121ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
302221ecdf64SLin Ling 		    devpath);
302321ecdf64SLin Ling 		return (EIO);
302421ecdf64SLin Ling 	}
3025e7cbe64fSgw 
302621ecdf64SLin Ling 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
302721ecdf64SLin Ling 	    &pname) == 0);
302821ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3029e7cbe64fSgw 
30306809eb4eSEric Schrock 	mutex_enter(&spa_namespace_lock);
30316809eb4eSEric Schrock 	if ((spa = spa_lookup(pname)) != NULL) {
30326809eb4eSEric Schrock 		/*
30336809eb4eSEric Schrock 		 * Remove the existing root pool from the namespace so that we
30346809eb4eSEric Schrock 		 * can replace it with the correct config we just read in.
30356809eb4eSEric Schrock 		 */
30366809eb4eSEric Schrock 		spa_remove(spa);
30376809eb4eSEric Schrock 	}
30386809eb4eSEric Schrock 
3039468c413aSTim Haley 	spa = spa_add(pname, config, NULL);
30406809eb4eSEric Schrock 	spa->spa_is_root = B_TRUE;
3041bc758434SLin Ling 	spa->spa_load_verbatim = B_TRUE;
3042e7cbe64fSgw 
304321ecdf64SLin Ling 	/*
304421ecdf64SLin Ling 	 * Build up a vdev tree based on the boot device's label config.
304521ecdf64SLin Ling 	 */
304621ecdf64SLin Ling 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
304721ecdf64SLin Ling 	    &nvtop) == 0);
304821ecdf64SLin Ling 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
304921ecdf64SLin Ling 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
305021ecdf64SLin Ling 	    VDEV_ALLOC_ROOTPOOL);
305121ecdf64SLin Ling 	spa_config_exit(spa, SCL_ALL, FTAG);
305221ecdf64SLin Ling 	if (error) {
305321ecdf64SLin Ling 		mutex_exit(&spa_namespace_lock);
305421ecdf64SLin Ling 		nvlist_free(config);
305521ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
305621ecdf64SLin Ling 		    pname);
305721ecdf64SLin Ling 		return (error);
305821ecdf64SLin Ling 	}
305921ecdf64SLin Ling 
306021ecdf64SLin Ling 	/*
306121ecdf64SLin Ling 	 * Get the boot vdev.
306221ecdf64SLin Ling 	 */
306321ecdf64SLin Ling 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
306421ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
306521ecdf64SLin Ling 		    (u_longlong_t)guid);
306621ecdf64SLin Ling 		error = ENOENT;
306721ecdf64SLin Ling 		goto out;
306821ecdf64SLin Ling 	}
3069e7cbe64fSgw 
307021ecdf64SLin Ling 	/*
307121ecdf64SLin Ling 	 * Determine if there is a better boot device.
307221ecdf64SLin Ling 	 */
307321ecdf64SLin Ling 	avd = bvd;
307421ecdf64SLin Ling 	spa_alt_rootvdev(rvd, &avd, &txg);
307521ecdf64SLin Ling 	if (avd != bvd) {
307621ecdf64SLin Ling 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
307721ecdf64SLin Ling 		    "try booting from '%s'", avd->vdev_path);
307821ecdf64SLin Ling 		error = EINVAL;
307921ecdf64SLin Ling 		goto out;
308021ecdf64SLin Ling 	}
3081e7cbe64fSgw 
308221ecdf64SLin Ling 	/*
308321ecdf64SLin Ling 	 * If the boot device is part of a spare vdev then ensure that
308421ecdf64SLin Ling 	 * we're booting off the active spare.
308521ecdf64SLin Ling 	 */
308621ecdf64SLin Ling 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
308721ecdf64SLin Ling 	    !bvd->vdev_isspare) {
308821ecdf64SLin Ling 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
308921ecdf64SLin Ling 		    "try booting from '%s'",
309021ecdf64SLin Ling 		    bvd->vdev_parent->vdev_child[1]->vdev_path);
309121ecdf64SLin Ling 		error = EINVAL;
309221ecdf64SLin Ling 		goto out;
309321ecdf64SLin Ling 	}
309421ecdf64SLin Ling 
309521ecdf64SLin Ling 	error = 0;
3096c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_IMPORT);
309721ecdf64SLin Ling out:
309821ecdf64SLin Ling 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
309921ecdf64SLin Ling 	vdev_free(rvd);
310021ecdf64SLin Ling 	spa_config_exit(spa, SCL_ALL, FTAG);
310121ecdf64SLin Ling 	mutex_exit(&spa_namespace_lock);
310221ecdf64SLin Ling 
310321ecdf64SLin Ling 	nvlist_free(config);
3104e7cbe64fSgw 	return (error);
3105e7cbe64fSgw }
310621ecdf64SLin Ling 
3107e7cbe64fSgw #endif
3108e7cbe64fSgw 
3109e7cbe64fSgw /*
31106809eb4eSEric Schrock  * Take a pool and insert it into the namespace as if it had been loaded at
31116809eb4eSEric Schrock  * boot.
3112e7cbe64fSgw  */
3113e7cbe64fSgw int
31146809eb4eSEric Schrock spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
3115e7cbe64fSgw {
31166809eb4eSEric Schrock 	spa_t *spa;
31176809eb4eSEric Schrock 	char *altroot = NULL;
31186809eb4eSEric Schrock 
31196809eb4eSEric Schrock 	mutex_enter(&spa_namespace_lock);
31206809eb4eSEric Schrock 	if (spa_lookup(pool) != NULL) {
31216809eb4eSEric Schrock 		mutex_exit(&spa_namespace_lock);
31226809eb4eSEric Schrock 		return (EEXIST);
31236809eb4eSEric Schrock 	}
31246809eb4eSEric Schrock 
31256809eb4eSEric Schrock 	(void) nvlist_lookup_string(props,
31266809eb4eSEric Schrock 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3127468c413aSTim Haley 	spa = spa_add(pool, config, altroot);
31286809eb4eSEric Schrock 
3129468c413aSTim Haley 	spa->spa_load_verbatim = B_TRUE;
31306809eb4eSEric Schrock 
31316809eb4eSEric Schrock 	if (props != NULL)
31326809eb4eSEric Schrock 		spa_configfile_set(spa, props, B_FALSE);
31336809eb4eSEric Schrock 
31346809eb4eSEric Schrock 	spa_config_sync(spa, B_FALSE, B_TRUE);
31356809eb4eSEric Schrock 
31366809eb4eSEric Schrock 	mutex_exit(&spa_namespace_lock);
3137c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_IMPORT);
31386809eb4eSEric Schrock 
31396809eb4eSEric Schrock 	return (0);
3140e7cbe64fSgw }
3141e7cbe64fSgw 
31426809eb4eSEric Schrock /*
31436809eb4eSEric Schrock  * Import a non-root pool into the system.
31446809eb4eSEric Schrock  */
3145c5904d13Seschrock int
31466809eb4eSEric Schrock spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
3147c5904d13Seschrock {
31486809eb4eSEric Schrock 	spa_t *spa;
31496809eb4eSEric Schrock 	char *altroot = NULL;
3150468c413aSTim Haley 	spa_load_state_t state = SPA_LOAD_IMPORT;
3151468c413aSTim Haley 	zpool_rewind_policy_t policy;
31526809eb4eSEric Schrock 	int error;
31536809eb4eSEric Schrock 	nvlist_t *nvroot;
31546809eb4eSEric Schrock 	nvlist_t **spares, **l2cache;
31556809eb4eSEric Schrock 	uint_t nspares, nl2cache;
31566809eb4eSEric Schrock 
31576809eb4eSEric Schrock 	/*
31586809eb4eSEric Schrock 	 * If a pool with this name exists, return failure.
31596809eb4eSEric Schrock 	 */
31606809eb4eSEric Schrock 	mutex_enter(&spa_namespace_lock);
31611195e687SMark J Musante 	if (spa_lookup(pool) != NULL) {
31626809eb4eSEric Schrock 		mutex_exit(&spa_namespace_lock);
31636809eb4eSEric Schrock 		return (EEXIST);
31646809eb4eSEric Schrock 	}
31656809eb4eSEric Schrock 
3166468c413aSTim Haley 	zpool_get_rewind_policy(config, &policy);
3167468c413aSTim Haley 	if (policy.zrp_request & ZPOOL_DO_REWIND)
3168468c413aSTim Haley 		state = SPA_LOAD_RECOVER;
3169468c413aSTim Haley 
31706809eb4eSEric Schrock 	/*
31716809eb4eSEric Schrock 	 * Create and initialize the spa structure.
31726809eb4eSEric Schrock 	 */
31736809eb4eSEric Schrock 	(void) nvlist_lookup_string(props,
31746809eb4eSEric Schrock 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3175468c413aSTim Haley 	spa = spa_add(pool, config, altroot);
31766809eb4eSEric Schrock 	spa_activate(spa, spa_mode_global);
31776809eb4eSEric Schrock 
317825f89ee2SJeff Bonwick 	/*
317925f89ee2SJeff Bonwick 	 * Don't start async tasks until we know everything is healthy.
318025f89ee2SJeff Bonwick 	 */
318125f89ee2SJeff Bonwick 	spa_async_suspend(spa);
318225f89ee2SJeff Bonwick 
31836809eb4eSEric Schrock 	/*
31846809eb4eSEric Schrock 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
31856809eb4eSEric Schrock 	 * because the user-supplied config is actually the one to trust when
31866809eb4eSEric Schrock 	 * doing an import.
31876809eb4eSEric Schrock 	 */
3188468c413aSTim Haley 	if (state != SPA_LOAD_RECOVER)
3189468c413aSTim Haley 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3190468c413aSTim Haley 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3191c8ee1847SVictor Latushkin 	    policy.zrp_request);
3192468c413aSTim Haley 
3193468c413aSTim Haley 	/*
3194468c413aSTim Haley 	 * Propagate anything learned about failing or best txgs
3195468c413aSTim Haley 	 * back to caller
3196468c413aSTim Haley 	 */
3197468c413aSTim Haley 	spa_rewind_data_to_nvlist(spa, config);
31986809eb4eSEric Schrock 
31996809eb4eSEric Schrock 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
32006809eb4eSEric Schrock 	/*
32016809eb4eSEric Schrock 	 * Toss any existing sparelist, as it doesn't have any validity
32026809eb4eSEric Schrock 	 * anymore, and conflicts with spa_has_spare().
32036809eb4eSEric Schrock 	 */
32046809eb4eSEric Schrock 	if (spa->spa_spares.sav_config) {
32056809eb4eSEric Schrock 		nvlist_free(spa->spa_spares.sav_config);
32066809eb4eSEric Schrock 		spa->spa_spares.sav_config = NULL;
32076809eb4eSEric Schrock 		spa_load_spares(spa);
32086809eb4eSEric Schrock 	}
32096809eb4eSEric Schrock 	if (spa->spa_l2cache.sav_config) {
32106809eb4eSEric Schrock 		nvlist_free(spa->spa_l2cache.sav_config);
32116809eb4eSEric Schrock 		spa->spa_l2cache.sav_config = NULL;
32126809eb4eSEric Schrock 		spa_load_l2cache(spa);
32136809eb4eSEric Schrock 	}
32146809eb4eSEric Schrock 
32156809eb4eSEric Schrock 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
32166809eb4eSEric Schrock 	    &nvroot) == 0);
32176809eb4eSEric Schrock 	if (error == 0)
32186809eb4eSEric Schrock 		error = spa_validate_aux(spa, nvroot, -1ULL,
32196809eb4eSEric Schrock 		    VDEV_ALLOC_SPARE);
32206809eb4eSEric Schrock 	if (error == 0)
32216809eb4eSEric Schrock 		error = spa_validate_aux(spa, nvroot, -1ULL,
32226809eb4eSEric Schrock 		    VDEV_ALLOC_L2CACHE);
32236809eb4eSEric Schrock 	spa_config_exit(spa, SCL_ALL, FTAG);
32246809eb4eSEric Schrock 
32256809eb4eSEric Schrock 	if (props != NULL)
32266809eb4eSEric Schrock 		spa_configfile_set(spa, props, B_FALSE);
32276809eb4eSEric Schrock 
32286809eb4eSEric Schrock 	if (error != 0 || (props && spa_writeable(spa) &&
32296809eb4eSEric Schrock 	    (error = spa_prop_set(spa, props)))) {
32306809eb4eSEric Schrock 		spa_unload(spa);
32316809eb4eSEric Schrock 		spa_deactivate(spa);
32326809eb4eSEric Schrock 		spa_remove(spa);
32336809eb4eSEric Schrock 		mutex_exit(&spa_namespace_lock);
32346809eb4eSEric Schrock 		return (error);
32356809eb4eSEric Schrock 	}
32366809eb4eSEric Schrock 
32376809eb4eSEric Schrock 	/*
32386809eb4eSEric Schrock 	 * Override any spares and level 2 cache devices as specified by
32396809eb4eSEric Schrock 	 * the user, as these may have correct device names/devids, etc.
32406809eb4eSEric Schrock 	 */
32416809eb4eSEric Schrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
32426809eb4eSEric Schrock 	    &spares, &nspares) == 0) {
32436809eb4eSEric Schrock 		if (spa->spa_spares.sav_config)
32446809eb4eSEric Schrock 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
32456809eb4eSEric Schrock 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
32466809eb4eSEric Schrock 		else
32476809eb4eSEric Schrock 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
32486809eb4eSEric Schrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
32496809eb4eSEric Schrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
32506809eb4eSEric Schrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
32516809eb4eSEric Schrock 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
32526809eb4eSEric Schrock 		spa_load_spares(spa);
32536809eb4eSEric Schrock 		spa_config_exit(spa, SCL_ALL, FTAG);
32546809eb4eSEric Schrock 		spa->spa_spares.sav_sync = B_TRUE;
32556809eb4eSEric Schrock 	}
32566809eb4eSEric Schrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
32576809eb4eSEric Schrock 	    &l2cache, &nl2cache) == 0) {
32586809eb4eSEric Schrock 		if (spa->spa_l2cache.sav_config)
32596809eb4eSEric Schrock 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
32606809eb4eSEric Schrock 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
32616809eb4eSEric Schrock 		else
32626809eb4eSEric Schrock 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
32636809eb4eSEric Schrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
32646809eb4eSEric Schrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
32656809eb4eSEric Schrock 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
32666809eb4eSEric Schrock 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
32676809eb4eSEric Schrock 		spa_load_l2cache(spa);
32686809eb4eSEric Schrock 		spa_config_exit(spa, SCL_ALL, FTAG);
32696809eb4eSEric Schrock 		spa->spa_l2cache.sav_sync = B_TRUE;
32706809eb4eSEric Schrock 	}
32716809eb4eSEric Schrock 
3272b693757aSEric Schrock 	/*
3273b693757aSEric Schrock 	 * Check for any removed devices.
3274b693757aSEric Schrock 	 */
3275b693757aSEric Schrock 	if (spa->spa_autoreplace) {
3276b693757aSEric Schrock 		spa_aux_check_removed(&spa->spa_spares);
3277b693757aSEric Schrock 		spa_aux_check_removed(&spa->spa_l2cache);
3278b693757aSEric Schrock 	}
3279b693757aSEric Schrock 
32806809eb4eSEric Schrock 	if (spa_writeable(spa)) {
32816809eb4eSEric Schrock 		/*
32826809eb4eSEric Schrock 		 * Update the config cache to include the newly-imported pool.
32836809eb4eSEric Schrock 		 */
3284bc758434SLin Ling 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
32856809eb4eSEric Schrock 	}
32866809eb4eSEric Schrock 
3287*3f9d6ad7SLin Ling 	spa_async_resume(spa);
3288*3f9d6ad7SLin Ling 
3289573ca77eSGeorge Wilson 	/*
3290573ca77eSGeorge Wilson 	 * It's possible that the pool was expanded while it was exported.
3291573ca77eSGeorge Wilson 	 * We kick off an async task to handle this for us.
3292573ca77eSGeorge Wilson 	 */
3293573ca77eSGeorge Wilson 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3294573ca77eSGeorge Wilson 
32956809eb4eSEric Schrock 	mutex_exit(&spa_namespace_lock);
3296c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_IMPORT);
32976809eb4eSEric Schrock 
32986809eb4eSEric Schrock 	return (0);
3299c5904d13Seschrock }
3300c5904d13Seschrock 
3301fa9e4066Sahrens nvlist_t *
3302fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig)
3303fa9e4066Sahrens {
3304fa9e4066Sahrens 	nvlist_t *config = NULL;
3305fa9e4066Sahrens 	char *poolname;
3306fa9e4066Sahrens 	spa_t *spa;
3307fa9e4066Sahrens 	uint64_t state;
33087b7154beSLin Ling 	int error;
3309fa9e4066Sahrens 
3310fa9e4066Sahrens 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3311fa9e4066Sahrens 		return (NULL);
3312fa9e4066Sahrens 
3313fa9e4066Sahrens 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3314fa9e4066Sahrens 		return (NULL);
3315fa9e4066Sahrens 
3316fa9e4066Sahrens 	/*
33170373e76bSbonwick 	 * Create and initialize the spa structure.
3318fa9e4066Sahrens 	 */
33190373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
3320468c413aSTim Haley 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
33218ad4d6ddSJeff Bonwick 	spa_activate(spa, FREAD);
3322fa9e4066Sahrens 
3323fa9e4066Sahrens 	/*
33240373e76bSbonwick 	 * Pass off the heavy lifting to spa_load().
3325ecc2d604Sbonwick 	 * Pass TRUE for mosconfig because the user-supplied config
3326ecc2d604Sbonwick 	 * is actually the one to trust when doing an import.
3327fa9e4066Sahrens 	 */
33281195e687SMark J Musante 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3329fa9e4066Sahrens 
3330fa9e4066Sahrens 	/*
3331fa9e4066Sahrens 	 * If 'tryconfig' was at least parsable, return the current config.
3332fa9e4066Sahrens 	 */
3333fa9e4066Sahrens 	if (spa->spa_root_vdev != NULL) {
3334fa9e4066Sahrens 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3335fa9e4066Sahrens 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3336fa9e4066Sahrens 		    poolname) == 0);
3337fa9e4066Sahrens 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3338fa9e4066Sahrens 		    state) == 0);
333995173954Sek 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
334095173954Sek 		    spa->spa_uberblock.ub_timestamp) == 0);
334199653d4eSeschrock 
3342e7cbe64fSgw 		/*
3343e7cbe64fSgw 		 * If the bootfs property exists on this pool then we
3344e7cbe64fSgw 		 * copy it out so that external consumers can tell which
3345e7cbe64fSgw 		 * pools are bootable.
3346e7cbe64fSgw 		 */
33477b7154beSLin Ling 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
3348e7cbe64fSgw 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3349e7cbe64fSgw 
3350e7cbe64fSgw 			/*
3351e7cbe64fSgw 			 * We have to play games with the name since the
3352e7cbe64fSgw 			 * pool was opened as TRYIMPORT_NAME.
3353e7cbe64fSgw 			 */
3354e14bb325SJeff Bonwick 			if (dsl_dsobj_to_dsname(spa_name(spa),
3355e7cbe64fSgw 			    spa->spa_bootfs, tmpname) == 0) {
3356e7cbe64fSgw 				char *cp;
3357e7cbe64fSgw 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3358e7cbe64fSgw 
3359e7cbe64fSgw 				cp = strchr(tmpname, '/');
3360e7cbe64fSgw 				if (cp == NULL) {
3361e7cbe64fSgw 					(void) strlcpy(dsname, tmpname,
3362e7cbe64fSgw 					    MAXPATHLEN);
3363e7cbe64fSgw 				} else {
3364e7cbe64fSgw 					(void) snprintf(dsname, MAXPATHLEN,
3365e7cbe64fSgw 					    "%s/%s", poolname, ++cp);
3366e7cbe64fSgw 				}
3367e7cbe64fSgw 				VERIFY(nvlist_add_string(config,
3368e7cbe64fSgw 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3369e7cbe64fSgw 				kmem_free(dsname, MAXPATHLEN);
3370e7cbe64fSgw 			}
3371e7cbe64fSgw 			kmem_free(tmpname, MAXPATHLEN);
3372e7cbe64fSgw 		}
3373e7cbe64fSgw 
337499653d4eSeschrock 		/*
3375fa94a07fSbrendan 		 * Add the list of hot spares and level 2 cache devices.
337699653d4eSeschrock 		 */
33776809eb4eSEric Schrock 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
337899653d4eSeschrock 		spa_add_spares(spa, config);
3379fa94a07fSbrendan 		spa_add_l2cache(spa, config);
33806809eb4eSEric Schrock 		spa_config_exit(spa, SCL_CONFIG, FTAG);
3381fa9e4066Sahrens 	}
3382fa9e4066Sahrens 
3383fa9e4066Sahrens 	spa_unload(spa);
3384fa9e4066Sahrens 	spa_deactivate(spa);
3385fa9e4066Sahrens 	spa_remove(spa);
3386fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3387fa9e4066Sahrens 
3388fa9e4066Sahrens 	return (config);
3389fa9e4066Sahrens }
3390fa9e4066Sahrens 
3391fa9e4066Sahrens /*
3392fa9e4066Sahrens  * Pool export/destroy
3393fa9e4066Sahrens  *
3394fa9e4066Sahrens  * The act of destroying or exporting a pool is very simple.  We make sure there
3395fa9e4066Sahrens  * is no more pending I/O and any references to the pool are gone.  Then, we
3396fa9e4066Sahrens  * update the pool state and sync all the labels to disk, removing the
3397394ab0cbSGeorge Wilson  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3398394ab0cbSGeorge Wilson  * we don't sync the labels or remove the configuration cache.
3399fa9e4066Sahrens  */
3400fa9e4066Sahrens static int
340189a89ebfSlling spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3402394ab0cbSGeorge Wilson     boolean_t force, boolean_t hardforce)
3403fa9e4066Sahrens {
3404fa9e4066Sahrens 	spa_t *spa;
3405fa9e4066Sahrens 
340644cd46caSbillm 	if (oldconfig)
340744cd46caSbillm 		*oldconfig = NULL;
340844cd46caSbillm 
34098ad4d6ddSJeff Bonwick 	if (!(spa_mode_global & FWRITE))
3410fa9e4066Sahrens 		return (EROFS);
3411fa9e4066Sahrens 
3412fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
3413fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
3414fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
3415fa9e4066Sahrens 		return (ENOENT);
3416fa9e4066Sahrens 	}
3417fa9e4066Sahrens 
3418ea8dc4b6Seschrock 	/*
3419ea8dc4b6Seschrock 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
3420ea8dc4b6Seschrock 	 * reacquire the namespace lock, and see if we can export.
3421ea8dc4b6Seschrock 	 */
3422ea8dc4b6Seschrock 	spa_open_ref(spa, FTAG);
3423ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
3424ea8dc4b6Seschrock 	spa_async_suspend(spa);
3425ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
3426ea8dc4b6Seschrock 	spa_close(spa, FTAG);
3427ea8dc4b6Seschrock 
3428fa9e4066Sahrens 	/*
3429fa9e4066Sahrens 	 * The pool will be in core if it's openable,
3430fa9e4066Sahrens 	 * in which case we can modify its state.
3431fa9e4066Sahrens 	 */
3432fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3433fa9e4066Sahrens 		/*
3434fa9e4066Sahrens 		 * Objsets may be open only because they're dirty, so we
3435fa9e4066Sahrens 		 * have to force it to sync before checking spa_refcnt.
3436fa9e4066Sahrens 		 */
3437fa9e4066Sahrens 		txg_wait_synced(spa->spa_dsl_pool, 0);
3438fa9e4066Sahrens 
3439ea8dc4b6Seschrock 		/*
3440ea8dc4b6Seschrock 		 * A pool cannot be exported or destroyed if there are active
3441ea8dc4b6Seschrock 		 * references.  If we are resetting a pool, allow references by
3442ea8dc4b6Seschrock 		 * fault injection handlers.
3443ea8dc4b6Seschrock 		 */
3444ea8dc4b6Seschrock 		if (!spa_refcount_zero(spa) ||
3445ea8dc4b6Seschrock 		    (spa->spa_inject_ref != 0 &&
3446ea8dc4b6Seschrock 		    new_state != POOL_STATE_UNINITIALIZED)) {
3447ea8dc4b6Seschrock 			spa_async_resume(spa);
3448fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
3449fa9e4066Sahrens 			return (EBUSY);
3450fa9e4066Sahrens 		}
3451fa9e4066Sahrens 
345289a89ebfSlling 		/*
345389a89ebfSlling 		 * A pool cannot be exported if it has an active shared spare.
345489a89ebfSlling 		 * This is to prevent other pools stealing the active spare
345589a89ebfSlling 		 * from an exported pool. At user's own will, such pool can
345689a89ebfSlling 		 * be forcedly exported.
345789a89ebfSlling 		 */
345889a89ebfSlling 		if (!force && new_state == POOL_STATE_EXPORTED &&
345989a89ebfSlling 		    spa_has_active_shared_spare(spa)) {
346089a89ebfSlling 			spa_async_resume(spa);
346189a89ebfSlling 			mutex_exit(&spa_namespace_lock);
346289a89ebfSlling 			return (EXDEV);
346389a89ebfSlling 		}
346489a89ebfSlling 
3465fa9e4066Sahrens 		/*
3466fa9e4066Sahrens 		 * We want this to be reflected on every label,
3467fa9e4066Sahrens 		 * so mark them all dirty.  spa_unload() will do the
3468fa9e4066Sahrens 		 * final sync that pushes these changes out.
3469fa9e4066Sahrens 		 */
3470394ab0cbSGeorge Wilson 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3471e14bb325SJeff Bonwick 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3472ea8dc4b6Seschrock 			spa->spa_state = new_state;
3473*3f9d6ad7SLin Ling 			spa->spa_final_txg = spa_last_synced_txg(spa) +
3474*3f9d6ad7SLin Ling 			    TXG_DEFER_SIZE + 1;
3475ea8dc4b6Seschrock 			vdev_config_dirty(spa->spa_root_vdev);
3476e14bb325SJeff Bonwick 			spa_config_exit(spa, SCL_ALL, FTAG);
3477ea8dc4b6Seschrock 		}
3478fa9e4066Sahrens 	}
3479fa9e4066Sahrens 
34803d7072f8Seschrock 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
34813d7072f8Seschrock 
3482fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3483fa9e4066Sahrens 		spa_unload(spa);
3484fa9e4066Sahrens 		spa_deactivate(spa);
3485fa9e4066Sahrens 	}
3486fa9e4066Sahrens 
348744cd46caSbillm 	if (oldconfig && spa->spa_config)
348844cd46caSbillm 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
348944cd46caSbillm 
3490ea8dc4b6Seschrock 	if (new_state != POOL_STATE_UNINITIALIZED) {
3491394ab0cbSGeorge Wilson 		if (!hardforce)
3492394ab0cbSGeorge Wilson 			spa_config_sync(spa, B_TRUE, B_TRUE);
3493ea8dc4b6Seschrock 		spa_remove(spa);
3494ea8dc4b6Seschrock 	}
3495fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3496fa9e4066Sahrens 
3497fa9e4066Sahrens 	return (0);
3498fa9e4066Sahrens }
3499fa9e4066Sahrens 
3500fa9e4066Sahrens /*
3501fa9e4066Sahrens  * Destroy a storage pool.
3502fa9e4066Sahrens  */
3503fa9e4066Sahrens int
3504fa9e4066Sahrens spa_destroy(char *pool)
3505fa9e4066Sahrens {
3506394ab0cbSGeorge Wilson 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3507394ab0cbSGeorge Wilson 	    B_FALSE, B_FALSE));
3508fa9e4066Sahrens }
3509fa9e4066Sahrens 
3510fa9e4066Sahrens /*
3511fa9e4066Sahrens  * Export a storage pool.
3512fa9e4066Sahrens  */
3513fa9e4066Sahrens int
3514394ab0cbSGeorge Wilson spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3515394ab0cbSGeorge Wilson     boolean_t hardforce)
3516fa9e4066Sahrens {
3517394ab0cbSGeorge Wilson 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3518394ab0cbSGeorge Wilson 	    force, hardforce));
3519fa9e4066Sahrens }
3520fa9e4066Sahrens 
3521ea8dc4b6Seschrock /*
3522ea8dc4b6Seschrock  * Similar to spa_export(), this unloads the spa_t without actually removing it
3523ea8dc4b6Seschrock  * from the namespace in any way.
3524ea8dc4b6Seschrock  */
3525ea8dc4b6Seschrock int
3526ea8dc4b6Seschrock spa_reset(char *pool)
3527ea8dc4b6Seschrock {
352889a89ebfSlling 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3529394ab0cbSGeorge Wilson 	    B_FALSE, B_FALSE));
3530ea8dc4b6Seschrock }
3531ea8dc4b6Seschrock 
3532fa9e4066Sahrens /*
3533fa9e4066Sahrens  * ==========================================================================
3534fa9e4066Sahrens  * Device manipulation
3535fa9e4066Sahrens  * ==========================================================================
3536fa9e4066Sahrens  */
3537fa9e4066Sahrens 
3538fa9e4066Sahrens /*
35398654d025Sperrin  * Add a device to a storage pool.
3540fa9e4066Sahrens  */
3541fa9e4066Sahrens int
3542fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3543fa9e4066Sahrens {
354488ecc943SGeorge Wilson 	uint64_t txg, id;
35458ad4d6ddSJeff Bonwick 	int error;
3546fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
35470e34b6a7Sbonwick 	vdev_t *vd, *tvd;
3548fa94a07fSbrendan 	nvlist_t **spares, **l2cache;
3549fa94a07fSbrendan 	uint_t nspares, nl2cache;
3550fa9e4066Sahrens 
3551fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
3552fa9e4066Sahrens 
355399653d4eSeschrock 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
355499653d4eSeschrock 	    VDEV_ALLOC_ADD)) != 0)
355599653d4eSeschrock 		return (spa_vdev_exit(spa, NULL, txg, error));
3556fa9e4066Sahrens 
3557e14bb325SJeff Bonwick 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
355899653d4eSeschrock 
3559fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3560fa94a07fSbrendan 	    &nspares) != 0)
356199653d4eSeschrock 		nspares = 0;
356299653d4eSeschrock 
3563fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3564fa94a07fSbrendan 	    &nl2cache) != 0)
3565fa94a07fSbrendan 		nl2cache = 0;
3566fa94a07fSbrendan 
3567e14bb325SJeff Bonwick 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3568fa9e4066Sahrens 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
3569fa9e4066Sahrens 
3570e14bb325SJeff Bonwick 	if (vd->vdev_children != 0 &&
3571e14bb325SJeff Bonwick 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
3572e14bb325SJeff Bonwick 		return (spa_vdev_exit(spa, vd, txg, error));
357399653d4eSeschrock 
357439c23413Seschrock 	/*
3575fa94a07fSbrendan 	 * We must validate the spares and l2cache devices after checking the
3576fa94a07fSbrendan 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
357739c23413Seschrock 	 */
3578e14bb325SJeff Bonwick 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
357939c23413Seschrock 		return (spa_vdev_exit(spa, vd, txg, error));
358039c23413Seschrock 
358139c23413Seschrock 	/*
358239c23413Seschrock 	 * Transfer each new top-level vdev from vd to rvd.
358339c23413Seschrock 	 */
35848ad4d6ddSJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++) {
358588ecc943SGeorge Wilson 
358688ecc943SGeorge Wilson 		/*
358788ecc943SGeorge Wilson 		 * Set the vdev id to the first hole, if one exists.
358888ecc943SGeorge Wilson 		 */
358988ecc943SGeorge Wilson 		for (id = 0; id < rvd->vdev_children; id++) {
359088ecc943SGeorge Wilson 			if (rvd->vdev_child[id]->vdev_ishole) {
359188ecc943SGeorge Wilson 				vdev_free(rvd->vdev_child[id]);
359288ecc943SGeorge Wilson 				break;
359388ecc943SGeorge Wilson 			}
359488ecc943SGeorge Wilson 		}
359539c23413Seschrock 		tvd = vd->vdev_child[c];
359639c23413Seschrock 		vdev_remove_child(vd, tvd);
359788ecc943SGeorge Wilson 		tvd->vdev_id = id;
359839c23413Seschrock 		vdev_add_child(rvd, tvd);
359939c23413Seschrock 		vdev_config_dirty(tvd);
360039c23413Seschrock 	}
360139c23413Seschrock 
360299653d4eSeschrock 	if (nspares != 0) {
3603fa94a07fSbrendan 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3604fa94a07fSbrendan 		    ZPOOL_CONFIG_SPARES);
360599653d4eSeschrock 		spa_load_spares(spa);
3606fa94a07fSbrendan 		spa->spa_spares.sav_sync = B_TRUE;
3607fa94a07fSbrendan 	}
3608fa94a07fSbrendan 
3609fa94a07fSbrendan 	if (nl2cache != 0) {
3610fa94a07fSbrendan 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3611fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE);
3612fa94a07fSbrendan 		spa_load_l2cache(spa);
3613fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
3614fa9e4066Sahrens 	}
3615fa9e4066Sahrens 
3616fa9e4066Sahrens 	/*
36170e34b6a7Sbonwick 	 * We have to be careful when adding new vdevs to an existing pool.
36180e34b6a7Sbonwick 	 * If other threads start allocating from these vdevs before we
36190e34b6a7Sbonwick 	 * sync the config cache, and we lose power, then upon reboot we may
36200e34b6a7Sbonwick 	 * fail to open the pool because there are DVAs that the config cache
36210e34b6a7Sbonwick 	 * can't translate.  Therefore, we first add the vdevs without
36220e34b6a7Sbonwick 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
36230373e76bSbonwick 	 * and then let spa_config_update() initialize the new metaslabs.
36240e34b6a7Sbonwick 	 *
36250e34b6a7Sbonwick 	 * spa_load() checks for added-but-not-initialized vdevs, so that
36260e34b6a7Sbonwick 	 * if we lose power at any point in this sequence, the remaining
36270e34b6a7Sbonwick 	 * steps will be completed the next time we load the pool.
36280e34b6a7Sbonwick 	 */
36290373e76bSbonwick 	(void) spa_vdev_exit(spa, vd, txg, 0);
36300e34b6a7Sbonwick 
36310373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
36320373e76bSbonwick 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
36330373e76bSbonwick 	mutex_exit(&spa_namespace_lock);
3634fa9e4066Sahrens 
36350373e76bSbonwick 	return (0);
3636fa9e4066Sahrens }
3637fa9e4066Sahrens 
3638fa9e4066Sahrens /*
3639fa9e4066Sahrens  * Attach a device to a mirror.  The arguments are the path to any device
3640fa9e4066Sahrens  * in the mirror, and the nvroot for the new device.  If the path specifies
3641fa9e4066Sahrens  * a device that is not mirrored, we automatically insert the mirror vdev.
3642fa9e4066Sahrens  *
3643fa9e4066Sahrens  * If 'replacing' is specified, the new device is intended to replace the
3644fa9e4066Sahrens  * existing device; in this case the two devices are made into their own
36453d7072f8Seschrock  * mirror using the 'replacing' vdev, which is functionally identical to
3646fa9e4066Sahrens  * the mirror vdev (it actually reuses all the same ops) but has a few
3647fa9e4066Sahrens  * extra rules: you can't attach to it after it's been created, and upon
3648fa9e4066Sahrens  * completion of resilvering, the first disk (the one being replaced)
3649fa9e4066Sahrens  * is automatically detached.
3650fa9e4066Sahrens  */
3651fa9e4066Sahrens int
3652ea8dc4b6Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3653fa9e4066Sahrens {
3654*3f9d6ad7SLin Ling 	uint64_t txg, dtl_max_txg;
3655fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
3656fa9e4066Sahrens 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
365799653d4eSeschrock 	vdev_ops_t *pvops;
36589b3f6b42SEric Kustarz 	char *oldvdpath, *newvdpath;
36599b3f6b42SEric Kustarz 	int newvd_isspare;
36609b3f6b42SEric Kustarz 	int error;
3661fa9e4066Sahrens 
3662fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
3663fa9e4066Sahrens 
3664c5904d13Seschrock 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3665fa9e4066Sahrens 
3666fa9e4066Sahrens 	if (oldvd == NULL)
3667fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3668fa9e4066Sahrens 
36690e34b6a7Sbonwick 	if (!oldvd->vdev_ops->vdev_op_leaf)
36700e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
36710e34b6a7Sbonwick 
3672fa9e4066Sahrens 	pvd = oldvd->vdev_parent;
3673fa9e4066Sahrens 
367499653d4eSeschrock 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
36753d7072f8Seschrock 	    VDEV_ALLOC_ADD)) != 0)
36763d7072f8Seschrock 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
36773d7072f8Seschrock 
36783d7072f8Seschrock 	if (newrootvd->vdev_children != 1)
3679fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3680fa9e4066Sahrens 
3681fa9e4066Sahrens 	newvd = newrootvd->vdev_child[0];
3682fa9e4066Sahrens 
3683fa9e4066Sahrens 	if (!newvd->vdev_ops->vdev_op_leaf)
3684fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3685fa9e4066Sahrens 
368699653d4eSeschrock 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3687fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, error));
3688fa9e4066Sahrens 
36898654d025Sperrin 	/*
36908654d025Sperrin 	 * Spares can't replace logs
36918654d025Sperrin 	 */
3692ee0eb9f2SEric Schrock 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
36938654d025Sperrin 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
36948654d025Sperrin 
369599653d4eSeschrock 	if (!replacing) {
369699653d4eSeschrock 		/*
369799653d4eSeschrock 		 * For attach, the only allowable parent is a mirror or the root
369899653d4eSeschrock 		 * vdev.
369999653d4eSeschrock 		 */
370099653d4eSeschrock 		if (pvd->vdev_ops != &vdev_mirror_ops &&
370199653d4eSeschrock 		    pvd->vdev_ops != &vdev_root_ops)
370299653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
370399653d4eSeschrock 
370499653d4eSeschrock 		pvops = &vdev_mirror_ops;
370599653d4eSeschrock 	} else {
370699653d4eSeschrock 		/*
370799653d4eSeschrock 		 * Active hot spares can only be replaced by inactive hot
370899653d4eSeschrock 		 * spares.
370999653d4eSeschrock 		 */
371099653d4eSeschrock 		if (pvd->vdev_ops == &vdev_spare_ops &&
371199653d4eSeschrock 		    pvd->vdev_child[1] == oldvd &&
371299653d4eSeschrock 		    !spa_has_spare(spa, newvd->vdev_guid))
371399653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
371499653d4eSeschrock 
371599653d4eSeschrock 		/*
371699653d4eSeschrock 		 * If the source is a hot spare, and the parent isn't already a
371799653d4eSeschrock 		 * spare, then we want to create a new hot spare.  Otherwise, we
371839c23413Seschrock 		 * want to create a replacing vdev.  The user is not allowed to
371939c23413Seschrock 		 * attach to a spared vdev child unless the 'isspare' state is
372039c23413Seschrock 		 * the same (spare replaces spare, non-spare replaces
372139c23413Seschrock 		 * non-spare).
372299653d4eSeschrock 		 */
372399653d4eSeschrock 		if (pvd->vdev_ops == &vdev_replacing_ops)
372499653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
372539c23413Seschrock 		else if (pvd->vdev_ops == &vdev_spare_ops &&
372639c23413Seschrock 		    newvd->vdev_isspare != oldvd->vdev_isspare)
372739c23413Seschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
372899653d4eSeschrock 		else if (pvd->vdev_ops != &vdev_spare_ops &&
372999653d4eSeschrock 		    newvd->vdev_isspare)
373099653d4eSeschrock 			pvops = &vdev_spare_ops;
373199653d4eSeschrock 		else
373299653d4eSeschrock 			pvops = &vdev_replacing_ops;
373399653d4eSeschrock 	}
373499653d4eSeschrock 
37352a79c5feSlling 	/*
3736573ca77eSGeorge Wilson 	 * Make sure the new device is big enough.
37372a79c5feSlling 	 */
3738573ca77eSGeorge Wilson 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3739fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3740fa9e4066Sahrens 
3741ecc2d604Sbonwick 	/*
3742ecc2d604Sbonwick 	 * The new device cannot have a higher alignment requirement
3743ecc2d604Sbonwick 	 * than the top-level vdev.
3744ecc2d604Sbonwick 	 */
3745ecc2d604Sbonwick 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3746fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3747fa9e4066Sahrens 
3748fa9e4066Sahrens 	/*
3749fa9e4066Sahrens 	 * If this is an in-place replacement, update oldvd's path and devid
3750fa9e4066Sahrens 	 * to make it distinguishable from newvd, and unopenable from now on.
3751fa9e4066Sahrens 	 */
3752fa9e4066Sahrens 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3753fa9e4066Sahrens 		spa_strfree(oldvd->vdev_path);
3754fa9e4066Sahrens 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3755fa9e4066Sahrens 		    KM_SLEEP);
3756fa9e4066Sahrens 		(void) sprintf(oldvd->vdev_path, "%s/%s",
3757fa9e4066Sahrens 		    newvd->vdev_path, "old");
3758fa9e4066Sahrens 		if (oldvd->vdev_devid != NULL) {
3759fa9e4066Sahrens 			spa_strfree(oldvd->vdev_devid);
3760fa9e4066Sahrens 			oldvd->vdev_devid = NULL;
3761fa9e4066Sahrens 		}
3762fa9e4066Sahrens 	}
3763fa9e4066Sahrens 
3764fa9e4066Sahrens 	/*
376599653d4eSeschrock 	 * If the parent is not a mirror, or if we're replacing, insert the new
376699653d4eSeschrock 	 * mirror/replacing/spare vdev above oldvd.
3767fa9e4066Sahrens 	 */
3768fa9e4066Sahrens 	if (pvd->vdev_ops != pvops)
3769fa9e4066Sahrens 		pvd = vdev_add_parent(oldvd, pvops);
3770fa9e4066Sahrens 
3771fa9e4066Sahrens 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
3772fa9e4066Sahrens 	ASSERT(pvd->vdev_ops == pvops);
3773fa9e4066Sahrens 	ASSERT(oldvd->vdev_parent == pvd);
3774fa9e4066Sahrens 
3775fa9e4066Sahrens 	/*
3776fa9e4066Sahrens 	 * Extract the new device from its root and add it to pvd.
3777fa9e4066Sahrens 	 */
3778fa9e4066Sahrens 	vdev_remove_child(newrootvd, newvd);
3779fa9e4066Sahrens 	newvd->vdev_id = pvd->vdev_children;
378088ecc943SGeorge Wilson 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
3781fa9e4066Sahrens 	vdev_add_child(pvd, newvd);
3782fa9e4066Sahrens 
3783fa9e4066Sahrens 	tvd = newvd->vdev_top;
3784fa9e4066Sahrens 	ASSERT(pvd->vdev_top == tvd);
3785fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
3786fa9e4066Sahrens 
3787fa9e4066Sahrens 	vdev_config_dirty(tvd);
3788fa9e4066Sahrens 
3789fa9e4066Sahrens 	/*
3790*3f9d6ad7SLin Ling 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
3791*3f9d6ad7SLin Ling 	 * for any dmu_sync-ed blocks.  It will propagate upward when
3792*3f9d6ad7SLin Ling 	 * spa_vdev_exit() calls vdev_dtl_reassess().
3793fa9e4066Sahrens 	 */
3794*3f9d6ad7SLin Ling 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
3795fa9e4066Sahrens 
3796*3f9d6ad7SLin Ling 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
3797*3f9d6ad7SLin Ling 	    dtl_max_txg - TXG_INITIAL);
3798fa9e4066Sahrens 
37996809eb4eSEric Schrock 	if (newvd->vdev_isspare) {
380039c23413Seschrock 		spa_spare_activate(newvd);
38016809eb4eSEric Schrock 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
38026809eb4eSEric Schrock 	}
38036809eb4eSEric Schrock 
3804e14bb325SJeff Bonwick 	oldvdpath = spa_strdup(oldvd->vdev_path);
3805e14bb325SJeff Bonwick 	newvdpath = spa_strdup(newvd->vdev_path);
38069b3f6b42SEric Kustarz 	newvd_isspare = newvd->vdev_isspare;
3807ea8dc4b6Seschrock 
3808fa9e4066Sahrens 	/*
3809fa9e4066Sahrens 	 * Mark newvd's DTL dirty in this txg.
3810fa9e4066Sahrens 	 */
3811ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
3812fa9e4066Sahrens 
3813*3f9d6ad7SLin Ling 	/*
3814*3f9d6ad7SLin Ling 	 * Restart the resilver
3815*3f9d6ad7SLin Ling 	 */
3816*3f9d6ad7SLin Ling 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
3817*3f9d6ad7SLin Ling 
3818*3f9d6ad7SLin Ling 	/*
3819*3f9d6ad7SLin Ling 	 * Commit the config
3820*3f9d6ad7SLin Ling 	 */
3821*3f9d6ad7SLin Ling 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
3822fa9e4066Sahrens 
3823*3f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
3824*3f9d6ad7SLin Ling 	    "%s vdev=%s %s vdev=%s",
3825c8e1f6d2SMark J Musante 	    replacing && newvd_isspare ? "spare in" :
3826c8e1f6d2SMark J Musante 	    replacing ? "replace" : "attach", newvdpath,
3827c8e1f6d2SMark J Musante 	    replacing ? "for" : "to", oldvdpath);
38289b3f6b42SEric Kustarz 
38299b3f6b42SEric Kustarz 	spa_strfree(oldvdpath);
38309b3f6b42SEric Kustarz 	spa_strfree(newvdpath);
38319b3f6b42SEric Kustarz 
3832fa9e4066Sahrens 	return (0);
3833fa9e4066Sahrens }
3834fa9e4066Sahrens 
3835fa9e4066Sahrens /*
3836fa9e4066Sahrens  * Detach a device from a mirror or replacing vdev.
3837fa9e4066Sahrens  * If 'replace_done' is specified, only detach if the parent
3838fa9e4066Sahrens  * is a replacing vdev.
3839fa9e4066Sahrens  */
3840fa9e4066Sahrens int
38418ad4d6ddSJeff Bonwick spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
3842fa9e4066Sahrens {
3843fa9e4066Sahrens 	uint64_t txg;
38448ad4d6ddSJeff Bonwick 	int error;
3845fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
3846fa9e4066Sahrens 	vdev_t *vd, *pvd, *cvd, *tvd;
384799653d4eSeschrock 	boolean_t unspare = B_FALSE;
384899653d4eSeschrock 	uint64_t unspare_guid;
3849bf82a41bSeschrock 	size_t len;
38501195e687SMark J Musante 	char *vdpath;
3851fa9e4066Sahrens 
3852fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
3853fa9e4066Sahrens 
3854c5904d13Seschrock 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3855fa9e4066Sahrens 
3856fa9e4066Sahrens 	if (vd == NULL)
3857fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3858fa9e4066Sahrens 
38590e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
38600e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
38610e34b6a7Sbonwick 
3862fa9e4066Sahrens 	pvd = vd->vdev_parent;
3863fa9e4066Sahrens 
38648ad4d6ddSJeff Bonwick 	/*
38658ad4d6ddSJeff Bonwick 	 * If the parent/child relationship is not as expected, don't do it.
38668ad4d6ddSJeff Bonwick 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
38678ad4d6ddSJeff Bonwick 	 * vdev that's replacing B with C.  The user's intent in replacing
38688ad4d6ddSJeff Bonwick 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
38698ad4d6ddSJeff Bonwick 	 * the replace by detaching C, the expected behavior is to end up
38708ad4d6ddSJeff Bonwick 	 * M(A,B).  But suppose that right after deciding to detach C,
38718ad4d6ddSJeff Bonwick 	 * the replacement of B completes.  We would have M(A,C), and then
38728ad4d6ddSJeff Bonwick 	 * ask to detach C, which would leave us with just A -- not what
38738ad4d6ddSJeff Bonwick 	 * the user wanted.  To prevent this, we make sure that the
38748ad4d6ddSJeff Bonwick 	 * parent/child relationship hasn't changed -- in this example,
38758ad4d6ddSJeff Bonwick 	 * that C's parent is still the replacing vdev R.
38768ad4d6ddSJeff Bonwick 	 */
38778ad4d6ddSJeff Bonwick 	if (pvd->vdev_guid != pguid && pguid != 0)
38788ad4d6ddSJeff Bonwick 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
38798ad4d6ddSJeff Bonwick 
3880fa9e4066Sahrens 	/*
3881fa9e4066Sahrens 	 * If replace_done is specified, only remove this device if it's
388299653d4eSeschrock 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
388399653d4eSeschrock 	 * disk can be removed.
388499653d4eSeschrock 	 */
388599653d4eSeschrock 	if (replace_done) {
388699653d4eSeschrock 		if (pvd->vdev_ops == &vdev_replacing_ops) {
388799653d4eSeschrock 			if (vd->vdev_id != 0)
388899653d4eSeschrock 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
388999653d4eSeschrock 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
389099653d4eSeschrock 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
389199653d4eSeschrock 		}
389299653d4eSeschrock 	}
389399653d4eSeschrock 
389499653d4eSeschrock 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
3895e7437265Sahrens 	    spa_version(spa) >= SPA_VERSION_SPARES);
3896fa9e4066Sahrens 
3897fa9e4066Sahrens 	/*
389899653d4eSeschrock 	 * Only mirror, replacing, and spare vdevs support detach.
3899fa9e4066Sahrens 	 */
3900fa9e4066Sahrens 	if (pvd->vdev_ops != &vdev_replacing_ops &&
390199653d4eSeschrock 	    pvd->vdev_ops != &vdev_mirror_ops &&
390299653d4eSeschrock 	    pvd->vdev_ops != &vdev_spare_ops)
3903fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3904fa9e4066Sahrens 
3905fa9e4066Sahrens 	/*
39068ad4d6ddSJeff Bonwick 	 * If this device has the only valid copy of some data,
39078ad4d6ddSJeff Bonwick 	 * we cannot safely detach it.
3908fa9e4066Sahrens 	 */
39098ad4d6ddSJeff Bonwick 	if (vdev_dtl_required(vd))
3910fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3911fa9e4066Sahrens 
39128ad4d6ddSJeff Bonwick 	ASSERT(pvd->vdev_children >= 2);
3913fa9e4066Sahrens 
3914bf82a41bSeschrock 	/*
3915bf82a41bSeschrock 	 * If we are detaching the second disk from a replacing vdev, then
3916bf82a41bSeschrock 	 * check to see if we changed the original vdev's path to have "/old"
3917bf82a41bSeschrock 	 * at the end in spa_vdev_attach().  If so, undo that change now.
3918bf82a41bSeschrock 	 */
3919bf82a41bSeschrock 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
3920bf82a41bSeschrock 	    pvd->vdev_child[0]->vdev_path != NULL &&
3921bf82a41bSeschrock 	    pvd->vdev_child[1]->vdev_path != NULL) {
3922bf82a41bSeschrock 		ASSERT(pvd->vdev_child[1] == vd);
3923bf82a41bSeschrock 		cvd = pvd->vdev_child[0];
3924bf82a41bSeschrock 		len = strlen(vd->vdev_path);
3925bf82a41bSeschrock 		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
3926bf82a41bSeschrock 		    strcmp(cvd->vdev_path + len, "/old") == 0) {
3927bf82a41bSeschrock 			spa_strfree(cvd->vdev_path);
3928bf82a41bSeschrock 			cvd->vdev_path = spa_strdup(vd->vdev_path);
3929bf82a41bSeschrock 		}
3930bf82a41bSeschrock 	}
3931bf82a41bSeschrock 
393299653d4eSeschrock 	/*
393399653d4eSeschrock 	 * If we are detaching the original disk from a spare, then it implies
393499653d4eSeschrock 	 * that the spare should become a real disk, and be removed from the
393599653d4eSeschrock 	 * active spare list for the pool.
393699653d4eSeschrock 	 */
393799653d4eSeschrock 	if (pvd->vdev_ops == &vdev_spare_ops &&
39388ad4d6ddSJeff Bonwick 	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
393999653d4eSeschrock 		unspare = B_TRUE;
394099653d4eSeschrock 
3941fa9e4066Sahrens 	/*
3942fa9e4066Sahrens 	 * Erase the disk labels so the disk can be used for other things.
3943fa9e4066Sahrens 	 * This must be done after all other error cases are handled,
3944fa9e4066Sahrens 	 * but before we disembowel vd (so we can still do I/O to it).
3945fa9e4066Sahrens 	 * But if we can't do it, don't treat the error as fatal --
3946fa9e4066Sahrens 	 * it may be that the unwritability of the disk is the reason
3947fa9e4066Sahrens 	 * it's being detached!
3948fa9e4066Sahrens 	 */
394939c23413Seschrock 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
3950fa9e4066Sahrens 
3951fa9e4066Sahrens 	/*
3952fa9e4066Sahrens 	 * Remove vd from its parent and compact the parent's children.
3953fa9e4066Sahrens 	 */
3954fa9e4066Sahrens 	vdev_remove_child(pvd, vd);
3955fa9e4066Sahrens 	vdev_compact_children(pvd);
3956fa9e4066Sahrens 
3957fa9e4066Sahrens 	/*
3958fa9e4066Sahrens 	 * Remember one of the remaining children so we can get tvd below.
3959fa9e4066Sahrens 	 */
3960fa9e4066Sahrens 	cvd = pvd->vdev_child[0];
3961fa9e4066Sahrens 
396299653d4eSeschrock 	/*
396399653d4eSeschrock 	 * If we need to remove the remaining child from the list of hot spares,
39648ad4d6ddSJeff Bonwick 	 * do it now, marking the vdev as no longer a spare in the process.
39658ad4d6ddSJeff Bonwick 	 * We must do this before vdev_remove_parent(), because that can
39668ad4d6ddSJeff Bonwick 	 * change the GUID if it creates a new toplevel GUID.  For a similar
39678ad4d6ddSJeff Bonwick 	 * reason, we must remove the spare now, in the same txg as the detach;
39688ad4d6ddSJeff Bonwick 	 * otherwise someone could attach a new sibling, change the GUID, and
39698ad4d6ddSJeff Bonwick 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
397099653d4eSeschrock 	 */
397199653d4eSeschrock 	if (unspare) {
397299653d4eSeschrock 		ASSERT(cvd->vdev_isspare);
397339c23413Seschrock 		spa_spare_remove(cvd);
397499653d4eSeschrock 		unspare_guid = cvd->vdev_guid;
39758ad4d6ddSJeff Bonwick 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
397699653d4eSeschrock 	}
397799653d4eSeschrock 
3978fa9e4066Sahrens 	/*
3979fa9e4066Sahrens 	 * If the parent mirror/replacing vdev only has one child,
3980fa9e4066Sahrens 	 * the parent is no longer needed.  Remove it from the tree.
3981fa9e4066Sahrens 	 */
3982fa9e4066Sahrens 	if (pvd->vdev_children == 1)
3983fa9e4066Sahrens 		vdev_remove_parent(cvd);
3984fa9e4066Sahrens 
3985fa9e4066Sahrens 	/*
3986fa9e4066Sahrens 	 * We don't set tvd until now because the parent we just removed
3987fa9e4066Sahrens 	 * may have been the previous top-level vdev.
3988fa9e4066Sahrens 	 */
3989fa9e4066Sahrens 	tvd = cvd->vdev_top;
3990fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
3991fa9e4066Sahrens 
3992fa9e4066Sahrens 	/*
399339c23413Seschrock 	 * Reevaluate the parent vdev state.
3994fa9e4066Sahrens 	 */
39953d7072f8Seschrock 	vdev_propagate_state(cvd);
3996fa9e4066Sahrens 
3997fa9e4066Sahrens 	/*
3998573ca77eSGeorge Wilson 	 * If the 'autoexpand' property is set on the pool then automatically
3999573ca77eSGeorge Wilson 	 * try to expand the size of the pool. For example if the device we
4000573ca77eSGeorge Wilson 	 * just detached was smaller than the others, it may be possible to
4001573ca77eSGeorge Wilson 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4002573ca77eSGeorge Wilson 	 * first so that we can obtain the updated sizes of the leaf vdevs.
4003fa9e4066Sahrens 	 */
4004573ca77eSGeorge Wilson 	if (spa->spa_autoexpand) {
4005573ca77eSGeorge Wilson 		vdev_reopen(tvd);
4006573ca77eSGeorge Wilson 		vdev_expand(tvd, txg);
4007573ca77eSGeorge Wilson 	}
4008fa9e4066Sahrens 
4009fa9e4066Sahrens 	vdev_config_dirty(tvd);
4010fa9e4066Sahrens 
4011fa9e4066Sahrens 	/*
401239c23413Seschrock 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
401339c23413Seschrock 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
401439c23413Seschrock 	 * But first make sure we're not on any *other* txg's DTL list, to
401539c23413Seschrock 	 * prevent vd from being accessed after it's freed.
4016fa9e4066Sahrens 	 */
40171195e687SMark J Musante 	vdpath = spa_strdup(vd->vdev_path);
40188ad4d6ddSJeff Bonwick 	for (int t = 0; t < TXG_SIZE; t++)
4019fa9e4066Sahrens 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4020ecc2d604Sbonwick 	vd->vdev_detached = B_TRUE;
4021ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, vd, txg);
4022fa9e4066Sahrens 
40233d7072f8Seschrock 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
40243d7072f8Seschrock 
402599653d4eSeschrock 	error = spa_vdev_exit(spa, vd, txg, 0);
402699653d4eSeschrock 
4027*3f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
40281195e687SMark J Musante 	    "vdev=%s", vdpath);
40291195e687SMark J Musante 	spa_strfree(vdpath);
40301195e687SMark J Musante 
403199653d4eSeschrock 	/*
403239c23413Seschrock 	 * If this was the removal of the original device in a hot spare vdev,
403339c23413Seschrock 	 * then we want to go through and remove the device from the hot spare
403439c23413Seschrock 	 * list of every other pool.
403599653d4eSeschrock 	 */
403699653d4eSeschrock 	if (unspare) {
40378ad4d6ddSJeff Bonwick 		spa_t *myspa = spa;
403899653d4eSeschrock 		spa = NULL;
403999653d4eSeschrock 		mutex_enter(&spa_namespace_lock);
404099653d4eSeschrock 		while ((spa = spa_next(spa)) != NULL) {
404199653d4eSeschrock 			if (spa->spa_state != POOL_STATE_ACTIVE)
404299653d4eSeschrock 				continue;
40438ad4d6ddSJeff Bonwick 			if (spa == myspa)
40448ad4d6ddSJeff Bonwick 				continue;
40459af0a4dfSJeff Bonwick 			spa_open_ref(spa, FTAG);
40469af0a4dfSJeff Bonwick 			mutex_exit(&spa_namespace_lock);
4047*3f9d6ad7SLin Ling 			(void) spa_vdev_remove(spa, unspare_guid,
4048*3f9d6ad7SLin Ling 			    B_TRUE);
40499af0a4dfSJeff Bonwick 			mutex_enter(&spa_namespace_lock);
40509af0a4dfSJeff Bonwick 			spa_close(spa, FTAG);
405199653d4eSeschrock 		}
405299653d4eSeschrock 		mutex_exit(&spa_namespace_lock);
405399653d4eSeschrock 	}
405499653d4eSeschrock 
405599653d4eSeschrock 	return (error);
405699653d4eSeschrock }
405799653d4eSeschrock 
40581195e687SMark J Musante /*
40591195e687SMark J Musante  * Split a set of devices from their mirrors, and create a new pool from them.
40601195e687SMark J Musante  */
40611195e687SMark J Musante int
40621195e687SMark J Musante spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
40631195e687SMark J Musante     nvlist_t *props, boolean_t exp)
40641195e687SMark J Musante {
40651195e687SMark J Musante 	int error = 0;
40661195e687SMark J Musante 	uint64_t txg, *glist;
40671195e687SMark J Musante 	spa_t *newspa;
40681195e687SMark J Musante 	uint_t c, children, lastlog;
40691195e687SMark J Musante 	nvlist_t **child, *nvl, *tmp;
40701195e687SMark J Musante 	dmu_tx_t *tx;
40711195e687SMark J Musante 	char *altroot = NULL;
40721195e687SMark J Musante 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
40731195e687SMark J Musante 	boolean_t activate_slog;
40741195e687SMark J Musante 
40751195e687SMark J Musante 	if (!spa_writeable(spa))
40761195e687SMark J Musante 		return (EROFS);
40771195e687SMark J Musante 
40781195e687SMark J Musante 	txg = spa_vdev_enter(spa);
40791195e687SMark J Musante 
40801195e687SMark J Musante 	/* clear the log and flush everything up to now */
40811195e687SMark J Musante 	activate_slog = spa_passivate_log(spa);
40821195e687SMark J Musante 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
40831195e687SMark J Musante 	error = spa_offline_log(spa);
40841195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
40851195e687SMark J Musante 
40861195e687SMark J Musante 	if (activate_slog)
40871195e687SMark J Musante 		spa_activate_log(spa);
40881195e687SMark J Musante 
40891195e687SMark J Musante 	if (error != 0)
40901195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, error));
40911195e687SMark J Musante 
40921195e687SMark J Musante 	/* check new spa name before going any further */
40931195e687SMark J Musante 	if (spa_lookup(newname) != NULL)
40941195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
40951195e687SMark J Musante 
40961195e687SMark J Musante 	/*
40971195e687SMark J Musante 	 * scan through all the children to ensure they're all mirrors
40981195e687SMark J Musante 	 */
40991195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
41001195e687SMark J Musante 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
41011195e687SMark J Musante 	    &children) != 0)
41021195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
41031195e687SMark J Musante 
41041195e687SMark J Musante 	/* first, check to ensure we've got the right child count */
41051195e687SMark J Musante 	rvd = spa->spa_root_vdev;
41061195e687SMark J Musante 	lastlog = 0;
41071195e687SMark J Musante 	for (c = 0; c < rvd->vdev_children; c++) {
41081195e687SMark J Musante 		vdev_t *vd = rvd->vdev_child[c];
41091195e687SMark J Musante 
41101195e687SMark J Musante 		/* don't count the holes & logs as children */
41111195e687SMark J Musante 		if (vd->vdev_islog || vd->vdev_ishole) {
41121195e687SMark J Musante 			if (lastlog == 0)
41131195e687SMark J Musante 				lastlog = c;
41141195e687SMark J Musante 			continue;
41151195e687SMark J Musante 		}
41161195e687SMark J Musante 
41171195e687SMark J Musante 		lastlog = 0;
41181195e687SMark J Musante 	}
41191195e687SMark J Musante 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
41201195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
41211195e687SMark J Musante 
41221195e687SMark J Musante 	/* next, ensure no spare or cache devices are part of the split */
41231195e687SMark J Musante 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
41241195e687SMark J Musante 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
41251195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
41261195e687SMark J Musante 
41271195e687SMark J Musante 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
41281195e687SMark J Musante 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
41291195e687SMark J Musante 
41301195e687SMark J Musante 	/* then, loop over each vdev and validate it */
41311195e687SMark J Musante 	for (c = 0; c < children; c++) {
41321195e687SMark J Musante 		uint64_t is_hole = 0;
41331195e687SMark J Musante 
41341195e687SMark J Musante 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
41351195e687SMark J Musante 		    &is_hole);
41361195e687SMark J Musante 
41371195e687SMark J Musante 		if (is_hole != 0) {
41381195e687SMark J Musante 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
41391195e687SMark J Musante 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
41401195e687SMark J Musante 				continue;
41411195e687SMark J Musante 			} else {
41421195e687SMark J Musante 				error = EINVAL;
41431195e687SMark J Musante 				break;
41441195e687SMark J Musante 			}
41451195e687SMark J Musante 		}
41461195e687SMark J Musante 
41471195e687SMark J Musante 		/* which disk is going to be split? */
41481195e687SMark J Musante 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
41491195e687SMark J Musante 		    &glist[c]) != 0) {
41501195e687SMark J Musante 			error = EINVAL;
41511195e687SMark J Musante 			break;
41521195e687SMark J Musante 		}
41531195e687SMark J Musante 
41541195e687SMark J Musante 		/* look it up in the spa */
41551195e687SMark J Musante 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
41561195e687SMark J Musante 		if (vml[c] == NULL) {
41571195e687SMark J Musante 			error = ENODEV;
41581195e687SMark J Musante 			break;
41591195e687SMark J Musante 		}
41601195e687SMark J Musante 
41611195e687SMark J Musante 		/* make sure there's nothing stopping the split */
41621195e687SMark J Musante 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
41631195e687SMark J Musante 		    vml[c]->vdev_islog ||
41641195e687SMark J Musante 		    vml[c]->vdev_ishole ||
41651195e687SMark J Musante 		    vml[c]->vdev_isspare ||
41661195e687SMark J Musante 		    vml[c]->vdev_isl2cache ||
41671195e687SMark J Musante 		    !vdev_writeable(vml[c]) ||
4168d41c4376SMark J Musante 		    vml[c]->vdev_children != 0 ||
41691195e687SMark J Musante 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
41701195e687SMark J Musante 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
41711195e687SMark J Musante 			error = EINVAL;
41721195e687SMark J Musante 			break;
41731195e687SMark J Musante 		}
41741195e687SMark J Musante 
41751195e687SMark J Musante 		if (vdev_dtl_required(vml[c])) {
41761195e687SMark J Musante 			error = EBUSY;
41771195e687SMark J Musante 			break;
41781195e687SMark J Musante 		}
41791195e687SMark J Musante 
41801195e687SMark J Musante 		/* we need certain info from the top level */
41811195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
41821195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ms_array) == 0);
41831195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
41841195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
41851195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
41861195e687SMark J Musante 		    vml[c]->vdev_top->vdev_asize) == 0);
41871195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
41881195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ashift) == 0);
41891195e687SMark J Musante 	}
41901195e687SMark J Musante 
41911195e687SMark J Musante 	if (error != 0) {
41921195e687SMark J Musante 		kmem_free(vml, children * sizeof (vdev_t *));
41931195e687SMark J Musante 		kmem_free(glist, children * sizeof (uint64_t));
41941195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, error));
41951195e687SMark J Musante 	}
41961195e687SMark J Musante 
41971195e687SMark J Musante 	/* stop writers from using the disks */
41981195e687SMark J Musante 	for (c = 0; c < children; c++) {
41991195e687SMark J Musante 		if (vml[c] != NULL)
42001195e687SMark J Musante 			vml[c]->vdev_offline = B_TRUE;
42011195e687SMark J Musante 	}
42021195e687SMark J Musante 	vdev_reopen(spa->spa_root_vdev);
42031195e687SMark J Musante 
42041195e687SMark J Musante 	/*
42051195e687SMark J Musante 	 * Temporarily record the splitting vdevs in the spa config.  This
42061195e687SMark J Musante 	 * will disappear once the config is regenerated.
42071195e687SMark J Musante 	 */
42081195e687SMark J Musante 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
42091195e687SMark J Musante 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
42101195e687SMark J Musante 	    glist, children) == 0);
42111195e687SMark J Musante 	kmem_free(glist, children * sizeof (uint64_t));
42121195e687SMark J Musante 
421398295d61SMark J Musante 	mutex_enter(&spa->spa_props_lock);
42141195e687SMark J Musante 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
42151195e687SMark J Musante 	    nvl) == 0);
421698295d61SMark J Musante 	mutex_exit(&spa->spa_props_lock);
42171195e687SMark J Musante 	spa->spa_config_splitting = nvl;
42181195e687SMark J Musante 	vdev_config_dirty(spa->spa_root_vdev);
42191195e687SMark J Musante 
42201195e687SMark J Musante 	/* configure and create the new pool */
42211195e687SMark J Musante 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
42221195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
42231195e687SMark J Musante 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
42241195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
42251195e687SMark J Musante 	    spa_version(spa)) == 0);
42261195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
42271195e687SMark J Musante 	    spa->spa_config_txg) == 0);
42281195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
42291195e687SMark J Musante 	    spa_generate_guid(NULL)) == 0);
42301195e687SMark J Musante 	(void) nvlist_lookup_string(props,
42311195e687SMark J Musante 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
42321195e687SMark J Musante 
4233d41c4376SMark J Musante 	/* add the new pool to the namespace */
42341195e687SMark J Musante 	newspa = spa_add(newname, config, altroot);
42351195e687SMark J Musante 	newspa->spa_config_txg = spa->spa_config_txg;
42361195e687SMark J Musante 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
42371195e687SMark J Musante 
42381195e687SMark J Musante 	/* release the spa config lock, retaining the namespace lock */
42391195e687SMark J Musante 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
42401195e687SMark J Musante 
42411195e687SMark J Musante 	if (zio_injection_enabled)
42421195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 1);
42431195e687SMark J Musante 
42441195e687SMark J Musante 	spa_activate(newspa, spa_mode_global);
42451195e687SMark J Musante 	spa_async_suspend(newspa);
42461195e687SMark J Musante 
42471195e687SMark J Musante 	/* create the new pool from the disks of the original pool */
42481195e687SMark J Musante 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
42491195e687SMark J Musante 	if (error)
42501195e687SMark J Musante 		goto out;
42511195e687SMark J Musante 
42521195e687SMark J Musante 	/* if that worked, generate a real config for the new pool */
42531195e687SMark J Musante 	if (newspa->spa_root_vdev != NULL) {
42541195e687SMark J Musante 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
42551195e687SMark J Musante 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
42561195e687SMark J Musante 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
42571195e687SMark J Musante 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
42581195e687SMark J Musante 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
42591195e687SMark J Musante 		    B_TRUE));
42601195e687SMark J Musante 	}
42611195e687SMark J Musante 
42621195e687SMark J Musante 	/* set the props */
42631195e687SMark J Musante 	if (props != NULL) {
42641195e687SMark J Musante 		spa_configfile_set(newspa, props, B_FALSE);
42651195e687SMark J Musante 		error = spa_prop_set(newspa, props);
42661195e687SMark J Musante 		if (error)
42671195e687SMark J Musante 			goto out;
42681195e687SMark J Musante 	}
42691195e687SMark J Musante 
42701195e687SMark J Musante 	/* flush everything */
42711195e687SMark J Musante 	txg = spa_vdev_config_enter(newspa);
42721195e687SMark J Musante 	vdev_config_dirty(newspa->spa_root_vdev);
42731195e687SMark J Musante 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
42741195e687SMark J Musante 
42751195e687SMark J Musante 	if (zio_injection_enabled)
42761195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 2);
42771195e687SMark J Musante 
42781195e687SMark J Musante 	spa_async_resume(newspa);
42791195e687SMark J Musante 
42801195e687SMark J Musante 	/* finally, update the original pool's config */
42811195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
42821195e687SMark J Musante 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
42831195e687SMark J Musante 	error = dmu_tx_assign(tx, TXG_WAIT);
42841195e687SMark J Musante 	if (error != 0)
42851195e687SMark J Musante 		dmu_tx_abort(tx);
42861195e687SMark J Musante 	for (c = 0; c < children; c++) {
42871195e687SMark J Musante 		if (vml[c] != NULL) {
42881195e687SMark J Musante 			vdev_split(vml[c]);
42891195e687SMark J Musante 			if (error == 0)
4290*3f9d6ad7SLin Ling 				spa_history_log_internal(LOG_POOL_VDEV_DETACH,
4291*3f9d6ad7SLin Ling 				    spa, tx, "vdev=%s",
42921195e687SMark J Musante 				    vml[c]->vdev_path);
42931195e687SMark J Musante 			vdev_free(vml[c]);
42941195e687SMark J Musante 		}
42951195e687SMark J Musante 	}
42961195e687SMark J Musante 	vdev_config_dirty(spa->spa_root_vdev);
42971195e687SMark J Musante 	spa->spa_config_splitting = NULL;
42981195e687SMark J Musante 	nvlist_free(nvl);
42991195e687SMark J Musante 	if (error == 0)
43001195e687SMark J Musante 		dmu_tx_commit(tx);
43011195e687SMark J Musante 	(void) spa_vdev_exit(spa, NULL, txg, 0);
43021195e687SMark J Musante 
43031195e687SMark J Musante 	if (zio_injection_enabled)
43041195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 3);
43051195e687SMark J Musante 
43061195e687SMark J Musante 	/* split is complete; log a history record */
4307*3f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
43081195e687SMark J Musante 	    "split new pool %s from pool %s", newname, spa_name(spa));
43091195e687SMark J Musante 
43101195e687SMark J Musante 	kmem_free(vml, children * sizeof (vdev_t *));
43111195e687SMark J Musante 
43121195e687SMark J Musante 	/* if we're not going to mount the filesystems in userland, export */
43131195e687SMark J Musante 	if (exp)
43141195e687SMark J Musante 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
43151195e687SMark J Musante 		    B_FALSE, B_FALSE);
43161195e687SMark J Musante 
43171195e687SMark J Musante 	return (error);
43181195e687SMark J Musante 
43191195e687SMark J Musante out:
43201195e687SMark J Musante 	spa_unload(newspa);
43211195e687SMark J Musante 	spa_deactivate(newspa);
43221195e687SMark J Musante 	spa_remove(newspa);
43231195e687SMark J Musante 
43241195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
432598295d61SMark J Musante 
432698295d61SMark J Musante 	/* re-online all offlined disks */
432798295d61SMark J Musante 	for (c = 0; c < children; c++) {
432898295d61SMark J Musante 		if (vml[c] != NULL)
432998295d61SMark J Musante 			vml[c]->vdev_offline = B_FALSE;
433098295d61SMark J Musante 	}
433198295d61SMark J Musante 	vdev_reopen(spa->spa_root_vdev);
433298295d61SMark J Musante 
43331195e687SMark J Musante 	nvlist_free(spa->spa_config_splitting);
43341195e687SMark J Musante 	spa->spa_config_splitting = NULL;
4335d41c4376SMark J Musante 	(void) spa_vdev_exit(spa, NULL, txg, error);
43361195e687SMark J Musante 
43371195e687SMark J Musante 	kmem_free(vml, children * sizeof (vdev_t *));
43381195e687SMark J Musante 	return (error);
43391195e687SMark J Musante }
43401195e687SMark J Musante 
4341e14bb325SJeff Bonwick static nvlist_t *
4342e14bb325SJeff Bonwick spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
434399653d4eSeschrock {
4344e14bb325SJeff Bonwick 	for (int i = 0; i < count; i++) {
4345e14bb325SJeff Bonwick 		uint64_t guid;
434699653d4eSeschrock 
4347e14bb325SJeff Bonwick 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4348e14bb325SJeff Bonwick 		    &guid) == 0);
434999653d4eSeschrock 
4350e14bb325SJeff Bonwick 		if (guid == target_guid)
4351e14bb325SJeff Bonwick 			return (nvpp[i]);
435299653d4eSeschrock 	}
435399653d4eSeschrock 
4354e14bb325SJeff Bonwick 	return (NULL);
4355fa94a07fSbrendan }
4356fa94a07fSbrendan 
4357e14bb325SJeff Bonwick static void
4358e14bb325SJeff Bonwick spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4359e14bb325SJeff Bonwick 	nvlist_t *dev_to_remove)
4360fa94a07fSbrendan {
4361e14bb325SJeff Bonwick 	nvlist_t **newdev = NULL;
4362fa94a07fSbrendan 
4363e14bb325SJeff Bonwick 	if (count > 1)
4364e14bb325SJeff Bonwick 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4365fa94a07fSbrendan 
4366e14bb325SJeff Bonwick 	for (int i = 0, j = 0; i < count; i++) {
4367e14bb325SJeff Bonwick 		if (dev[i] == dev_to_remove)
4368e14bb325SJeff Bonwick 			continue;
4369e14bb325SJeff Bonwick 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4370fa94a07fSbrendan 	}
4371fa94a07fSbrendan 
4372e14bb325SJeff Bonwick 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4373e14bb325SJeff Bonwick 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4374fa94a07fSbrendan 
4375e14bb325SJeff Bonwick 	for (int i = 0; i < count - 1; i++)
4376e14bb325SJeff Bonwick 		nvlist_free(newdev[i]);
4377fa94a07fSbrendan 
4378e14bb325SJeff Bonwick 	if (count > 1)
4379e14bb325SJeff Bonwick 		kmem_free(newdev, (count - 1) * sizeof (void *));
4380fa94a07fSbrendan }
4381fa94a07fSbrendan 
438288ecc943SGeorge Wilson /*
438388ecc943SGeorge Wilson  * Evacuate the device.
438488ecc943SGeorge Wilson  */
4385*3f9d6ad7SLin Ling static int
438688ecc943SGeorge Wilson spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
438788ecc943SGeorge Wilson {
438888ecc943SGeorge Wilson 	uint64_t txg;
4389*3f9d6ad7SLin Ling 	int error = 0;
439088ecc943SGeorge Wilson 
439188ecc943SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
439288ecc943SGeorge Wilson 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4393b24ab676SJeff Bonwick 	ASSERT(vd == vd->vdev_top);
439488ecc943SGeorge Wilson 
439588ecc943SGeorge Wilson 	/*
439688ecc943SGeorge Wilson 	 * Evacuate the device.  We don't hold the config lock as writer
439788ecc943SGeorge Wilson 	 * since we need to do I/O but we do keep the
439888ecc943SGeorge Wilson 	 * spa_namespace_lock held.  Once this completes the device
439988ecc943SGeorge Wilson 	 * should no longer have any blocks allocated on it.
440088ecc943SGeorge Wilson 	 */
440188ecc943SGeorge Wilson 	if (vd->vdev_islog) {
4402*3f9d6ad7SLin Ling 		if (vd->vdev_stat.vs_alloc != 0)
4403*3f9d6ad7SLin Ling 			error = spa_offline_log(spa);
4404a1521560SJeff Bonwick 	} else {
4405*3f9d6ad7SLin Ling 		error = ENOTSUP;
440688ecc943SGeorge Wilson 	}
440788ecc943SGeorge Wilson 
4408a1521560SJeff Bonwick 	if (error)
4409a1521560SJeff Bonwick 		return (error);
4410a1521560SJeff Bonwick 
441188ecc943SGeorge Wilson 	/*
4412a1521560SJeff Bonwick 	 * The evacuation succeeded.  Remove any remaining MOS metadata
4413a1521560SJeff Bonwick 	 * associated with this vdev, and wait for these changes to sync.
441488ecc943SGeorge Wilson 	 */
4415*3f9d6ad7SLin Ling 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
441688ecc943SGeorge Wilson 	txg = spa_vdev_config_enter(spa);
441788ecc943SGeorge Wilson 	vd->vdev_removing = B_TRUE;
441888ecc943SGeorge Wilson 	vdev_dirty(vd, 0, NULL, txg);
441988ecc943SGeorge Wilson 	vdev_config_dirty(vd);
442088ecc943SGeorge Wilson 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
442188ecc943SGeorge Wilson 
442288ecc943SGeorge Wilson 	return (0);
442388ecc943SGeorge Wilson }
442488ecc943SGeorge Wilson 
442588ecc943SGeorge Wilson /*
442688ecc943SGeorge Wilson  * Complete the removal by cleaning up the namespace.
442788ecc943SGeorge Wilson  */
4428*3f9d6ad7SLin Ling static void
4429a1521560SJeff Bonwick spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
443088ecc943SGeorge Wilson {
443188ecc943SGeorge Wilson 	vdev_t *rvd = spa->spa_root_vdev;
443288ecc943SGeorge Wilson 	uint64_t id = vd->vdev_id;
443388ecc943SGeorge Wilson 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
443488ecc943SGeorge Wilson 
443588ecc943SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
443688ecc943SGeorge Wilson 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4437b24ab676SJeff Bonwick 	ASSERT(vd == vd->vdev_top);
443888ecc943SGeorge Wilson 
4439*3f9d6ad7SLin Ling 	/*
4440*3f9d6ad7SLin Ling 	 * Only remove any devices which are empty.
4441*3f9d6ad7SLin Ling 	 */
4442*3f9d6ad7SLin Ling 	if (vd->vdev_stat.vs_alloc != 0)
4443*3f9d6ad7SLin Ling 		return;
4444*3f9d6ad7SLin Ling 
444588ecc943SGeorge Wilson 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4446b24ab676SJeff Bonwick 
4447b24ab676SJeff Bonwick 	if (list_link_active(&vd->vdev_state_dirty_node))
4448b24ab676SJeff Bonwick 		vdev_state_clean(vd);
4449b24ab676SJeff Bonwick 	if (list_link_active(&vd->vdev_config_dirty_node))
4450b24ab676SJeff Bonwick 		vdev_config_clean(vd);
4451b24ab676SJeff Bonwick 
445288ecc943SGeorge Wilson 	vdev_free(vd);
445388ecc943SGeorge Wilson 
445488ecc943SGeorge Wilson 	if (last_vdev) {
445588ecc943SGeorge Wilson 		vdev_compact_children(rvd);
445688ecc943SGeorge Wilson 	} else {
445788ecc943SGeorge Wilson 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
445888ecc943SGeorge Wilson 		vdev_add_child(rvd, vd);
445988ecc943SGeorge Wilson 	}
446088ecc943SGeorge Wilson }
446188ecc943SGeorge Wilson 
4462*3f9d6ad7SLin Ling /*
4463*3f9d6ad7SLin Ling  * Remove a device from the pool -
4464*3f9d6ad7SLin Ling  *
4465*3f9d6ad7SLin Ling  * Removing a device from the vdev namespace requires several steps
4466*3f9d6ad7SLin Ling  * and can take a significant amount of time.  As a result we use
4467*3f9d6ad7SLin Ling  * the spa_vdev_config_[enter/exit] functions which allow us to
4468*3f9d6ad7SLin Ling  * grab and release the spa_config_lock while still holding the namespace
4469*3f9d6ad7SLin Ling  * lock.  During each step the configuration is synced out.
4470*3f9d6ad7SLin Ling  */
4471*3f9d6ad7SLin Ling 
4472fa94a07fSbrendan /*
4473fa94a07fSbrendan  * Remove a device from the pool.  Currently, this supports removing only hot
447488ecc943SGeorge Wilson  * spares, slogs, and level 2 ARC devices.
4475fa94a07fSbrendan  */
4476fa94a07fSbrendan int
4477fa94a07fSbrendan spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4478fa94a07fSbrendan {
4479fa94a07fSbrendan 	vdev_t *vd;
4480a1521560SJeff Bonwick 	metaslab_group_t *mg;
4481e14bb325SJeff Bonwick 	nvlist_t **spares, **l2cache, *nv;
44828ad4d6ddSJeff Bonwick 	uint64_t txg = 0;
448388ecc943SGeorge Wilson 	uint_t nspares, nl2cache;
4484fa94a07fSbrendan 	int error = 0;
44858ad4d6ddSJeff Bonwick 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4486fa94a07fSbrendan 
44878ad4d6ddSJeff Bonwick 	if (!locked)
44888ad4d6ddSJeff Bonwick 		txg = spa_vdev_enter(spa);
4489fa94a07fSbrendan 
4490c5904d13Seschrock 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4491fa94a07fSbrendan 
4492fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs != NULL &&
4493fa94a07fSbrendan 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4494e14bb325SJeff Bonwick 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4495e14bb325SJeff Bonwick 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4496e14bb325SJeff Bonwick 		/*
4497e14bb325SJeff Bonwick 		 * Only remove the hot spare if it's not currently in use
4498e14bb325SJeff Bonwick 		 * in this pool.
4499e14bb325SJeff Bonwick 		 */
4500e14bb325SJeff Bonwick 		if (vd == NULL || unspare) {
4501e14bb325SJeff Bonwick 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
4502e14bb325SJeff Bonwick 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4503e14bb325SJeff Bonwick 			spa_load_spares(spa);
4504e14bb325SJeff Bonwick 			spa->spa_spares.sav_sync = B_TRUE;
4505e14bb325SJeff Bonwick 		} else {
4506e14bb325SJeff Bonwick 			error = EBUSY;
4507e14bb325SJeff Bonwick 		}
4508e14bb325SJeff Bonwick 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
4509fa94a07fSbrendan 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4510e14bb325SJeff Bonwick 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4511e14bb325SJeff Bonwick 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4512e14bb325SJeff Bonwick 		/*
4513e14bb325SJeff Bonwick 		 * Cache devices can always be removed.
4514e14bb325SJeff Bonwick 		 */
4515e14bb325SJeff Bonwick 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4516e14bb325SJeff Bonwick 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4517fa94a07fSbrendan 		spa_load_l2cache(spa);
4518fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
451988ecc943SGeorge Wilson 	} else if (vd != NULL && vd->vdev_islog) {
452088ecc943SGeorge Wilson 		ASSERT(!locked);
4521b24ab676SJeff Bonwick 		ASSERT(vd == vd->vdev_top);
452288ecc943SGeorge Wilson 
452388ecc943SGeorge Wilson 		/*
452488ecc943SGeorge Wilson 		 * XXX - Once we have bp-rewrite this should
452588ecc943SGeorge Wilson 		 * become the common case.
452688ecc943SGeorge Wilson 		 */
452788ecc943SGeorge Wilson 
4528a1521560SJeff Bonwick 		mg = vd->vdev_mg;
4529a1521560SJeff Bonwick 
453088ecc943SGeorge Wilson 		/*
4531a1521560SJeff Bonwick 		 * Stop allocating from this vdev.
453288ecc943SGeorge Wilson 		 */
4533a1521560SJeff Bonwick 		metaslab_group_passivate(mg);
453488ecc943SGeorge Wilson 
4535b24ab676SJeff Bonwick 		/*
4536b24ab676SJeff Bonwick 		 * Wait for the youngest allocations and frees to sync,
4537b24ab676SJeff Bonwick 		 * and then wait for the deferral of those frees to finish.
4538b24ab676SJeff Bonwick 		 */
4539b24ab676SJeff Bonwick 		spa_vdev_config_exit(spa, NULL,
4540b24ab676SJeff Bonwick 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4541b24ab676SJeff Bonwick 
4542a1521560SJeff Bonwick 		/*
4543a1521560SJeff Bonwick 		 * Attempt to evacuate the vdev.
4544a1521560SJeff Bonwick 		 */
4545a1521560SJeff Bonwick 		error = spa_vdev_remove_evacuate(spa, vd);
4546a1521560SJeff Bonwick 
454788ecc943SGeorge Wilson 		txg = spa_vdev_config_enter(spa);
454888ecc943SGeorge Wilson 
4549a1521560SJeff Bonwick 		/*
4550a1521560SJeff Bonwick 		 * If we couldn't evacuate the vdev, unwind.
4551a1521560SJeff Bonwick 		 */
4552a1521560SJeff Bonwick 		if (error) {
4553a1521560SJeff Bonwick 			metaslab_group_activate(mg);
4554a1521560SJeff Bonwick 			return (spa_vdev_exit(spa, NULL, txg, error));
4555a1521560SJeff Bonwick 		}
4556a1521560SJeff Bonwick 
4557a1521560SJeff Bonwick 		/*
4558a1521560SJeff Bonwick 		 * Clean up the vdev namespace.
4559a1521560SJeff Bonwick 		 */
4560a1521560SJeff Bonwick 		spa_vdev_remove_from_namespace(spa, vd);
456188ecc943SGeorge Wilson 
4562e14bb325SJeff Bonwick 	} else if (vd != NULL) {
4563e14bb325SJeff Bonwick 		/*
4564e14bb325SJeff Bonwick 		 * Normal vdevs cannot be removed (yet).
4565e14bb325SJeff Bonwick 		 */
4566e14bb325SJeff Bonwick 		error = ENOTSUP;
4567e14bb325SJeff Bonwick 	} else {
4568e14bb325SJeff Bonwick 		/*
4569e14bb325SJeff Bonwick 		 * There is no vdev of any kind with the specified guid.
4570e14bb325SJeff Bonwick 		 */
4571e14bb325SJeff Bonwick 		error = ENOENT;
4572fa94a07fSbrendan 	}
457399653d4eSeschrock 
45748ad4d6ddSJeff Bonwick 	if (!locked)
45758ad4d6ddSJeff Bonwick 		return (spa_vdev_exit(spa, NULL, txg, error));
45768ad4d6ddSJeff Bonwick 
45778ad4d6ddSJeff Bonwick 	return (error);
4578fa9e4066Sahrens }
4579fa9e4066Sahrens 
4580fa9e4066Sahrens /*
45813d7072f8Seschrock  * Find any device that's done replacing, or a vdev marked 'unspare' that's
45823d7072f8Seschrock  * current spared, so we can detach it.
4583fa9e4066Sahrens  */
4584ea8dc4b6Seschrock static vdev_t *
45853d7072f8Seschrock spa_vdev_resilver_done_hunt(vdev_t *vd)
4586fa9e4066Sahrens {
4587ea8dc4b6Seschrock 	vdev_t *newvd, *oldvd;
4588fa9e4066Sahrens 
4589573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++) {
45903d7072f8Seschrock 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4591ea8dc4b6Seschrock 		if (oldvd != NULL)
4592ea8dc4b6Seschrock 			return (oldvd);
4593ea8dc4b6Seschrock 	}
4594fa9e4066Sahrens 
45953d7072f8Seschrock 	/*
45963d7072f8Seschrock 	 * Check for a completed replacement.
45973d7072f8Seschrock 	 */
4598fa9e4066Sahrens 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
4599ea8dc4b6Seschrock 		oldvd = vd->vdev_child[0];
4600ea8dc4b6Seschrock 		newvd = vd->vdev_child[1];
4601ea8dc4b6Seschrock 
46028ad4d6ddSJeff Bonwick 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4603e69acc92SVictor Latushkin 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
46048ad4d6ddSJeff Bonwick 		    !vdev_dtl_required(oldvd))
4605ea8dc4b6Seschrock 			return (oldvd);
4606fa9e4066Sahrens 	}
4607ea8dc4b6Seschrock 
46083d7072f8Seschrock 	/*
46093d7072f8Seschrock 	 * Check for a completed resilver with the 'unspare' flag set.
46103d7072f8Seschrock 	 */
46113d7072f8Seschrock 	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
46123d7072f8Seschrock 		newvd = vd->vdev_child[0];
46133d7072f8Seschrock 		oldvd = vd->vdev_child[1];
46143d7072f8Seschrock 
46153d7072f8Seschrock 		if (newvd->vdev_unspare &&
46168ad4d6ddSJeff Bonwick 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
4617e69acc92SVictor Latushkin 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
46188ad4d6ddSJeff Bonwick 		    !vdev_dtl_required(oldvd)) {
46193d7072f8Seschrock 			newvd->vdev_unspare = 0;
46203d7072f8Seschrock 			return (oldvd);
46213d7072f8Seschrock 		}
46223d7072f8Seschrock 	}
46233d7072f8Seschrock 
4624ea8dc4b6Seschrock 	return (NULL);
4625fa9e4066Sahrens }
4626fa9e4066Sahrens 
4627ea8dc4b6Seschrock static void
46283d7072f8Seschrock spa_vdev_resilver_done(spa_t *spa)
4629fa9e4066Sahrens {
46308ad4d6ddSJeff Bonwick 	vdev_t *vd, *pvd, *ppvd;
46318ad4d6ddSJeff Bonwick 	uint64_t guid, sguid, pguid, ppguid;
4632ea8dc4b6Seschrock 
46338ad4d6ddSJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4634ea8dc4b6Seschrock 
46353d7072f8Seschrock 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
46368ad4d6ddSJeff Bonwick 		pvd = vd->vdev_parent;
46378ad4d6ddSJeff Bonwick 		ppvd = pvd->vdev_parent;
4638ea8dc4b6Seschrock 		guid = vd->vdev_guid;
46398ad4d6ddSJeff Bonwick 		pguid = pvd->vdev_guid;
46408ad4d6ddSJeff Bonwick 		ppguid = ppvd->vdev_guid;
46418ad4d6ddSJeff Bonwick 		sguid = 0;
464299653d4eSeschrock 		/*
464399653d4eSeschrock 		 * If we have just finished replacing a hot spared device, then
464499653d4eSeschrock 		 * we need to detach the parent's first child (the original hot
464599653d4eSeschrock 		 * spare) as well.
464699653d4eSeschrock 		 */
46478ad4d6ddSJeff Bonwick 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
464899653d4eSeschrock 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
46498ad4d6ddSJeff Bonwick 			ASSERT(ppvd->vdev_children == 2);
46508ad4d6ddSJeff Bonwick 			sguid = ppvd->vdev_child[1]->vdev_guid;
465199653d4eSeschrock 		}
46528ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
46538ad4d6ddSJeff Bonwick 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4654ea8dc4b6Seschrock 			return;
46558ad4d6ddSJeff Bonwick 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
465699653d4eSeschrock 			return;
46578ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4658fa9e4066Sahrens 	}
4659fa9e4066Sahrens 
46608ad4d6ddSJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
4661fa9e4066Sahrens }
4662fa9e4066Sahrens 
4663c67d9675Seschrock /*
4664b3388e4fSEric Taylor  * Update the stored path or FRU for this vdev.
4665c67d9675Seschrock  */
4666c67d9675Seschrock int
46676809eb4eSEric Schrock spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
46686809eb4eSEric Schrock     boolean_t ispath)
4669c67d9675Seschrock {
4670c5904d13Seschrock 	vdev_t *vd;
4671208044b8SGeorge Wilson 	boolean_t sync = B_FALSE;
4672c67d9675Seschrock 
4673b3388e4fSEric Taylor 	spa_vdev_state_enter(spa, SCL_ALL);
4674c67d9675Seschrock 
46756809eb4eSEric Schrock 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4676b3388e4fSEric Taylor 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
4677c67d9675Seschrock 
46780e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
4679b3388e4fSEric Taylor 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
46800e34b6a7Sbonwick 
46816809eb4eSEric Schrock 	if (ispath) {
4682208044b8SGeorge Wilson 		if (strcmp(value, vd->vdev_path) != 0) {
4683208044b8SGeorge Wilson 			spa_strfree(vd->vdev_path);
4684208044b8SGeorge Wilson 			vd->vdev_path = spa_strdup(value);
4685208044b8SGeorge Wilson 			sync = B_TRUE;
4686208044b8SGeorge Wilson 		}
46876809eb4eSEric Schrock 	} else {
4688208044b8SGeorge Wilson 		if (vd->vdev_fru == NULL) {
4689208044b8SGeorge Wilson 			vd->vdev_fru = spa_strdup(value);
4690208044b8SGeorge Wilson 			sync = B_TRUE;
4691208044b8SGeorge Wilson 		} else if (strcmp(value, vd->vdev_fru) != 0) {
46926809eb4eSEric Schrock 			spa_strfree(vd->vdev_fru);
4693208044b8SGeorge Wilson 			vd->vdev_fru = spa_strdup(value);
4694208044b8SGeorge Wilson 			sync = B_TRUE;
4695208044b8SGeorge Wilson 		}
46966809eb4eSEric Schrock 	}
4697c67d9675Seschrock 
4698208044b8SGeorge Wilson 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4699c67d9675Seschrock }
4700c67d9675Seschrock 
47016809eb4eSEric Schrock int
47026809eb4eSEric Schrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
47036809eb4eSEric Schrock {
47046809eb4eSEric Schrock 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
47056809eb4eSEric Schrock }
47066809eb4eSEric Schrock 
47076809eb4eSEric Schrock int
47086809eb4eSEric Schrock spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
47096809eb4eSEric Schrock {
47106809eb4eSEric Schrock 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
47116809eb4eSEric Schrock }
47126809eb4eSEric Schrock 
4713fa9e4066Sahrens /*
4714fa9e4066Sahrens  * ==========================================================================
4715*3f9d6ad7SLin Ling  * SPA Scanning
4716fa9e4066Sahrens  * ==========================================================================
4717fa9e4066Sahrens  */
4718fa9e4066Sahrens 
4719ea8dc4b6Seschrock int
4720*3f9d6ad7SLin Ling spa_scan_stop(spa_t *spa)
4721fa9e4066Sahrens {
4722e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4723*3f9d6ad7SLin Ling 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
4724*3f9d6ad7SLin Ling 		return (EBUSY);
4725*3f9d6ad7SLin Ling 	return (dsl_scan_cancel(spa->spa_dsl_pool));
4726*3f9d6ad7SLin Ling }
4727bb8b5132Sek 
4728*3f9d6ad7SLin Ling int
4729*3f9d6ad7SLin Ling spa_scan(spa_t *spa, pool_scan_func_t func)
4730*3f9d6ad7SLin Ling {
4731*3f9d6ad7SLin Ling 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4732*3f9d6ad7SLin Ling 
4733*3f9d6ad7SLin Ling 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
4734fa9e4066Sahrens 		return (ENOTSUP);
4735fa9e4066Sahrens 
4736fa9e4066Sahrens 	/*
4737088f3894Sahrens 	 * If a resilver was requested, but there is no DTL on a
4738088f3894Sahrens 	 * writeable leaf device, we have nothing to do.
4739fa9e4066Sahrens 	 */
4740*3f9d6ad7SLin Ling 	if (func == POOL_SCAN_RESILVER &&
4741088f3894Sahrens 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
4742088f3894Sahrens 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4743ea8dc4b6Seschrock 		return (0);
4744ea8dc4b6Seschrock 	}
4745fa9e4066Sahrens 
4746*3f9d6ad7SLin Ling 	return (dsl_scan(spa->spa_dsl_pool, func));
4747fa9e4066Sahrens }
4748fa9e4066Sahrens 
4749ea8dc4b6Seschrock /*
4750ea8dc4b6Seschrock  * ==========================================================================
4751ea8dc4b6Seschrock  * SPA async task processing
4752ea8dc4b6Seschrock  * ==========================================================================
4753ea8dc4b6Seschrock  */
4754ea8dc4b6Seschrock 
4755ea8dc4b6Seschrock static void
47563d7072f8Seschrock spa_async_remove(spa_t *spa, vdev_t *vd)
4757fa9e4066Sahrens {
475849cf58c0SBrendan Gregg - Sun Microsystems 	if (vd->vdev_remove_wanted) {
475998d1cbfeSGeorge Wilson 		vd->vdev_remove_wanted = B_FALSE;
476098d1cbfeSGeorge Wilson 		vd->vdev_delayed_close = B_FALSE;
476149cf58c0SBrendan Gregg - Sun Microsystems 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
47621d713200SEric Schrock 
47631d713200SEric Schrock 		/*
47641d713200SEric Schrock 		 * We want to clear the stats, but we don't want to do a full
47651d713200SEric Schrock 		 * vdev_clear() as that will cause us to throw away
47661d713200SEric Schrock 		 * degraded/faulted state as well as attempt to reopen the
47671d713200SEric Schrock 		 * device, all of which is a waste.
47681d713200SEric Schrock 		 */
47691d713200SEric Schrock 		vd->vdev_stat.vs_read_errors = 0;
47701d713200SEric Schrock 		vd->vdev_stat.vs_write_errors = 0;
47711d713200SEric Schrock 		vd->vdev_stat.vs_checksum_errors = 0;
47721d713200SEric Schrock 
4773e14bb325SJeff Bonwick 		vdev_state_dirty(vd->vdev_top);
4774ea8dc4b6Seschrock 	}
477549cf58c0SBrendan Gregg - Sun Microsystems 
4776e14bb325SJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++)
477749cf58c0SBrendan Gregg - Sun Microsystems 		spa_async_remove(spa, vd->vdev_child[c]);
4778ea8dc4b6Seschrock }
4779fa9e4066Sahrens 
4780e14bb325SJeff Bonwick static void
4781e14bb325SJeff Bonwick spa_async_probe(spa_t *spa, vdev_t *vd)
4782e14bb325SJeff Bonwick {
4783e14bb325SJeff Bonwick 	if (vd->vdev_probe_wanted) {
478498d1cbfeSGeorge Wilson 		vd->vdev_probe_wanted = B_FALSE;
4785e14bb325SJeff Bonwick 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
4786e14bb325SJeff Bonwick 	}
4787e14bb325SJeff Bonwick 
4788e14bb325SJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++)
4789e14bb325SJeff Bonwick 		spa_async_probe(spa, vd->vdev_child[c]);
4790e14bb325SJeff Bonwick }
4791e14bb325SJeff Bonwick 
4792573ca77eSGeorge Wilson static void
4793573ca77eSGeorge Wilson spa_async_autoexpand(spa_t *spa, vdev_t *vd)
4794573ca77eSGeorge Wilson {
4795573ca77eSGeorge Wilson 	sysevent_id_t eid;
4796573ca77eSGeorge Wilson 	nvlist_t *attr;
4797573ca77eSGeorge Wilson 	char *physpath;
4798573ca77eSGeorge Wilson 
4799573ca77eSGeorge Wilson 	if (!spa->spa_autoexpand)
4800573ca77eSGeorge Wilson 		return;
4801573ca77eSGeorge Wilson 
4802573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++) {
4803573ca77eSGeorge Wilson 		vdev_t *cvd = vd->vdev_child[c];
4804573ca77eSGeorge Wilson 		spa_async_autoexpand(spa, cvd);
4805573ca77eSGeorge Wilson 	}
4806573ca77eSGeorge Wilson 
4807573ca77eSGeorge Wilson 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
4808573ca77eSGeorge Wilson 		return;
4809573ca77eSGeorge Wilson 
4810573ca77eSGeorge Wilson 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4811573ca77eSGeorge Wilson 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
4812573ca77eSGeorge Wilson 
4813573ca77eSGeorge Wilson 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4814573ca77eSGeorge Wilson 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
4815573ca77eSGeorge Wilson 
4816573ca77eSGeorge Wilson 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
4817573ca77eSGeorge Wilson 	    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
4818573ca77eSGeorge Wilson 
4819573ca77eSGeorge Wilson 	nvlist_free(attr);
4820573ca77eSGeorge Wilson 	kmem_free(physpath, MAXPATHLEN);
4821573ca77eSGeorge Wilson }
4822573ca77eSGeorge Wilson 
4823ea8dc4b6Seschrock static void
4824ea8dc4b6Seschrock spa_async_thread(spa_t *spa)
4825ea8dc4b6Seschrock {
4826e14bb325SJeff Bonwick 	int tasks;
4827ea8dc4b6Seschrock 
4828ea8dc4b6Seschrock 	ASSERT(spa->spa_sync_on);
4829ea8dc4b6Seschrock 
4830ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4831ea8dc4b6Seschrock 	tasks = spa->spa_async_tasks;
4832ea8dc4b6Seschrock 	spa->spa_async_tasks = 0;
4833ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4834ea8dc4b6Seschrock 
48350373e76bSbonwick 	/*
48360373e76bSbonwick 	 * See if the config needs to be updated.
48370373e76bSbonwick 	 */
48380373e76bSbonwick 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
4839b24ab676SJeff Bonwick 		uint64_t old_space, new_space;
4840573ca77eSGeorge Wilson 
48410373e76bSbonwick 		mutex_enter(&spa_namespace_lock);
4842b24ab676SJeff Bonwick 		old_space = metaslab_class_get_space(spa_normal_class(spa));
48430373e76bSbonwick 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4844b24ab676SJeff Bonwick 		new_space = metaslab_class_get_space(spa_normal_class(spa));
48450373e76bSbonwick 		mutex_exit(&spa_namespace_lock);
4846573ca77eSGeorge Wilson 
4847573ca77eSGeorge Wilson 		/*
4848573ca77eSGeorge Wilson 		 * If the pool grew as a result of the config update,
4849573ca77eSGeorge Wilson 		 * then log an internal history event.
4850573ca77eSGeorge Wilson 		 */
4851b24ab676SJeff Bonwick 		if (new_space != old_space) {
4852*3f9d6ad7SLin Ling 			spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
4853*3f9d6ad7SLin Ling 			    spa, NULL,
4854c8e1f6d2SMark J Musante 			    "pool '%s' size: %llu(+%llu)",
4855b24ab676SJeff Bonwick 			    spa_name(spa), new_space, new_space - old_space);
4856573ca77eSGeorge Wilson 		}
48570373e76bSbonwick 	}
48580373e76bSbonwick 
4859ea8dc4b6Seschrock 	/*
48603d7072f8Seschrock 	 * See if any devices need to be marked REMOVED.
4861ea8dc4b6Seschrock 	 */
4862e14bb325SJeff Bonwick 	if (tasks & SPA_ASYNC_REMOVE) {
48638f18d1faSGeorge Wilson 		spa_vdev_state_enter(spa, SCL_NONE);
48643d7072f8Seschrock 		spa_async_remove(spa, spa->spa_root_vdev);
4865e14bb325SJeff Bonwick 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
486649cf58c0SBrendan Gregg - Sun Microsystems 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
4867e14bb325SJeff Bonwick 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
486849cf58c0SBrendan Gregg - Sun Microsystems 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
4869e14bb325SJeff Bonwick 		(void) spa_vdev_state_exit(spa, NULL, 0);
4870e14bb325SJeff Bonwick 	}
4871e14bb325SJeff Bonwick 
4872573ca77eSGeorge Wilson 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
4873573ca77eSGeorge Wilson 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4874573ca77eSGeorge Wilson 		spa_async_autoexpand(spa, spa->spa_root_vdev);
4875573ca77eSGeorge Wilson 		spa_config_exit(spa, SCL_CONFIG, FTAG);
4876573ca77eSGeorge Wilson 	}
4877573ca77eSGeorge Wilson 
4878e14bb325SJeff Bonwick 	/*
4879e14bb325SJeff Bonwick 	 * See if any devices need to be probed.
4880e14bb325SJeff Bonwick 	 */
4881e14bb325SJeff Bonwick 	if (tasks & SPA_ASYNC_PROBE) {
48828f18d1faSGeorge Wilson 		spa_vdev_state_enter(spa, SCL_NONE);
4883e14bb325SJeff Bonwick 		spa_async_probe(spa, spa->spa_root_vdev);
4884e14bb325SJeff Bonwick 		(void) spa_vdev_state_exit(spa, NULL, 0);
48853d7072f8Seschrock 	}
4886ea8dc4b6Seschrock 
4887ea8dc4b6Seschrock 	/*
4888ea8dc4b6Seschrock 	 * If any devices are done replacing, detach them.
4889ea8dc4b6Seschrock 	 */
48903d7072f8Seschrock 	if (tasks & SPA_ASYNC_RESILVER_DONE)
48913d7072f8Seschrock 		spa_vdev_resilver_done(spa);
4892fa9e4066Sahrens 
4893ea8dc4b6Seschrock 	/*
4894ea8dc4b6Seschrock 	 * Kick off a resilver.
4895ea8dc4b6Seschrock 	 */
4896088f3894Sahrens 	if (tasks & SPA_ASYNC_RESILVER)
4897*3f9d6ad7SLin Ling 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
4898ea8dc4b6Seschrock 
4899ea8dc4b6Seschrock 	/*
4900ea8dc4b6Seschrock 	 * Let the world know that we're done.
4901ea8dc4b6Seschrock 	 */
4902ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4903ea8dc4b6Seschrock 	spa->spa_async_thread = NULL;
4904ea8dc4b6Seschrock 	cv_broadcast(&spa->spa_async_cv);
4905ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4906ea8dc4b6Seschrock 	thread_exit();
4907ea8dc4b6Seschrock }
4908ea8dc4b6Seschrock 
4909ea8dc4b6Seschrock void
4910ea8dc4b6Seschrock spa_async_suspend(spa_t *spa)
4911ea8dc4b6Seschrock {
4912ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4913ea8dc4b6Seschrock 	spa->spa_async_suspended++;
4914ea8dc4b6Seschrock 	while (spa->spa_async_thread != NULL)
4915ea8dc4b6Seschrock 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
4916ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4917ea8dc4b6Seschrock }
4918ea8dc4b6Seschrock 
4919ea8dc4b6Seschrock void
4920ea8dc4b6Seschrock spa_async_resume(spa_t *spa)
4921ea8dc4b6Seschrock {
4922ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4923ea8dc4b6Seschrock 	ASSERT(spa->spa_async_suspended != 0);
4924ea8dc4b6Seschrock 	spa->spa_async_suspended--;
4925ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4926ea8dc4b6Seschrock }
4927ea8dc4b6Seschrock 
4928ea8dc4b6Seschrock static void
4929ea8dc4b6Seschrock spa_async_dispatch(spa_t *spa)
4930ea8dc4b6Seschrock {
4931ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4932ea8dc4b6Seschrock 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
49330373e76bSbonwick 	    spa->spa_async_thread == NULL &&
49340373e76bSbonwick 	    rootdir != NULL && !vn_is_readonly(rootdir))
4935ea8dc4b6Seschrock 		spa->spa_async_thread = thread_create(NULL, 0,
4936ea8dc4b6Seschrock 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
4937ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4938ea8dc4b6Seschrock }
4939ea8dc4b6Seschrock 
4940ea8dc4b6Seschrock void
4941ea8dc4b6Seschrock spa_async_request(spa_t *spa, int task)
4942ea8dc4b6Seschrock {
4943*3f9d6ad7SLin Ling 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
4944ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
4945ea8dc4b6Seschrock 	spa->spa_async_tasks |= task;
4946ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
4947fa9e4066Sahrens }
4948fa9e4066Sahrens 
4949fa9e4066Sahrens /*
4950fa9e4066Sahrens  * ==========================================================================
4951fa9e4066Sahrens  * SPA syncing routines
4952fa9e4066Sahrens  * ==========================================================================
4953fa9e4066Sahrens  */
4954fa9e4066Sahrens static void
4955b24ab676SJeff Bonwick spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
4956fa9e4066Sahrens {
4957fa9e4066Sahrens 	blkptr_t blk;
4958fa9e4066Sahrens 	uint64_t itor = 0;
4959fa9e4066Sahrens 	uint8_t c = 1;
4960fa9e4066Sahrens 
4961e14bb325SJeff Bonwick 	while (bplist_iterate(bpl, &itor, &blk) == 0) {
4962e14bb325SJeff Bonwick 		ASSERT(blk.blk_birth < txg);
4963b24ab676SJeff Bonwick 		zio_free(spa, txg, &blk);
4964e14bb325SJeff Bonwick 	}
4965fa9e4066Sahrens 
4966fa9e4066Sahrens 	bplist_vacate(bpl, tx);
4967fa9e4066Sahrens 
4968fa9e4066Sahrens 	/*
4969fa9e4066Sahrens 	 * Pre-dirty the first block so we sync to convergence faster.
4970fa9e4066Sahrens 	 * (Usually only the first block is needed.)
4971fa9e4066Sahrens 	 */
4972b24ab676SJeff Bonwick 	dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
4973b24ab676SJeff Bonwick }
4974b24ab676SJeff Bonwick 
4975b24ab676SJeff Bonwick static void
4976b24ab676SJeff Bonwick spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
4977b24ab676SJeff Bonwick {
4978b24ab676SJeff Bonwick 	zio_t *zio = arg;
4979b24ab676SJeff Bonwick 
4980b24ab676SJeff Bonwick 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
4981b24ab676SJeff Bonwick 	    zio->io_flags));
4982fa9e4066Sahrens }
4983fa9e4066Sahrens 
4984fa9e4066Sahrens static void
498599653d4eSeschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
4986fa9e4066Sahrens {
4987fa9e4066Sahrens 	char *packed = NULL;
4988f7991ba4STim Haley 	size_t bufsize;
4989fa9e4066Sahrens 	size_t nvsize = 0;
4990fa9e4066Sahrens 	dmu_buf_t *db;
4991fa9e4066Sahrens 
499299653d4eSeschrock 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
4993fa9e4066Sahrens 
4994f7991ba4STim Haley 	/*
4995f7991ba4STim Haley 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
4996f7991ba4STim Haley 	 * information.  This avoids the dbuf_will_dirty() path and
4997f7991ba4STim Haley 	 * saves us a pre-read to get data we don't actually care about.
4998f7991ba4STim Haley 	 */
4999f7991ba4STim Haley 	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5000f7991ba4STim Haley 	packed = kmem_alloc(bufsize, KM_SLEEP);
5001fa9e4066Sahrens 
500299653d4eSeschrock 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5003ea8dc4b6Seschrock 	    KM_SLEEP) == 0);
5004f7991ba4STim Haley 	bzero(packed + nvsize, bufsize - nvsize);
5005fa9e4066Sahrens 
5006f7991ba4STim Haley 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5007fa9e4066Sahrens 
5008f7991ba4STim Haley 	kmem_free(packed, bufsize);
5009fa9e4066Sahrens 
501099653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5011fa9e4066Sahrens 	dmu_buf_will_dirty(db, tx);
5012fa9e4066Sahrens 	*(uint64_t *)db->db_data = nvsize;
5013ea8dc4b6Seschrock 	dmu_buf_rele(db, FTAG);
5014fa9e4066Sahrens }
5015fa9e4066Sahrens 
501699653d4eSeschrock static void
5017fa94a07fSbrendan spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5018fa94a07fSbrendan     const char *config, const char *entry)
501999653d4eSeschrock {
502099653d4eSeschrock 	nvlist_t *nvroot;
5021fa94a07fSbrendan 	nvlist_t **list;
502299653d4eSeschrock 	int i;
502399653d4eSeschrock 
5024fa94a07fSbrendan 	if (!sav->sav_sync)
502599653d4eSeschrock 		return;
502699653d4eSeschrock 
502799653d4eSeschrock 	/*
5028fa94a07fSbrendan 	 * Update the MOS nvlist describing the list of available devices.
5029fa94a07fSbrendan 	 * spa_validate_aux() will have already made sure this nvlist is
50303d7072f8Seschrock 	 * valid and the vdevs are labeled appropriately.
503199653d4eSeschrock 	 */
5032fa94a07fSbrendan 	if (sav->sav_object == 0) {
5033fa94a07fSbrendan 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5034fa94a07fSbrendan 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5035fa94a07fSbrendan 		    sizeof (uint64_t), tx);
503699653d4eSeschrock 		VERIFY(zap_update(spa->spa_meta_objset,
5037fa94a07fSbrendan 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5038fa94a07fSbrendan 		    &sav->sav_object, tx) == 0);
503999653d4eSeschrock 	}
504099653d4eSeschrock 
504199653d4eSeschrock 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5042fa94a07fSbrendan 	if (sav->sav_count == 0) {
5043fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
504499653d4eSeschrock 	} else {
5045fa94a07fSbrendan 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5046fa94a07fSbrendan 		for (i = 0; i < sav->sav_count; i++)
5047fa94a07fSbrendan 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5048*3f9d6ad7SLin Ling 			    B_FALSE, VDEV_CONFIG_L2CACHE);
5049fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5050fa94a07fSbrendan 		    sav->sav_count) == 0);
5051fa94a07fSbrendan 		for (i = 0; i < sav->sav_count; i++)
5052fa94a07fSbrendan 			nvlist_free(list[i]);
5053fa94a07fSbrendan 		kmem_free(list, sav->sav_count * sizeof (void *));
505499653d4eSeschrock 	}
505599653d4eSeschrock 
5056fa94a07fSbrendan 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
505706eeb2adSek 	nvlist_free(nvroot);
505899653d4eSeschrock 
5059fa94a07fSbrendan 	sav->sav_sync = B_FALSE;
506099653d4eSeschrock }
506199653d4eSeschrock 
506299653d4eSeschrock static void
506399653d4eSeschrock spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
506499653d4eSeschrock {
506599653d4eSeschrock 	nvlist_t *config;
506699653d4eSeschrock 
5067e14bb325SJeff Bonwick 	if (list_is_empty(&spa->spa_config_dirty_list))
506899653d4eSeschrock 		return;
506999653d4eSeschrock 
5070e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5071e14bb325SJeff Bonwick 
5072e14bb325SJeff Bonwick 	config = spa_config_generate(spa, spa->spa_root_vdev,
5073e14bb325SJeff Bonwick 	    dmu_tx_get_txg(tx), B_FALSE);
5074e14bb325SJeff Bonwick 
5075e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_STATE, FTAG);
507699653d4eSeschrock 
507799653d4eSeschrock 	if (spa->spa_config_syncing)
507899653d4eSeschrock 		nvlist_free(spa->spa_config_syncing);
507999653d4eSeschrock 	spa->spa_config_syncing = config;
508099653d4eSeschrock 
508199653d4eSeschrock 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
508299653d4eSeschrock }
508399653d4eSeschrock 
5084990b4856Slling /*
5085990b4856Slling  * Set zpool properties.
5086990b4856Slling  */
5087b1b8ab34Slling static void
5088*3f9d6ad7SLin Ling spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5089b1b8ab34Slling {
5090b1b8ab34Slling 	spa_t *spa = arg1;
5091b1b8ab34Slling 	objset_t *mos = spa->spa_meta_objset;
5092990b4856Slling 	nvlist_t *nvp = arg2;
5093990b4856Slling 	nvpair_t *elem;
50943d7072f8Seschrock 	uint64_t intval;
5095c5904d13Seschrock 	char *strval;
5096990b4856Slling 	zpool_prop_t prop;
5097990b4856Slling 	const char *propname;
5098990b4856Slling 	zprop_type_t proptype;
5099b1b8ab34Slling 
5100e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);
5101e14bb325SJeff Bonwick 
5102990b4856Slling 	elem = NULL;
5103990b4856Slling 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
5104990b4856Slling 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5105990b4856Slling 		case ZPOOL_PROP_VERSION:
5106990b4856Slling 			/*
5107990b4856Slling 			 * Only set version for non-zpool-creation cases
5108990b4856Slling 			 * (set/import). spa_create() needs special care
5109990b4856Slling 			 * for version setting.
5110990b4856Slling 			 */
5111990b4856Slling 			if (tx->tx_txg != TXG_INITIAL) {
5112990b4856Slling 				VERIFY(nvpair_value_uint64(elem,
5113990b4856Slling 				    &intval) == 0);
5114990b4856Slling 				ASSERT(intval <= SPA_VERSION);
5115990b4856Slling 				ASSERT(intval >= spa_version(spa));
5116990b4856Slling 				spa->spa_uberblock.ub_version = intval;
5117990b4856Slling 				vdev_config_dirty(spa->spa_root_vdev);
5118990b4856Slling 			}
5119ecd6cf80Smarks 			break;
5120990b4856Slling 
5121990b4856Slling 		case ZPOOL_PROP_ALTROOT:
5122990b4856Slling 			/*
5123990b4856Slling 			 * 'altroot' is a non-persistent property. It should
5124990b4856Slling 			 * have been set temporarily at creation or import time.
5125990b4856Slling 			 */
5126990b4856Slling 			ASSERT(spa->spa_root != NULL);
5127b1b8ab34Slling 			break;
51283d7072f8Seschrock 
51292f8aaab3Seschrock 		case ZPOOL_PROP_CACHEFILE:
5130990b4856Slling 			/*
5131379c004dSEric Schrock 			 * 'cachefile' is also a non-persisitent property.
5132990b4856Slling 			 */
51333d7072f8Seschrock 			break;
5134990b4856Slling 		default:
5135990b4856Slling 			/*
5136990b4856Slling 			 * Set pool property values in the poolprops mos object.
5137990b4856Slling 			 */
5138990b4856Slling 			if (spa->spa_pool_props_object == 0) {
5139990b4856Slling 				VERIFY((spa->spa_pool_props_object =
5140990b4856Slling 				    zap_create(mos, DMU_OT_POOL_PROPS,
5141990b4856Slling 				    DMU_OT_NONE, 0, tx)) > 0);
5142990b4856Slling 
5143990b4856Slling 				VERIFY(zap_update(mos,
5144990b4856Slling 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5145990b4856Slling 				    8, 1, &spa->spa_pool_props_object, tx)
5146990b4856Slling 				    == 0);
5147990b4856Slling 			}
5148990b4856Slling 
5149990b4856Slling 			/* normalize the property name */
5150990b4856Slling 			propname = zpool_prop_to_name(prop);
5151990b4856Slling 			proptype = zpool_prop_get_type(prop);
5152990b4856Slling 
5153990b4856Slling 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
5154990b4856Slling 				ASSERT(proptype == PROP_TYPE_STRING);
5155990b4856Slling 				VERIFY(nvpair_value_string(elem, &strval) == 0);
5156990b4856Slling 				VERIFY(zap_update(mos,
5157990b4856Slling 				    spa->spa_pool_props_object, propname,
5158990b4856Slling 				    1, strlen(strval) + 1, strval, tx) == 0);
5159990b4856Slling 
5160990b4856Slling 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5161990b4856Slling 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5162990b4856Slling 
5163990b4856Slling 				if (proptype == PROP_TYPE_INDEX) {
5164990b4856Slling 					const char *unused;
5165990b4856Slling 					VERIFY(zpool_prop_index_to_string(
5166990b4856Slling 					    prop, intval, &unused) == 0);
5167990b4856Slling 				}
5168990b4856Slling 				VERIFY(zap_update(mos,
5169990b4856Slling 				    spa->spa_pool_props_object, propname,
5170990b4856Slling 				    8, 1, &intval, tx) == 0);
5171990b4856Slling 			} else {
5172990b4856Slling 				ASSERT(0); /* not allowed */
5173990b4856Slling 			}
5174990b4856Slling 
51750a4e9518Sgw 			switch (prop) {
51760a4e9518Sgw 			case ZPOOL_PROP_DELEGATION:
5177990b4856Slling 				spa->spa_delegation = intval;
51780a4e9518Sgw 				break;
51790a4e9518Sgw 			case ZPOOL_PROP_BOOTFS:
5180990b4856Slling 				spa->spa_bootfs = intval;
51810a4e9518Sgw 				break;
51820a4e9518Sgw 			case ZPOOL_PROP_FAILUREMODE:
51830a4e9518Sgw 				spa->spa_failmode = intval;
51840a4e9518Sgw 				break;
5185573ca77eSGeorge Wilson 			case ZPOOL_PROP_AUTOEXPAND:
5186573ca77eSGeorge Wilson 				spa->spa_autoexpand = intval;
5187573ca77eSGeorge Wilson 				spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5188573ca77eSGeorge Wilson 				break;
5189b24ab676SJeff Bonwick 			case ZPOOL_PROP_DEDUPDITTO:
5190b24ab676SJeff Bonwick 				spa->spa_dedup_ditto = intval;
5191b24ab676SJeff Bonwick 				break;
51920a4e9518Sgw 			default:
51930a4e9518Sgw 				break;
51940a4e9518Sgw 			}
5195990b4856Slling 		}
5196990b4856Slling 
5197990b4856Slling 		/* log internal history if this is not a zpool create */
5198990b4856Slling 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5199990b4856Slling 		    tx->tx_txg != TXG_INITIAL) {
5200*3f9d6ad7SLin Ling 			spa_history_log_internal(LOG_POOL_PROPSET,
5201*3f9d6ad7SLin Ling 			    spa, tx, "%s %lld %s",
5202e14bb325SJeff Bonwick 			    nvpair_name(elem), intval, spa_name(spa));
5203b1b8ab34Slling 		}
5204b1b8ab34Slling 	}
5205e14bb325SJeff Bonwick 
5206e14bb325SJeff Bonwick 	mutex_exit(&spa->spa_props_lock);
5207b1b8ab34Slling }
5208b1b8ab34Slling 
5209fa9e4066Sahrens /*
5210fa9e4066Sahrens  * Sync the specified transaction group.  New blocks may be dirtied as
5211fa9e4066Sahrens  * part of the process, so we iterate until it converges.
5212fa9e4066Sahrens  */
5213fa9e4066Sahrens void
5214fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg)
5215fa9e4066Sahrens {
5216fa9e4066Sahrens 	dsl_pool_t *dp = spa->spa_dsl_pool;
5217fa9e4066Sahrens 	objset_t *mos = spa->spa_meta_objset;
5218b24ab676SJeff Bonwick 	bplist_t *defer_bpl = &spa->spa_deferred_bplist;
5219b24ab676SJeff Bonwick 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
52200373e76bSbonwick 	vdev_t *rvd = spa->spa_root_vdev;
5221fa9e4066Sahrens 	vdev_t *vd;
5222fa9e4066Sahrens 	dmu_tx_t *tx;
5223e14bb325SJeff Bonwick 	int error;
5224fa9e4066Sahrens 
5225fa9e4066Sahrens 	/*
5226fa9e4066Sahrens 	 * Lock out configuration changes.
5227fa9e4066Sahrens 	 */
5228e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5229fa9e4066Sahrens 
5230fa9e4066Sahrens 	spa->spa_syncing_txg = txg;
5231fa9e4066Sahrens 	spa->spa_sync_pass = 0;
5232fa9e4066Sahrens 
5233e14bb325SJeff Bonwick 	/*
5234e14bb325SJeff Bonwick 	 * If there are any pending vdev state changes, convert them
5235e14bb325SJeff Bonwick 	 * into config changes that go out with this transaction group.
5236e14bb325SJeff Bonwick 	 */
5237e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
52388ad4d6ddSJeff Bonwick 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
52398ad4d6ddSJeff Bonwick 		/*
52408ad4d6ddSJeff Bonwick 		 * We need the write lock here because, for aux vdevs,
52418ad4d6ddSJeff Bonwick 		 * calling vdev_config_dirty() modifies sav_config.
52428ad4d6ddSJeff Bonwick 		 * This is ugly and will become unnecessary when we
52438ad4d6ddSJeff Bonwick 		 * eliminate the aux vdev wart by integrating all vdevs
52448ad4d6ddSJeff Bonwick 		 * into the root vdev tree.
52458ad4d6ddSJeff Bonwick 		 */
52468ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
52478ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
52488ad4d6ddSJeff Bonwick 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
52498ad4d6ddSJeff Bonwick 			vdev_state_clean(vd);
52508ad4d6ddSJeff Bonwick 			vdev_config_dirty(vd);
52518ad4d6ddSJeff Bonwick 		}
52528ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
52538ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5254e14bb325SJeff Bonwick 	}
5255e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_STATE, FTAG);
5256e14bb325SJeff Bonwick 
5257b24ab676SJeff Bonwick 	VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
5258fa9e4066Sahrens 
525999653d4eSeschrock 	tx = dmu_tx_create_assigned(dp, txg);
526099653d4eSeschrock 
526199653d4eSeschrock 	/*
5262e7437265Sahrens 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
526399653d4eSeschrock 	 * set spa_deflate if we have no raid-z vdevs.
526499653d4eSeschrock 	 */
5265e7437265Sahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5266e7437265Sahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
526799653d4eSeschrock 		int i;
526899653d4eSeschrock 
526999653d4eSeschrock 		for (i = 0; i < rvd->vdev_children; i++) {
527099653d4eSeschrock 			vd = rvd->vdev_child[i];
527199653d4eSeschrock 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
527299653d4eSeschrock 				break;
527399653d4eSeschrock 		}
527499653d4eSeschrock 		if (i == rvd->vdev_children) {
527599653d4eSeschrock 			spa->spa_deflate = TRUE;
527699653d4eSeschrock 			VERIFY(0 == zap_add(spa->spa_meta_objset,
527799653d4eSeschrock 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
527899653d4eSeschrock 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
527999653d4eSeschrock 		}
528099653d4eSeschrock 	}
528199653d4eSeschrock 
5282088f3894Sahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5283088f3894Sahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5284088f3894Sahrens 		dsl_pool_create_origin(dp, tx);
5285088f3894Sahrens 
5286088f3894Sahrens 		/* Keeping the origin open increases spa_minref */
5287088f3894Sahrens 		spa->spa_minref += 3;
5288088f3894Sahrens 	}
5289088f3894Sahrens 
5290088f3894Sahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5291088f3894Sahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5292088f3894Sahrens 		dsl_pool_upgrade_clones(dp, tx);
5293088f3894Sahrens 	}
5294088f3894Sahrens 
5295fa9e4066Sahrens 	/*
5296*3f9d6ad7SLin Ling 	 * If anything has changed in this txg, or if someone is waiting
5297*3f9d6ad7SLin Ling 	 * for this txg to sync (eg, spa_vdev_remove()), push the
5298*3f9d6ad7SLin Ling 	 * deferred frees from the previous txg.  If not, leave them
5299*3f9d6ad7SLin Ling 	 * alone so that we don't generate work on an otherwise idle
5300*3f9d6ad7SLin Ling 	 * system.
5301fa9e4066Sahrens 	 */
5302fa9e4066Sahrens 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
53031615a317Sek 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
5304*3f9d6ad7SLin Ling 	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5305*3f9d6ad7SLin Ling 	    ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING ||
5306*3f9d6ad7SLin Ling 	    txg_sync_waiting(dp)) && !spa_shutting_down(spa)))
5307b24ab676SJeff Bonwick 		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
5308fa9e4066Sahrens 
5309fa9e4066Sahrens 	/*
5310fa9e4066Sahrens 	 * Iterate to convergence.
5311fa9e4066Sahrens 	 */
5312fa9e4066Sahrens 	do {
5313b24ab676SJeff Bonwick 		int pass = ++spa->spa_sync_pass;
5314fa9e4066Sahrens 
5315fa9e4066Sahrens 		spa_sync_config_object(spa, tx);
5316fa94a07fSbrendan 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5317fa94a07fSbrendan 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5318fa94a07fSbrendan 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5319fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5320ea8dc4b6Seschrock 		spa_errlog_sync(spa, txg);
5321fa9e4066Sahrens 		dsl_pool_sync(dp, txg);
5322fa9e4066Sahrens 
5323b24ab676SJeff Bonwick 		if (pass <= SYNC_PASS_DEFERRED_FREE) {
5324b24ab676SJeff Bonwick 			zio_t *zio = zio_root(spa, NULL, NULL, 0);
5325b24ab676SJeff Bonwick 			bplist_sync(free_bpl, spa_sync_free, zio, tx);
5326b24ab676SJeff Bonwick 			VERIFY(zio_wait(zio) == 0);
5327b24ab676SJeff Bonwick 		} else {
5328b24ab676SJeff Bonwick 			bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
5329fa9e4066Sahrens 		}
5330fa9e4066Sahrens 
5331b24ab676SJeff Bonwick 		ddt_sync(spa, txg);
5332*3f9d6ad7SLin Ling 		dsl_scan_sync(dp, tx);
5333afee20e4SGeorge Wilson 
5334b24ab676SJeff Bonwick 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5335b24ab676SJeff Bonwick 			vdev_sync(vd, txg);
5336b24ab676SJeff Bonwick 
5337b24ab676SJeff Bonwick 	} while (dmu_objset_is_dirty(mos, txg));
5338fa9e4066Sahrens 
53397a21607fSGeorge Wilson 	ASSERT(list_is_empty(&free_bpl->bpl_queue));
5340fa9e4066Sahrens 
5341b24ab676SJeff Bonwick 	bplist_close(defer_bpl);
5342fa9e4066Sahrens 
5343fa9e4066Sahrens 	/*
5344fa9e4066Sahrens 	 * Rewrite the vdev configuration (which includes the uberblock)
5345fa9e4066Sahrens 	 * to commit the transaction group.
53460373e76bSbonwick 	 *
534717f17c2dSbonwick 	 * If there are no dirty vdevs, we sync the uberblock to a few
534817f17c2dSbonwick 	 * random top-level vdevs that are known to be visible in the
5349e14bb325SJeff Bonwick 	 * config cache (see spa_vdev_add() for a complete description).
5350e14bb325SJeff Bonwick 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
53510373e76bSbonwick 	 */
5352e14bb325SJeff Bonwick 	for (;;) {
5353e14bb325SJeff Bonwick 		/*
5354e14bb325SJeff Bonwick 		 * We hold SCL_STATE to prevent vdev open/close/etc.
5355e14bb325SJeff Bonwick 		 * while we're attempting to write the vdev labels.
5356e14bb325SJeff Bonwick 		 */
5357e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5358e14bb325SJeff Bonwick 
5359e14bb325SJeff Bonwick 		if (list_is_empty(&spa->spa_config_dirty_list)) {
5360e14bb325SJeff Bonwick 			vdev_t *svd[SPA_DVAS_PER_BP];
5361e14bb325SJeff Bonwick 			int svdcount = 0;
5362e14bb325SJeff Bonwick 			int children = rvd->vdev_children;
5363e14bb325SJeff Bonwick 			int c0 = spa_get_random(children);
5364e14bb325SJeff Bonwick 
5365573ca77eSGeorge Wilson 			for (int c = 0; c < children; c++) {
5366e14bb325SJeff Bonwick 				vd = rvd->vdev_child[(c0 + c) % children];
5367e14bb325SJeff Bonwick 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5368e14bb325SJeff Bonwick 					continue;
5369e14bb325SJeff Bonwick 				svd[svdcount++] = vd;
5370e14bb325SJeff Bonwick 				if (svdcount == SPA_DVAS_PER_BP)
5371e14bb325SJeff Bonwick 					break;
5372e14bb325SJeff Bonwick 			}
53738956713aSEric Schrock 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
53748956713aSEric Schrock 			if (error != 0)
53758956713aSEric Schrock 				error = vdev_config_sync(svd, svdcount, txg,
53768956713aSEric Schrock 				    B_TRUE);
5377e14bb325SJeff Bonwick 		} else {
5378e14bb325SJeff Bonwick 			error = vdev_config_sync(rvd->vdev_child,
53798956713aSEric Schrock 			    rvd->vdev_children, txg, B_FALSE);
53808956713aSEric Schrock 			if (error != 0)
53818956713aSEric Schrock 				error = vdev_config_sync(rvd->vdev_child,
53828956713aSEric Schrock 				    rvd->vdev_children, txg, B_TRUE);
53830373e76bSbonwick 		}
5384e14bb325SJeff Bonwick 
5385e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_STATE, FTAG);
5386e14bb325SJeff Bonwick 
5387e14bb325SJeff Bonwick 		if (error == 0)
5388e14bb325SJeff Bonwick 			break;
5389e14bb325SJeff Bonwick 		zio_suspend(spa, NULL);
5390e14bb325SJeff Bonwick 		zio_resume_wait(spa);
53910373e76bSbonwick 	}
539299653d4eSeschrock 	dmu_tx_commit(tx);
539399653d4eSeschrock 
53940373e76bSbonwick 	/*
53950373e76bSbonwick 	 * Clear the dirty config list.
5396fa9e4066Sahrens 	 */
5397e14bb325SJeff Bonwick 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
53980373e76bSbonwick 		vdev_config_clean(vd);
53990373e76bSbonwick 
54000373e76bSbonwick 	/*
54010373e76bSbonwick 	 * Now that the new config has synced transactionally,
54020373e76bSbonwick 	 * let it become visible to the config cache.
54030373e76bSbonwick 	 */
54040373e76bSbonwick 	if (spa->spa_config_syncing != NULL) {
54050373e76bSbonwick 		spa_config_set(spa, spa->spa_config_syncing);
54060373e76bSbonwick 		spa->spa_config_txg = txg;
54070373e76bSbonwick 		spa->spa_config_syncing = NULL;
54080373e76bSbonwick 	}
5409fa9e4066Sahrens 
5410fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
5411fa9e4066Sahrens 
5412b24ab676SJeff Bonwick 	dsl_pool_sync_done(dp, txg);
5413fa9e4066Sahrens 
5414fa9e4066Sahrens 	/*
5415fa9e4066Sahrens 	 * Update usable space statistics.
5416fa9e4066Sahrens 	 */
5417fa9e4066Sahrens 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5418fa9e4066Sahrens 		vdev_sync_done(vd, txg);
5419fa9e4066Sahrens 
5420485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
5421485bbbf5SGeorge Wilson 
5422fa9e4066Sahrens 	/*
5423fa9e4066Sahrens 	 * It had better be the case that we didn't dirty anything
542499653d4eSeschrock 	 * since vdev_config_sync().
5425fa9e4066Sahrens 	 */
5426fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5427fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5428fa9e4066Sahrens 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
54297a21607fSGeorge Wilson 	ASSERT(list_is_empty(&defer_bpl->bpl_queue));
54307a21607fSGeorge Wilson 	ASSERT(list_is_empty(&free_bpl->bpl_queue));
5431b24ab676SJeff Bonwick 
5432b24ab676SJeff Bonwick 	spa->spa_sync_pass = 0;
5433fa9e4066Sahrens 
5434e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_CONFIG, FTAG);
5435ea8dc4b6Seschrock 
5436468c413aSTim Haley 	spa_handle_ignored_writes(spa);
5437468c413aSTim Haley 
5438ea8dc4b6Seschrock 	/*
5439ea8dc4b6Seschrock 	 * If any async tasks have been requested, kick them off.
5440ea8dc4b6Seschrock 	 */
5441ea8dc4b6Seschrock 	spa_async_dispatch(spa);
5442fa9e4066Sahrens }
5443fa9e4066Sahrens 
5444fa9e4066Sahrens /*
5445fa9e4066Sahrens  * Sync all pools.  We don't want to hold the namespace lock across these
5446fa9e4066Sahrens  * operations, so we take a reference on the spa_t and drop the lock during the
5447fa9e4066Sahrens  * sync.
5448fa9e4066Sahrens  */
5449fa9e4066Sahrens void
5450fa9e4066Sahrens spa_sync_allpools(void)
5451fa9e4066Sahrens {
5452fa9e4066Sahrens 	spa_t *spa = NULL;
5453fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
5454fa9e4066Sahrens 	while ((spa = spa_next(spa)) != NULL) {
5455e14bb325SJeff Bonwick 		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
5456fa9e4066Sahrens 			continue;
5457fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
5458fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
5459fa9e4066Sahrens 		txg_wait_synced(spa_get_dsl(spa), 0);
5460fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
5461fa9e4066Sahrens 		spa_close(spa, FTAG);
5462fa9e4066Sahrens 	}
5463fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
5464fa9e4066Sahrens }
5465fa9e4066Sahrens 
5466fa9e4066Sahrens /*
5467fa9e4066Sahrens  * ==========================================================================
5468fa9e4066Sahrens  * Miscellaneous routines
5469fa9e4066Sahrens  * ==========================================================================
5470fa9e4066Sahrens  */
5471fa9e4066Sahrens 
5472fa9e4066Sahrens /*
5473fa9e4066Sahrens  * Remove all pools in the system.
5474fa9e4066Sahrens  */
5475fa9e4066Sahrens void
5476fa9e4066Sahrens spa_evict_all(void)
5477fa9e4066Sahrens {
5478fa9e4066Sahrens 	spa_t *spa;
5479fa9e4066Sahrens 
5480fa9e4066Sahrens 	/*
5481fa9e4066Sahrens 	 * Remove all cached state.  All pools should be closed now,
5482fa9e4066Sahrens 	 * so every spa in the AVL tree should be unreferenced.
5483fa9e4066Sahrens 	 */
5484fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
5485fa9e4066Sahrens 	while ((spa = spa_next(NULL)) != NULL) {
5486fa9e4066Sahrens 		/*
5487ea8dc4b6Seschrock 		 * Stop async tasks.  The async thread may need to detach
5488ea8dc4b6Seschrock 		 * a device that's been replaced, which requires grabbing
5489ea8dc4b6Seschrock 		 * spa_namespace_lock, so we must drop it here.
5490fa9e4066Sahrens 		 */
5491fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
5492fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
5493ea8dc4b6Seschrock 		spa_async_suspend(spa);
5494fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
5495fa9e4066Sahrens 		spa_close(spa, FTAG);
5496fa9e4066Sahrens 
5497fa9e4066Sahrens 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5498fa9e4066Sahrens 			spa_unload(spa);
5499fa9e4066Sahrens 			spa_deactivate(spa);
5500fa9e4066Sahrens 		}
5501fa9e4066Sahrens 		spa_remove(spa);
5502fa9e4066Sahrens 	}
5503fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
5504fa9e4066Sahrens }
5505ea8dc4b6Seschrock 
5506ea8dc4b6Seschrock vdev_t *
55076809eb4eSEric Schrock spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5508ea8dc4b6Seschrock {
5509c5904d13Seschrock 	vdev_t *vd;
5510c5904d13Seschrock 	int i;
5511c5904d13Seschrock 
5512c5904d13Seschrock 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5513c5904d13Seschrock 		return (vd);
5514c5904d13Seschrock 
55156809eb4eSEric Schrock 	if (aux) {
5516c5904d13Seschrock 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5517c5904d13Seschrock 			vd = spa->spa_l2cache.sav_vdevs[i];
55186809eb4eSEric Schrock 			if (vd->vdev_guid == guid)
55196809eb4eSEric Schrock 				return (vd);
55206809eb4eSEric Schrock 		}
55216809eb4eSEric Schrock 
55226809eb4eSEric Schrock 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
55236809eb4eSEric Schrock 			vd = spa->spa_spares.sav_vdevs[i];
5524c5904d13Seschrock 			if (vd->vdev_guid == guid)
5525c5904d13Seschrock 				return (vd);
5526c5904d13Seschrock 		}
5527c5904d13Seschrock 	}
5528c5904d13Seschrock 
5529c5904d13Seschrock 	return (NULL);
5530ea8dc4b6Seschrock }
5531eaca9bbdSeschrock 
5532eaca9bbdSeschrock void
5533990b4856Slling spa_upgrade(spa_t *spa, uint64_t version)
5534eaca9bbdSeschrock {
5535e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5536eaca9bbdSeschrock 
5537eaca9bbdSeschrock 	/*
5538eaca9bbdSeschrock 	 * This should only be called for a non-faulted pool, and since a
5539eaca9bbdSeschrock 	 * future version would result in an unopenable pool, this shouldn't be
5540eaca9bbdSeschrock 	 * possible.
5541eaca9bbdSeschrock 	 */
5542e7437265Sahrens 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5543990b4856Slling 	ASSERT(version >= spa->spa_uberblock.ub_version);
5544eaca9bbdSeschrock 
5545990b4856Slling 	spa->spa_uberblock.ub_version = version;
5546eaca9bbdSeschrock 	vdev_config_dirty(spa->spa_root_vdev);
5547eaca9bbdSeschrock 
5548e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
554999653d4eSeschrock 
555099653d4eSeschrock 	txg_wait_synced(spa_get_dsl(spa), 0);
555199653d4eSeschrock }
555299653d4eSeschrock 
555399653d4eSeschrock boolean_t
555499653d4eSeschrock spa_has_spare(spa_t *spa, uint64_t guid)
555599653d4eSeschrock {
555699653d4eSeschrock 	int i;
555739c23413Seschrock 	uint64_t spareguid;
5558fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_spares;
555999653d4eSeschrock 
5560fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
5561fa94a07fSbrendan 		if (sav->sav_vdevs[i]->vdev_guid == guid)
556299653d4eSeschrock 			return (B_TRUE);
556399653d4eSeschrock 
5564fa94a07fSbrendan 	for (i = 0; i < sav->sav_npending; i++) {
5565fa94a07fSbrendan 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5566fa94a07fSbrendan 		    &spareguid) == 0 && spareguid == guid)
556739c23413Seschrock 			return (B_TRUE);
556839c23413Seschrock 	}
556939c23413Seschrock 
557099653d4eSeschrock 	return (B_FALSE);
5571eaca9bbdSeschrock }
5572b1b8ab34Slling 
557389a89ebfSlling /*
557489a89ebfSlling  * Check if a pool has an active shared spare device.
557589a89ebfSlling  * Note: reference count of an active spare is 2, as a spare and as a replace
557689a89ebfSlling  */
557789a89ebfSlling static boolean_t
557889a89ebfSlling spa_has_active_shared_spare(spa_t *spa)
557989a89ebfSlling {
558089a89ebfSlling 	int i, refcnt;
558189a89ebfSlling 	uint64_t pool;
558289a89ebfSlling 	spa_aux_vdev_t *sav = &spa->spa_spares;
558389a89ebfSlling 
558489a89ebfSlling 	for (i = 0; i < sav->sav_count; i++) {
558589a89ebfSlling 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
558689a89ebfSlling 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
558789a89ebfSlling 		    refcnt > 2)
558889a89ebfSlling 			return (B_TRUE);
558989a89ebfSlling 	}
559089a89ebfSlling 
559189a89ebfSlling 	return (B_FALSE);
559289a89ebfSlling }
559389a89ebfSlling 
55943d7072f8Seschrock /*
55953d7072f8Seschrock  * Post a sysevent corresponding to the given event.  The 'name' must be one of
55963d7072f8Seschrock  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
55973d7072f8Seschrock  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
55983d7072f8Seschrock  * in the userland libzpool, as we don't want consumers to misinterpret ztest
55993d7072f8Seschrock  * or zdb as real changes.
56003d7072f8Seschrock  */
56013d7072f8Seschrock void
56023d7072f8Seschrock spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
56033d7072f8Seschrock {
56043d7072f8Seschrock #ifdef _KERNEL
56053d7072f8Seschrock 	sysevent_t		*ev;
56063d7072f8Seschrock 	sysevent_attr_list_t	*attr = NULL;
56073d7072f8Seschrock 	sysevent_value_t	value;
56083d7072f8Seschrock 	sysevent_id_t		eid;
56093d7072f8Seschrock 
56103d7072f8Seschrock 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
56113d7072f8Seschrock 	    SE_SLEEP);
56123d7072f8Seschrock 
56133d7072f8Seschrock 	value.value_type = SE_DATA_TYPE_STRING;
56143d7072f8Seschrock 	value.value.sv_string = spa_name(spa);
56153d7072f8Seschrock 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
56163d7072f8Seschrock 		goto done;
56173d7072f8Seschrock 
56183d7072f8Seschrock 	value.value_type = SE_DATA_TYPE_UINT64;
56193d7072f8Seschrock 	value.value.sv_uint64 = spa_guid(spa);
56203d7072f8Seschrock 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
56213d7072f8Seschrock 		goto done;
56223d7072f8Seschrock 
56233d7072f8Seschrock 	if (vd) {
56243d7072f8Seschrock 		value.value_type = SE_DATA_TYPE_UINT64;
56253d7072f8Seschrock 		value.value.sv_uint64 = vd->vdev_guid;
56263d7072f8Seschrock 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
56273d7072f8Seschrock 		    SE_SLEEP) != 0)
56283d7072f8Seschrock 			goto done;
56293d7072f8Seschrock 
56303d7072f8Seschrock 		if (vd->vdev_path) {
56313d7072f8Seschrock 			value.value_type = SE_DATA_TYPE_STRING;
56323d7072f8Seschrock 			value.value.sv_string = vd->vdev_path;
56333d7072f8Seschrock 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
56343d7072f8Seschrock 			    &value, SE_SLEEP) != 0)
56353d7072f8Seschrock 				goto done;
56363d7072f8Seschrock 		}
56373d7072f8Seschrock 	}
56383d7072f8Seschrock 
5639b01c3b58Seschrock 	if (sysevent_attach_attributes(ev, attr) != 0)
5640b01c3b58Seschrock 		goto done;
5641b01c3b58Seschrock 	attr = NULL;
5642b01c3b58Seschrock 
56433d7072f8Seschrock 	(void) log_sysevent(ev, SE_SLEEP, &eid);
56443d7072f8Seschrock 
56453d7072f8Seschrock done:
56463d7072f8Seschrock 	if (attr)
56473d7072f8Seschrock 		sysevent_free_attr(attr);
56483d7072f8Seschrock 	sysevent_free(ev);
56493d7072f8Seschrock #endif
56503d7072f8Seschrock }
5651