xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa.c (revision 8704186e373c9ed74daa395ff3f7fd745396df9e)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
2199653d4eSeschrock 
22fa9e4066Sahrens /*
2398d1cbfeSGeorge Wilson  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245aeb9474SGarrett D'Amore  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
25e9103aaeSGarrett D'Amore  * Copyright (c) 2011 by Delphix. All rights reserved.
265aeb9474SGarrett D'Amore  */
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * This file contains all the routines used when modifying on-disk SPA state.
30fa9e4066Sahrens  * This includes opening, importing, destroying, exporting a pool, and syncing a
31fa9e4066Sahrens  * pool.
32fa9e4066Sahrens  */
33fa9e4066Sahrens 
34fa9e4066Sahrens #include <sys/zfs_context.h>
35ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h>
36fa9e4066Sahrens #include <sys/spa_impl.h>
37fa9e4066Sahrens #include <sys/zio.h>
38fa9e4066Sahrens #include <sys/zio_checksum.h>
39fa9e4066Sahrens #include <sys/dmu.h>
40fa9e4066Sahrens #include <sys/dmu_tx.h>
41fa9e4066Sahrens #include <sys/zap.h>
42fa9e4066Sahrens #include <sys/zil.h>
43b24ab676SJeff Bonwick #include <sys/ddt.h>
44fa9e4066Sahrens #include <sys/vdev_impl.h>
45fa9e4066Sahrens #include <sys/metaslab.h>
4688ecc943SGeorge Wilson #include <sys/metaslab_impl.h>
47fa9e4066Sahrens #include <sys/uberblock_impl.h>
48fa9e4066Sahrens #include <sys/txg.h>
49fa9e4066Sahrens #include <sys/avl.h>
50fa9e4066Sahrens #include <sys/dmu_traverse.h>
51b1b8ab34Slling #include <sys/dmu_objset.h>
52fa9e4066Sahrens #include <sys/unique.h>
53fa9e4066Sahrens #include <sys/dsl_pool.h>
54b1b8ab34Slling #include <sys/dsl_dataset.h>
55fa9e4066Sahrens #include <sys/dsl_dir.h>
56fa9e4066Sahrens #include <sys/dsl_prop.h>
57b1b8ab34Slling #include <sys/dsl_synctask.h>
58fa9e4066Sahrens #include <sys/fs/zfs.h>
59fa94a07fSbrendan #include <sys/arc.h>
60fa9e4066Sahrens #include <sys/callb.h>
6195173954Sek #include <sys/systeminfo.h>
62e7cbe64fSgw #include <sys/spa_boot.h>
63573ca77eSGeorge Wilson #include <sys/zfs_ioctl.h>
643f9d6ad7SLin Ling #include <sys/dsl_scan.h>
65fa9e4066Sahrens 
665679c89fSjv #ifdef	_KERNEL
67dedec472SJack Meng #include <sys/bootprops.h>
6835a5a358SJonathan Adams #include <sys/callb.h>
6935a5a358SJonathan Adams #include <sys/cpupart.h>
7035a5a358SJonathan Adams #include <sys/pool.h>
7135a5a358SJonathan Adams #include <sys/sysdc.h>
7235a5a358SJonathan Adams #include <sys/zone.h>
735679c89fSjv #endif	/* _KERNEL */
745679c89fSjv 
75990b4856Slling #include "zfs_prop.h"
76b7b97454Sperrin #include "zfs_comutil.h"
77990b4856Slling 
7835a5a358SJonathan Adams typedef enum zti_modes {
792e0c549eSJonathan Adams 	zti_mode_fixed,			/* value is # of threads (min 1) */
802e0c549eSJonathan Adams 	zti_mode_online_percent,	/* value is % of online CPUs */
8135a5a358SJonathan Adams 	zti_mode_batch,			/* cpu-intensive; value is ignored */
8280eb36f2SGeorge Wilson 	zti_mode_null,			/* don't create a taskq */
832e0c549eSJonathan Adams 	zti_nmodes
8435a5a358SJonathan Adams } zti_modes_t;
85416e0cd8Sek 
8680eb36f2SGeorge Wilson #define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
8780eb36f2SGeorge Wilson #define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
8835a5a358SJonathan Adams #define	ZTI_BATCH	{ zti_mode_batch, 0 }
8980eb36f2SGeorge Wilson #define	ZTI_NULL	{ zti_mode_null, 0 }
902e0c549eSJonathan Adams 
9180eb36f2SGeorge Wilson #define	ZTI_ONE		ZTI_FIX(1)
922e0c549eSJonathan Adams 
932e0c549eSJonathan Adams typedef struct zio_taskq_info {
9480eb36f2SGeorge Wilson 	enum zti_modes zti_mode;
9580eb36f2SGeorge Wilson 	uint_t zti_value;
962e0c549eSJonathan Adams } zio_taskq_info_t;
972e0c549eSJonathan Adams 
982e0c549eSJonathan Adams static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
9935a5a358SJonathan Adams 	"issue", "issue_high", "intr", "intr_high"
1002e0c549eSJonathan Adams };
1012e0c549eSJonathan Adams 
10280eb36f2SGeorge Wilson /*
10380eb36f2SGeorge Wilson  * Define the taskq threads for the following I/O types:
10480eb36f2SGeorge Wilson  * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
10580eb36f2SGeorge Wilson  */
10680eb36f2SGeorge Wilson const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
10780eb36f2SGeorge Wilson 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
10880eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
10935a5a358SJonathan Adams 	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
11035a5a358SJonathan Adams 	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
111c7cd2421SGeorge Wilson 	{ ZTI_FIX(100),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
11280eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
11380eb36f2SGeorge Wilson 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
1142e0c549eSJonathan Adams };
1152e0c549eSJonathan Adams 
1163f9d6ad7SLin Ling static dsl_syncfunc_t spa_sync_props;
11789a89ebfSlling static boolean_t spa_has_active_shared_spare(spa_t *spa);
1181195e687SMark J Musante static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
1191195e687SMark J Musante     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1201195e687SMark J Musante     char **ereport);
121cb04b873SMark J Musante static void spa_vdev_resilver_done(spa_t *spa);
122990b4856Slling 
12335a5a358SJonathan Adams uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
12435a5a358SJonathan Adams id_t		zio_taskq_psrset_bind = PS_NONE;
12535a5a358SJonathan Adams boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
12635a5a358SJonathan Adams uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
12735a5a358SJonathan Adams 
12835a5a358SJonathan Adams boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
12935a5a358SJonathan Adams 
13035a5a358SJonathan Adams /*
13135a5a358SJonathan Adams  * This (illegal) pool name is used when temporarily importing a spa_t in order
13235a5a358SJonathan Adams  * to get the vdev stats associated with the imported devices.
13335a5a358SJonathan Adams  */
13435a5a358SJonathan Adams #define	TRYIMPORT_NAME	"$import"
13535a5a358SJonathan Adams 
136990b4856Slling /*
137990b4856Slling  * ==========================================================================
138990b4856Slling  * SPA properties routines
139990b4856Slling  * ==========================================================================
140990b4856Slling  */
141990b4856Slling 
142990b4856Slling /*
143990b4856Slling  * Add a (source=src, propname=propval) list to an nvlist.
144990b4856Slling  */
1459d82f4f6Slling static void
146990b4856Slling spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
147990b4856Slling     uint64_t intval, zprop_source_t src)
148990b4856Slling {
149990b4856Slling 	const char *propname = zpool_prop_to_name(prop);
150990b4856Slling 	nvlist_t *propval;
151990b4856Slling 
1529d82f4f6Slling 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1539d82f4f6Slling 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
154990b4856Slling 
1559d82f4f6Slling 	if (strval != NULL)
1569d82f4f6Slling 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
1579d82f4f6Slling 	else
1589d82f4f6Slling 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
159990b4856Slling 
1609d82f4f6Slling 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
161990b4856Slling 	nvlist_free(propval);
162990b4856Slling }
163990b4856Slling 
164990b4856Slling /*
165990b4856Slling  * Get property values from the spa configuration.
166990b4856Slling  */
1679d82f4f6Slling static void
168990b4856Slling spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
169990b4856Slling {
170379c004dSEric Schrock 	uint64_t size;
171485bbbf5SGeorge Wilson 	uint64_t alloc;
172990b4856Slling 	uint64_t cap, version;
173990b4856Slling 	zprop_source_t src = ZPROP_SRC_NONE;
174c5904d13Seschrock 	spa_config_dirent_t *dp;
175990b4856Slling 
176e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
177e14bb325SJeff Bonwick 
178379c004dSEric Schrock 	if (spa->spa_root_vdev != NULL) {
179485bbbf5SGeorge Wilson 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
180b24ab676SJeff Bonwick 		size = metaslab_class_get_space(spa_normal_class(spa));
181379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
182379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
183485bbbf5SGeorge Wilson 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
184485bbbf5SGeorge Wilson 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
185485bbbf5SGeorge Wilson 		    size - alloc, src);
186f9af39baSGeorge Wilson 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
187f9af39baSGeorge Wilson 		    (spa_mode(spa) == FREAD), src);
188379c004dSEric Schrock 
189485bbbf5SGeorge Wilson 		cap = (size == 0) ? 0 : (alloc * 100 / size);
190379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
191379c004dSEric Schrock 
192b24ab676SJeff Bonwick 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
193b24ab676SJeff Bonwick 		    ddt_get_pool_dedup_ratio(spa), src);
194b24ab676SJeff Bonwick 
195379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
196379c004dSEric Schrock 		    spa->spa_root_vdev->vdev_state, src);
197379c004dSEric Schrock 
198379c004dSEric Schrock 		version = spa_version(spa);
199379c004dSEric Schrock 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
200379c004dSEric Schrock 			src = ZPROP_SRC_DEFAULT;
201379c004dSEric Schrock 		else
202379c004dSEric Schrock 			src = ZPROP_SRC_LOCAL;
203379c004dSEric Schrock 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
204379c004dSEric Schrock 	}
205990b4856Slling 
2069d82f4f6Slling 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
207990b4856Slling 
208*8704186eSDan McDonald 	if (spa->spa_comment != NULL) {
209*8704186eSDan McDonald 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
210*8704186eSDan McDonald 		    0, ZPROP_SRC_LOCAL);
211*8704186eSDan McDonald 	}
212*8704186eSDan McDonald 
2139d82f4f6Slling 	if (spa->spa_root != NULL)
2149d82f4f6Slling 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
2159d82f4f6Slling 		    0, ZPROP_SRC_LOCAL);
216990b4856Slling 
217c5904d13Seschrock 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
218c5904d13Seschrock 		if (dp->scd_path == NULL) {
2199d82f4f6Slling 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
220c5904d13Seschrock 			    "none", 0, ZPROP_SRC_LOCAL);
221c5904d13Seschrock 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
2229d82f4f6Slling 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
223c5904d13Seschrock 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
2242f8aaab3Seschrock 		}
2252f8aaab3Seschrock 	}
226990b4856Slling }
227990b4856Slling 
228990b4856Slling /*
229990b4856Slling  * Get zpool property values.
230990b4856Slling  */
231990b4856Slling int
232990b4856Slling spa_prop_get(spa_t *spa, nvlist_t **nvp)
233990b4856Slling {
234b24ab676SJeff Bonwick 	objset_t *mos = spa->spa_meta_objset;
235990b4856Slling 	zap_cursor_t zc;
236990b4856Slling 	zap_attribute_t za;
237990b4856Slling 	int err;
238990b4856Slling 
2399d82f4f6Slling 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
240990b4856Slling 
241e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);
242e14bb325SJeff Bonwick 
243990b4856Slling 	/*
244990b4856Slling 	 * Get properties from the spa config.
245990b4856Slling 	 */
2469d82f4f6Slling 	spa_prop_get_config(spa, nvp);
247990b4856Slling 
248990b4856Slling 	/* If no pool property object, no more prop to get. */
249afee20e4SGeorge Wilson 	if (mos == NULL || spa->spa_pool_props_object == 0) {
250990b4856Slling 		mutex_exit(&spa->spa_props_lock);
251990b4856Slling 		return (0);
252990b4856Slling 	}
253990b4856Slling 
254990b4856Slling 	/*
255990b4856Slling 	 * Get properties from the MOS pool property object.
256990b4856Slling 	 */
257990b4856Slling 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
258990b4856Slling 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
259990b4856Slling 	    zap_cursor_advance(&zc)) {
260990b4856Slling 		uint64_t intval = 0;
261990b4856Slling 		char *strval = NULL;
262990b4856Slling 		zprop_source_t src = ZPROP_SRC_DEFAULT;
263990b4856Slling 		zpool_prop_t prop;
264990b4856Slling 
265990b4856Slling 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
266990b4856Slling 			continue;
267990b4856Slling 
268990b4856Slling 		switch (za.za_integer_length) {
269990b4856Slling 		case 8:
270990b4856Slling 			/* integer property */
271990b4856Slling 			if (za.za_first_integer !=
272990b4856Slling 			    zpool_prop_default_numeric(prop))
273990b4856Slling 				src = ZPROP_SRC_LOCAL;
274990b4856Slling 
275990b4856Slling 			if (prop == ZPOOL_PROP_BOOTFS) {
276990b4856Slling 				dsl_pool_t *dp;
277990b4856Slling 				dsl_dataset_t *ds = NULL;
278990b4856Slling 
279990b4856Slling 				dp = spa_get_dsl(spa);
280990b4856Slling 				rw_enter(&dp->dp_config_rwlock, RW_READER);
281745cd3c5Smaybee 				if (err = dsl_dataset_hold_obj(dp,
282745cd3c5Smaybee 				    za.za_first_integer, FTAG, &ds)) {
283990b4856Slling 					rw_exit(&dp->dp_config_rwlock);
284990b4856Slling 					break;
285990b4856Slling 				}
286990b4856Slling 
287990b4856Slling 				strval = kmem_alloc(
288990b4856Slling 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
289990b4856Slling 				    KM_SLEEP);
290990b4856Slling 				dsl_dataset_name(ds, strval);
291745cd3c5Smaybee 				dsl_dataset_rele(ds, FTAG);
292990b4856Slling 				rw_exit(&dp->dp_config_rwlock);
293990b4856Slling 			} else {
294990b4856Slling 				strval = NULL;
295990b4856Slling 				intval = za.za_first_integer;
296990b4856Slling 			}
297990b4856Slling 
2989d82f4f6Slling 			spa_prop_add_list(*nvp, prop, strval, intval, src);
299990b4856Slling 
300990b4856Slling 			if (strval != NULL)
301990b4856Slling 				kmem_free(strval,
302990b4856Slling 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
303990b4856Slling 
304990b4856Slling 			break;
305990b4856Slling 
306990b4856Slling 		case 1:
307990b4856Slling 			/* string property */
308990b4856Slling 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
309990b4856Slling 			err = zap_lookup(mos, spa->spa_pool_props_object,
310990b4856Slling 			    za.za_name, 1, za.za_num_integers, strval);
311990b4856Slling 			if (err) {
312990b4856Slling 				kmem_free(strval, za.za_num_integers);
313990b4856Slling 				break;
314990b4856Slling 			}
3159d82f4f6Slling 			spa_prop_add_list(*nvp, prop, strval, 0, src);
316990b4856Slling 			kmem_free(strval, za.za_num_integers);
317990b4856Slling 			break;
318990b4856Slling 
319990b4856Slling 		default:
320990b4856Slling 			break;
321990b4856Slling 		}
322990b4856Slling 	}
323990b4856Slling 	zap_cursor_fini(&zc);
324990b4856Slling 	mutex_exit(&spa->spa_props_lock);
325990b4856Slling out:
326990b4856Slling 	if (err && err != ENOENT) {
327990b4856Slling 		nvlist_free(*nvp);
3289d82f4f6Slling 		*nvp = NULL;
329990b4856Slling 		return (err);
330990b4856Slling 	}
331990b4856Slling 
332990b4856Slling 	return (0);
333990b4856Slling }
334990b4856Slling 
335990b4856Slling /*
336990b4856Slling  * Validate the given pool properties nvlist and modify the list
337990b4856Slling  * for the property values to be set.
338990b4856Slling  */
339990b4856Slling static int
340990b4856Slling spa_prop_validate(spa_t *spa, nvlist_t *props)
341990b4856Slling {
342990b4856Slling 	nvpair_t *elem;
343990b4856Slling 	int error = 0, reset_bootfs = 0;
344990b4856Slling 	uint64_t objnum;
345990b4856Slling 
346990b4856Slling 	elem = NULL;
347990b4856Slling 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
348990b4856Slling 		zpool_prop_t prop;
349990b4856Slling 		char *propname, *strval;
350990b4856Slling 		uint64_t intval;
351990b4856Slling 		objset_t *os;
352*8704186eSDan McDonald 		char *slash, *check;
353990b4856Slling 
354990b4856Slling 		propname = nvpair_name(elem);
355990b4856Slling 
356990b4856Slling 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
357990b4856Slling 			return (EINVAL);
358990b4856Slling 
359990b4856Slling 		switch (prop) {
360990b4856Slling 		case ZPOOL_PROP_VERSION:
361990b4856Slling 			error = nvpair_value_uint64(elem, &intval);
362990b4856Slling 			if (!error &&
363990b4856Slling 			    (intval < spa_version(spa) || intval > SPA_VERSION))
364990b4856Slling 				error = EINVAL;
365990b4856Slling 			break;
366990b4856Slling 
367990b4856Slling 		case ZPOOL_PROP_DELEGATION:
368990b4856Slling 		case ZPOOL_PROP_AUTOREPLACE:
369d5b5bb25SRich Morris 		case ZPOOL_PROP_LISTSNAPS:
370573ca77eSGeorge Wilson 		case ZPOOL_PROP_AUTOEXPAND:
371990b4856Slling 			error = nvpair_value_uint64(elem, &intval);
372990b4856Slling 			if (!error && intval > 1)
373990b4856Slling 				error = EINVAL;
374990b4856Slling 			break;
375990b4856Slling 
376990b4856Slling 		case ZPOOL_PROP_BOOTFS:
37725f89ee2SJeff Bonwick 			/*
37825f89ee2SJeff Bonwick 			 * If the pool version is less than SPA_VERSION_BOOTFS,
37925f89ee2SJeff Bonwick 			 * or the pool is still being created (version == 0),
38025f89ee2SJeff Bonwick 			 * the bootfs property cannot be set.
38125f89ee2SJeff Bonwick 			 */
382990b4856Slling 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
383990b4856Slling 				error = ENOTSUP;
384990b4856Slling 				break;
385990b4856Slling 			}
386990b4856Slling 
387990b4856Slling 			/*
38815e6edf1Sgw 			 * Make sure the vdev config is bootable
389990b4856Slling 			 */
39015e6edf1Sgw 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
391990b4856Slling 				error = ENOTSUP;
392990b4856Slling 				break;
393990b4856Slling 			}
394990b4856Slling 
395990b4856Slling 			reset_bootfs = 1;
396990b4856Slling 
397990b4856Slling 			error = nvpair_value_string(elem, &strval);
398990b4856Slling 
399990b4856Slling 			if (!error) {
40015e6edf1Sgw 				uint64_t compress;
40115e6edf1Sgw 
402990b4856Slling 				if (strval == NULL || strval[0] == '\0') {
403990b4856Slling 					objnum = zpool_prop_default_numeric(
404990b4856Slling 					    ZPOOL_PROP_BOOTFS);
405990b4856Slling 					break;
406990b4856Slling 				}
407990b4856Slling 
408503ad85cSMatthew Ahrens 				if (error = dmu_objset_hold(strval, FTAG, &os))
409990b4856Slling 					break;
41015e6edf1Sgw 
411503ad85cSMatthew Ahrens 				/* Must be ZPL and not gzip compressed. */
412503ad85cSMatthew Ahrens 
413503ad85cSMatthew Ahrens 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
414503ad85cSMatthew Ahrens 					error = ENOTSUP;
415503ad85cSMatthew Ahrens 				} else if ((error = dsl_prop_get_integer(strval,
41615e6edf1Sgw 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
41715e6edf1Sgw 				    &compress, NULL)) == 0 &&
41815e6edf1Sgw 				    !BOOTFS_COMPRESS_VALID(compress)) {
41915e6edf1Sgw 					error = ENOTSUP;
42015e6edf1Sgw 				} else {
42115e6edf1Sgw 					objnum = dmu_objset_id(os);
42215e6edf1Sgw 				}
423503ad85cSMatthew Ahrens 				dmu_objset_rele(os, FTAG);
424990b4856Slling 			}
425990b4856Slling 			break;
426e14bb325SJeff Bonwick 
4270a4e9518Sgw 		case ZPOOL_PROP_FAILUREMODE:
4280a4e9518Sgw 			error = nvpair_value_uint64(elem, &intval);
4290a4e9518Sgw 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
4300a4e9518Sgw 			    intval > ZIO_FAILURE_MODE_PANIC))
4310a4e9518Sgw 				error = EINVAL;
4320a4e9518Sgw 
4330a4e9518Sgw 			/*
4340a4e9518Sgw 			 * This is a special case which only occurs when
4350a4e9518Sgw 			 * the pool has completely failed. This allows
4360a4e9518Sgw 			 * the user to change the in-core failmode property
4370a4e9518Sgw 			 * without syncing it out to disk (I/Os might
4380a4e9518Sgw 			 * currently be blocked). We do this by returning
4390a4e9518Sgw 			 * EIO to the caller (spa_prop_set) to trick it
4400a4e9518Sgw 			 * into thinking we encountered a property validation
4410a4e9518Sgw 			 * error.
4420a4e9518Sgw 			 */
443e14bb325SJeff Bonwick 			if (!error && spa_suspended(spa)) {
4440a4e9518Sgw 				spa->spa_failmode = intval;
4450a4e9518Sgw 				error = EIO;
4460a4e9518Sgw 			}
4470a4e9518Sgw 			break;
4482f8aaab3Seschrock 
4492f8aaab3Seschrock 		case ZPOOL_PROP_CACHEFILE:
4502f8aaab3Seschrock 			if ((error = nvpair_value_string(elem, &strval)) != 0)
4512f8aaab3Seschrock 				break;
4522f8aaab3Seschrock 
4532f8aaab3Seschrock 			if (strval[0] == '\0')
4542f8aaab3Seschrock 				break;
4552f8aaab3Seschrock 
4562f8aaab3Seschrock 			if (strcmp(strval, "none") == 0)
4572f8aaab3Seschrock 				break;
4582f8aaab3Seschrock 
4592f8aaab3Seschrock 			if (strval[0] != '/') {
4602f8aaab3Seschrock 				error = EINVAL;
4612f8aaab3Seschrock 				break;
4622f8aaab3Seschrock 			}
4632f8aaab3Seschrock 
4642f8aaab3Seschrock 			slash = strrchr(strval, '/');
4652f8aaab3Seschrock 			ASSERT(slash != NULL);
4662f8aaab3Seschrock 
4672f8aaab3Seschrock 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
4682f8aaab3Seschrock 			    strcmp(slash, "/..") == 0)
4692f8aaab3Seschrock 				error = EINVAL;
4702f8aaab3Seschrock 			break;
471b24ab676SJeff Bonwick 
472*8704186eSDan McDonald 		case ZPOOL_PROP_COMMENT:
473*8704186eSDan McDonald 			if ((error = nvpair_value_string(elem, &strval)) != 0)
474*8704186eSDan McDonald 				break;
475*8704186eSDan McDonald 			for (check = strval; *check != '\0'; check++) {
476*8704186eSDan McDonald 				/*
477*8704186eSDan McDonald 				 * The kernel doesn't have an easy isprint()
478*8704186eSDan McDonald 				 * check.  For this kernel check, we merely
479*8704186eSDan McDonald 				 * check ASCII apart from DEL.  Fix this if
480*8704186eSDan McDonald 				 * there is an easy-to-use kernel isprint().
481*8704186eSDan McDonald 				 */
482*8704186eSDan McDonald 				if (*check >= 0x7f) {
483*8704186eSDan McDonald 					error = EINVAL;
484*8704186eSDan McDonald 					break;
485*8704186eSDan McDonald 				}
486*8704186eSDan McDonald 				check++;
487*8704186eSDan McDonald 			}
488*8704186eSDan McDonald 			if (strlen(strval) > ZPROP_MAX_COMMENT)
489*8704186eSDan McDonald 				error = E2BIG;
490*8704186eSDan McDonald 			break;
491*8704186eSDan McDonald 
492b24ab676SJeff Bonwick 		case ZPOOL_PROP_DEDUPDITTO:
493b24ab676SJeff Bonwick 			if (spa_version(spa) < SPA_VERSION_DEDUP)
494b24ab676SJeff Bonwick 				error = ENOTSUP;
495b24ab676SJeff Bonwick 			else
496b24ab676SJeff Bonwick 				error = nvpair_value_uint64(elem, &intval);
497b24ab676SJeff Bonwick 			if (error == 0 &&
498b24ab676SJeff Bonwick 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
499b24ab676SJeff Bonwick 				error = EINVAL;
500b24ab676SJeff Bonwick 			break;
501990b4856Slling 		}
502990b4856Slling 
503990b4856Slling 		if (error)
504990b4856Slling 			break;
505990b4856Slling 	}
506990b4856Slling 
507990b4856Slling 	if (!error && reset_bootfs) {
508990b4856Slling 		error = nvlist_remove(props,
509990b4856Slling 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
510990b4856Slling 
511990b4856Slling 		if (!error) {
512990b4856Slling 			error = nvlist_add_uint64(props,
513990b4856Slling 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
514990b4856Slling 		}
515990b4856Slling 	}
516990b4856Slling 
517990b4856Slling 	return (error);
518990b4856Slling }
519990b4856Slling 
520379c004dSEric Schrock void
521379c004dSEric Schrock spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
522379c004dSEric Schrock {
523379c004dSEric Schrock 	char *cachefile;
524379c004dSEric Schrock 	spa_config_dirent_t *dp;
525379c004dSEric Schrock 
526379c004dSEric Schrock 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
527379c004dSEric Schrock 	    &cachefile) != 0)
528379c004dSEric Schrock 		return;
529379c004dSEric Schrock 
530379c004dSEric Schrock 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
531379c004dSEric Schrock 	    KM_SLEEP);
532379c004dSEric Schrock 
533379c004dSEric Schrock 	if (cachefile[0] == '\0')
534379c004dSEric Schrock 		dp->scd_path = spa_strdup(spa_config_path);
535379c004dSEric Schrock 	else if (strcmp(cachefile, "none") == 0)
536379c004dSEric Schrock 		dp->scd_path = NULL;
537379c004dSEric Schrock 	else
538379c004dSEric Schrock 		dp->scd_path = spa_strdup(cachefile);
539379c004dSEric Schrock 
540379c004dSEric Schrock 	list_insert_head(&spa->spa_config_list, dp);
541379c004dSEric Schrock 	if (need_sync)
542379c004dSEric Schrock 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
543379c004dSEric Schrock }
544379c004dSEric Schrock 
545990b4856Slling int
546990b4856Slling spa_prop_set(spa_t *spa, nvlist_t *nvp)
547990b4856Slling {
548990b4856Slling 	int error;
549379c004dSEric Schrock 	nvpair_t *elem;
550379c004dSEric Schrock 	boolean_t need_sync = B_FALSE;
551379c004dSEric Schrock 	zpool_prop_t prop;
552990b4856Slling 
553990b4856Slling 	if ((error = spa_prop_validate(spa, nvp)) != 0)
554990b4856Slling 		return (error);
555990b4856Slling 
556379c004dSEric Schrock 	elem = NULL;
557379c004dSEric Schrock 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
558379c004dSEric Schrock 		if ((prop = zpool_name_to_prop(
559379c004dSEric Schrock 		    nvpair_name(elem))) == ZPROP_INVAL)
560379c004dSEric Schrock 			return (EINVAL);
561379c004dSEric Schrock 
562f9af39baSGeorge Wilson 		if (prop == ZPOOL_PROP_CACHEFILE ||
563f9af39baSGeorge Wilson 		    prop == ZPOOL_PROP_ALTROOT ||
564f9af39baSGeorge Wilson 		    prop == ZPOOL_PROP_READONLY)
565379c004dSEric Schrock 			continue;
566379c004dSEric Schrock 
567379c004dSEric Schrock 		need_sync = B_TRUE;
568379c004dSEric Schrock 		break;
569379c004dSEric Schrock 	}
570379c004dSEric Schrock 
571379c004dSEric Schrock 	if (need_sync)
572379c004dSEric Schrock 		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
573379c004dSEric Schrock 		    spa, nvp, 3));
574379c004dSEric Schrock 	else
575379c004dSEric Schrock 		return (0);
576990b4856Slling }
577990b4856Slling 
578990b4856Slling /*
579990b4856Slling  * If the bootfs property value is dsobj, clear it.
580990b4856Slling  */
581990b4856Slling void
582990b4856Slling spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
583990b4856Slling {
584990b4856Slling 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
585990b4856Slling 		VERIFY(zap_remove(spa->spa_meta_objset,
586990b4856Slling 		    spa->spa_pool_props_object,
587990b4856Slling 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
588990b4856Slling 		spa->spa_bootfs = 0;
589990b4856Slling 	}
590990b4856Slling }
591990b4856Slling 
592e9103aaeSGarrett D'Amore /*
593e9103aaeSGarrett D'Amore  * Change the GUID for the pool.  This is done so that we can later
594e9103aaeSGarrett D'Amore  * re-import a pool built from a clone of our own vdevs.  We will modify
595e9103aaeSGarrett D'Amore  * the root vdev's guid, our own pool guid, and then mark all of our
596e9103aaeSGarrett D'Amore  * vdevs dirty.  Note that we must make sure that all our vdevs are
597e9103aaeSGarrett D'Amore  * online when we do this, or else any vdevs that weren't present
598e9103aaeSGarrett D'Amore  * would be orphaned from our pool.  We are also going to issue a
599e9103aaeSGarrett D'Amore  * sysevent to update any watchers.
600e9103aaeSGarrett D'Amore  */
601e9103aaeSGarrett D'Amore int
602e9103aaeSGarrett D'Amore spa_change_guid(spa_t *spa)
603e9103aaeSGarrett D'Amore {
604e9103aaeSGarrett D'Amore 	uint64_t	oldguid, newguid;
605e9103aaeSGarrett D'Amore 	uint64_t	txg;
606e9103aaeSGarrett D'Amore 
607e9103aaeSGarrett D'Amore 	if (!(spa_mode_global & FWRITE))
608e9103aaeSGarrett D'Amore 		return (EROFS);
609e9103aaeSGarrett D'Amore 
610e9103aaeSGarrett D'Amore 	txg = spa_vdev_enter(spa);
611e9103aaeSGarrett D'Amore 
612e9103aaeSGarrett D'Amore 	if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
613e9103aaeSGarrett D'Amore 		return (spa_vdev_exit(spa, NULL, txg, ENXIO));
614e9103aaeSGarrett D'Amore 
615e9103aaeSGarrett D'Amore 	oldguid = spa_guid(spa);
616e9103aaeSGarrett D'Amore 	newguid = spa_generate_guid(NULL);
617e9103aaeSGarrett D'Amore 	ASSERT3U(oldguid, !=, newguid);
618e9103aaeSGarrett D'Amore 
619e9103aaeSGarrett D'Amore 	spa->spa_root_vdev->vdev_guid = newguid;
620e9103aaeSGarrett D'Amore 	spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
621e9103aaeSGarrett D'Amore 
622e9103aaeSGarrett D'Amore 	vdev_config_dirty(spa->spa_root_vdev);
623e9103aaeSGarrett D'Amore 
624e9103aaeSGarrett D'Amore 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
625e9103aaeSGarrett D'Amore 
626e9103aaeSGarrett D'Amore 	return (spa_vdev_exit(spa, NULL, txg, 0));
627e9103aaeSGarrett D'Amore }
628e9103aaeSGarrett D'Amore 
629fa9e4066Sahrens /*
630fa9e4066Sahrens  * ==========================================================================
631fa9e4066Sahrens  * SPA state manipulation (open/create/destroy/import/export)
632fa9e4066Sahrens  * ==========================================================================
633fa9e4066Sahrens  */
634fa9e4066Sahrens 
635ea8dc4b6Seschrock static int
636ea8dc4b6Seschrock spa_error_entry_compare(const void *a, const void *b)
637ea8dc4b6Seschrock {
638ea8dc4b6Seschrock 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
639ea8dc4b6Seschrock 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
640ea8dc4b6Seschrock 	int ret;
641ea8dc4b6Seschrock 
642ea8dc4b6Seschrock 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
643ea8dc4b6Seschrock 	    sizeof (zbookmark_t));
644ea8dc4b6Seschrock 
645ea8dc4b6Seschrock 	if (ret < 0)
646ea8dc4b6Seschrock 		return (-1);
647ea8dc4b6Seschrock 	else if (ret > 0)
648ea8dc4b6Seschrock 		return (1);
649ea8dc4b6Seschrock 	else
650ea8dc4b6Seschrock 		return (0);
651ea8dc4b6Seschrock }
652ea8dc4b6Seschrock 
653ea8dc4b6Seschrock /*
654ea8dc4b6Seschrock  * Utility function which retrieves copies of the current logs and
655ea8dc4b6Seschrock  * re-initializes them in the process.
656ea8dc4b6Seschrock  */
657ea8dc4b6Seschrock void
658ea8dc4b6Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
659ea8dc4b6Seschrock {
660ea8dc4b6Seschrock 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
661ea8dc4b6Seschrock 
662ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
663ea8dc4b6Seschrock 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
664ea8dc4b6Seschrock 
665ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
666ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
667ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
668ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
669ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
670ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
671ea8dc4b6Seschrock }
672ea8dc4b6Seschrock 
67335a5a358SJonathan Adams static taskq_t *
67435a5a358SJonathan Adams spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
67535a5a358SJonathan Adams     uint_t value)
676fa9e4066Sahrens {
6775aeb9474SGarrett D'Amore 	uint_t flags = 0;
67835a5a358SJonathan Adams 	boolean_t batch = B_FALSE;
679fa9e4066Sahrens 
68035a5a358SJonathan Adams 	switch (mode) {
68135a5a358SJonathan Adams 	case zti_mode_null:
68235a5a358SJonathan Adams 		return (NULL);		/* no taskq needed */
683fa9e4066Sahrens 
68435a5a358SJonathan Adams 	case zti_mode_fixed:
68535a5a358SJonathan Adams 		ASSERT3U(value, >=, 1);
68635a5a358SJonathan Adams 		value = MAX(value, 1);
68735a5a358SJonathan Adams 		break;
688fa9e4066Sahrens 
68935a5a358SJonathan Adams 	case zti_mode_batch:
69035a5a358SJonathan Adams 		batch = B_TRUE;
69135a5a358SJonathan Adams 		flags |= TASKQ_THREADS_CPU_PCT;
69235a5a358SJonathan Adams 		value = zio_taskq_batch_pct;
69335a5a358SJonathan Adams 		break;
69435a5a358SJonathan Adams 
69535a5a358SJonathan Adams 	case zti_mode_online_percent:
69635a5a358SJonathan Adams 		flags |= TASKQ_THREADS_CPU_PCT;
69735a5a358SJonathan Adams 		break;
69835a5a358SJonathan Adams 
69935a5a358SJonathan Adams 	default:
70035a5a358SJonathan Adams 		panic("unrecognized mode for %s taskq (%u:%u) in "
70135a5a358SJonathan Adams 		    "spa_activate()",
70235a5a358SJonathan Adams 		    name, mode, value);
70335a5a358SJonathan Adams 		break;
70435a5a358SJonathan Adams 	}
70535a5a358SJonathan Adams 
70635a5a358SJonathan Adams 	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
70735a5a358SJonathan Adams 		if (batch)
70835a5a358SJonathan Adams 			flags |= TASKQ_DC_BATCH;
70935a5a358SJonathan Adams 
71035a5a358SJonathan Adams 		return (taskq_create_sysdc(name, value, 50, INT_MAX,
71135a5a358SJonathan Adams 		    spa->spa_proc, zio_taskq_basedc, flags));
71235a5a358SJonathan Adams 	}
71335a5a358SJonathan Adams 	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
71435a5a358SJonathan Adams 	    spa->spa_proc, flags));
71535a5a358SJonathan Adams }
71635a5a358SJonathan Adams 
71735a5a358SJonathan Adams static void
71835a5a358SJonathan Adams spa_create_zio_taskqs(spa_t *spa)
71935a5a358SJonathan Adams {
720e14bb325SJeff Bonwick 	for (int t = 0; t < ZIO_TYPES; t++) {
721e14bb325SJeff Bonwick 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
72280eb36f2SGeorge Wilson 			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
72380eb36f2SGeorge Wilson 			enum zti_modes mode = ztip->zti_mode;
72480eb36f2SGeorge Wilson 			uint_t value = ztip->zti_value;
7252e0c549eSJonathan Adams 			char name[32];
7262e0c549eSJonathan Adams 
7272e0c549eSJonathan Adams 			(void) snprintf(name, sizeof (name),
72880eb36f2SGeorge Wilson 			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
7292e0c549eSJonathan Adams 
73035a5a358SJonathan Adams 			spa->spa_zio_taskq[t][q] =
73135a5a358SJonathan Adams 			    spa_taskq_create(spa, name, mode, value);
73235a5a358SJonathan Adams 		}
73335a5a358SJonathan Adams 	}
73435a5a358SJonathan Adams }
73535a5a358SJonathan Adams 
73635a5a358SJonathan Adams #ifdef _KERNEL
73735a5a358SJonathan Adams static void
73835a5a358SJonathan Adams spa_thread(void *arg)
73935a5a358SJonathan Adams {
74035a5a358SJonathan Adams 	callb_cpr_t cprinfo;
7412e0c549eSJonathan Adams 
74235a5a358SJonathan Adams 	spa_t *spa = arg;
74335a5a358SJonathan Adams 	user_t *pu = PTOU(curproc);
7442e0c549eSJonathan Adams 
74535a5a358SJonathan Adams 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
74635a5a358SJonathan Adams 	    spa->spa_name);
7472e0c549eSJonathan Adams 
74835a5a358SJonathan Adams 	ASSERT(curproc != &p0);
74935a5a358SJonathan Adams 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
75035a5a358SJonathan Adams 	    "zpool-%s", spa->spa_name);
75135a5a358SJonathan Adams 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
7522e0c549eSJonathan Adams 
75335a5a358SJonathan Adams 	/* bind this thread to the requested psrset */
75435a5a358SJonathan Adams 	if (zio_taskq_psrset_bind != PS_NONE) {
75535a5a358SJonathan Adams 		pool_lock();
75635a5a358SJonathan Adams 		mutex_enter(&cpu_lock);
75735a5a358SJonathan Adams 		mutex_enter(&pidlock);
75835a5a358SJonathan Adams 		mutex_enter(&curproc->p_lock);
75980eb36f2SGeorge Wilson 
76035a5a358SJonathan Adams 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
76135a5a358SJonathan Adams 		    0, NULL, NULL) == 0)  {
76235a5a358SJonathan Adams 			curthread->t_bind_pset = zio_taskq_psrset_bind;
76335a5a358SJonathan Adams 		} else {
76435a5a358SJonathan Adams 			cmn_err(CE_WARN,
76535a5a358SJonathan Adams 			    "Couldn't bind process for zfs pool \"%s\" to "
76635a5a358SJonathan Adams 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
76735a5a358SJonathan Adams 		}
76835a5a358SJonathan Adams 
76935a5a358SJonathan Adams 		mutex_exit(&curproc->p_lock);
77035a5a358SJonathan Adams 		mutex_exit(&pidlock);
77135a5a358SJonathan Adams 		mutex_exit(&cpu_lock);
77235a5a358SJonathan Adams 		pool_unlock();
77335a5a358SJonathan Adams 	}
77435a5a358SJonathan Adams 
77535a5a358SJonathan Adams 	if (zio_taskq_sysdc) {
77635a5a358SJonathan Adams 		sysdc_thread_enter(curthread, 100, 0);
77735a5a358SJonathan Adams 	}
77835a5a358SJonathan Adams 
77935a5a358SJonathan Adams 	spa->spa_proc = curproc;
78035a5a358SJonathan Adams 	spa->spa_did = curthread->t_did;
78135a5a358SJonathan Adams 
78235a5a358SJonathan Adams 	spa_create_zio_taskqs(spa);
78335a5a358SJonathan Adams 
78435a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
78535a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
78635a5a358SJonathan Adams 
78735a5a358SJonathan Adams 	spa->spa_proc_state = SPA_PROC_ACTIVE;
78835a5a358SJonathan Adams 	cv_broadcast(&spa->spa_proc_cv);
78935a5a358SJonathan Adams 
79035a5a358SJonathan Adams 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
79135a5a358SJonathan Adams 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
79235a5a358SJonathan Adams 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
79335a5a358SJonathan Adams 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
79435a5a358SJonathan Adams 
79535a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
79635a5a358SJonathan Adams 	spa->spa_proc_state = SPA_PROC_GONE;
79735a5a358SJonathan Adams 	spa->spa_proc = &p0;
79835a5a358SJonathan Adams 	cv_broadcast(&spa->spa_proc_cv);
79935a5a358SJonathan Adams 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
80035a5a358SJonathan Adams 
80135a5a358SJonathan Adams 	mutex_enter(&curproc->p_lock);
80235a5a358SJonathan Adams 	lwp_exit();
80335a5a358SJonathan Adams }
80435a5a358SJonathan Adams #endif
80535a5a358SJonathan Adams 
80635a5a358SJonathan Adams /*
80735a5a358SJonathan Adams  * Activate an uninitialized pool.
80835a5a358SJonathan Adams  */
80935a5a358SJonathan Adams static void
81035a5a358SJonathan Adams spa_activate(spa_t *spa, int mode)
81135a5a358SJonathan Adams {
81235a5a358SJonathan Adams 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
81335a5a358SJonathan Adams 
81435a5a358SJonathan Adams 	spa->spa_state = POOL_STATE_ACTIVE;
81535a5a358SJonathan Adams 	spa->spa_mode = mode;
81635a5a358SJonathan Adams 
81735a5a358SJonathan Adams 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
81835a5a358SJonathan Adams 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
81935a5a358SJonathan Adams 
82035a5a358SJonathan Adams 	/* Try to create a covering process */
82135a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
82235a5a358SJonathan Adams 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
82335a5a358SJonathan Adams 	ASSERT(spa->spa_proc == &p0);
82435a5a358SJonathan Adams 	spa->spa_did = 0;
82535a5a358SJonathan Adams 
82635a5a358SJonathan Adams 	/* Only create a process if we're going to be around a while. */
82735a5a358SJonathan Adams 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
82835a5a358SJonathan Adams 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
82935a5a358SJonathan Adams 		    NULL, 0) == 0) {
83035a5a358SJonathan Adams 			spa->spa_proc_state = SPA_PROC_CREATED;
83135a5a358SJonathan Adams 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
83235a5a358SJonathan Adams 				cv_wait(&spa->spa_proc_cv,
83335a5a358SJonathan Adams 				    &spa->spa_proc_lock);
8342e0c549eSJonathan Adams 			}
83535a5a358SJonathan Adams 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
83635a5a358SJonathan Adams 			ASSERT(spa->spa_proc != &p0);
83735a5a358SJonathan Adams 			ASSERT(spa->spa_did != 0);
83835a5a358SJonathan Adams 		} else {
83935a5a358SJonathan Adams #ifdef _KERNEL
84035a5a358SJonathan Adams 			cmn_err(CE_WARN,
84135a5a358SJonathan Adams 			    "Couldn't create process for zfs pool \"%s\"\n",
84235a5a358SJonathan Adams 			    spa->spa_name);
84335a5a358SJonathan Adams #endif
844e14bb325SJeff Bonwick 		}
845fa9e4066Sahrens 	}
84635a5a358SJonathan Adams 	mutex_exit(&spa->spa_proc_lock);
84735a5a358SJonathan Adams 
84835a5a358SJonathan Adams 	/* If we didn't create a process, we need to create our taskqs. */
84935a5a358SJonathan Adams 	if (spa->spa_proc == &p0) {
85035a5a358SJonathan Adams 		spa_create_zio_taskqs(spa);
85135a5a358SJonathan Adams 	}
852fa9e4066Sahrens 
853e14bb325SJeff Bonwick 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
854e14bb325SJeff Bonwick 	    offsetof(vdev_t, vdev_config_dirty_node));
855e14bb325SJeff Bonwick 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
856e14bb325SJeff Bonwick 	    offsetof(vdev_t, vdev_state_dirty_node));
857fa9e4066Sahrens 
858fa9e4066Sahrens 	txg_list_create(&spa->spa_vdev_txg_list,
859fa9e4066Sahrens 	    offsetof(struct vdev, vdev_txg_node));
860ea8dc4b6Seschrock 
861ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_scrub,
862ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
863ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
864ea8dc4b6Seschrock 	avl_create(&spa->spa_errlist_last,
865ea8dc4b6Seschrock 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
866ea8dc4b6Seschrock 	    offsetof(spa_error_entry_t, se_avl));
867fa9e4066Sahrens }
868fa9e4066Sahrens 
869fa9e4066Sahrens /*
870fa9e4066Sahrens  * Opposite of spa_activate().
871fa9e4066Sahrens  */
872fa9e4066Sahrens static void
873fa9e4066Sahrens spa_deactivate(spa_t *spa)
874fa9e4066Sahrens {
875fa9e4066Sahrens 	ASSERT(spa->spa_sync_on == B_FALSE);
876fa9e4066Sahrens 	ASSERT(spa->spa_dsl_pool == NULL);
877fa9e4066Sahrens 	ASSERT(spa->spa_root_vdev == NULL);
87825f89ee2SJeff Bonwick 	ASSERT(spa->spa_async_zio_root == NULL);
879fa9e4066Sahrens 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
880fa9e4066Sahrens 
881fa9e4066Sahrens 	txg_list_destroy(&spa->spa_vdev_txg_list);
882fa9e4066Sahrens 
883e14bb325SJeff Bonwick 	list_destroy(&spa->spa_config_dirty_list);
884e14bb325SJeff Bonwick 	list_destroy(&spa->spa_state_dirty_list);
885fa9e4066Sahrens 
886e14bb325SJeff Bonwick 	for (int t = 0; t < ZIO_TYPES; t++) {
887e14bb325SJeff Bonwick 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
88880eb36f2SGeorge Wilson 			if (spa->spa_zio_taskq[t][q] != NULL)
88980eb36f2SGeorge Wilson 				taskq_destroy(spa->spa_zio_taskq[t][q]);
890e14bb325SJeff Bonwick 			spa->spa_zio_taskq[t][q] = NULL;
891e14bb325SJeff Bonwick 		}
892fa9e4066Sahrens 	}
893fa9e4066Sahrens 
894fa9e4066Sahrens 	metaslab_class_destroy(spa->spa_normal_class);
895fa9e4066Sahrens 	spa->spa_normal_class = NULL;
896fa9e4066Sahrens 
8978654d025Sperrin 	metaslab_class_destroy(spa->spa_log_class);
8988654d025Sperrin 	spa->spa_log_class = NULL;
8998654d025Sperrin 
900ea8dc4b6Seschrock 	/*
901ea8dc4b6Seschrock 	 * If this was part of an import or the open otherwise failed, we may
902ea8dc4b6Seschrock 	 * still have errors left in the queues.  Empty them just in case.
903ea8dc4b6Seschrock 	 */
904ea8dc4b6Seschrock 	spa_errlog_drain(spa);
905ea8dc4b6Seschrock 
906ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_scrub);
907ea8dc4b6Seschrock 	avl_destroy(&spa->spa_errlist_last);
908ea8dc4b6Seschrock 
909fa9e4066Sahrens 	spa->spa_state = POOL_STATE_UNINITIALIZED;
91035a5a358SJonathan Adams 
91135a5a358SJonathan Adams 	mutex_enter(&spa->spa_proc_lock);
91235a5a358SJonathan Adams 	if (spa->spa_proc_state != SPA_PROC_NONE) {
91335a5a358SJonathan Adams 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
91435a5a358SJonathan Adams 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
91535a5a358SJonathan Adams 		cv_broadcast(&spa->spa_proc_cv);
91635a5a358SJonathan Adams 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
91735a5a358SJonathan Adams 			ASSERT(spa->spa_proc != &p0);
91835a5a358SJonathan Adams 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
91935a5a358SJonathan Adams 		}
92035a5a358SJonathan Adams 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
92135a5a358SJonathan Adams 		spa->spa_proc_state = SPA_PROC_NONE;
92235a5a358SJonathan Adams 	}
92335a5a358SJonathan Adams 	ASSERT(spa->spa_proc == &p0);
92435a5a358SJonathan Adams 	mutex_exit(&spa->spa_proc_lock);
92535a5a358SJonathan Adams 
92635a5a358SJonathan Adams 	/*
92735a5a358SJonathan Adams 	 * We want to make sure spa_thread() has actually exited the ZFS
92835a5a358SJonathan Adams 	 * module, so that the module can't be unloaded out from underneath
92935a5a358SJonathan Adams 	 * it.
93035a5a358SJonathan Adams 	 */
93135a5a358SJonathan Adams 	if (spa->spa_did != 0) {
93235a5a358SJonathan Adams 		thread_join(spa->spa_did);
93335a5a358SJonathan Adams 		spa->spa_did = 0;
93435a5a358SJonathan Adams 	}
935fa9e4066Sahrens }
936fa9e4066Sahrens 
937fa9e4066Sahrens /*
938fa9e4066Sahrens  * Verify a pool configuration, and construct the vdev tree appropriately.  This
939fa9e4066Sahrens  * will create all the necessary vdevs in the appropriate layout, with each vdev
940fa9e4066Sahrens  * in the CLOSED state.  This will prep the pool before open/creation/import.
941fa9e4066Sahrens  * All vdev validation is done by the vdev_alloc() routine.
942fa9e4066Sahrens  */
94399653d4eSeschrock static int
94499653d4eSeschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
94599653d4eSeschrock     uint_t id, int atype)
946fa9e4066Sahrens {
947fa9e4066Sahrens 	nvlist_t **child;
948573ca77eSGeorge Wilson 	uint_t children;
94999653d4eSeschrock 	int error;
950fa9e4066Sahrens 
95199653d4eSeschrock 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
95299653d4eSeschrock 		return (error);
953fa9e4066Sahrens 
95499653d4eSeschrock 	if ((*vdp)->vdev_ops->vdev_op_leaf)
95599653d4eSeschrock 		return (0);
956fa9e4066Sahrens 
957e14bb325SJeff Bonwick 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
958e14bb325SJeff Bonwick 	    &child, &children);
959e14bb325SJeff Bonwick 
960e14bb325SJeff Bonwick 	if (error == ENOENT)
961e14bb325SJeff Bonwick 		return (0);
962e14bb325SJeff Bonwick 
963e14bb325SJeff Bonwick 	if (error) {
96499653d4eSeschrock 		vdev_free(*vdp);
96599653d4eSeschrock 		*vdp = NULL;
96699653d4eSeschrock 		return (EINVAL);
967fa9e4066Sahrens 	}
968fa9e4066Sahrens 
969573ca77eSGeorge Wilson 	for (int c = 0; c < children; c++) {
97099653d4eSeschrock 		vdev_t *vd;
97199653d4eSeschrock 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
97299653d4eSeschrock 		    atype)) != 0) {
97399653d4eSeschrock 			vdev_free(*vdp);
97499653d4eSeschrock 			*vdp = NULL;
97599653d4eSeschrock 			return (error);
976fa9e4066Sahrens 		}
977fa9e4066Sahrens 	}
978fa9e4066Sahrens 
97999653d4eSeschrock 	ASSERT(*vdp != NULL);
98099653d4eSeschrock 
98199653d4eSeschrock 	return (0);
982fa9e4066Sahrens }
983fa9e4066Sahrens 
984fa9e4066Sahrens /*
985fa9e4066Sahrens  * Opposite of spa_load().
986fa9e4066Sahrens  */
987fa9e4066Sahrens static void
988fa9e4066Sahrens spa_unload(spa_t *spa)
989fa9e4066Sahrens {
99099653d4eSeschrock 	int i;
99199653d4eSeschrock 
992e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
993e14bb325SJeff Bonwick 
994ea8dc4b6Seschrock 	/*
995ea8dc4b6Seschrock 	 * Stop async tasks.
996ea8dc4b6Seschrock 	 */
997ea8dc4b6Seschrock 	spa_async_suspend(spa);
998ea8dc4b6Seschrock 
999fa9e4066Sahrens 	/*
1000fa9e4066Sahrens 	 * Stop syncing.
1001fa9e4066Sahrens 	 */
1002fa9e4066Sahrens 	if (spa->spa_sync_on) {
1003fa9e4066Sahrens 		txg_sync_stop(spa->spa_dsl_pool);
1004fa9e4066Sahrens 		spa->spa_sync_on = B_FALSE;
1005fa9e4066Sahrens 	}
1006fa9e4066Sahrens 
1007fa9e4066Sahrens 	/*
1008e14bb325SJeff Bonwick 	 * Wait for any outstanding async I/O to complete.
1009fa9e4066Sahrens 	 */
101054d692b7SGeorge Wilson 	if (spa->spa_async_zio_root != NULL) {
101154d692b7SGeorge Wilson 		(void) zio_wait(spa->spa_async_zio_root);
101254d692b7SGeorge Wilson 		spa->spa_async_zio_root = NULL;
101354d692b7SGeorge Wilson 	}
1014fa9e4066Sahrens 
1015cde58dbcSMatthew Ahrens 	bpobj_close(&spa->spa_deferred_bpobj);
1016cde58dbcSMatthew Ahrens 
1017fa9e4066Sahrens 	/*
1018fa9e4066Sahrens 	 * Close the dsl pool.
1019fa9e4066Sahrens 	 */
1020fa9e4066Sahrens 	if (spa->spa_dsl_pool) {
1021fa9e4066Sahrens 		dsl_pool_close(spa->spa_dsl_pool);
1022fa9e4066Sahrens 		spa->spa_dsl_pool = NULL;
1023afee20e4SGeorge Wilson 		spa->spa_meta_objset = NULL;
1024fa9e4066Sahrens 	}
1025fa9e4066Sahrens 
1026b24ab676SJeff Bonwick 	ddt_unload(spa);
1027b24ab676SJeff Bonwick 
10288ad4d6ddSJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
10298ad4d6ddSJeff Bonwick 
10308ad4d6ddSJeff Bonwick 	/*
10318ad4d6ddSJeff Bonwick 	 * Drop and purge level 2 cache
10328ad4d6ddSJeff Bonwick 	 */
10338ad4d6ddSJeff Bonwick 	spa_l2cache_drop(spa);
10348ad4d6ddSJeff Bonwick 
1035fa9e4066Sahrens 	/*
1036fa9e4066Sahrens 	 * Close all vdevs.
1037fa9e4066Sahrens 	 */
10380e34b6a7Sbonwick 	if (spa->spa_root_vdev)
1039fa9e4066Sahrens 		vdev_free(spa->spa_root_vdev);
10400e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == NULL);
1041ea8dc4b6Seschrock 
1042fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
1043fa94a07fSbrendan 		vdev_free(spa->spa_spares.sav_vdevs[i]);
1044fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs) {
1045fa94a07fSbrendan 		kmem_free(spa->spa_spares.sav_vdevs,
1046fa94a07fSbrendan 		    spa->spa_spares.sav_count * sizeof (void *));
1047fa94a07fSbrendan 		spa->spa_spares.sav_vdevs = NULL;
104899653d4eSeschrock 	}
1049fa94a07fSbrendan 	if (spa->spa_spares.sav_config) {
1050fa94a07fSbrendan 		nvlist_free(spa->spa_spares.sav_config);
1051fa94a07fSbrendan 		spa->spa_spares.sav_config = NULL;
1052fa94a07fSbrendan 	}
10532ce8af81SEric Schrock 	spa->spa_spares.sav_count = 0;
1054fa94a07fSbrendan 
1055fa94a07fSbrendan 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
1056fa94a07fSbrendan 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1057fa94a07fSbrendan 	if (spa->spa_l2cache.sav_vdevs) {
1058fa94a07fSbrendan 		kmem_free(spa->spa_l2cache.sav_vdevs,
1059fa94a07fSbrendan 		    spa->spa_l2cache.sav_count * sizeof (void *));
1060fa94a07fSbrendan 		spa->spa_l2cache.sav_vdevs = NULL;
1061fa94a07fSbrendan 	}
1062fa94a07fSbrendan 	if (spa->spa_l2cache.sav_config) {
1063fa94a07fSbrendan 		nvlist_free(spa->spa_l2cache.sav_config);
1064fa94a07fSbrendan 		spa->spa_l2cache.sav_config = NULL;
106599653d4eSeschrock 	}
10662ce8af81SEric Schrock 	spa->spa_l2cache.sav_count = 0;
106799653d4eSeschrock 
1068ea8dc4b6Seschrock 	spa->spa_async_suspended = 0;
10698ad4d6ddSJeff Bonwick 
1070*8704186eSDan McDonald 	if (spa->spa_comment != NULL) {
1071*8704186eSDan McDonald 		spa_strfree(spa->spa_comment);
1072*8704186eSDan McDonald 		spa->spa_comment = NULL;
1073*8704186eSDan McDonald 	}
1074*8704186eSDan McDonald 
10758ad4d6ddSJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
1076fa9e4066Sahrens }
1077fa9e4066Sahrens 
107899653d4eSeschrock /*
107999653d4eSeschrock  * Load (or re-load) the current list of vdevs describing the active spares for
108099653d4eSeschrock  * this pool.  When this is called, we have some form of basic information in
1081fa94a07fSbrendan  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1082fa94a07fSbrendan  * then re-generate a more complete list including status information.
108399653d4eSeschrock  */
108499653d4eSeschrock static void
108599653d4eSeschrock spa_load_spares(spa_t *spa)
108699653d4eSeschrock {
108799653d4eSeschrock 	nvlist_t **spares;
108899653d4eSeschrock 	uint_t nspares;
108999653d4eSeschrock 	int i;
109039c23413Seschrock 	vdev_t *vd, *tvd;
109199653d4eSeschrock 
1092e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1093e14bb325SJeff Bonwick 
109499653d4eSeschrock 	/*
109599653d4eSeschrock 	 * First, close and free any existing spare vdevs.
109699653d4eSeschrock 	 */
1097fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1098fa94a07fSbrendan 		vd = spa->spa_spares.sav_vdevs[i];
109939c23413Seschrock 
110039c23413Seschrock 		/* Undo the call to spa_activate() below */
1101c5904d13Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1102c5904d13Seschrock 		    B_FALSE)) != NULL && tvd->vdev_isspare)
110339c23413Seschrock 			spa_spare_remove(tvd);
110439c23413Seschrock 		vdev_close(vd);
110539c23413Seschrock 		vdev_free(vd);
110699653d4eSeschrock 	}
110739c23413Seschrock 
1108fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs)
1109fa94a07fSbrendan 		kmem_free(spa->spa_spares.sav_vdevs,
1110fa94a07fSbrendan 		    spa->spa_spares.sav_count * sizeof (void *));
111199653d4eSeschrock 
1112fa94a07fSbrendan 	if (spa->spa_spares.sav_config == NULL)
111399653d4eSeschrock 		nspares = 0;
111499653d4eSeschrock 	else
1115fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
111699653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
111799653d4eSeschrock 
1118fa94a07fSbrendan 	spa->spa_spares.sav_count = (int)nspares;
1119fa94a07fSbrendan 	spa->spa_spares.sav_vdevs = NULL;
112099653d4eSeschrock 
112199653d4eSeschrock 	if (nspares == 0)
112299653d4eSeschrock 		return;
112399653d4eSeschrock 
112499653d4eSeschrock 	/*
112599653d4eSeschrock 	 * Construct the array of vdevs, opening them to get status in the
112639c23413Seschrock 	 * process.   For each spare, there is potentially two different vdev_t
112739c23413Seschrock 	 * structures associated with it: one in the list of spares (used only
112839c23413Seschrock 	 * for basic validation purposes) and one in the active vdev
112939c23413Seschrock 	 * configuration (if it's spared in).  During this phase we open and
113039c23413Seschrock 	 * validate each vdev on the spare list.  If the vdev also exists in the
113139c23413Seschrock 	 * active configuration, then we also mark this vdev as an active spare.
113299653d4eSeschrock 	 */
1133fa94a07fSbrendan 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1134fa94a07fSbrendan 	    KM_SLEEP);
1135fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
113699653d4eSeschrock 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
113799653d4eSeschrock 		    VDEV_ALLOC_SPARE) == 0);
113899653d4eSeschrock 		ASSERT(vd != NULL);
113999653d4eSeschrock 
1140fa94a07fSbrendan 		spa->spa_spares.sav_vdevs[i] = vd;
114199653d4eSeschrock 
1142c5904d13Seschrock 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1143c5904d13Seschrock 		    B_FALSE)) != NULL) {
114439c23413Seschrock 			if (!tvd->vdev_isspare)
114539c23413Seschrock 				spa_spare_add(tvd);
114639c23413Seschrock 
114739c23413Seschrock 			/*
114839c23413Seschrock 			 * We only mark the spare active if we were successfully
114939c23413Seschrock 			 * able to load the vdev.  Otherwise, importing a pool
115039c23413Seschrock 			 * with a bad active spare would result in strange
115139c23413Seschrock 			 * behavior, because multiple pool would think the spare
115239c23413Seschrock 			 * is actively in use.
115339c23413Seschrock 			 *
115439c23413Seschrock 			 * There is a vulnerability here to an equally bizarre
115539c23413Seschrock 			 * circumstance, where a dead active spare is later
115639c23413Seschrock 			 * brought back to life (onlined or otherwise).  Given
115739c23413Seschrock 			 * the rarity of this scenario, and the extra complexity
115839c23413Seschrock 			 * it adds, we ignore the possibility.
115939c23413Seschrock 			 */
116039c23413Seschrock 			if (!vdev_is_dead(tvd))
116139c23413Seschrock 				spa_spare_activate(tvd);
116239c23413Seschrock 		}
116339c23413Seschrock 
1164e14bb325SJeff Bonwick 		vd->vdev_top = vd;
11656809eb4eSEric Schrock 		vd->vdev_aux = &spa->spa_spares;
1166e14bb325SJeff Bonwick 
116799653d4eSeschrock 		if (vdev_open(vd) != 0)
116899653d4eSeschrock 			continue;
116999653d4eSeschrock 
1170fa94a07fSbrendan 		if (vdev_validate_aux(vd) == 0)
1171fa94a07fSbrendan 			spa_spare_add(vd);
117299653d4eSeschrock 	}
117399653d4eSeschrock 
117499653d4eSeschrock 	/*
117599653d4eSeschrock 	 * Recompute the stashed list of spares, with status information
117699653d4eSeschrock 	 * this time.
117799653d4eSeschrock 	 */
1178fa94a07fSbrendan 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
117999653d4eSeschrock 	    DATA_TYPE_NVLIST_ARRAY) == 0);
118099653d4eSeschrock 
1181fa94a07fSbrendan 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1182fa94a07fSbrendan 	    KM_SLEEP);
1183fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
1184fa94a07fSbrendan 		spares[i] = vdev_config_generate(spa,
11853f9d6ad7SLin Ling 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1186fa94a07fSbrendan 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1187fa94a07fSbrendan 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1188fa94a07fSbrendan 	for (i = 0; i < spa->spa_spares.sav_count; i++)
118999653d4eSeschrock 		nvlist_free(spares[i]);
1190fa94a07fSbrendan 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1191fa94a07fSbrendan }
1192fa94a07fSbrendan 
1193fa94a07fSbrendan /*
1194fa94a07fSbrendan  * Load (or re-load) the current list of vdevs describing the active l2cache for
1195fa94a07fSbrendan  * this pool.  When this is called, we have some form of basic information in
1196fa94a07fSbrendan  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1197fa94a07fSbrendan  * then re-generate a more complete list including status information.
1198fa94a07fSbrendan  * Devices which are already active have their details maintained, and are
1199fa94a07fSbrendan  * not re-opened.
1200fa94a07fSbrendan  */
1201fa94a07fSbrendan static void
1202fa94a07fSbrendan spa_load_l2cache(spa_t *spa)
1203fa94a07fSbrendan {
1204fa94a07fSbrendan 	nvlist_t **l2cache;
1205fa94a07fSbrendan 	uint_t nl2cache;
1206fa94a07fSbrendan 	int i, j, oldnvdevs;
1207573ca77eSGeorge Wilson 	uint64_t guid;
1208fa94a07fSbrendan 	vdev_t *vd, **oldvdevs, **newvdevs;
1209fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1210fa94a07fSbrendan 
1211e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1212e14bb325SJeff Bonwick 
1213fa94a07fSbrendan 	if (sav->sav_config != NULL) {
1214fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1215fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1216fa94a07fSbrendan 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1217fa94a07fSbrendan 	} else {
1218fa94a07fSbrendan 		nl2cache = 0;
1219fa94a07fSbrendan 	}
1220fa94a07fSbrendan 
1221fa94a07fSbrendan 	oldvdevs = sav->sav_vdevs;
1222fa94a07fSbrendan 	oldnvdevs = sav->sav_count;
1223fa94a07fSbrendan 	sav->sav_vdevs = NULL;
1224fa94a07fSbrendan 	sav->sav_count = 0;
1225fa94a07fSbrendan 
1226fa94a07fSbrendan 	/*
1227fa94a07fSbrendan 	 * Process new nvlist of vdevs.
1228fa94a07fSbrendan 	 */
1229fa94a07fSbrendan 	for (i = 0; i < nl2cache; i++) {
1230fa94a07fSbrendan 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1231fa94a07fSbrendan 		    &guid) == 0);
1232fa94a07fSbrendan 
1233fa94a07fSbrendan 		newvdevs[i] = NULL;
1234fa94a07fSbrendan 		for (j = 0; j < oldnvdevs; j++) {
1235fa94a07fSbrendan 			vd = oldvdevs[j];
1236fa94a07fSbrendan 			if (vd != NULL && guid == vd->vdev_guid) {
1237fa94a07fSbrendan 				/*
1238fa94a07fSbrendan 				 * Retain previous vdev for add/remove ops.
1239fa94a07fSbrendan 				 */
1240fa94a07fSbrendan 				newvdevs[i] = vd;
1241fa94a07fSbrendan 				oldvdevs[j] = NULL;
1242fa94a07fSbrendan 				break;
1243fa94a07fSbrendan 			}
1244fa94a07fSbrendan 		}
1245fa94a07fSbrendan 
1246fa94a07fSbrendan 		if (newvdevs[i] == NULL) {
1247fa94a07fSbrendan 			/*
1248fa94a07fSbrendan 			 * Create new vdev
1249fa94a07fSbrendan 			 */
1250fa94a07fSbrendan 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1251fa94a07fSbrendan 			    VDEV_ALLOC_L2CACHE) == 0);
1252fa94a07fSbrendan 			ASSERT(vd != NULL);
1253fa94a07fSbrendan 			newvdevs[i] = vd;
1254fa94a07fSbrendan 
1255fa94a07fSbrendan 			/*
1256fa94a07fSbrendan 			 * Commit this vdev as an l2cache device,
1257fa94a07fSbrendan 			 * even if it fails to open.
1258fa94a07fSbrendan 			 */
1259fa94a07fSbrendan 			spa_l2cache_add(vd);
1260fa94a07fSbrendan 
1261c5904d13Seschrock 			vd->vdev_top = vd;
1262c5904d13Seschrock 			vd->vdev_aux = sav;
1263c5904d13Seschrock 
1264c5904d13Seschrock 			spa_l2cache_activate(vd);
1265c5904d13Seschrock 
1266fa94a07fSbrendan 			if (vdev_open(vd) != 0)
1267fa94a07fSbrendan 				continue;
1268fa94a07fSbrendan 
1269fa94a07fSbrendan 			(void) vdev_validate_aux(vd);
1270fa94a07fSbrendan 
1271573ca77eSGeorge Wilson 			if (!vdev_is_dead(vd))
1272573ca77eSGeorge Wilson 				l2arc_add_vdev(spa, vd);
1273fa94a07fSbrendan 		}
1274fa94a07fSbrendan 	}
1275fa94a07fSbrendan 
1276fa94a07fSbrendan 	/*
1277fa94a07fSbrendan 	 * Purge vdevs that were dropped
1278fa94a07fSbrendan 	 */
1279fa94a07fSbrendan 	for (i = 0; i < oldnvdevs; i++) {
1280fa94a07fSbrendan 		uint64_t pool;
1281fa94a07fSbrendan 
1282fa94a07fSbrendan 		vd = oldvdevs[i];
1283fa94a07fSbrendan 		if (vd != NULL) {
12848ad4d6ddSJeff Bonwick 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
12858ad4d6ddSJeff Bonwick 			    pool != 0ULL && l2arc_vdev_present(vd))
1286fa94a07fSbrendan 				l2arc_remove_vdev(vd);
1287fa94a07fSbrendan 			(void) vdev_close(vd);
1288fa94a07fSbrendan 			spa_l2cache_remove(vd);
1289fa94a07fSbrendan 		}
1290fa94a07fSbrendan 	}
1291fa94a07fSbrendan 
1292fa94a07fSbrendan 	if (oldvdevs)
1293fa94a07fSbrendan 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1294fa94a07fSbrendan 
1295fa94a07fSbrendan 	if (sav->sav_config == NULL)
1296fa94a07fSbrendan 		goto out;
1297fa94a07fSbrendan 
1298fa94a07fSbrendan 	sav->sav_vdevs = newvdevs;
1299fa94a07fSbrendan 	sav->sav_count = (int)nl2cache;
1300fa94a07fSbrendan 
1301fa94a07fSbrendan 	/*
1302fa94a07fSbrendan 	 * Recompute the stashed list of l2cache devices, with status
1303fa94a07fSbrendan 	 * information this time.
1304fa94a07fSbrendan 	 */
1305fa94a07fSbrendan 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1306fa94a07fSbrendan 	    DATA_TYPE_NVLIST_ARRAY) == 0);
1307fa94a07fSbrendan 
1308fa94a07fSbrendan 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1309fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
1310fa94a07fSbrendan 		l2cache[i] = vdev_config_generate(spa,
13113f9d6ad7SLin Ling 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1312fa94a07fSbrendan 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1313fa94a07fSbrendan 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1314fa94a07fSbrendan out:
1315fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
1316fa94a07fSbrendan 		nvlist_free(l2cache[i]);
1317fa94a07fSbrendan 	if (sav->sav_count)
1318fa94a07fSbrendan 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
131999653d4eSeschrock }
132099653d4eSeschrock 
132199653d4eSeschrock static int
132299653d4eSeschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
132399653d4eSeschrock {
132499653d4eSeschrock 	dmu_buf_t *db;
132599653d4eSeschrock 	char *packed = NULL;
132699653d4eSeschrock 	size_t nvsize = 0;
132799653d4eSeschrock 	int error;
132899653d4eSeschrock 	*value = NULL;
132999653d4eSeschrock 
133099653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
133199653d4eSeschrock 	nvsize = *(uint64_t *)db->db_data;
133299653d4eSeschrock 	dmu_buf_rele(db, FTAG);
133399653d4eSeschrock 
133499653d4eSeschrock 	packed = kmem_alloc(nvsize, KM_SLEEP);
13357bfdf011SNeil Perrin 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
13367bfdf011SNeil Perrin 	    DMU_READ_PREFETCH);
133799653d4eSeschrock 	if (error == 0)
133899653d4eSeschrock 		error = nvlist_unpack(packed, nvsize, value, 0);
133999653d4eSeschrock 	kmem_free(packed, nvsize);
134099653d4eSeschrock 
134199653d4eSeschrock 	return (error);
134299653d4eSeschrock }
134399653d4eSeschrock 
13443d7072f8Seschrock /*
13453d7072f8Seschrock  * Checks to see if the given vdev could not be opened, in which case we post a
13463d7072f8Seschrock  * sysevent to notify the autoreplace code that the device has been removed.
13473d7072f8Seschrock  */
13483d7072f8Seschrock static void
13493d7072f8Seschrock spa_check_removed(vdev_t *vd)
13503d7072f8Seschrock {
1351573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
13523d7072f8Seschrock 		spa_check_removed(vd->vdev_child[c]);
13533d7072f8Seschrock 
13543d7072f8Seschrock 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
13553d7072f8Seschrock 		zfs_post_autoreplace(vd->vdev_spa, vd);
13563d7072f8Seschrock 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
13573d7072f8Seschrock 	}
13583d7072f8Seschrock }
13593d7072f8Seschrock 
1360e6ca193dSGeorge Wilson /*
13614b964adaSGeorge Wilson  * Validate the current config against the MOS config
1362e6ca193dSGeorge Wilson  */
13634b964adaSGeorge Wilson static boolean_t
13644b964adaSGeorge Wilson spa_config_valid(spa_t *spa, nvlist_t *config)
1365e6ca193dSGeorge Wilson {
13664b964adaSGeorge Wilson 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
13674b964adaSGeorge Wilson 	nvlist_t *nv;
13684b964adaSGeorge Wilson 
13694b964adaSGeorge Wilson 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
13704b964adaSGeorge Wilson 
13714b964adaSGeorge Wilson 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
13724b964adaSGeorge Wilson 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
13734b964adaSGeorge Wilson 
13744b964adaSGeorge Wilson 	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1375e6ca193dSGeorge Wilson 
137688ecc943SGeorge Wilson 	/*
13774b964adaSGeorge Wilson 	 * If we're doing a normal import, then build up any additional
13784b964adaSGeorge Wilson 	 * diagnostic information about missing devices in this config.
13794b964adaSGeorge Wilson 	 * We'll pass this up to the user for further processing.
138088ecc943SGeorge Wilson 	 */
13814b964adaSGeorge Wilson 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
13824b964adaSGeorge Wilson 		nvlist_t **child, *nv;
13834b964adaSGeorge Wilson 		uint64_t idx = 0;
13844b964adaSGeorge Wilson 
13854b964adaSGeorge Wilson 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
13864b964adaSGeorge Wilson 		    KM_SLEEP);
13874b964adaSGeorge Wilson 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1388e6ca193dSGeorge Wilson 
13894b964adaSGeorge Wilson 		for (int c = 0; c < rvd->vdev_children; c++) {
13904b964adaSGeorge Wilson 			vdev_t *tvd = rvd->vdev_child[c];
13914b964adaSGeorge Wilson 			vdev_t *mtvd  = mrvd->vdev_child[c];
13924b964adaSGeorge Wilson 
13934b964adaSGeorge Wilson 			if (tvd->vdev_ops == &vdev_missing_ops &&
13944b964adaSGeorge Wilson 			    mtvd->vdev_ops != &vdev_missing_ops &&
13954b964adaSGeorge Wilson 			    mtvd->vdev_islog)
13964b964adaSGeorge Wilson 				child[idx++] = vdev_config_generate(spa, mtvd,
13974b964adaSGeorge Wilson 				    B_FALSE, 0);
13984b964adaSGeorge Wilson 		}
13994b964adaSGeorge Wilson 
14004b964adaSGeorge Wilson 		if (idx) {
14014b964adaSGeorge Wilson 			VERIFY(nvlist_add_nvlist_array(nv,
14024b964adaSGeorge Wilson 			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
14034b964adaSGeorge Wilson 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
14044b964adaSGeorge Wilson 			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
14054b964adaSGeorge Wilson 
14064b964adaSGeorge Wilson 			for (int i = 0; i < idx; i++)
14074b964adaSGeorge Wilson 				nvlist_free(child[i]);
14084b964adaSGeorge Wilson 		}
14094b964adaSGeorge Wilson 		nvlist_free(nv);
14104b964adaSGeorge Wilson 		kmem_free(child, rvd->vdev_children * sizeof (char **));
14114b964adaSGeorge Wilson 	}
14124b964adaSGeorge Wilson 
14134b964adaSGeorge Wilson 	/*
14144b964adaSGeorge Wilson 	 * Compare the root vdev tree with the information we have
14154b964adaSGeorge Wilson 	 * from the MOS config (mrvd). Check each top-level vdev
14164b964adaSGeorge Wilson 	 * with the corresponding MOS config top-level (mtvd).
14174b964adaSGeorge Wilson 	 */
141888ecc943SGeorge Wilson 	for (int c = 0; c < rvd->vdev_children; c++) {
14194b964adaSGeorge Wilson 		vdev_t *tvd = rvd->vdev_child[c];
14204b964adaSGeorge Wilson 		vdev_t *mtvd  = mrvd->vdev_child[c];
14214b964adaSGeorge Wilson 
14224b964adaSGeorge Wilson 		/*
14234b964adaSGeorge Wilson 		 * Resolve any "missing" vdevs in the current configuration.
14244b964adaSGeorge Wilson 		 * If we find that the MOS config has more accurate information
14254b964adaSGeorge Wilson 		 * about the top-level vdev then use that vdev instead.
14264b964adaSGeorge Wilson 		 */
14274b964adaSGeorge Wilson 		if (tvd->vdev_ops == &vdev_missing_ops &&
14284b964adaSGeorge Wilson 		    mtvd->vdev_ops != &vdev_missing_ops) {
14294b964adaSGeorge Wilson 
14304b964adaSGeorge Wilson 			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
14314b964adaSGeorge Wilson 				continue;
14324b964adaSGeorge Wilson 
14334b964adaSGeorge Wilson 			/*
14344b964adaSGeorge Wilson 			 * Device specific actions.
14354b964adaSGeorge Wilson 			 */
14364b964adaSGeorge Wilson 			if (mtvd->vdev_islog) {
14374b964adaSGeorge Wilson 				spa_set_log_state(spa, SPA_LOG_CLEAR);
14384b964adaSGeorge Wilson 			} else {
14394b964adaSGeorge Wilson 				/*
14404b964adaSGeorge Wilson 				 * XXX - once we have 'readonly' pool
14414b964adaSGeorge Wilson 				 * support we should be able to handle
14424b964adaSGeorge Wilson 				 * missing data devices by transitioning
14434b964adaSGeorge Wilson 				 * the pool to readonly.
14444b964adaSGeorge Wilson 				 */
14454b964adaSGeorge Wilson 				continue;
14464b964adaSGeorge Wilson 			}
14474b964adaSGeorge Wilson 
14484b964adaSGeorge Wilson 			/*
14494b964adaSGeorge Wilson 			 * Swap the missing vdev with the data we were
14504b964adaSGeorge Wilson 			 * able to obtain from the MOS config.
14514b964adaSGeorge Wilson 			 */
14524b964adaSGeorge Wilson 			vdev_remove_child(rvd, tvd);
14534b964adaSGeorge Wilson 			vdev_remove_child(mrvd, mtvd);
14544b964adaSGeorge Wilson 
14554b964adaSGeorge Wilson 			vdev_add_child(rvd, mtvd);
14564b964adaSGeorge Wilson 			vdev_add_child(mrvd, tvd);
14574b964adaSGeorge Wilson 
14584b964adaSGeorge Wilson 			spa_config_exit(spa, SCL_ALL, FTAG);
14594b964adaSGeorge Wilson 			vdev_load(mtvd);
14604b964adaSGeorge Wilson 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
14614b964adaSGeorge Wilson 
14624b964adaSGeorge Wilson 			vdev_reopen(rvd);
14634b964adaSGeorge Wilson 		} else if (mtvd->vdev_islog) {
14644b964adaSGeorge Wilson 			/*
14654b964adaSGeorge Wilson 			 * Load the slog device's state from the MOS config
14664b964adaSGeorge Wilson 			 * since it's possible that the label does not
14674b964adaSGeorge Wilson 			 * contain the most up-to-date information.
14684b964adaSGeorge Wilson 			 */
14694b964adaSGeorge Wilson 			vdev_load_log_state(tvd, mtvd);
14704b964adaSGeorge Wilson 			vdev_reopen(tvd);
14714b964adaSGeorge Wilson 		}
1472e6ca193dSGeorge Wilson 	}
14734b964adaSGeorge Wilson 	vdev_free(mrvd);
147488ecc943SGeorge Wilson 	spa_config_exit(spa, SCL_ALL, FTAG);
14754b964adaSGeorge Wilson 
14764b964adaSGeorge Wilson 	/*
14774b964adaSGeorge Wilson 	 * Ensure we were able to validate the config.
14784b964adaSGeorge Wilson 	 */
14794b964adaSGeorge Wilson 	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1480e6ca193dSGeorge Wilson }
1481e6ca193dSGeorge Wilson 
1482b87f3af3Sperrin /*
1483b87f3af3Sperrin  * Check for missing log devices
1484b87f3af3Sperrin  */
14854b964adaSGeorge Wilson static int
1486b87f3af3Sperrin spa_check_logs(spa_t *spa)
1487b87f3af3Sperrin {
1488b87f3af3Sperrin 	switch (spa->spa_log_state) {
1489b87f3af3Sperrin 	case SPA_LOG_MISSING:
1490b87f3af3Sperrin 		/* need to recheck in case slog has been restored */
1491b87f3af3Sperrin 	case SPA_LOG_UNKNOWN:
1492b87f3af3Sperrin 		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1493b87f3af3Sperrin 		    DS_FIND_CHILDREN)) {
14941195e687SMark J Musante 			spa_set_log_state(spa, SPA_LOG_MISSING);
1495b87f3af3Sperrin 			return (1);
1496b87f3af3Sperrin 		}
1497b87f3af3Sperrin 		break;
1498b87f3af3Sperrin 	}
1499b87f3af3Sperrin 	return (0);
1500b87f3af3Sperrin }
1501b87f3af3Sperrin 
15021195e687SMark J Musante static boolean_t
15031195e687SMark J Musante spa_passivate_log(spa_t *spa)
15041195e687SMark J Musante {
15051195e687SMark J Musante 	vdev_t *rvd = spa->spa_root_vdev;
15061195e687SMark J Musante 	boolean_t slog_found = B_FALSE;
15071195e687SMark J Musante 
15081195e687SMark J Musante 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
15091195e687SMark J Musante 
15101195e687SMark J Musante 	if (!spa_has_slogs(spa))
15111195e687SMark J Musante 		return (B_FALSE);
15121195e687SMark J Musante 
15131195e687SMark J Musante 	for (int c = 0; c < rvd->vdev_children; c++) {
15141195e687SMark J Musante 		vdev_t *tvd = rvd->vdev_child[c];
15151195e687SMark J Musante 		metaslab_group_t *mg = tvd->vdev_mg;
15161195e687SMark J Musante 
15171195e687SMark J Musante 		if (tvd->vdev_islog) {
15181195e687SMark J Musante 			metaslab_group_passivate(mg);
15191195e687SMark J Musante 			slog_found = B_TRUE;
15201195e687SMark J Musante 		}
15211195e687SMark J Musante 	}
15221195e687SMark J Musante 
15231195e687SMark J Musante 	return (slog_found);
15241195e687SMark J Musante }
15251195e687SMark J Musante 
15261195e687SMark J Musante static void
15271195e687SMark J Musante spa_activate_log(spa_t *spa)
15281195e687SMark J Musante {
15291195e687SMark J Musante 	vdev_t *rvd = spa->spa_root_vdev;
15301195e687SMark J Musante 
15311195e687SMark J Musante 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
15321195e687SMark J Musante 
15331195e687SMark J Musante 	for (int c = 0; c < rvd->vdev_children; c++) {
15341195e687SMark J Musante 		vdev_t *tvd = rvd->vdev_child[c];
15351195e687SMark J Musante 		metaslab_group_t *mg = tvd->vdev_mg;
15361195e687SMark J Musante 
15371195e687SMark J Musante 		if (tvd->vdev_islog)
15381195e687SMark J Musante 			metaslab_group_activate(mg);
15391195e687SMark J Musante 	}
15401195e687SMark J Musante }
15411195e687SMark J Musante 
15421195e687SMark J Musante int
15431195e687SMark J Musante spa_offline_log(spa_t *spa)
15441195e687SMark J Musante {
15451195e687SMark J Musante 	int error = 0;
15461195e687SMark J Musante 
15471195e687SMark J Musante 	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
15481195e687SMark J Musante 	    NULL, DS_FIND_CHILDREN)) == 0) {
15491195e687SMark J Musante 
15501195e687SMark J Musante 		/*
15511195e687SMark J Musante 		 * We successfully offlined the log device, sync out the
15521195e687SMark J Musante 		 * current txg so that the "stubby" block can be removed
15531195e687SMark J Musante 		 * by zil_sync().
15541195e687SMark J Musante 		 */
15551195e687SMark J Musante 		txg_wait_synced(spa->spa_dsl_pool, 0);
15561195e687SMark J Musante 	}
15571195e687SMark J Musante 	return (error);
15581195e687SMark J Musante }
15591195e687SMark J Musante 
1560b693757aSEric Schrock static void
1561b693757aSEric Schrock spa_aux_check_removed(spa_aux_vdev_t *sav)
1562b693757aSEric Schrock {
1563b24ab676SJeff Bonwick 	for (int i = 0; i < sav->sav_count; i++)
1564b693757aSEric Schrock 		spa_check_removed(sav->sav_vdevs[i]);
1565b693757aSEric Schrock }
1566b693757aSEric Schrock 
1567b24ab676SJeff Bonwick void
1568b24ab676SJeff Bonwick spa_claim_notify(zio_t *zio)
1569b24ab676SJeff Bonwick {
1570b24ab676SJeff Bonwick 	spa_t *spa = zio->io_spa;
1571b24ab676SJeff Bonwick 
1572b24ab676SJeff Bonwick 	if (zio->io_error)
1573b24ab676SJeff Bonwick 		return;
1574b24ab676SJeff Bonwick 
1575b24ab676SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
1576b24ab676SJeff Bonwick 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1577b24ab676SJeff Bonwick 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1578b24ab676SJeff Bonwick 	mutex_exit(&spa->spa_props_lock);
1579b24ab676SJeff Bonwick }
1580b24ab676SJeff Bonwick 
1581468c413aSTim Haley typedef struct spa_load_error {
1582c8ee1847SVictor Latushkin 	uint64_t	sle_meta_count;
1583468c413aSTim Haley 	uint64_t	sle_data_count;
1584468c413aSTim Haley } spa_load_error_t;
1585468c413aSTim Haley 
1586468c413aSTim Haley static void
1587468c413aSTim Haley spa_load_verify_done(zio_t *zio)
1588468c413aSTim Haley {
1589468c413aSTim Haley 	blkptr_t *bp = zio->io_bp;
1590468c413aSTim Haley 	spa_load_error_t *sle = zio->io_private;
1591468c413aSTim Haley 	dmu_object_type_t type = BP_GET_TYPE(bp);
1592468c413aSTim Haley 	int error = zio->io_error;
1593468c413aSTim Haley 
1594468c413aSTim Haley 	if (error) {
1595468c413aSTim Haley 		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1596468c413aSTim Haley 		    type != DMU_OT_INTENT_LOG)
1597c8ee1847SVictor Latushkin 			atomic_add_64(&sle->sle_meta_count, 1);
1598468c413aSTim Haley 		else
1599468c413aSTim Haley 			atomic_add_64(&sle->sle_data_count, 1);
1600468c413aSTim Haley 	}
1601468c413aSTim Haley 	zio_data_buf_free(zio->io_data, zio->io_size);
1602468c413aSTim Haley }
1603468c413aSTim Haley 
1604468c413aSTim Haley /*ARGSUSED*/
1605468c413aSTim Haley static int
1606b24ab676SJeff Bonwick spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
16073f9d6ad7SLin Ling     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1608468c413aSTim Haley {
1609468c413aSTim Haley 	if (bp != NULL) {
1610468c413aSTim Haley 		zio_t *rio = arg;
1611468c413aSTim Haley 		size_t size = BP_GET_PSIZE(bp);
1612468c413aSTim Haley 		void *data = zio_data_buf_alloc(size);
1613468c413aSTim Haley 
1614468c413aSTim Haley 		zio_nowait(zio_read(rio, spa, bp, data, size,
1615468c413aSTim Haley 		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1616468c413aSTim Haley 		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1617468c413aSTim Haley 		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1618468c413aSTim Haley 	}
1619468c413aSTim Haley 	return (0);
1620468c413aSTim Haley }
1621468c413aSTim Haley 
1622468c413aSTim Haley static int
1623468c413aSTim Haley spa_load_verify(spa_t *spa)
1624468c413aSTim Haley {
1625468c413aSTim Haley 	zio_t *rio;
1626468c413aSTim Haley 	spa_load_error_t sle = { 0 };
1627468c413aSTim Haley 	zpool_rewind_policy_t policy;
1628468c413aSTim Haley 	boolean_t verify_ok = B_FALSE;
1629468c413aSTim Haley 	int error;
1630468c413aSTim Haley 
1631c8ee1847SVictor Latushkin 	zpool_get_rewind_policy(spa->spa_config, &policy);
1632c8ee1847SVictor Latushkin 
1633c8ee1847SVictor Latushkin 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1634c8ee1847SVictor Latushkin 		return (0);
1635c8ee1847SVictor Latushkin 
1636468c413aSTim Haley 	rio = zio_root(spa, NULL, &sle,
1637468c413aSTim Haley 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1638468c413aSTim Haley 
1639bbfd46c4SJeff Bonwick 	error = traverse_pool(spa, spa->spa_verify_min_txg,
1640bbfd46c4SJeff Bonwick 	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1641468c413aSTim Haley 
1642468c413aSTim Haley 	(void) zio_wait(rio);
1643468c413aSTim Haley 
1644c8ee1847SVictor Latushkin 	spa->spa_load_meta_errors = sle.sle_meta_count;
1645468c413aSTim Haley 	spa->spa_load_data_errors = sle.sle_data_count;
1646468c413aSTim Haley 
1647c8ee1847SVictor Latushkin 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1648468c413aSTim Haley 	    sle.sle_data_count <= policy.zrp_maxdata) {
16494b964adaSGeorge Wilson 		int64_t loss = 0;
16504b964adaSGeorge Wilson 
1651468c413aSTim Haley 		verify_ok = B_TRUE;
1652468c413aSTim Haley 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1653468c413aSTim Haley 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
16544b964adaSGeorge Wilson 
16554b964adaSGeorge Wilson 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
16564b964adaSGeorge Wilson 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
16574b964adaSGeorge Wilson 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
16584b964adaSGeorge Wilson 		VERIFY(nvlist_add_int64(spa->spa_load_info,
16594b964adaSGeorge Wilson 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
16604b964adaSGeorge Wilson 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
16614b964adaSGeorge Wilson 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1662a33cae98STim Haley 	} else {
1663a33cae98STim Haley 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1664468c413aSTim Haley 	}
1665468c413aSTim Haley 
1666468c413aSTim Haley 	if (error) {
1667468c413aSTim Haley 		if (error != ENXIO && error != EIO)
1668468c413aSTim Haley 			error = EIO;
1669468c413aSTim Haley 		return (error);
1670468c413aSTim Haley 	}
1671468c413aSTim Haley 
1672468c413aSTim Haley 	return (verify_ok ? 0 : EIO);
1673468c413aSTim Haley }
1674468c413aSTim Haley 
16751195e687SMark J Musante /*
16761195e687SMark J Musante  * Find a value in the pool props object.
16771195e687SMark J Musante  */
16781195e687SMark J Musante static void
16791195e687SMark J Musante spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
16801195e687SMark J Musante {
16811195e687SMark J Musante 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
16821195e687SMark J Musante 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
16831195e687SMark J Musante }
16841195e687SMark J Musante 
16851195e687SMark J Musante /*
16861195e687SMark J Musante  * Find a value in the pool directory object.
16871195e687SMark J Musante  */
16881195e687SMark J Musante static int
16891195e687SMark J Musante spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
16901195e687SMark J Musante {
16911195e687SMark J Musante 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
16921195e687SMark J Musante 	    name, sizeof (uint64_t), 1, val));
16931195e687SMark J Musante }
16941195e687SMark J Musante 
16951195e687SMark J Musante static int
16961195e687SMark J Musante spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
16971195e687SMark J Musante {
16981195e687SMark J Musante 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
16991195e687SMark J Musante 	return (err);
17001195e687SMark J Musante }
17011195e687SMark J Musante 
17021195e687SMark J Musante /*
17031195e687SMark J Musante  * Fix up config after a partly-completed split.  This is done with the
17041195e687SMark J Musante  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
17051195e687SMark J Musante  * pool have that entry in their config, but only the splitting one contains
17061195e687SMark J Musante  * a list of all the guids of the vdevs that are being split off.
17071195e687SMark J Musante  *
17081195e687SMark J Musante  * This function determines what to do with that list: either rejoin
17091195e687SMark J Musante  * all the disks to the pool, or complete the splitting process.  To attempt
17101195e687SMark J Musante  * the rejoin, each disk that is offlined is marked online again, and
17111195e687SMark J Musante  * we do a reopen() call.  If the vdev label for every disk that was
17121195e687SMark J Musante  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
17131195e687SMark J Musante  * then we call vdev_split() on each disk, and complete the split.
17141195e687SMark J Musante  *
1715d41c4376SMark J Musante  * Otherwise we leave the config alone, with all the vdevs in place in
1716d41c4376SMark J Musante  * the original pool.
17171195e687SMark J Musante  */
17181195e687SMark J Musante static void
17191195e687SMark J Musante spa_try_repair(spa_t *spa, nvlist_t *config)
17201195e687SMark J Musante {
17211195e687SMark J Musante 	uint_t extracted;
17221195e687SMark J Musante 	uint64_t *glist;
17231195e687SMark J Musante 	uint_t i, gcount;
17241195e687SMark J Musante 	nvlist_t *nvl;
17251195e687SMark J Musante 	vdev_t **vd;
17261195e687SMark J Musante 	boolean_t attempt_reopen;
17271195e687SMark J Musante 
17281195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
17291195e687SMark J Musante 		return;
17301195e687SMark J Musante 
17311195e687SMark J Musante 	/* check that the config is complete */
17321195e687SMark J Musante 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
17331195e687SMark J Musante 	    &glist, &gcount) != 0)
17341195e687SMark J Musante 		return;
17351195e687SMark J Musante 
17361195e687SMark J Musante 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
17371195e687SMark J Musante 
17381195e687SMark J Musante 	/* attempt to online all the vdevs & validate */
17391195e687SMark J Musante 	attempt_reopen = B_TRUE;
17401195e687SMark J Musante 	for (i = 0; i < gcount; i++) {
17411195e687SMark J Musante 		if (glist[i] == 0)	/* vdev is hole */
17421195e687SMark J Musante 			continue;
17431195e687SMark J Musante 
17441195e687SMark J Musante 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
17451195e687SMark J Musante 		if (vd[i] == NULL) {
17461195e687SMark J Musante 			/*
17471195e687SMark J Musante 			 * Don't bother attempting to reopen the disks;
17481195e687SMark J Musante 			 * just do the split.
17491195e687SMark J Musante 			 */
17501195e687SMark J Musante 			attempt_reopen = B_FALSE;
17511195e687SMark J Musante 		} else {
17521195e687SMark J Musante 			/* attempt to re-online it */
17531195e687SMark J Musante 			vd[i]->vdev_offline = B_FALSE;
17541195e687SMark J Musante 		}
17551195e687SMark J Musante 	}
17561195e687SMark J Musante 
17571195e687SMark J Musante 	if (attempt_reopen) {
17581195e687SMark J Musante 		vdev_reopen(spa->spa_root_vdev);
17591195e687SMark J Musante 
17601195e687SMark J Musante 		/* check each device to see what state it's in */
17611195e687SMark J Musante 		for (extracted = 0, i = 0; i < gcount; i++) {
17621195e687SMark J Musante 			if (vd[i] != NULL &&
17631195e687SMark J Musante 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
17641195e687SMark J Musante 				break;
17651195e687SMark J Musante 			++extracted;
17661195e687SMark J Musante 		}
17671195e687SMark J Musante 	}
17681195e687SMark J Musante 
17691195e687SMark J Musante 	/*
17701195e687SMark J Musante 	 * If every disk has been moved to the new pool, or if we never
17711195e687SMark J Musante 	 * even attempted to look at them, then we split them off for
17721195e687SMark J Musante 	 * good.
17731195e687SMark J Musante 	 */
17741195e687SMark J Musante 	if (!attempt_reopen || gcount == extracted) {
17751195e687SMark J Musante 		for (i = 0; i < gcount; i++)
17761195e687SMark J Musante 			if (vd[i] != NULL)
17771195e687SMark J Musante 				vdev_split(vd[i]);
17781195e687SMark J Musante 		vdev_reopen(spa->spa_root_vdev);
17791195e687SMark J Musante 	}
17801195e687SMark J Musante 
17811195e687SMark J Musante 	kmem_free(vd, gcount * sizeof (vdev_t *));
17821195e687SMark J Musante }
17831195e687SMark J Musante 
17841195e687SMark J Musante static int
17851195e687SMark J Musante spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
17861195e687SMark J Musante     boolean_t mosconfig)
17871195e687SMark J Musante {
17881195e687SMark J Musante 	nvlist_t *config = spa->spa_config;
17891195e687SMark J Musante 	char *ereport = FM_EREPORT_ZFS_POOL;
1790*8704186eSDan McDonald 	char *comment;
17911195e687SMark J Musante 	int error;
17921195e687SMark J Musante 	uint64_t pool_guid;
17931195e687SMark J Musante 	nvlist_t *nvl;
17941195e687SMark J Musante 
17951195e687SMark J Musante 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
17961195e687SMark J Musante 		return (EINVAL);
17971195e687SMark J Musante 
1798*8704186eSDan McDonald 	ASSERT(spa->spa_comment == NULL);
1799*8704186eSDan McDonald 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1800*8704186eSDan McDonald 		spa->spa_comment = spa_strdup(comment);
1801*8704186eSDan McDonald 
18021195e687SMark J Musante 	/*
18031195e687SMark J Musante 	 * Versioning wasn't explicitly added to the label until later, so if
18041195e687SMark J Musante 	 * it's not present treat it as the initial version.
18051195e687SMark J Musante 	 */
18061195e687SMark J Musante 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
18071195e687SMark J Musante 	    &spa->spa_ubsync.ub_version) != 0)
18081195e687SMark J Musante 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
18091195e687SMark J Musante 
18101195e687SMark J Musante 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
18111195e687SMark J Musante 	    &spa->spa_config_txg);
18121195e687SMark J Musante 
18131195e687SMark J Musante 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
18141195e687SMark J Musante 	    spa_guid_exists(pool_guid, 0)) {
18151195e687SMark J Musante 		error = EEXIST;
18161195e687SMark J Musante 	} else {
1817e9103aaeSGarrett D'Amore 		spa->spa_config_guid = pool_guid;
18181195e687SMark J Musante 
18191195e687SMark J Musante 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
18201195e687SMark J Musante 		    &nvl) == 0) {
18211195e687SMark J Musante 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
18221195e687SMark J Musante 			    KM_SLEEP) == 0);
18231195e687SMark J Musante 		}
18241195e687SMark J Musante 
182511027bc7STim Haley 		gethrestime(&spa->spa_loaded_ts);
18261195e687SMark J Musante 		error = spa_load_impl(spa, pool_guid, config, state, type,
18271195e687SMark J Musante 		    mosconfig, &ereport);
18281195e687SMark J Musante 	}
18291195e687SMark J Musante 
18301195e687SMark J Musante 	spa->spa_minref = refcount_count(&spa->spa_refcount);
183111027bc7STim Haley 	if (error) {
183211027bc7STim Haley 		if (error != EEXIST) {
183311027bc7STim Haley 			spa->spa_loaded_ts.tv_sec = 0;
183411027bc7STim Haley 			spa->spa_loaded_ts.tv_nsec = 0;
183511027bc7STim Haley 		}
183611027bc7STim Haley 		if (error != EBADF) {
183711027bc7STim Haley 			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
183811027bc7STim Haley 		}
183911027bc7STim Haley 	}
18401195e687SMark J Musante 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
18411195e687SMark J Musante 	spa->spa_ena = 0;
18421195e687SMark J Musante 
18431195e687SMark J Musante 	return (error);
18441195e687SMark J Musante }
18451195e687SMark J Musante 
1846fa9e4066Sahrens /*
1847fa9e4066Sahrens  * Load an existing storage pool, using the pool's builtin spa_config as a
1848ea8dc4b6Seschrock  * source of configuration information.
1849fa9e4066Sahrens  */
1850fa9e4066Sahrens static int
18511195e687SMark J Musante spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
18521195e687SMark J Musante     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
18531195e687SMark J Musante     char **ereport)
1854fa9e4066Sahrens {
1855fa9e4066Sahrens 	int error = 0;
1856871a9500SMark J Musante 	nvlist_t *nvroot = NULL;
1857fa9e4066Sahrens 	vdev_t *rvd;
1858fa9e4066Sahrens 	uberblock_t *ub = &spa->spa_uberblock;
18594b964adaSGeorge Wilson 	uint64_t children, config_cache_txg = spa->spa_config_txg;
18608ad4d6ddSJeff Bonwick 	int orig_mode = spa->spa_mode;
18611195e687SMark J Musante 	int parse;
1862cde58dbcSMatthew Ahrens 	uint64_t obj;
1863fa9e4066Sahrens 
18648ad4d6ddSJeff Bonwick 	/*
18658ad4d6ddSJeff Bonwick 	 * If this is an untrusted config, access the pool in read-only mode.
18668ad4d6ddSJeff Bonwick 	 * This prevents things like resilvering recently removed devices.
18678ad4d6ddSJeff Bonwick 	 */
18688ad4d6ddSJeff Bonwick 	if (!mosconfig)
18698ad4d6ddSJeff Bonwick 		spa->spa_mode = FREAD;
18708ad4d6ddSJeff Bonwick 
1871e14bb325SJeff Bonwick 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1872e14bb325SJeff Bonwick 
1873ea8dc4b6Seschrock 	spa->spa_load_state = state;
18740373e76bSbonwick 
18751195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
18761195e687SMark J Musante 		return (EINVAL);
1877fa9e4066Sahrens 
18781195e687SMark J Musante 	parse = (type == SPA_IMPORT_EXISTING ?
18791195e687SMark J Musante 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1880b5989ec7Seschrock 
188154d692b7SGeorge Wilson 	/*
188254d692b7SGeorge Wilson 	 * Create "The Godfather" zio to hold all async IOs
188354d692b7SGeorge Wilson 	 */
188425f89ee2SJeff Bonwick 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
188525f89ee2SJeff Bonwick 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
188654d692b7SGeorge Wilson 
1887fa9e4066Sahrens 	/*
188899653d4eSeschrock 	 * Parse the configuration into a vdev tree.  We explicitly set the
188999653d4eSeschrock 	 * value that will be returned by spa_version() since parsing the
189099653d4eSeschrock 	 * configuration requires knowing the version number.
1891fa9e4066Sahrens 	 */
1892e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
18931195e687SMark J Musante 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1894e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
1895fa9e4066Sahrens 
189699653d4eSeschrock 	if (error != 0)
18971195e687SMark J Musante 		return (error);
1898fa9e4066Sahrens 
18990e34b6a7Sbonwick 	ASSERT(spa->spa_root_vdev == rvd);
19001195e687SMark J Musante 
19011195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
19021195e687SMark J Musante 		ASSERT(spa_guid(spa) == pool_guid);
19031195e687SMark J Musante 	}
1904fa9e4066Sahrens 
1905fa9e4066Sahrens 	/*
1906fa9e4066Sahrens 	 * Try to open all vdevs, loading each label in the process.
1907fa9e4066Sahrens 	 */
1908e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
19090bf246f5Smc 	error = vdev_open(rvd);
1910e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
19110bf246f5Smc 	if (error != 0)
19121195e687SMark J Musante 		return (error);
1913fa9e4066Sahrens 
1914560e6e96Seschrock 	/*
191577e3a39cSMark J Musante 	 * We need to validate the vdev labels against the configuration that
191677e3a39cSMark J Musante 	 * we have in hand, which is dependent on the setting of mosconfig. If
191777e3a39cSMark J Musante 	 * mosconfig is true then we're validating the vdev labels based on
19181195e687SMark J Musante 	 * that config.  Otherwise, we're validating against the cached config
191977e3a39cSMark J Musante 	 * (zpool.cache) that was read when we loaded the zfs module, and then
192077e3a39cSMark J Musante 	 * later we will recursively call spa_load() and validate against
192177e3a39cSMark J Musante 	 * the vdev config.
19221195e687SMark J Musante 	 *
19231195e687SMark J Musante 	 * If we're assembling a new pool that's been split off from an
19241195e687SMark J Musante 	 * existing pool, the labels haven't yet been updated so we skip
19251195e687SMark J Musante 	 * validation for now.
1926560e6e96Seschrock 	 */
19271195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
19281195e687SMark J Musante 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
19291195e687SMark J Musante 		error = vdev_validate(rvd);
19301195e687SMark J Musante 		spa_config_exit(spa, SCL_ALL, FTAG);
1931560e6e96Seschrock 
19321195e687SMark J Musante 		if (error != 0)
19331195e687SMark J Musante 			return (error);
19341195e687SMark J Musante 
19351195e687SMark J Musante 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
19361195e687SMark J Musante 			return (ENXIO);
1937560e6e96Seschrock 	}
1938560e6e96Seschrock 
1939fa9e4066Sahrens 	/*
1940fa9e4066Sahrens 	 * Find the best uberblock.
1941fa9e4066Sahrens 	 */
1942e14bb325SJeff Bonwick 	vdev_uberblock_load(NULL, rvd, ub);
1943fa9e4066Sahrens 
1944fa9e4066Sahrens 	/*
1945fa9e4066Sahrens 	 * If we weren't able to find a single valid uberblock, return failure.
1946fa9e4066Sahrens 	 */
19471195e687SMark J Musante 	if (ub->ub_txg == 0)
19481195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1949ea8dc4b6Seschrock 
1950ea8dc4b6Seschrock 	/*
1951ea8dc4b6Seschrock 	 * If the pool is newer than the code, we can't open it.
1952ea8dc4b6Seschrock 	 */
19531195e687SMark J Musante 	if (ub->ub_version > SPA_VERSION)
19541195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1955fa9e4066Sahrens 
1956fa9e4066Sahrens 	/*
1957fa9e4066Sahrens 	 * If the vdev guid sum doesn't match the uberblock, we have an
19584b964adaSGeorge Wilson 	 * incomplete configuration.  We first check to see if the pool
19594b964adaSGeorge Wilson 	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
19604b964adaSGeorge Wilson 	 * If it is, defer the vdev_guid_sum check till later so we
19614b964adaSGeorge Wilson 	 * can handle missing vdevs.
1962fa9e4066Sahrens 	 */
19634b964adaSGeorge Wilson 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
19644b964adaSGeorge Wilson 	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
19651195e687SMark J Musante 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
19661195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
19671195e687SMark J Musante 
19681195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
19691195e687SMark J Musante 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
19701195e687SMark J Musante 		spa_try_repair(spa, config);
19711195e687SMark J Musante 		spa_config_exit(spa, SCL_ALL, FTAG);
19721195e687SMark J Musante 		nvlist_free(spa->spa_config_splitting);
19731195e687SMark J Musante 		spa->spa_config_splitting = NULL;
1974fa9e4066Sahrens 	}
1975fa9e4066Sahrens 
1976fa9e4066Sahrens 	/*
1977fa9e4066Sahrens 	 * Initialize internal SPA structures.
1978fa9e4066Sahrens 	 */
1979fa9e4066Sahrens 	spa->spa_state = POOL_STATE_ACTIVE;
1980fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
1981468c413aSTim Haley 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1982c8ee1847SVictor Latushkin 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1983468c413aSTim Haley 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1984468c413aSTim Haley 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
1985b24ab676SJeff Bonwick 	spa->spa_claim_max_txg = spa->spa_first_txg;
19863f9d6ad7SLin Ling 	spa->spa_prev_software_version = ub->ub_software_version;
1987b24ab676SJeff Bonwick 
1988ea8dc4b6Seschrock 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
19891195e687SMark J Musante 	if (error)
19901195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1991fa9e4066Sahrens 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1992fa9e4066Sahrens 
19931195e687SMark J Musante 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
19941195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1995fa9e4066Sahrens 
1996fa9e4066Sahrens 	if (!mosconfig) {
199795173954Sek 		uint64_t hostid;
1998871a9500SMark J Musante 		nvlist_t *policy = NULL, *nvconfig;
1999871a9500SMark J Musante 
2000871a9500SMark J Musante 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2001871a9500SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2002fa9e4066Sahrens 
200388ecc943SGeorge Wilson 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
200477650510SLin Ling 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
200595173954Sek 			char *hostname;
200695173954Sek 			unsigned long myhostid = 0;
200795173954Sek 
200888ecc943SGeorge Wilson 			VERIFY(nvlist_lookup_string(nvconfig,
200995173954Sek 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
201095173954Sek 
20115679c89fSjv #ifdef	_KERNEL
20125679c89fSjv 			myhostid = zone_get_hostid(NULL);
20135679c89fSjv #else	/* _KERNEL */
20145679c89fSjv 			/*
20155679c89fSjv 			 * We're emulating the system's hostid in userland, so
20165679c89fSjv 			 * we can't use zone_get_hostid().
20175679c89fSjv 			 */
201895173954Sek 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
20195679c89fSjv #endif	/* _KERNEL */
202017194a52Slling 			if (hostid != 0 && myhostid != 0 &&
20215679c89fSjv 			    hostid != myhostid) {
2022871a9500SMark J Musante 				nvlist_free(nvconfig);
202395173954Sek 				cmn_err(CE_WARN, "pool '%s' could not be "
202495173954Sek 				    "loaded as it was last accessed by "
202577650510SLin Ling 				    "another system (host: %s hostid: 0x%lx). "
202695173954Sek 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
2027e14bb325SJeff Bonwick 				    spa_name(spa), hostname,
202895173954Sek 				    (unsigned long)hostid);
20291195e687SMark J Musante 				return (EBADF);
203095173954Sek 			}
203195173954Sek 		}
2032c8ee1847SVictor Latushkin 		if (nvlist_lookup_nvlist(spa->spa_config,
2033c8ee1847SVictor Latushkin 		    ZPOOL_REWIND_POLICY, &policy) == 0)
2034c8ee1847SVictor Latushkin 			VERIFY(nvlist_add_nvlist(nvconfig,
2035c8ee1847SVictor Latushkin 			    ZPOOL_REWIND_POLICY, policy) == 0);
203695173954Sek 
203788ecc943SGeorge Wilson 		spa_config_set(spa, nvconfig);
2038fa9e4066Sahrens 		spa_unload(spa);
2039fa9e4066Sahrens 		spa_deactivate(spa);
20408ad4d6ddSJeff Bonwick 		spa_activate(spa, orig_mode);
2041fa9e4066Sahrens 
20421195e687SMark J Musante 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2043fa9e4066Sahrens 	}
2044fa9e4066Sahrens 
2045cde58dbcSMatthew Ahrens 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2046cde58dbcSMatthew Ahrens 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2047cde58dbcSMatthew Ahrens 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2048cde58dbcSMatthew Ahrens 	if (error != 0)
20491195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2050fa9e4066Sahrens 
205199653d4eSeschrock 	/*
205299653d4eSeschrock 	 * Load the bit that tells us to use the new accounting function
205399653d4eSeschrock 	 * (raid-z deflation).  If we have an older pool, this will not
205499653d4eSeschrock 	 * be present.
205599653d4eSeschrock 	 */
20561195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
20571195e687SMark J Musante 	if (error != 0 && error != ENOENT)
20581195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
205999653d4eSeschrock 
20603f9d6ad7SLin Ling 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
20613f9d6ad7SLin Ling 	    &spa->spa_creation_version);
20623f9d6ad7SLin Ling 	if (error != 0 && error != ENOENT)
20633f9d6ad7SLin Ling 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
20643f9d6ad7SLin Ling 
2065fa9e4066Sahrens 	/*
2066ea8dc4b6Seschrock 	 * Load the persistent error log.  If we have an older pool, this will
2067ea8dc4b6Seschrock 	 * not be present.
2068fa9e4066Sahrens 	 */
20691195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
20701195e687SMark J Musante 	if (error != 0 && error != ENOENT)
20711195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2072ea8dc4b6Seschrock 
20731195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
20741195e687SMark J Musante 	    &spa->spa_errlog_scrub);
20751195e687SMark J Musante 	if (error != 0 && error != ENOENT)
20761195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2077ea8dc4b6Seschrock 
207806eeb2adSek 	/*
207906eeb2adSek 	 * Load the history object.  If we have an older pool, this
208006eeb2adSek 	 * will not be present.
208106eeb2adSek 	 */
20821195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
20831195e687SMark J Musante 	if (error != 0 && error != ENOENT)
20841195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
20851195e687SMark J Musante 
20861195e687SMark J Musante 	/*
20871195e687SMark J Musante 	 * If we're assembling the pool from the split-off vdevs of
20881195e687SMark J Musante 	 * an existing pool, we don't want to attach the spares & cache
20891195e687SMark J Musante 	 * devices.
20901195e687SMark J Musante 	 */
209106eeb2adSek 
209299653d4eSeschrock 	/*
209399653d4eSeschrock 	 * Load any hot spares for this pool.
209499653d4eSeschrock 	 */
20951195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
20961195e687SMark J Musante 	if (error != 0 && error != ENOENT)
20971195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
20981195e687SMark J Musante 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2099e7437265Sahrens 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2100fa94a07fSbrendan 		if (load_nvlist(spa, spa->spa_spares.sav_object,
21011195e687SMark J Musante 		    &spa->spa_spares.sav_config) != 0)
21021195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
210399653d4eSeschrock 
2104e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
210599653d4eSeschrock 		spa_load_spares(spa);
2106e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
21071195e687SMark J Musante 	} else if (error == 0) {
21081195e687SMark J Musante 		spa->spa_spares.sav_sync = B_TRUE;
210999653d4eSeschrock 	}
211099653d4eSeschrock 
2111fa94a07fSbrendan 	/*
2112fa94a07fSbrendan 	 * Load any level 2 ARC devices for this pool.
2113fa94a07fSbrendan 	 */
21141195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2115fa94a07fSbrendan 	    &spa->spa_l2cache.sav_object);
21161195e687SMark J Musante 	if (error != 0 && error != ENOENT)
21171195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
21181195e687SMark J Musante 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2119fa94a07fSbrendan 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2120fa94a07fSbrendan 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
21211195e687SMark J Musante 		    &spa->spa_l2cache.sav_config) != 0)
21221195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2123fa94a07fSbrendan 
2124e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2125fa94a07fSbrendan 		spa_load_l2cache(spa);
2126e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
21271195e687SMark J Musante 	} else if (error == 0) {
21281195e687SMark J Musante 		spa->spa_l2cache.sav_sync = B_TRUE;
2129fa94a07fSbrendan 	}
2130fa94a07fSbrendan 
2131990b4856Slling 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2132ecd6cf80Smarks 
21331195e687SMark J Musante 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
21341195e687SMark J Musante 	if (error && error != ENOENT)
21351195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2136b1b8ab34Slling 
2137b1b8ab34Slling 	if (error == 0) {
21381195e687SMark J Musante 		uint64_t autoreplace;
21391195e687SMark J Musante 
21401195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
21411195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
21421195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
21431195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
21441195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
21451195e687SMark J Musante 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
21461195e687SMark J Musante 		    &spa->spa_dedup_ditto);
21471195e687SMark J Musante 
2148b693757aSEric Schrock 		spa->spa_autoreplace = (autoreplace != 0);
2149b1b8ab34Slling 	}
2150b1b8ab34Slling 
21513d7072f8Seschrock 	/*
21523d7072f8Seschrock 	 * If the 'autoreplace' property is set, then post a resource notifying
21533d7072f8Seschrock 	 * the ZFS DE that it should not issue any faults for unopenable
21543d7072f8Seschrock 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
21553d7072f8Seschrock 	 * unopenable vdevs so that the normal autoreplace handler can take
21563d7072f8Seschrock 	 * over.
21573d7072f8Seschrock 	 */
2158b693757aSEric Schrock 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
21593d7072f8Seschrock 		spa_check_removed(spa->spa_root_vdev);
2160b693757aSEric Schrock 		/*
2161b693757aSEric Schrock 		 * For the import case, this is done in spa_import(), because
2162b693757aSEric Schrock 		 * at this point we're using the spare definitions from
2163b693757aSEric Schrock 		 * the MOS config, not necessarily from the userland config.
2164b693757aSEric Schrock 		 */
2165b693757aSEric Schrock 		if (state != SPA_LOAD_IMPORT) {
2166b693757aSEric Schrock 			spa_aux_check_removed(&spa->spa_spares);
2167b693757aSEric Schrock 			spa_aux_check_removed(&spa->spa_l2cache);
2168b693757aSEric Schrock 		}
2169b693757aSEric Schrock 	}
21703d7072f8Seschrock 
2171ea8dc4b6Seschrock 	/*
2172560e6e96Seschrock 	 * Load the vdev state for all toplevel vdevs.
2173ea8dc4b6Seschrock 	 */
2174560e6e96Seschrock 	vdev_load(rvd);
21750373e76bSbonwick 
2176fa9e4066Sahrens 	/*
2177fa9e4066Sahrens 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
2178fa9e4066Sahrens 	 */
2179e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2180fa9e4066Sahrens 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2181e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
2182fa9e4066Sahrens 
2183b24ab676SJeff Bonwick 	/*
2184b24ab676SJeff Bonwick 	 * Load the DDTs (dedup tables).
2185b24ab676SJeff Bonwick 	 */
2186b24ab676SJeff Bonwick 	error = ddt_load(spa);
21871195e687SMark J Musante 	if (error != 0)
21881195e687SMark J Musante 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2189b24ab676SJeff Bonwick 
2190485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
2191485bbbf5SGeorge Wilson 
2192b24ab676SJeff Bonwick 	/*
21934b964adaSGeorge Wilson 	 * Validate the config, using the MOS config to fill in any
21944b964adaSGeorge Wilson 	 * information which might be missing.  If we fail to validate
21954b964adaSGeorge Wilson 	 * the config then declare the pool unfit for use. If we're
21964b964adaSGeorge Wilson 	 * assembling a pool from a split, the log is not transferred
21974b964adaSGeorge Wilson 	 * over.
2198b24ab676SJeff Bonwick 	 */
21991195e687SMark J Musante 	if (type != SPA_IMPORT_ASSEMBLE) {
2200871a9500SMark J Musante 		nvlist_t *nvconfig;
2201871a9500SMark J Musante 
2202871a9500SMark J Musante 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2203871a9500SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2204871a9500SMark J Musante 
22054b964adaSGeorge Wilson 		if (!spa_config_valid(spa, nvconfig)) {
22064b964adaSGeorge Wilson 			nvlist_free(nvconfig);
22074b964adaSGeorge Wilson 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
22084b964adaSGeorge Wilson 			    ENXIO));
22094b964adaSGeorge Wilson 		}
22101195e687SMark J Musante 		nvlist_free(nvconfig);
22111195e687SMark J Musante 
22124b964adaSGeorge Wilson 		/*
22134b964adaSGeorge Wilson 		 * Now that we've validate the config, check the state of the
22144b964adaSGeorge Wilson 		 * root vdev.  If it can't be opened, it indicates one or
22154b964adaSGeorge Wilson 		 * more toplevel vdevs are faulted.
22164b964adaSGeorge Wilson 		 */
22174b964adaSGeorge Wilson 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
22184b964adaSGeorge Wilson 			return (ENXIO);
22194b964adaSGeorge Wilson 
22201195e687SMark J Musante 		if (spa_check_logs(spa)) {
22211195e687SMark J Musante 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
22221195e687SMark J Musante 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
22231195e687SMark J Musante 		}
2224b24ab676SJeff Bonwick 	}
2225b24ab676SJeff Bonwick 
22264b964adaSGeorge Wilson 	/*
22274b964adaSGeorge Wilson 	 * We've successfully opened the pool, verify that we're ready
22284b964adaSGeorge Wilson 	 * to start pushing transactions.
22294b964adaSGeorge Wilson 	 */
22304b964adaSGeorge Wilson 	if (state != SPA_LOAD_TRYIMPORT) {
22314b964adaSGeorge Wilson 		if (error = spa_load_verify(spa))
22324b964adaSGeorge Wilson 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
22334b964adaSGeorge Wilson 			    error));
22344b964adaSGeorge Wilson 	}
22354b964adaSGeorge Wilson 
2236468c413aSTim Haley 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2237468c413aSTim Haley 	    spa->spa_load_max_txg == UINT64_MAX)) {
22385dabedeeSbonwick 		dmu_tx_t *tx;
22390373e76bSbonwick 		int need_update = B_FALSE;
22408ad4d6ddSJeff Bonwick 
22418ad4d6ddSJeff Bonwick 		ASSERT(state != SPA_LOAD_TRYIMPORT);
22425dabedeeSbonwick 
22430373e76bSbonwick 		/*
22440373e76bSbonwick 		 * Claim log blocks that haven't been committed yet.
22450373e76bSbonwick 		 * This must all happen in a single txg.
2246b24ab676SJeff Bonwick 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2247b24ab676SJeff Bonwick 		 * invoked from zil_claim_log_block()'s i/o done callback.
2248468c413aSTim Haley 		 * Price of rollback is that we abandon the log.
22490373e76bSbonwick 		 */
2250b24ab676SJeff Bonwick 		spa->spa_claiming = B_TRUE;
2251b24ab676SJeff Bonwick 
22525dabedeeSbonwick 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2253fa9e4066Sahrens 		    spa_first_txg(spa));
2254e14bb325SJeff Bonwick 		(void) dmu_objset_find(spa_name(spa),
22550b69c2f0Sahrens 		    zil_claim, tx, DS_FIND_CHILDREN);
2256fa9e4066Sahrens 		dmu_tx_commit(tx);
2257fa9e4066Sahrens 
2258b24ab676SJeff Bonwick 		spa->spa_claiming = B_FALSE;
2259b24ab676SJeff Bonwick 
22601195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_GOOD);
2261fa9e4066Sahrens 		spa->spa_sync_on = B_TRUE;
2262fa9e4066Sahrens 		txg_sync_start(spa->spa_dsl_pool);
2263fa9e4066Sahrens 
2264fa9e4066Sahrens 		/*
2265b24ab676SJeff Bonwick 		 * Wait for all claims to sync.  We sync up to the highest
2266b24ab676SJeff Bonwick 		 * claimed log block birth time so that claimed log blocks
2267b24ab676SJeff Bonwick 		 * don't appear to be from the future.  spa_claim_max_txg
2268b24ab676SJeff Bonwick 		 * will have been set for us by either zil_check_log_chain()
2269b24ab676SJeff Bonwick 		 * (invoked from spa_check_logs()) or zil_claim() above.
2270fa9e4066Sahrens 		 */
2271b24ab676SJeff Bonwick 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
22720e34b6a7Sbonwick 
22730e34b6a7Sbonwick 		/*
22740373e76bSbonwick 		 * If the config cache is stale, or we have uninitialized
22750373e76bSbonwick 		 * metaslabs (see spa_vdev_add()), then update the config.
2276bc758434SLin Ling 		 *
22774b964adaSGeorge Wilson 		 * If this is a verbatim import, trust the current
2278bc758434SLin Ling 		 * in-core spa_config and update the disk labels.
22790e34b6a7Sbonwick 		 */
22800373e76bSbonwick 		if (config_cache_txg != spa->spa_config_txg ||
22814b964adaSGeorge Wilson 		    state == SPA_LOAD_IMPORT ||
22824b964adaSGeorge Wilson 		    state == SPA_LOAD_RECOVER ||
22834b964adaSGeorge Wilson 		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
22840373e76bSbonwick 			need_update = B_TRUE;
22850373e76bSbonwick 
22868ad4d6ddSJeff Bonwick 		for (int c = 0; c < rvd->vdev_children; c++)
22870373e76bSbonwick 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
22880373e76bSbonwick 				need_update = B_TRUE;
22890e34b6a7Sbonwick 
22900e34b6a7Sbonwick 		/*
22910373e76bSbonwick 		 * Update the config cache asychronously in case we're the
22920373e76bSbonwick 		 * root pool, in which case the config cache isn't writable yet.
22930e34b6a7Sbonwick 		 */
22940373e76bSbonwick 		if (need_update)
22950373e76bSbonwick 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
22968ad4d6ddSJeff Bonwick 
22978ad4d6ddSJeff Bonwick 		/*
22988ad4d6ddSJeff Bonwick 		 * Check all DTLs to see if anything needs resilvering.
22998ad4d6ddSJeff Bonwick 		 */
23003f9d6ad7SLin Ling 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
23013f9d6ad7SLin Ling 		    vdev_resilver_needed(rvd, NULL, NULL))
23028ad4d6ddSJeff Bonwick 			spa_async_request(spa, SPA_ASYNC_RESILVER);
2303503ad85cSMatthew Ahrens 
2304503ad85cSMatthew Ahrens 		/*
2305503ad85cSMatthew Ahrens 		 * Delete any inconsistent datasets.
2306503ad85cSMatthew Ahrens 		 */
2307503ad85cSMatthew Ahrens 		(void) dmu_objset_find(spa_name(spa),
2308503ad85cSMatthew Ahrens 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2309ca45db41SChris Kirby 
2310ca45db41SChris Kirby 		/*
2311ca45db41SChris Kirby 		 * Clean up any stale temporary dataset userrefs.
2312ca45db41SChris Kirby 		 */
2313ca45db41SChris Kirby 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2314fa9e4066Sahrens 	}
2315fa9e4066Sahrens 
23161195e687SMark J Musante 	return (0);
2317fa9e4066Sahrens }
2318fa9e4066Sahrens 
2319468c413aSTim Haley static int
2320468c413aSTim Haley spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2321468c413aSTim Haley {
2322f9af39baSGeorge Wilson 	int mode = spa->spa_mode;
2323f9af39baSGeorge Wilson 
2324468c413aSTim Haley 	spa_unload(spa);
2325468c413aSTim Haley 	spa_deactivate(spa);
2326468c413aSTim Haley 
2327468c413aSTim Haley 	spa->spa_load_max_txg--;
2328468c413aSTim Haley 
2329f9af39baSGeorge Wilson 	spa_activate(spa, mode);
2330468c413aSTim Haley 	spa_async_suspend(spa);
2331468c413aSTim Haley 
23321195e687SMark J Musante 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2333468c413aSTim Haley }
2334468c413aSTim Haley 
2335468c413aSTim Haley static int
2336468c413aSTim Haley spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2337c8ee1847SVictor Latushkin     uint64_t max_request, int rewind_flags)
2338468c413aSTim Haley {
2339468c413aSTim Haley 	nvlist_t *config = NULL;
2340468c413aSTim Haley 	int load_error, rewind_error;
2341c8ee1847SVictor Latushkin 	uint64_t safe_rewind_txg;
2342468c413aSTim Haley 	uint64_t min_txg;
2343468c413aSTim Haley 
2344a33cae98STim Haley 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2345468c413aSTim Haley 		spa->spa_load_max_txg = spa->spa_load_txg;
23461195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_CLEAR);
2347a33cae98STim Haley 	} else {
2348468c413aSTim Haley 		spa->spa_load_max_txg = max_request;
2349a33cae98STim Haley 	}
2350468c413aSTim Haley 
23511195e687SMark J Musante 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
23521195e687SMark J Musante 	    mosconfig);
2353468c413aSTim Haley 	if (load_error == 0)
2354468c413aSTim Haley 		return (0);
2355468c413aSTim Haley 
2356468c413aSTim Haley 	if (spa->spa_root_vdev != NULL)
2357468c413aSTim Haley 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2358468c413aSTim Haley 
2359468c413aSTim Haley 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2360468c413aSTim Haley 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2361468c413aSTim Haley 
2362c8ee1847SVictor Latushkin 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
2363468c413aSTim Haley 		nvlist_free(config);
2364468c413aSTim Haley 		return (load_error);
2365468c413aSTim Haley 	}
2366468c413aSTim Haley 
2367468c413aSTim Haley 	/* Price of rolling back is discarding txgs, including log */
2368468c413aSTim Haley 	if (state == SPA_LOAD_RECOVER)
23691195e687SMark J Musante 		spa_set_log_state(spa, SPA_LOG_CLEAR);
2370468c413aSTim Haley 
2371c8ee1847SVictor Latushkin 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2372c8ee1847SVictor Latushkin 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2373c8ee1847SVictor Latushkin 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2374c8ee1847SVictor Latushkin 	    TXG_INITIAL : safe_rewind_txg;
2375468c413aSTim Haley 
2376c8ee1847SVictor Latushkin 	/*
2377c8ee1847SVictor Latushkin 	 * Continue as long as we're finding errors, we're still within
2378c8ee1847SVictor Latushkin 	 * the acceptable rewind range, and we're still finding uberblocks
2379c8ee1847SVictor Latushkin 	 */
2380c8ee1847SVictor Latushkin 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2381c8ee1847SVictor Latushkin 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2382c8ee1847SVictor Latushkin 		if (spa->spa_load_max_txg < safe_rewind_txg)
2383468c413aSTim Haley 			spa->spa_extreme_rewind = B_TRUE;
2384468c413aSTim Haley 		rewind_error = spa_load_retry(spa, state, mosconfig);
2385468c413aSTim Haley 	}
2386468c413aSTim Haley 
2387468c413aSTim Haley 	spa->spa_extreme_rewind = B_FALSE;
2388468c413aSTim Haley 	spa->spa_load_max_txg = UINT64_MAX;
2389468c413aSTim Haley 
2390468c413aSTim Haley 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2391468c413aSTim Haley 		spa_config_set(spa, config);
2392468c413aSTim Haley 
2393468c413aSTim Haley 	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2394468c413aSTim Haley }
2395468c413aSTim Haley 
2396fa9e4066Sahrens /*
2397fa9e4066Sahrens  * Pool Open/Import
2398fa9e4066Sahrens  *
2399fa9e4066Sahrens  * The import case is identical to an open except that the configuration is sent
2400fa9e4066Sahrens  * down from userland, instead of grabbed from the configuration cache.  For the
2401fa9e4066Sahrens  * case of an open, the pool configuration will exist in the
24023d7072f8Seschrock  * POOL_STATE_UNINITIALIZED state.
2403fa9e4066Sahrens  *
2404fa9e4066Sahrens  * The stats information (gen/count/ustats) is used to gather vdev statistics at
2405fa9e4066Sahrens  * the same time open the pool, without having to keep around the spa_t in some
2406fa9e4066Sahrens  * ambiguous state.
2407fa9e4066Sahrens  */
2408fa9e4066Sahrens static int
2409468c413aSTim Haley spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2410468c413aSTim Haley     nvlist_t **config)
2411fa9e4066Sahrens {
2412fa9e4066Sahrens 	spa_t *spa;
24134b964adaSGeorge Wilson 	spa_load_state_t state = SPA_LOAD_OPEN;
2414fa9e4066Sahrens 	int error;
2415fa9e4066Sahrens 	int locked = B_FALSE;
2416fa9e4066Sahrens 
2417fa9e4066Sahrens 	*spapp = NULL;
2418fa9e4066Sahrens 
2419fa9e4066Sahrens 	/*
2420fa9e4066Sahrens 	 * As disgusting as this is, we need to support recursive calls to this
2421fa9e4066Sahrens 	 * function because dsl_dir_open() is called during spa_load(), and ends
2422fa9e4066Sahrens 	 * up calling spa_open() again.  The real fix is to figure out how to
2423fa9e4066Sahrens 	 * avoid dsl_dir_open() calling this in the first place.
2424fa9e4066Sahrens 	 */
2425fa9e4066Sahrens 	if (mutex_owner(&spa_namespace_lock) != curthread) {
2426fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
2427fa9e4066Sahrens 		locked = B_TRUE;
2428fa9e4066Sahrens 	}
2429fa9e4066Sahrens 
2430fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
2431fa9e4066Sahrens 		if (locked)
2432fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
2433fa9e4066Sahrens 		return (ENOENT);
2434fa9e4066Sahrens 	}
2435468c413aSTim Haley 
2436fa9e4066Sahrens 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
24374b44c88cSTim Haley 		zpool_rewind_policy_t policy;
24384b44c88cSTim Haley 
24394b44c88cSTim Haley 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
24404b44c88cSTim Haley 		    &policy);
24414b44c88cSTim Haley 		if (policy.zrp_request & ZPOOL_DO_REWIND)
24424b44c88cSTim Haley 			state = SPA_LOAD_RECOVER;
2443fa9e4066Sahrens 
24448ad4d6ddSJeff Bonwick 		spa_activate(spa, spa_mode_global);
2445fa9e4066Sahrens 
2446468c413aSTim Haley 		if (state != SPA_LOAD_RECOVER)
2447468c413aSTim Haley 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2448468c413aSTim Haley 
2449468c413aSTim Haley 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2450c8ee1847SVictor Latushkin 		    policy.zrp_request);
2451fa9e4066Sahrens 
2452fa9e4066Sahrens 		if (error == EBADF) {
2453fa9e4066Sahrens 			/*
2454560e6e96Seschrock 			 * If vdev_validate() returns failure (indicated by
2455560e6e96Seschrock 			 * EBADF), it indicates that one of the vdevs indicates
2456560e6e96Seschrock 			 * that the pool has been exported or destroyed.  If
2457560e6e96Seschrock 			 * this is the case, the config cache is out of sync and
2458560e6e96Seschrock 			 * we should remove the pool from the namespace.
2459fa9e4066Sahrens 			 */
2460fa9e4066Sahrens 			spa_unload(spa);
2461fa9e4066Sahrens 			spa_deactivate(spa);
2462c5904d13Seschrock 			spa_config_sync(spa, B_TRUE, B_TRUE);
2463fa9e4066Sahrens 			spa_remove(spa);
2464fa9e4066Sahrens 			if (locked)
2465fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
2466fa9e4066Sahrens 			return (ENOENT);
2467ea8dc4b6Seschrock 		}
2468ea8dc4b6Seschrock 
2469ea8dc4b6Seschrock 		if (error) {
2470fa9e4066Sahrens 			/*
2471fa9e4066Sahrens 			 * We can't open the pool, but we still have useful
2472fa9e4066Sahrens 			 * information: the state of each vdev after the
2473fa9e4066Sahrens 			 * attempted vdev_open().  Return this to the user.
2474fa9e4066Sahrens 			 */
24754b964adaSGeorge Wilson 			if (config != NULL && spa->spa_config) {
2476468c413aSTim Haley 				VERIFY(nvlist_dup(spa->spa_config, config,
2477468c413aSTim Haley 				    KM_SLEEP) == 0);
24784b964adaSGeorge Wilson 				VERIFY(nvlist_add_nvlist(*config,
24794b964adaSGeorge Wilson 				    ZPOOL_CONFIG_LOAD_INFO,
24804b964adaSGeorge Wilson 				    spa->spa_load_info) == 0);
24814b964adaSGeorge Wilson 			}
2482fa9e4066Sahrens 			spa_unload(spa);
2483fa9e4066Sahrens 			spa_deactivate(spa);
2484468c413aSTim Haley 			spa->spa_last_open_failed = error;
2485fa9e4066Sahrens 			if (locked)
2486fa9e4066Sahrens 				mutex_exit(&spa_namespace_lock);
2487fa9e4066Sahrens 			*spapp = NULL;
2488fa9e4066Sahrens 			return (error);
2489fa9e4066Sahrens 		}
2490fa9e4066Sahrens 	}
2491fa9e4066Sahrens 
2492fa9e4066Sahrens 	spa_open_ref(spa, tag);
24933d7072f8Seschrock 
2494468c413aSTim Haley 	if (config != NULL)
2495468c413aSTim Haley 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2496468c413aSTim Haley 
24974b964adaSGeorge Wilson 	/*
24984b964adaSGeorge Wilson 	 * If we've recovered the pool, pass back any information we
24994b964adaSGeorge Wilson 	 * gathered while doing the load.
25004b964adaSGeorge Wilson 	 */
25014b964adaSGeorge Wilson 	if (state == SPA_LOAD_RECOVER) {
25024b964adaSGeorge Wilson 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
25034b964adaSGeorge Wilson 		    spa->spa_load_info) == 0);
25044b964adaSGeorge Wilson 	}
25054b964adaSGeorge Wilson 
2506a33cae98STim Haley 	if (locked) {
2507a33cae98STim Haley 		spa->spa_last_open_failed = 0;
2508a33cae98STim Haley 		spa->spa_last_ubsync_txg = 0;
2509a33cae98STim Haley 		spa->spa_load_txg = 0;
2510fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2511a33cae98STim Haley 	}
2512fa9e4066Sahrens 
2513fa9e4066Sahrens 	*spapp = spa;
2514fa9e4066Sahrens 
2515fa9e4066Sahrens 	return (0);
2516fa9e4066Sahrens }
2517fa9e4066Sahrens 
2518468c413aSTim Haley int
2519468c413aSTim Haley spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2520468c413aSTim Haley     nvlist_t **config)
2521468c413aSTim Haley {
2522468c413aSTim Haley 	return (spa_open_common(name, spapp, tag, policy, config));
2523468c413aSTim Haley }
2524468c413aSTim Haley 
2525fa9e4066Sahrens int
2526fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag)
2527fa9e4066Sahrens {
2528468c413aSTim Haley 	return (spa_open_common(name, spapp, tag, NULL, NULL));
2529fa9e4066Sahrens }
2530fa9e4066Sahrens 
2531ea8dc4b6Seschrock /*
2532ea8dc4b6Seschrock  * Lookup the given spa_t, incrementing the inject count in the process,
2533ea8dc4b6Seschrock  * preventing it from being exported or destroyed.
2534ea8dc4b6Seschrock  */
2535ea8dc4b6Seschrock spa_t *
2536ea8dc4b6Seschrock spa_inject_addref(char *name)
2537ea8dc4b6Seschrock {
2538ea8dc4b6Seschrock 	spa_t *spa;
2539ea8dc4b6Seschrock 
2540ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
2541ea8dc4b6Seschrock 	if ((spa = spa_lookup(name)) == NULL) {
2542ea8dc4b6Seschrock 		mutex_exit(&spa_namespace_lock);
2543ea8dc4b6Seschrock 		return (NULL);
2544ea8dc4b6Seschrock 	}
2545ea8dc4b6Seschrock 	spa->spa_inject_ref++;
2546ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
2547ea8dc4b6Seschrock 
2548ea8dc4b6Seschrock 	return (spa);
2549ea8dc4b6Seschrock }
2550ea8dc4b6Seschrock 
2551ea8dc4b6Seschrock void
2552ea8dc4b6Seschrock spa_inject_delref(spa_t *spa)
2553ea8dc4b6Seschrock {
2554ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
2555ea8dc4b6Seschrock 	spa->spa_inject_ref--;
2556ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
2557ea8dc4b6Seschrock }
2558ea8dc4b6Seschrock 
2559fa94a07fSbrendan /*
2560fa94a07fSbrendan  * Add spares device information to the nvlist.
2561fa94a07fSbrendan  */
256299653d4eSeschrock static void
256399653d4eSeschrock spa_add_spares(spa_t *spa, nvlist_t *config)
256499653d4eSeschrock {
256599653d4eSeschrock 	nvlist_t **spares;
256699653d4eSeschrock 	uint_t i, nspares;
256799653d4eSeschrock 	nvlist_t *nvroot;
256899653d4eSeschrock 	uint64_t guid;
256999653d4eSeschrock 	vdev_stat_t *vs;
257099653d4eSeschrock 	uint_t vsc;
257139c23413Seschrock 	uint64_t pool;
257299653d4eSeschrock 
25736809eb4eSEric Schrock 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
25746809eb4eSEric Schrock 
2575fa94a07fSbrendan 	if (spa->spa_spares.sav_count == 0)
257699653d4eSeschrock 		return;
257799653d4eSeschrock 
257899653d4eSeschrock 	VERIFY(nvlist_lookup_nvlist(config,
257999653d4eSeschrock 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2580fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
258199653d4eSeschrock 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
258299653d4eSeschrock 	if (nspares != 0) {
258399653d4eSeschrock 		VERIFY(nvlist_add_nvlist_array(nvroot,
258499653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
258599653d4eSeschrock 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
258699653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
258799653d4eSeschrock 
258899653d4eSeschrock 		/*
258999653d4eSeschrock 		 * Go through and find any spares which have since been
259099653d4eSeschrock 		 * repurposed as an active spare.  If this is the case, update
259199653d4eSeschrock 		 * their status appropriately.
259299653d4eSeschrock 		 */
259399653d4eSeschrock 		for (i = 0; i < nspares; i++) {
259499653d4eSeschrock 			VERIFY(nvlist_lookup_uint64(spares[i],
259599653d4eSeschrock 			    ZPOOL_CONFIG_GUID, &guid) == 0);
259689a89ebfSlling 			if (spa_spare_exists(guid, &pool, NULL) &&
259789a89ebfSlling 			    pool != 0ULL) {
259899653d4eSeschrock 				VERIFY(nvlist_lookup_uint64_array(
25993f9d6ad7SLin Ling 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
260099653d4eSeschrock 				    (uint64_t **)&vs, &vsc) == 0);
260199653d4eSeschrock 				vs->vs_state = VDEV_STATE_CANT_OPEN;
260299653d4eSeschrock 				vs->vs_aux = VDEV_AUX_SPARED;
260399653d4eSeschrock 			}
260499653d4eSeschrock 		}
260599653d4eSeschrock 	}
260699653d4eSeschrock }
260799653d4eSeschrock 
2608fa94a07fSbrendan /*
2609fa94a07fSbrendan  * Add l2cache device information to the nvlist, including vdev stats.
2610fa94a07fSbrendan  */
2611fa94a07fSbrendan static void
2612fa94a07fSbrendan spa_add_l2cache(spa_t *spa, nvlist_t *config)
2613fa94a07fSbrendan {
2614fa94a07fSbrendan 	nvlist_t **l2cache;
2615fa94a07fSbrendan 	uint_t i, j, nl2cache;
2616fa94a07fSbrendan 	nvlist_t *nvroot;
2617fa94a07fSbrendan 	uint64_t guid;
2618fa94a07fSbrendan 	vdev_t *vd;
2619fa94a07fSbrendan 	vdev_stat_t *vs;
2620fa94a07fSbrendan 	uint_t vsc;
2621fa94a07fSbrendan 
26226809eb4eSEric Schrock 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
26236809eb4eSEric Schrock 
2624fa94a07fSbrendan 	if (spa->spa_l2cache.sav_count == 0)
2625fa94a07fSbrendan 		return;
2626fa94a07fSbrendan 
2627fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist(config,
2628fa94a07fSbrendan 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2629fa94a07fSbrendan 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2630fa94a07fSbrendan 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2631fa94a07fSbrendan 	if (nl2cache != 0) {
2632fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot,
2633fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2634fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
2635fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2636fa94a07fSbrendan 
2637fa94a07fSbrendan 		/*
2638fa94a07fSbrendan 		 * Update level 2 cache device stats.
2639fa94a07fSbrendan 		 */
2640fa94a07fSbrendan 
2641fa94a07fSbrendan 		for (i = 0; i < nl2cache; i++) {
2642fa94a07fSbrendan 			VERIFY(nvlist_lookup_uint64(l2cache[i],
2643fa94a07fSbrendan 			    ZPOOL_CONFIG_GUID, &guid) == 0);
2644fa94a07fSbrendan 
2645fa94a07fSbrendan 			vd = NULL;
2646fa94a07fSbrendan 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2647fa94a07fSbrendan 				if (guid ==
2648fa94a07fSbrendan 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2649fa94a07fSbrendan 					vd = spa->spa_l2cache.sav_vdevs[j];
2650fa94a07fSbrendan 					break;
2651fa94a07fSbrendan 				}
2652fa94a07fSbrendan 			}
2653fa94a07fSbrendan 			ASSERT(vd != NULL);
2654fa94a07fSbrendan 
2655fa94a07fSbrendan 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
26563f9d6ad7SLin Ling 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
26573f9d6ad7SLin Ling 			    == 0);
2658fa94a07fSbrendan 			vdev_get_stats(vd, vs);
2659fa94a07fSbrendan 		}
2660fa94a07fSbrendan 	}
2661fa94a07fSbrendan }
2662fa94a07fSbrendan 
2663fa9e4066Sahrens int
2664ea8dc4b6Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2665fa9e4066Sahrens {
2666fa9e4066Sahrens 	int error;
2667fa9e4066Sahrens 	spa_t *spa;
2668fa9e4066Sahrens 
2669fa9e4066Sahrens 	*config = NULL;
2670468c413aSTim Haley 	error = spa_open_common(name, &spa, FTAG, NULL, config);
2671fa9e4066Sahrens 
26726809eb4eSEric Schrock 	if (spa != NULL) {
26736809eb4eSEric Schrock 		/*
26746809eb4eSEric Schrock 		 * This still leaves a window of inconsistency where the spares
26756809eb4eSEric Schrock 		 * or l2cache devices could change and the config would be
26766809eb4eSEric Schrock 		 * self-inconsistent.
26776809eb4eSEric Schrock 		 */
26786809eb4eSEric Schrock 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2679ea8dc4b6Seschrock 
26806809eb4eSEric Schrock 		if (*config != NULL) {
268111027bc7STim Haley 			uint64_t loadtimes[2];
268211027bc7STim Haley 
268311027bc7STim Haley 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
268411027bc7STim Haley 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
268511027bc7STim Haley 			VERIFY(nvlist_add_uint64_array(*config,
268611027bc7STim Haley 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
268711027bc7STim Haley 
2688e14bb325SJeff Bonwick 			VERIFY(nvlist_add_uint64(*config,
26896809eb4eSEric Schrock 			    ZPOOL_CONFIG_ERRCOUNT,
26906809eb4eSEric Schrock 			    spa_get_errlog_size(spa)) == 0);
2691e14bb325SJeff Bonwick 
26926809eb4eSEric Schrock 			if (spa_suspended(spa))
26936809eb4eSEric Schrock 				VERIFY(nvlist_add_uint64(*config,
26946809eb4eSEric Schrock 				    ZPOOL_CONFIG_SUSPENDED,
26956809eb4eSEric Schrock 				    spa->spa_failmode) == 0);
26966809eb4eSEric Schrock 
26976809eb4eSEric Schrock 			spa_add_spares(spa, *config);
26986809eb4eSEric Schrock 			spa_add_l2cache(spa, *config);
26996809eb4eSEric Schrock 		}
270099653d4eSeschrock 	}
270199653d4eSeschrock 
2702ea8dc4b6Seschrock 	/*
2703ea8dc4b6Seschrock 	 * We want to get the alternate root even for faulted pools, so we cheat
2704ea8dc4b6Seschrock 	 * and call spa_lookup() directly.
2705ea8dc4b6Seschrock 	 */
2706ea8dc4b6Seschrock 	if (altroot) {
2707ea8dc4b6Seschrock 		if (spa == NULL) {
2708ea8dc4b6Seschrock 			mutex_enter(&spa_namespace_lock);
2709ea8dc4b6Seschrock 			spa = spa_lookup(name);
2710ea8dc4b6Seschrock 			if (spa)
2711ea8dc4b6Seschrock 				spa_altroot(spa, altroot, buflen);
2712ea8dc4b6Seschrock 			else
2713ea8dc4b6Seschrock 				altroot[0] = '\0';
2714ea8dc4b6Seschrock 			spa = NULL;
2715ea8dc4b6Seschrock 			mutex_exit(&spa_namespace_lock);
2716ea8dc4b6Seschrock 		} else {
2717ea8dc4b6Seschrock 			spa_altroot(spa, altroot, buflen);
2718ea8dc4b6Seschrock 		}
2719ea8dc4b6Seschrock 	}
2720ea8dc4b6Seschrock 
27216809eb4eSEric Schrock 	if (spa != NULL) {
27226809eb4eSEric Schrock 		spa_config_exit(spa, SCL_CONFIG, FTAG);
2723fa9e4066Sahrens 		spa_close(spa, FTAG);
27246809eb4eSEric Schrock 	}
2725fa9e4066Sahrens 
2726fa9e4066Sahrens 	return (error);
2727fa9e4066Sahrens }
2728fa9e4066Sahrens 
272999653d4eSeschrock /*
2730fa94a07fSbrendan  * Validate that the auxiliary device array is well formed.  We must have an
2731fa94a07fSbrendan  * array of nvlists, each which describes a valid leaf vdev.  If this is an
2732fa94a07fSbrendan  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2733fa94a07fSbrendan  * specified, as long as they are well-formed.
273499653d4eSeschrock  */
273599653d4eSeschrock static int
2736fa94a07fSbrendan spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2737fa94a07fSbrendan     spa_aux_vdev_t *sav, const char *config, uint64_t version,
2738fa94a07fSbrendan     vdev_labeltype_t label)
273999653d4eSeschrock {
2740fa94a07fSbrendan 	nvlist_t **dev;
2741fa94a07fSbrendan 	uint_t i, ndev;
274299653d4eSeschrock 	vdev_t *vd;
274399653d4eSeschrock 	int error;
274499653d4eSeschrock 
2745e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2746e14bb325SJeff Bonwick 
274799653d4eSeschrock 	/*
2748fa94a07fSbrendan 	 * It's acceptable to have no devs specified.
274999653d4eSeschrock 	 */
2750fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
275199653d4eSeschrock 		return (0);
275299653d4eSeschrock 
2753fa94a07fSbrendan 	if (ndev == 0)
275499653d4eSeschrock 		return (EINVAL);
275599653d4eSeschrock 
275699653d4eSeschrock 	/*
2757fa94a07fSbrendan 	 * Make sure the pool is formatted with a version that supports this
2758fa94a07fSbrendan 	 * device type.
275999653d4eSeschrock 	 */
2760fa94a07fSbrendan 	if (spa_version(spa) < version)
276199653d4eSeschrock 		return (ENOTSUP);
276299653d4eSeschrock 
276339c23413Seschrock 	/*
2764fa94a07fSbrendan 	 * Set the pending device list so we correctly handle device in-use
276539c23413Seschrock 	 * checking.
276639c23413Seschrock 	 */
2767fa94a07fSbrendan 	sav->sav_pending = dev;
2768fa94a07fSbrendan 	sav->sav_npending = ndev;
276939c23413Seschrock 
2770fa94a07fSbrendan 	for (i = 0; i < ndev; i++) {
2771fa94a07fSbrendan 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
277299653d4eSeschrock 		    mode)) != 0)
277339c23413Seschrock 			goto out;
277499653d4eSeschrock 
277599653d4eSeschrock 		if (!vd->vdev_ops->vdev_op_leaf) {
277699653d4eSeschrock 			vdev_free(vd);
277739c23413Seschrock 			error = EINVAL;
277839c23413Seschrock 			goto out;
277999653d4eSeschrock 		}
278099653d4eSeschrock 
2781fa94a07fSbrendan 		/*
2782e14bb325SJeff Bonwick 		 * The L2ARC currently only supports disk devices in
2783e14bb325SJeff Bonwick 		 * kernel context.  For user-level testing, we allow it.
2784fa94a07fSbrendan 		 */
2785e14bb325SJeff Bonwick #ifdef _KERNEL
2786fa94a07fSbrendan 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2787fa94a07fSbrendan 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2788fa94a07fSbrendan 			error = ENOTBLK;
2789fa94a07fSbrendan 			goto out;
2790fa94a07fSbrendan 		}
2791e14bb325SJeff Bonwick #endif
279299653d4eSeschrock 		vd->vdev_top = vd;
279399653d4eSeschrock 
279439c23413Seschrock 		if ((error = vdev_open(vd)) == 0 &&
2795fa94a07fSbrendan 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
2796fa94a07fSbrendan 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
279739c23413Seschrock 			    vd->vdev_guid) == 0);
279839c23413Seschrock 		}
279999653d4eSeschrock 
280099653d4eSeschrock 		vdev_free(vd);
280139c23413Seschrock 
2802fa94a07fSbrendan 		if (error &&
2803fa94a07fSbrendan 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
280439c23413Seschrock 			goto out;
280539c23413Seschrock 		else
280639c23413Seschrock 			error = 0;
280799653d4eSeschrock 	}
280899653d4eSeschrock 
280939c23413Seschrock out:
2810fa94a07fSbrendan 	sav->sav_pending = NULL;
2811fa94a07fSbrendan 	sav->sav_npending = 0;
281239c23413Seschrock 	return (error);
281399653d4eSeschrock }
281499653d4eSeschrock 
2815fa94a07fSbrendan static int
2816fa94a07fSbrendan spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2817fa94a07fSbrendan {
2818fa94a07fSbrendan 	int error;
2819fa94a07fSbrendan 
2820e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2821e14bb325SJeff Bonwick 
2822fa94a07fSbrendan 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2823fa94a07fSbrendan 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2824fa94a07fSbrendan 	    VDEV_LABEL_SPARE)) != 0) {
2825fa94a07fSbrendan 		return (error);
2826fa94a07fSbrendan 	}
2827fa94a07fSbrendan 
2828fa94a07fSbrendan 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2829fa94a07fSbrendan 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2830fa94a07fSbrendan 	    VDEV_LABEL_L2CACHE));
2831fa94a07fSbrendan }
2832fa94a07fSbrendan 
2833fa94a07fSbrendan static void
2834fa94a07fSbrendan spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2835fa94a07fSbrendan     const char *config)
2836fa94a07fSbrendan {
2837fa94a07fSbrendan 	int i;
2838fa94a07fSbrendan 
2839fa94a07fSbrendan 	if (sav->sav_config != NULL) {
2840fa94a07fSbrendan 		nvlist_t **olddevs;
2841fa94a07fSbrendan 		uint_t oldndevs;
2842fa94a07fSbrendan 		nvlist_t **newdevs;
2843fa94a07fSbrendan 
2844fa94a07fSbrendan 		/*
2845fa94a07fSbrendan 		 * Generate new dev list by concatentating with the
2846fa94a07fSbrendan 		 * current dev list.
2847fa94a07fSbrendan 		 */
2848fa94a07fSbrendan 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2849fa94a07fSbrendan 		    &olddevs, &oldndevs) == 0);
2850fa94a07fSbrendan 
2851fa94a07fSbrendan 		newdevs = kmem_alloc(sizeof (void *) *
2852fa94a07fSbrendan 		    (ndevs + oldndevs), KM_SLEEP);
2853fa94a07fSbrendan 		for (i = 0; i < oldndevs; i++)
2854fa94a07fSbrendan 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2855fa94a07fSbrendan 			    KM_SLEEP) == 0);
2856fa94a07fSbrendan 		for (i = 0; i < ndevs; i++)
2857fa94a07fSbrendan 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2858fa94a07fSbrendan 			    KM_SLEEP) == 0);
2859fa94a07fSbrendan 
2860fa94a07fSbrendan 		VERIFY(nvlist_remove(sav->sav_config, config,
2861fa94a07fSbrendan 		    DATA_TYPE_NVLIST_ARRAY) == 0);
2862fa94a07fSbrendan 
2863fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2864fa94a07fSbrendan 		    config, newdevs, ndevs + oldndevs) == 0);
2865fa94a07fSbrendan 		for (i = 0; i < oldndevs + ndevs; i++)
2866fa94a07fSbrendan 			nvlist_free(newdevs[i]);
2867fa94a07fSbrendan 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2868fa94a07fSbrendan 	} else {
2869fa94a07fSbrendan 		/*
2870fa94a07fSbrendan 		 * Generate a new dev list.
2871fa94a07fSbrendan 		 */
2872fa94a07fSbrendan 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2873fa94a07fSbrendan 		    KM_SLEEP) == 0);
2874fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2875fa94a07fSbrendan 		    devs, ndevs) == 0);
2876fa94a07fSbrendan 	}
2877fa94a07fSbrendan }
2878fa94a07fSbrendan 
2879fa94a07fSbrendan /*
2880fa94a07fSbrendan  * Stop and drop level 2 ARC devices
2881fa94a07fSbrendan  */
2882fa94a07fSbrendan void
2883fa94a07fSbrendan spa_l2cache_drop(spa_t *spa)
2884fa94a07fSbrendan {
2885fa94a07fSbrendan 	vdev_t *vd;
2886fa94a07fSbrendan 	int i;
2887fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2888fa94a07fSbrendan 
2889fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++) {
2890fa94a07fSbrendan 		uint64_t pool;
2891fa94a07fSbrendan 
2892fa94a07fSbrendan 		vd = sav->sav_vdevs[i];
2893fa94a07fSbrendan 		ASSERT(vd != NULL);
2894fa94a07fSbrendan 
28958ad4d6ddSJeff Bonwick 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
28968ad4d6ddSJeff Bonwick 		    pool != 0ULL && l2arc_vdev_present(vd))
2897fa94a07fSbrendan 			l2arc_remove_vdev(vd);
2898fa94a07fSbrendan 		if (vd->vdev_isl2cache)
2899fa94a07fSbrendan 			spa_l2cache_remove(vd);
2900fa94a07fSbrendan 		vdev_clear_stats(vd);
2901fa94a07fSbrendan 		(void) vdev_close(vd);
2902fa94a07fSbrendan 	}
2903fa94a07fSbrendan }
2904fa94a07fSbrendan 
2905fa9e4066Sahrens /*
2906fa9e4066Sahrens  * Pool Creation
2907fa9e4066Sahrens  */
2908fa9e4066Sahrens int
2909990b4856Slling spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
29100a48a24eStimh     const char *history_str, nvlist_t *zplprops)
2911fa9e4066Sahrens {
2912fa9e4066Sahrens 	spa_t *spa;
2913990b4856Slling 	char *altroot = NULL;
29140373e76bSbonwick 	vdev_t *rvd;
2915fa9e4066Sahrens 	dsl_pool_t *dp;
2916fa9e4066Sahrens 	dmu_tx_t *tx;
2917573ca77eSGeorge Wilson 	int error = 0;
2918fa9e4066Sahrens 	uint64_t txg = TXG_INITIAL;
2919fa94a07fSbrendan 	nvlist_t **spares, **l2cache;
2920fa94a07fSbrendan 	uint_t nspares, nl2cache;
2921cde58dbcSMatthew Ahrens 	uint64_t version, obj;
2922fa9e4066Sahrens 
2923fa9e4066Sahrens 	/*
2924fa9e4066Sahrens 	 * If this pool already exists, return failure.
2925fa9e4066Sahrens 	 */
2926fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
2927fa9e4066Sahrens 	if (spa_lookup(pool) != NULL) {
2928fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2929fa9e4066Sahrens 		return (EEXIST);
2930fa9e4066Sahrens 	}
2931fa9e4066Sahrens 
2932fa9e4066Sahrens 	/*
2933fa9e4066Sahrens 	 * Allocate a new spa_t structure.
2934fa9e4066Sahrens 	 */
2935990b4856Slling 	(void) nvlist_lookup_string(props,
2936990b4856Slling 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2937468c413aSTim Haley 	spa = spa_add(pool, NULL, altroot);
29388ad4d6ddSJeff Bonwick 	spa_activate(spa, spa_mode_global);
2939fa9e4066Sahrens 
2940990b4856Slling 	if (props && (error = spa_prop_validate(spa, props))) {
2941990b4856Slling 		spa_deactivate(spa);
2942990b4856Slling 		spa_remove(spa);
2943c5904d13Seschrock 		mutex_exit(&spa_namespace_lock);
2944990b4856Slling 		return (error);
2945990b4856Slling 	}
2946990b4856Slling 
2947990b4856Slling 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2948990b4856Slling 	    &version) != 0)
2949990b4856Slling 		version = SPA_VERSION;
2950990b4856Slling 	ASSERT(version <= SPA_VERSION);
2951b24ab676SJeff Bonwick 
2952b24ab676SJeff Bonwick 	spa->spa_first_txg = txg;
2953b24ab676SJeff Bonwick 	spa->spa_uberblock.ub_txg = txg - 1;
2954990b4856Slling 	spa->spa_uberblock.ub_version = version;
2955fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
2956fa9e4066Sahrens 
295754d692b7SGeorge Wilson 	/*
295854d692b7SGeorge Wilson 	 * Create "The Godfather" zio to hold all async IOs
295954d692b7SGeorge Wilson 	 */
296025f89ee2SJeff Bonwick 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
296125f89ee2SJeff Bonwick 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
296254d692b7SGeorge Wilson 
29630373e76bSbonwick 	/*
29640373e76bSbonwick 	 * Create the root vdev.
29650373e76bSbonwick 	 */
2966e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
29670373e76bSbonwick 
296899653d4eSeschrock 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
29690373e76bSbonwick 
297099653d4eSeschrock 	ASSERT(error != 0 || rvd != NULL);
297199653d4eSeschrock 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
29720373e76bSbonwick 
2973b7b97454Sperrin 	if (error == 0 && !zfs_allocatable_devs(nvroot))
29740373e76bSbonwick 		error = EINVAL;
297599653d4eSeschrock 
297699653d4eSeschrock 	if (error == 0 &&
297799653d4eSeschrock 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2978fa94a07fSbrendan 	    (error = spa_validate_aux(spa, nvroot, txg,
297999653d4eSeschrock 	    VDEV_ALLOC_ADD)) == 0) {
2980573ca77eSGeorge Wilson 		for (int c = 0; c < rvd->vdev_children; c++) {
2981573ca77eSGeorge Wilson 			vdev_metaslab_set_size(rvd->vdev_child[c]);
2982573ca77eSGeorge Wilson 			vdev_expand(rvd->vdev_child[c], txg);
2983573ca77eSGeorge Wilson 		}
29840373e76bSbonwick 	}
29850373e76bSbonwick 
2986e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
2987fa9e4066Sahrens 
298899653d4eSeschrock 	if (error != 0) {
2989fa9e4066Sahrens 		spa_unload(spa);
2990fa9e4066Sahrens 		spa_deactivate(spa);
2991fa9e4066Sahrens 		spa_remove(spa);
2992fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
2993fa9e4066Sahrens 		return (error);
2994fa9e4066Sahrens 	}
2995fa9e4066Sahrens 
299699653d4eSeschrock 	/*
299799653d4eSeschrock 	 * Get the list of spares, if specified.
299899653d4eSeschrock 	 */
299999653d4eSeschrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
300099653d4eSeschrock 	    &spares, &nspares) == 0) {
3001fa94a07fSbrendan 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
300299653d4eSeschrock 		    KM_SLEEP) == 0);
3003fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
300499653d4eSeschrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3005e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
300699653d4eSeschrock 		spa_load_spares(spa);
3007e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
3008fa94a07fSbrendan 		spa->spa_spares.sav_sync = B_TRUE;
3009fa94a07fSbrendan 	}
3010fa94a07fSbrendan 
3011fa94a07fSbrendan 	/*
3012fa94a07fSbrendan 	 * Get the list of level 2 cache devices, if specified.
3013fa94a07fSbrendan 	 */
3014fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3015fa94a07fSbrendan 	    &l2cache, &nl2cache) == 0) {
3016fa94a07fSbrendan 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3017fa94a07fSbrendan 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
3018fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3019fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3020e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3021fa94a07fSbrendan 		spa_load_l2cache(spa);
3022e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
3023fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
302499653d4eSeschrock 	}
302599653d4eSeschrock 
30260a48a24eStimh 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3027fa9e4066Sahrens 	spa->spa_meta_objset = dp->dp_meta_objset;
3028fa9e4066Sahrens 
3029485bbbf5SGeorge Wilson 	/*
3030485bbbf5SGeorge Wilson 	 * Create DDTs (dedup tables).
3031485bbbf5SGeorge Wilson 	 */
3032485bbbf5SGeorge Wilson 	ddt_create(spa);
3033485bbbf5SGeorge Wilson 
3034485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
3035485bbbf5SGeorge Wilson 
3036fa9e4066Sahrens 	tx = dmu_tx_create_assigned(dp, txg);
3037fa9e4066Sahrens 
3038fa9e4066Sahrens 	/*
3039fa9e4066Sahrens 	 * Create the pool config object.
3040fa9e4066Sahrens 	 */
3041fa9e4066Sahrens 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3042f7991ba4STim Haley 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3043fa9e4066Sahrens 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3044fa9e4066Sahrens 
3045ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
3046fa9e4066Sahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3047ea8dc4b6Seschrock 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3048ea8dc4b6Seschrock 		cmn_err(CE_PANIC, "failed to add pool config");
3049ea8dc4b6Seschrock 	}
3050fa9e4066Sahrens 
30513f9d6ad7SLin Ling 	if (zap_add(spa->spa_meta_objset,
30523f9d6ad7SLin Ling 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
30533f9d6ad7SLin Ling 	    sizeof (uint64_t), 1, &version, tx) != 0) {
30543f9d6ad7SLin Ling 		cmn_err(CE_PANIC, "failed to add pool version");
30553f9d6ad7SLin Ling 	}
30563f9d6ad7SLin Ling 
3057990b4856Slling 	/* Newly created pools with the right version are always deflated. */
3058990b4856Slling 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3059990b4856Slling 		spa->spa_deflate = TRUE;
3060990b4856Slling 		if (zap_add(spa->spa_meta_objset,
3061990b4856Slling 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3062990b4856Slling 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3063990b4856Slling 			cmn_err(CE_PANIC, "failed to add deflate");
3064990b4856Slling 		}
306599653d4eSeschrock 	}
306699653d4eSeschrock 
3067fa9e4066Sahrens 	/*
3068cde58dbcSMatthew Ahrens 	 * Create the deferred-free bpobj.  Turn off compression
3069fa9e4066Sahrens 	 * because sync-to-convergence takes longer if the blocksize
3070fa9e4066Sahrens 	 * keeps changing.
3071fa9e4066Sahrens 	 */
3072cde58dbcSMatthew Ahrens 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3073cde58dbcSMatthew Ahrens 	dmu_object_set_compress(spa->spa_meta_objset, obj,
3074cde58dbcSMatthew Ahrens 	    ZIO_COMPRESS_OFF, tx);
3075ea8dc4b6Seschrock 	if (zap_add(spa->spa_meta_objset,
3076cde58dbcSMatthew Ahrens 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3077cde58dbcSMatthew Ahrens 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
3078cde58dbcSMatthew Ahrens 		cmn_err(CE_PANIC, "failed to add bpobj");
3079ea8dc4b6Seschrock 	}
3080cde58dbcSMatthew Ahrens 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3081cde58dbcSMatthew Ahrens 	    spa->spa_meta_objset, obj));
3082fa9e4066Sahrens 
308306eeb2adSek 	/*
308406eeb2adSek 	 * Create the pool's history object.
308506eeb2adSek 	 */
3086990b4856Slling 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
3087990b4856Slling 		spa_history_create_obj(spa, tx);
3088990b4856Slling 
3089990b4856Slling 	/*
3090990b4856Slling 	 * Set pool properties.
3091990b4856Slling 	 */
3092990b4856Slling 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3093990b4856Slling 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
30940a4e9518Sgw 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3095573ca77eSGeorge Wilson 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3096b24ab676SJeff Bonwick 
3097379c004dSEric Schrock 	if (props != NULL) {
3098379c004dSEric Schrock 		spa_configfile_set(spa, props, B_FALSE);
30993f9d6ad7SLin Ling 		spa_sync_props(spa, props, tx);
3100379c004dSEric Schrock 	}
310106eeb2adSek 
3102fa9e4066Sahrens 	dmu_tx_commit(tx);
3103fa9e4066Sahrens 
3104fa9e4066Sahrens 	spa->spa_sync_on = B_TRUE;
3105fa9e4066Sahrens 	txg_sync_start(spa->spa_dsl_pool);
3106fa9e4066Sahrens 
3107fa9e4066Sahrens 	/*
3108fa9e4066Sahrens 	 * We explicitly wait for the first transaction to complete so that our
3109fa9e4066Sahrens 	 * bean counters are appropriately updated.
3110fa9e4066Sahrens 	 */
3111fa9e4066Sahrens 	txg_wait_synced(spa->spa_dsl_pool, txg);
3112fa9e4066Sahrens 
3113c5904d13Seschrock 	spa_config_sync(spa, B_FALSE, B_TRUE);
3114fa9e4066Sahrens 
3115990b4856Slling 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
3116228975ccSek 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
3117c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_CREATE);
3118228975ccSek 
3119088f3894Sahrens 	spa->spa_minref = refcount_count(&spa->spa_refcount);
3120088f3894Sahrens 
3121daaa36a7SGeorge Wilson 	mutex_exit(&spa_namespace_lock);
3122daaa36a7SGeorge Wilson 
3123fa9e4066Sahrens 	return (0);
3124fa9e4066Sahrens }
3125fa9e4066Sahrens 
3126e7cbe64fSgw #ifdef _KERNEL
3127e7cbe64fSgw /*
312821ecdf64SLin Ling  * Get the root pool information from the root disk, then import the root pool
312921ecdf64SLin Ling  * during the system boot up time.
3130e7cbe64fSgw  */
313121ecdf64SLin Ling extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
313221ecdf64SLin Ling 
313321ecdf64SLin Ling static nvlist_t *
313421ecdf64SLin Ling spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3135e7cbe64fSgw {
313621ecdf64SLin Ling 	nvlist_t *config;
3137e7cbe64fSgw 	nvlist_t *nvtop, *nvroot;
3138e7cbe64fSgw 	uint64_t pgid;
3139e7cbe64fSgw 
314021ecdf64SLin Ling 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
314121ecdf64SLin Ling 		return (NULL);
314221ecdf64SLin Ling 
3143e7cbe64fSgw 	/*
3144e7cbe64fSgw 	 * Add this top-level vdev to the child array.
3145e7cbe64fSgw 	 */
314621ecdf64SLin Ling 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
314721ecdf64SLin Ling 	    &nvtop) == 0);
314821ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
314921ecdf64SLin Ling 	    &pgid) == 0);
315021ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3151e7cbe64fSgw 
3152e7cbe64fSgw 	/*
3153e7cbe64fSgw 	 * Put this pool's top-level vdevs into a root vdev.
3154e7cbe64fSgw 	 */
3155e7cbe64fSgw 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
315621ecdf64SLin Ling 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
315721ecdf64SLin Ling 	    VDEV_TYPE_ROOT) == 0);
3158e7cbe64fSgw 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3159e7cbe64fSgw 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3160e7cbe64fSgw 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3161e7cbe64fSgw 	    &nvtop, 1) == 0);
3162e7cbe64fSgw 
3163e7cbe64fSgw 	/*
3164e7cbe64fSgw 	 * Replace the existing vdev_tree with the new root vdev in
3165e7cbe64fSgw 	 * this pool's configuration (remove the old, add the new).
3166e7cbe64fSgw 	 */
3167e7cbe64fSgw 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3168e7cbe64fSgw 	nvlist_free(nvroot);
316921ecdf64SLin Ling 	return (config);
3170e7cbe64fSgw }
3171e7cbe64fSgw 
3172e7cbe64fSgw /*
317321ecdf64SLin Ling  * Walk the vdev tree and see if we can find a device with "better"
317421ecdf64SLin Ling  * configuration. A configuration is "better" if the label on that
317521ecdf64SLin Ling  * device has a more recent txg.
3176051aabe6Staylor  */
317721ecdf64SLin Ling static void
317821ecdf64SLin Ling spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3179051aabe6Staylor {
3180573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++)
318121ecdf64SLin Ling 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3182051aabe6Staylor 
318321ecdf64SLin Ling 	if (vd->vdev_ops->vdev_op_leaf) {
318421ecdf64SLin Ling 		nvlist_t *label;
318521ecdf64SLin Ling 		uint64_t label_txg;
3186051aabe6Staylor 
318721ecdf64SLin Ling 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
318821ecdf64SLin Ling 		    &label) != 0)
318921ecdf64SLin Ling 			return;
3190051aabe6Staylor 
319121ecdf64SLin Ling 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
319221ecdf64SLin Ling 		    &label_txg) == 0);
3193051aabe6Staylor 
319421ecdf64SLin Ling 		/*
319521ecdf64SLin Ling 		 * Do we have a better boot device?
319621ecdf64SLin Ling 		 */
319721ecdf64SLin Ling 		if (label_txg > *txg) {
319821ecdf64SLin Ling 			*txg = label_txg;
319921ecdf64SLin Ling 			*avd = vd;
3200051aabe6Staylor 		}
320121ecdf64SLin Ling 		nvlist_free(label);
3202051aabe6Staylor 	}
3203051aabe6Staylor }
3204051aabe6Staylor 
3205e7cbe64fSgw /*
3206e7cbe64fSgw  * Import a root pool.
3207e7cbe64fSgw  *
3208051aabe6Staylor  * For x86. devpath_list will consist of devid and/or physpath name of
3209051aabe6Staylor  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3210051aabe6Staylor  * The GRUB "findroot" command will return the vdev we should boot.
3211e7cbe64fSgw  *
3212e7cbe64fSgw  * For Sparc, devpath_list consists the physpath name of the booting device
3213e7cbe64fSgw  * no matter the rootpool is a single device pool or a mirrored pool.
3214e7cbe64fSgw  * e.g.
3215e7cbe64fSgw  *	"/pci@1f,0/ide@d/disk@0,0:a"
3216e7cbe64fSgw  */
3217e7cbe64fSgw int
3218051aabe6Staylor spa_import_rootpool(char *devpath, char *devid)
3219e7cbe64fSgw {
322021ecdf64SLin Ling 	spa_t *spa;
322121ecdf64SLin Ling 	vdev_t *rvd, *bvd, *avd = NULL;
322221ecdf64SLin Ling 	nvlist_t *config, *nvtop;
322321ecdf64SLin Ling 	uint64_t guid, txg;
3224e7cbe64fSgw 	char *pname;
3225e7cbe64fSgw 	int error;
3226e7cbe64fSgw 
3227e7cbe64fSgw 	/*
322821ecdf64SLin Ling 	 * Read the label from the boot device and generate a configuration.
3229e7cbe64fSgw 	 */
3230dedec472SJack Meng 	config = spa_generate_rootconf(devpath, devid, &guid);
3231dedec472SJack Meng #if defined(_OBP) && defined(_KERNEL)
3232dedec472SJack Meng 	if (config == NULL) {
3233dedec472SJack Meng 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
3234dedec472SJack Meng 			/* iscsi boot */
3235dedec472SJack Meng 			get_iscsi_bootpath_phy(devpath);
3236dedec472SJack Meng 			config = spa_generate_rootconf(devpath, devid, &guid);
3237dedec472SJack Meng 		}
3238dedec472SJack Meng 	}
3239dedec472SJack Meng #endif
3240dedec472SJack Meng 	if (config == NULL) {
324121ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
324221ecdf64SLin Ling 		    devpath);
324321ecdf64SLin Ling 		return (EIO);
324421ecdf64SLin Ling 	}
3245e7cbe64fSgw 
324621ecdf64SLin Ling 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
324721ecdf64SLin Ling 	    &pname) == 0);
324821ecdf64SLin Ling 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3249e7cbe64fSgw 
32506809eb4eSEric Schrock 	mutex_enter(&spa_namespace_lock);
32516809eb4eSEric Schrock 	if ((spa = spa_lookup(pname)) != NULL) {
32526809eb4eSEric Schrock 		/*
32536809eb4eSEric Schrock 		 * Remove the existing root pool from the namespace so that we
32546809eb4eSEric Schrock 		 * can replace it with the correct config we just read in.
32556809eb4eSEric Schrock 		 */
32566809eb4eSEric Schrock 		spa_remove(spa);
32576809eb4eSEric Schrock 	}
32586809eb4eSEric Schrock 
3259468c413aSTim Haley 	spa = spa_add(pname, config, NULL);
32606809eb4eSEric Schrock 	spa->spa_is_root = B_TRUE;
32614b964adaSGeorge Wilson 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3262e7cbe64fSgw 
326321ecdf64SLin Ling 	/*
326421ecdf64SLin Ling 	 * Build up a vdev tree based on the boot device's label config.
326521ecdf64SLin Ling 	 */
326621ecdf64SLin Ling 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
326721ecdf64SLin Ling 	    &nvtop) == 0);
326821ecdf64SLin Ling 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
326921ecdf64SLin Ling 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
327021ecdf64SLin Ling 	    VDEV_ALLOC_ROOTPOOL);
327121ecdf64SLin Ling 	spa_config_exit(spa, SCL_ALL, FTAG);
327221ecdf64SLin Ling 	if (error) {
327321ecdf64SLin Ling 		mutex_exit(&spa_namespace_lock);
327421ecdf64SLin Ling 		nvlist_free(config);
327521ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
327621ecdf64SLin Ling 		    pname);
327721ecdf64SLin Ling 		return (error);
327821ecdf64SLin Ling 	}
327921ecdf64SLin Ling 
328021ecdf64SLin Ling 	/*
328121ecdf64SLin Ling 	 * Get the boot vdev.
328221ecdf64SLin Ling 	 */
328321ecdf64SLin Ling 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
328421ecdf64SLin Ling 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
328521ecdf64SLin Ling 		    (u_longlong_t)guid);
328621ecdf64SLin Ling 		error = ENOENT;
328721ecdf64SLin Ling 		goto out;
328821ecdf64SLin Ling 	}
3289e7cbe64fSgw 
329021ecdf64SLin Ling 	/*
329121ecdf64SLin Ling 	 * Determine if there is a better boot device.
329221ecdf64SLin Ling 	 */
329321ecdf64SLin Ling 	avd = bvd;
329421ecdf64SLin Ling 	spa_alt_rootvdev(rvd, &avd, &txg);
329521ecdf64SLin Ling 	if (avd != bvd) {
329621ecdf64SLin Ling 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
329721ecdf64SLin Ling 		    "try booting from '%s'", avd->vdev_path);
329821ecdf64SLin Ling 		error = EINVAL;
329921ecdf64SLin Ling 		goto out;
330021ecdf64SLin Ling 	}
3301e7cbe64fSgw 
330221ecdf64SLin Ling 	/*
330321ecdf64SLin Ling 	 * If the boot device is part of a spare vdev then ensure that
330421ecdf64SLin Ling 	 * we're booting off the active spare.
330521ecdf64SLin Ling 	 */
330621ecdf64SLin Ling 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
330721ecdf64SLin Ling 	    !bvd->vdev_isspare) {
330821ecdf64SLin Ling 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
330921ecdf64SLin Ling 		    "try booting from '%s'",
3310cb04b873SMark J Musante 		    bvd->vdev_parent->
3311cb04b873SMark J Musante 		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
331221ecdf64SLin Ling 		error = EINVAL;
331321ecdf64SLin Ling 		goto out;
331421ecdf64SLin Ling 	}
331521ecdf64SLin Ling 
331621ecdf64SLin Ling 	error = 0;
3317c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_IMPORT);
331821ecdf64SLin Ling out:
331921ecdf64SLin Ling 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
332021ecdf64SLin Ling 	vdev_free(rvd);
332121ecdf64SLin Ling 	spa_config_exit(spa, SCL_ALL, FTAG);
332221ecdf64SLin Ling 	mutex_exit(&spa_namespace_lock);
332321ecdf64SLin Ling 
332421ecdf64SLin Ling 	nvlist_free(config);
3325e7cbe64fSgw 	return (error);
3326e7cbe64fSgw }
332721ecdf64SLin Ling 
3328e7cbe64fSgw #endif
3329e7cbe64fSgw 
33306809eb4eSEric Schrock /*
33316809eb4eSEric Schrock  * Import a non-root pool into the system.
33326809eb4eSEric Schrock  */
3333c5904d13Seschrock int
33344b964adaSGeorge Wilson spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3335c5904d13Seschrock {
33366809eb4eSEric Schrock 	spa_t *spa;
33376809eb4eSEric Schrock 	char *altroot = NULL;
3338468c413aSTim Haley 	spa_load_state_t state = SPA_LOAD_IMPORT;
3339468c413aSTim Haley 	zpool_rewind_policy_t policy;
3340f9af39baSGeorge Wilson 	uint64_t mode = spa_mode_global;
3341f9af39baSGeorge Wilson 	uint64_t readonly = B_FALSE;
33426809eb4eSEric Schrock 	int error;
33436809eb4eSEric Schrock 	nvlist_t *nvroot;
33446809eb4eSEric Schrock 	nvlist_t **spares, **l2cache;
33456809eb4eSEric Schrock 	uint_t nspares, nl2cache;
33466809eb4eSEric Schrock 
33476809eb4eSEric Schrock 	/*
33486809eb4eSEric Schrock 	 * If a pool with this name exists, return failure.
33496809eb4eSEric Schrock 	 */
33506809eb4eSEric Schrock 	mutex_enter(&spa_namespace_lock);
33511195e687SMark J Musante 	if (spa_lookup(pool) != NULL) {
33526809eb4eSEric Schrock 		mutex_exit(&spa_namespace_lock);
33536809eb4eSEric Schrock 		return (EEXIST);
33546809eb4eSEric Schrock 	}
33556809eb4eSEric Schrock 
33566809eb4eSEric Schrock 	/*
33576809eb4eSEric Schrock 	 * Create and initialize the spa structure.
33586809eb4eSEric Schrock 	 */
33596809eb4eSEric Schrock 	(void) nvlist_lookup_string(props,
33606809eb4eSEric Schrock 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3361f9af39baSGeorge Wilson 	(void) nvlist_lookup_uint64(props,
3362f9af39baSGeorge Wilson 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3363f9af39baSGeorge Wilson 	if (readonly)
3364f9af39baSGeorge Wilson 		mode = FREAD;
3365468c413aSTim Haley 	spa = spa_add(pool, config, altroot);
33664b964adaSGeorge Wilson 	spa->spa_import_flags = flags;
33674b964adaSGeorge Wilson 
33684b964adaSGeorge Wilson 	/*
33694b964adaSGeorge Wilson 	 * Verbatim import - Take a pool and insert it into the namespace
33704b964adaSGeorge Wilson 	 * as if it had been loaded at boot.
33714b964adaSGeorge Wilson 	 */
33724b964adaSGeorge Wilson 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
33734b964adaSGeorge Wilson 		if (props != NULL)
33744b964adaSGeorge Wilson 			spa_configfile_set(spa, props, B_FALSE);
33754b964adaSGeorge Wilson 
33764b964adaSGeorge Wilson 		spa_config_sync(spa, B_FALSE, B_TRUE);
33774b964adaSGeorge Wilson 
33784b964adaSGeorge Wilson 		mutex_exit(&spa_namespace_lock);
33794b964adaSGeorge Wilson 		spa_history_log_version(spa, LOG_POOL_IMPORT);
33804b964adaSGeorge Wilson 
33814b964adaSGeorge Wilson 		return (0);
33824b964adaSGeorge Wilson 	}
33834b964adaSGeorge Wilson 
3384f9af39baSGeorge Wilson 	spa_activate(spa, mode);
33856809eb4eSEric Schrock 
338625f89ee2SJeff Bonwick 	/*
338725f89ee2SJeff Bonwick 	 * Don't start async tasks until we know everything is healthy.
338825f89ee2SJeff Bonwick 	 */
338925f89ee2SJeff Bonwick 	spa_async_suspend(spa);
339025f89ee2SJeff Bonwick 
33914b964adaSGeorge Wilson 	zpool_get_rewind_policy(config, &policy);
33924b964adaSGeorge Wilson 	if (policy.zrp_request & ZPOOL_DO_REWIND)
33934b964adaSGeorge Wilson 		state = SPA_LOAD_RECOVER;
33944b964adaSGeorge Wilson 
33956809eb4eSEric Schrock 	/*
33966809eb4eSEric Schrock 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
33976809eb4eSEric Schrock 	 * because the user-supplied config is actually the one to trust when
33986809eb4eSEric Schrock 	 * doing an import.
33996809eb4eSEric Schrock 	 */
3400468c413aSTim Haley 	if (state != SPA_LOAD_RECOVER)
3401468c413aSTim Haley 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
34024b964adaSGeorge Wilson 
3403468c413aSTim Haley 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3404c8ee1847SVictor Latushkin 	    policy.zrp_request);
3405468c413aSTim Haley 
3406468c413aSTim Haley 	/*
34074b964adaSGeorge Wilson 	 * Propagate anything learned while loading the pool and pass it
34084b964adaSGeorge Wilson 	 * back to caller (i.e. rewind info, missing devices, etc).
3409468c413aSTim Haley 	 */
34104b964adaSGeorge Wilson 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
34114b964adaSGeorge Wilson 	    spa->spa_load_info) == 0);
34126809eb4eSEric Schrock 
34136809eb4eSEric Schrock 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34146809eb4eSEric Schrock 	/*
34156809eb4eSEric Schrock 	 * Toss any existing sparelist, as it doesn't have any validity
34166809eb4eSEric Schrock 	 * anymore, and conflicts with spa_has_spare().
34176809eb4eSEric Schrock 	 */
34186809eb4eSEric Schrock 	if (spa->spa_spares.sav_config) {
34196809eb4eSEric Schrock 		nvlist_free(spa->spa_spares.sav_config);
34206809eb4eSEric Schrock 		spa->spa_spares.sav_config = NULL;
34216809eb4eSEric Schrock 		spa_load_spares(spa);
34226809eb4eSEric Schrock 	}
34236809eb4eSEric Schrock 	if (spa->spa_l2cache.sav_config) {
34246809eb4eSEric Schrock 		nvlist_free(spa->spa_l2cache.sav_config);
34256809eb4eSEric Schrock 		spa->spa_l2cache.sav_config = NULL;
34266809eb4eSEric Schrock 		spa_load_l2cache(spa);
34276809eb4eSEric Schrock 	}
34286809eb4eSEric Schrock 
34296809eb4eSEric Schrock 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
34306809eb4eSEric Schrock 	    &nvroot) == 0);
34316809eb4eSEric Schrock 	if (error == 0)
34326809eb4eSEric Schrock 		error = spa_validate_aux(spa, nvroot, -1ULL,
34336809eb4eSEric Schrock 		    VDEV_ALLOC_SPARE);
34346809eb4eSEric Schrock 	if (error == 0)
34356809eb4eSEric Schrock 		error = spa_validate_aux(spa, nvroot, -1ULL,
34366809eb4eSEric Schrock 		    VDEV_ALLOC_L2CACHE);
34376809eb4eSEric Schrock 	spa_config_exit(spa, SCL_ALL, FTAG);
34386809eb4eSEric Schrock 
34396809eb4eSEric Schrock 	if (props != NULL)
34406809eb4eSEric Schrock 		spa_configfile_set(spa, props, B_FALSE);
34416809eb4eSEric Schrock 
34426809eb4eSEric Schrock 	if (error != 0 || (props && spa_writeable(spa) &&
34436809eb4eSEric Schrock 	    (error = spa_prop_set(spa, props)))) {
34446809eb4eSEric Schrock 		spa_unload(spa);
34456809eb4eSEric Schrock 		spa_deactivate(spa);
34466809eb4eSEric Schrock 		spa_remove(spa);
34476809eb4eSEric Schrock 		mutex_exit(&spa_namespace_lock);
34486809eb4eSEric Schrock 		return (error);
34496809eb4eSEric Schrock 	}
34506809eb4eSEric Schrock 
3451955ef359SLin Ling 	spa_async_resume(spa);
3452955ef359SLin Ling 
34536809eb4eSEric Schrock 	/*
34546809eb4eSEric Schrock 	 * Override any spares and level 2 cache devices as specified by
34556809eb4eSEric Schrock 	 * the user, as these may have correct device names/devids, etc.
34566809eb4eSEric Schrock 	 */
34576809eb4eSEric Schrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
34586809eb4eSEric Schrock 	    &spares, &nspares) == 0) {
34596809eb4eSEric Schrock 		if (spa->spa_spares.sav_config)
34606809eb4eSEric Schrock 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
34616809eb4eSEric Schrock 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
34626809eb4eSEric Schrock 		else
34636809eb4eSEric Schrock 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
34646809eb4eSEric Schrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
34656809eb4eSEric Schrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
34666809eb4eSEric Schrock 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
34676809eb4eSEric Schrock 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34686809eb4eSEric Schrock 		spa_load_spares(spa);
34696809eb4eSEric Schrock 		spa_config_exit(spa, SCL_ALL, FTAG);
34706809eb4eSEric Schrock 		spa->spa_spares.sav_sync = B_TRUE;
34716809eb4eSEric Schrock 	}
34726809eb4eSEric Schrock 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
34736809eb4eSEric Schrock 	    &l2cache, &nl2cache) == 0) {
34746809eb4eSEric Schrock 		if (spa->spa_l2cache.sav_config)
34756809eb4eSEric Schrock 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
34766809eb4eSEric Schrock 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
34776809eb4eSEric Schrock 		else
34786809eb4eSEric Schrock 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
34796809eb4eSEric Schrock 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
34806809eb4eSEric Schrock 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
34816809eb4eSEric Schrock 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
34826809eb4eSEric Schrock 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
34836809eb4eSEric Schrock 		spa_load_l2cache(spa);
34846809eb4eSEric Schrock 		spa_config_exit(spa, SCL_ALL, FTAG);
34856809eb4eSEric Schrock 		spa->spa_l2cache.sav_sync = B_TRUE;
34866809eb4eSEric Schrock 	}
34876809eb4eSEric Schrock 
3488b693757aSEric Schrock 	/*
3489b693757aSEric Schrock 	 * Check for any removed devices.
3490b693757aSEric Schrock 	 */
3491b693757aSEric Schrock 	if (spa->spa_autoreplace) {
3492b693757aSEric Schrock 		spa_aux_check_removed(&spa->spa_spares);
3493b693757aSEric Schrock 		spa_aux_check_removed(&spa->spa_l2cache);
3494b693757aSEric Schrock 	}
3495b693757aSEric Schrock 
34966809eb4eSEric Schrock 	if (spa_writeable(spa)) {
34976809eb4eSEric Schrock 		/*
34986809eb4eSEric Schrock 		 * Update the config cache to include the newly-imported pool.
34996809eb4eSEric Schrock 		 */
3500bc758434SLin Ling 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
35016809eb4eSEric Schrock 	}
35026809eb4eSEric Schrock 
3503573ca77eSGeorge Wilson 	/*
3504573ca77eSGeorge Wilson 	 * It's possible that the pool was expanded while it was exported.
3505573ca77eSGeorge Wilson 	 * We kick off an async task to handle this for us.
3506573ca77eSGeorge Wilson 	 */
3507573ca77eSGeorge Wilson 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3508573ca77eSGeorge Wilson 
35096809eb4eSEric Schrock 	mutex_exit(&spa_namespace_lock);
3510c8e1f6d2SMark J Musante 	spa_history_log_version(spa, LOG_POOL_IMPORT);
35116809eb4eSEric Schrock 
35126809eb4eSEric Schrock 	return (0);
3513c5904d13Seschrock }
3514c5904d13Seschrock 
3515fa9e4066Sahrens nvlist_t *
3516fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig)
3517fa9e4066Sahrens {
3518fa9e4066Sahrens 	nvlist_t *config = NULL;
3519fa9e4066Sahrens 	char *poolname;
3520fa9e4066Sahrens 	spa_t *spa;
3521fa9e4066Sahrens 	uint64_t state;
35227b7154beSLin Ling 	int error;
3523fa9e4066Sahrens 
3524fa9e4066Sahrens 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3525fa9e4066Sahrens 		return (NULL);
3526fa9e4066Sahrens 
3527fa9e4066Sahrens 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3528fa9e4066Sahrens 		return (NULL);
3529fa9e4066Sahrens 
3530fa9e4066Sahrens 	/*
35310373e76bSbonwick 	 * Create and initialize the spa structure.
3532fa9e4066Sahrens 	 */
35330373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
3534468c413aSTim Haley 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
35358ad4d6ddSJeff Bonwick 	spa_activate(spa, FREAD);
3536fa9e4066Sahrens 
3537fa9e4066Sahrens 	/*
35380373e76bSbonwick 	 * Pass off the heavy lifting to spa_load().
3539ecc2d604Sbonwick 	 * Pass TRUE for mosconfig because the user-supplied config
3540ecc2d604Sbonwick 	 * is actually the one to trust when doing an import.
3541fa9e4066Sahrens 	 */
35421195e687SMark J Musante 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3543fa9e4066Sahrens 
3544fa9e4066Sahrens 	/*
3545fa9e4066Sahrens 	 * If 'tryconfig' was at least parsable, return the current config.
3546fa9e4066Sahrens 	 */
3547fa9e4066Sahrens 	if (spa->spa_root_vdev != NULL) {
3548fa9e4066Sahrens 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3549fa9e4066Sahrens 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3550fa9e4066Sahrens 		    poolname) == 0);
3551fa9e4066Sahrens 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3552fa9e4066Sahrens 		    state) == 0);
355395173954Sek 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
355495173954Sek 		    spa->spa_uberblock.ub_timestamp) == 0);
355599653d4eSeschrock 
3556e7cbe64fSgw 		/*
3557e7cbe64fSgw 		 * If the bootfs property exists on this pool then we
3558e7cbe64fSgw 		 * copy it out so that external consumers can tell which
3559e7cbe64fSgw 		 * pools are bootable.
3560e7cbe64fSgw 		 */
35617b7154beSLin Ling 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
3562e7cbe64fSgw 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3563e7cbe64fSgw 
3564e7cbe64fSgw 			/*
3565e7cbe64fSgw 			 * We have to play games with the name since the
3566e7cbe64fSgw 			 * pool was opened as TRYIMPORT_NAME.
3567e7cbe64fSgw 			 */
3568e14bb325SJeff Bonwick 			if (dsl_dsobj_to_dsname(spa_name(spa),
3569e7cbe64fSgw 			    spa->spa_bootfs, tmpname) == 0) {
3570e7cbe64fSgw 				char *cp;
3571e7cbe64fSgw 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3572e7cbe64fSgw 
3573e7cbe64fSgw 				cp = strchr(tmpname, '/');
3574e7cbe64fSgw 				if (cp == NULL) {
3575e7cbe64fSgw 					(void) strlcpy(dsname, tmpname,
3576e7cbe64fSgw 					    MAXPATHLEN);
3577e7cbe64fSgw 				} else {
3578e7cbe64fSgw 					(void) snprintf(dsname, MAXPATHLEN,
3579e7cbe64fSgw 					    "%s/%s", poolname, ++cp);
3580e7cbe64fSgw 				}
3581e7cbe64fSgw 				VERIFY(nvlist_add_string(config,
3582e7cbe64fSgw 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3583e7cbe64fSgw 				kmem_free(dsname, MAXPATHLEN);
3584e7cbe64fSgw 			}
3585e7cbe64fSgw 			kmem_free(tmpname, MAXPATHLEN);
3586e7cbe64fSgw 		}
3587e7cbe64fSgw 
358899653d4eSeschrock 		/*
3589fa94a07fSbrendan 		 * Add the list of hot spares and level 2 cache devices.
359099653d4eSeschrock 		 */
35916809eb4eSEric Schrock 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
359299653d4eSeschrock 		spa_add_spares(spa, config);
3593fa94a07fSbrendan 		spa_add_l2cache(spa, config);
35946809eb4eSEric Schrock 		spa_config_exit(spa, SCL_CONFIG, FTAG);
3595fa9e4066Sahrens 	}
3596fa9e4066Sahrens 
3597fa9e4066Sahrens 	spa_unload(spa);
3598fa9e4066Sahrens 	spa_deactivate(spa);
3599fa9e4066Sahrens 	spa_remove(spa);
3600fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3601fa9e4066Sahrens 
3602fa9e4066Sahrens 	return (config);
3603fa9e4066Sahrens }
3604fa9e4066Sahrens 
3605fa9e4066Sahrens /*
3606fa9e4066Sahrens  * Pool export/destroy
3607fa9e4066Sahrens  *
3608fa9e4066Sahrens  * The act of destroying or exporting a pool is very simple.  We make sure there
3609fa9e4066Sahrens  * is no more pending I/O and any references to the pool are gone.  Then, we
3610fa9e4066Sahrens  * update the pool state and sync all the labels to disk, removing the
3611394ab0cbSGeorge Wilson  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3612394ab0cbSGeorge Wilson  * we don't sync the labels or remove the configuration cache.
3613fa9e4066Sahrens  */
3614fa9e4066Sahrens static int
361589a89ebfSlling spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3616394ab0cbSGeorge Wilson     boolean_t force, boolean_t hardforce)
3617fa9e4066Sahrens {
3618fa9e4066Sahrens 	spa_t *spa;
3619fa9e4066Sahrens 
362044cd46caSbillm 	if (oldconfig)
362144cd46caSbillm 		*oldconfig = NULL;
362244cd46caSbillm 
36238ad4d6ddSJeff Bonwick 	if (!(spa_mode_global & FWRITE))
3624fa9e4066Sahrens 		return (EROFS);
3625fa9e4066Sahrens 
3626fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
3627fa9e4066Sahrens 	if ((spa = spa_lookup(pool)) == NULL) {
3628fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
3629fa9e4066Sahrens 		return (ENOENT);
3630fa9e4066Sahrens 	}
3631fa9e4066Sahrens 
3632ea8dc4b6Seschrock 	/*
3633ea8dc4b6Seschrock 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
3634ea8dc4b6Seschrock 	 * reacquire the namespace lock, and see if we can export.
3635ea8dc4b6Seschrock 	 */
3636ea8dc4b6Seschrock 	spa_open_ref(spa, FTAG);
3637ea8dc4b6Seschrock 	mutex_exit(&spa_namespace_lock);
3638ea8dc4b6Seschrock 	spa_async_suspend(spa);
3639ea8dc4b6Seschrock 	mutex_enter(&spa_namespace_lock);
3640ea8dc4b6Seschrock 	spa_close(spa, FTAG);
3641ea8dc4b6Seschrock 
3642fa9e4066Sahrens 	/*
3643fa9e4066Sahrens 	 * The pool will be in core if it's openable,
3644fa9e4066Sahrens 	 * in which case we can modify its state.
3645fa9e4066Sahrens 	 */
3646fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3647fa9e4066Sahrens 		/*
3648fa9e4066Sahrens 		 * Objsets may be open only because they're dirty, so we
3649fa9e4066Sahrens 		 * have to force it to sync before checking spa_refcnt.
3650fa9e4066Sahrens 		 */
3651fa9e4066Sahrens 		txg_wait_synced(spa->spa_dsl_pool, 0);
3652fa9e4066Sahrens 
3653ea8dc4b6Seschrock 		/*
3654ea8dc4b6Seschrock 		 * A pool cannot be exported or destroyed if there are active
3655ea8dc4b6Seschrock 		 * references.  If we are resetting a pool, allow references by
3656ea8dc4b6Seschrock 		 * fault injection handlers.
3657ea8dc4b6Seschrock 		 */
3658ea8dc4b6Seschrock 		if (!spa_refcount_zero(spa) ||
3659ea8dc4b6Seschrock 		    (spa->spa_inject_ref != 0 &&
3660ea8dc4b6Seschrock 		    new_state != POOL_STATE_UNINITIALIZED)) {
3661ea8dc4b6Seschrock 			spa_async_resume(spa);
3662fa9e4066Sahrens 			mutex_exit(&spa_namespace_lock);
3663fa9e4066Sahrens 			return (EBUSY);
3664fa9e4066Sahrens 		}
3665fa9e4066Sahrens 
366689a89ebfSlling 		/*
366789a89ebfSlling 		 * A pool cannot be exported if it has an active shared spare.
366889a89ebfSlling 		 * This is to prevent other pools stealing the active spare
366989a89ebfSlling 		 * from an exported pool. At user's own will, such pool can
367089a89ebfSlling 		 * be forcedly exported.
367189a89ebfSlling 		 */
367289a89ebfSlling 		if (!force && new_state == POOL_STATE_EXPORTED &&
367389a89ebfSlling 		    spa_has_active_shared_spare(spa)) {
367489a89ebfSlling 			spa_async_resume(spa);
367589a89ebfSlling 			mutex_exit(&spa_namespace_lock);
367689a89ebfSlling 			return (EXDEV);
367789a89ebfSlling 		}
367889a89ebfSlling 
3679fa9e4066Sahrens 		/*
3680fa9e4066Sahrens 		 * We want this to be reflected on every label,
3681fa9e4066Sahrens 		 * so mark them all dirty.  spa_unload() will do the
3682fa9e4066Sahrens 		 * final sync that pushes these changes out.
3683fa9e4066Sahrens 		 */
3684394ab0cbSGeorge Wilson 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3685e14bb325SJeff Bonwick 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3686ea8dc4b6Seschrock 			spa->spa_state = new_state;
36873f9d6ad7SLin Ling 			spa->spa_final_txg = spa_last_synced_txg(spa) +
36883f9d6ad7SLin Ling 			    TXG_DEFER_SIZE + 1;
3689ea8dc4b6Seschrock 			vdev_config_dirty(spa->spa_root_vdev);
3690e14bb325SJeff Bonwick 			spa_config_exit(spa, SCL_ALL, FTAG);
3691ea8dc4b6Seschrock 		}
3692fa9e4066Sahrens 	}
3693fa9e4066Sahrens 
36943d7072f8Seschrock 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
36953d7072f8Seschrock 
3696fa9e4066Sahrens 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3697fa9e4066Sahrens 		spa_unload(spa);
3698fa9e4066Sahrens 		spa_deactivate(spa);
3699fa9e4066Sahrens 	}
3700fa9e4066Sahrens 
370144cd46caSbillm 	if (oldconfig && spa->spa_config)
370244cd46caSbillm 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
370344cd46caSbillm 
3704ea8dc4b6Seschrock 	if (new_state != POOL_STATE_UNINITIALIZED) {
3705394ab0cbSGeorge Wilson 		if (!hardforce)
3706394ab0cbSGeorge Wilson 			spa_config_sync(spa, B_TRUE, B_TRUE);
3707ea8dc4b6Seschrock 		spa_remove(spa);
3708ea8dc4b6Seschrock 	}
3709fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
3710fa9e4066Sahrens 
3711fa9e4066Sahrens 	return (0);
3712fa9e4066Sahrens }
3713fa9e4066Sahrens 
3714fa9e4066Sahrens /*
3715fa9e4066Sahrens  * Destroy a storage pool.
3716fa9e4066Sahrens  */
3717fa9e4066Sahrens int
3718fa9e4066Sahrens spa_destroy(char *pool)
3719fa9e4066Sahrens {
3720394ab0cbSGeorge Wilson 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3721394ab0cbSGeorge Wilson 	    B_FALSE, B_FALSE));
3722fa9e4066Sahrens }
3723fa9e4066Sahrens 
3724fa9e4066Sahrens /*
3725fa9e4066Sahrens  * Export a storage pool.
3726fa9e4066Sahrens  */
3727fa9e4066Sahrens int
3728394ab0cbSGeorge Wilson spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3729394ab0cbSGeorge Wilson     boolean_t hardforce)
3730fa9e4066Sahrens {
3731394ab0cbSGeorge Wilson 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3732394ab0cbSGeorge Wilson 	    force, hardforce));
3733fa9e4066Sahrens }
3734fa9e4066Sahrens 
3735ea8dc4b6Seschrock /*
3736ea8dc4b6Seschrock  * Similar to spa_export(), this unloads the spa_t without actually removing it
3737ea8dc4b6Seschrock  * from the namespace in any way.
3738ea8dc4b6Seschrock  */
3739ea8dc4b6Seschrock int
3740ea8dc4b6Seschrock spa_reset(char *pool)
3741ea8dc4b6Seschrock {
374289a89ebfSlling 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3743394ab0cbSGeorge Wilson 	    B_FALSE, B_FALSE));
3744ea8dc4b6Seschrock }
3745ea8dc4b6Seschrock 
3746fa9e4066Sahrens /*
3747fa9e4066Sahrens  * ==========================================================================
3748fa9e4066Sahrens  * Device manipulation
3749fa9e4066Sahrens  * ==========================================================================
3750fa9e4066Sahrens  */
3751fa9e4066Sahrens 
3752fa9e4066Sahrens /*
37538654d025Sperrin  * Add a device to a storage pool.
3754fa9e4066Sahrens  */
3755fa9e4066Sahrens int
3756fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3757fa9e4066Sahrens {
375888ecc943SGeorge Wilson 	uint64_t txg, id;
37598ad4d6ddSJeff Bonwick 	int error;
3760fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
37610e34b6a7Sbonwick 	vdev_t *vd, *tvd;
3762fa94a07fSbrendan 	nvlist_t **spares, **l2cache;
3763fa94a07fSbrendan 	uint_t nspares, nl2cache;
3764fa9e4066Sahrens 
3765f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
3766f9af39baSGeorge Wilson 
3767fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
3768fa9e4066Sahrens 
376999653d4eSeschrock 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
377099653d4eSeschrock 	    VDEV_ALLOC_ADD)) != 0)
377199653d4eSeschrock 		return (spa_vdev_exit(spa, NULL, txg, error));
3772fa9e4066Sahrens 
3773e14bb325SJeff Bonwick 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
377499653d4eSeschrock 
3775fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3776fa94a07fSbrendan 	    &nspares) != 0)
377799653d4eSeschrock 		nspares = 0;
377899653d4eSeschrock 
3779fa94a07fSbrendan 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3780fa94a07fSbrendan 	    &nl2cache) != 0)
3781fa94a07fSbrendan 		nl2cache = 0;
3782fa94a07fSbrendan 
3783e14bb325SJeff Bonwick 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3784fa9e4066Sahrens 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
3785fa9e4066Sahrens 
3786e14bb325SJeff Bonwick 	if (vd->vdev_children != 0 &&
3787e14bb325SJeff Bonwick 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
3788e14bb325SJeff Bonwick 		return (spa_vdev_exit(spa, vd, txg, error));
378999653d4eSeschrock 
379039c23413Seschrock 	/*
3791fa94a07fSbrendan 	 * We must validate the spares and l2cache devices after checking the
3792fa94a07fSbrendan 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
379339c23413Seschrock 	 */
3794e14bb325SJeff Bonwick 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
379539c23413Seschrock 		return (spa_vdev_exit(spa, vd, txg, error));
379639c23413Seschrock 
379739c23413Seschrock 	/*
379839c23413Seschrock 	 * Transfer each new top-level vdev from vd to rvd.
379939c23413Seschrock 	 */
38008ad4d6ddSJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++) {
380188ecc943SGeorge Wilson 
380288ecc943SGeorge Wilson 		/*
380388ecc943SGeorge Wilson 		 * Set the vdev id to the first hole, if one exists.
380488ecc943SGeorge Wilson 		 */
380588ecc943SGeorge Wilson 		for (id = 0; id < rvd->vdev_children; id++) {
380688ecc943SGeorge Wilson 			if (rvd->vdev_child[id]->vdev_ishole) {
380788ecc943SGeorge Wilson 				vdev_free(rvd->vdev_child[id]);
380888ecc943SGeorge Wilson 				break;
380988ecc943SGeorge Wilson 			}
381088ecc943SGeorge Wilson 		}
381139c23413Seschrock 		tvd = vd->vdev_child[c];
381239c23413Seschrock 		vdev_remove_child(vd, tvd);
381388ecc943SGeorge Wilson 		tvd->vdev_id = id;
381439c23413Seschrock 		vdev_add_child(rvd, tvd);
381539c23413Seschrock 		vdev_config_dirty(tvd);
381639c23413Seschrock 	}
381739c23413Seschrock 
381899653d4eSeschrock 	if (nspares != 0) {
3819fa94a07fSbrendan 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3820fa94a07fSbrendan 		    ZPOOL_CONFIG_SPARES);
382199653d4eSeschrock 		spa_load_spares(spa);
3822fa94a07fSbrendan 		spa->spa_spares.sav_sync = B_TRUE;
3823fa94a07fSbrendan 	}
3824fa94a07fSbrendan 
3825fa94a07fSbrendan 	if (nl2cache != 0) {
3826fa94a07fSbrendan 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3827fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE);
3828fa94a07fSbrendan 		spa_load_l2cache(spa);
3829fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
3830fa9e4066Sahrens 	}
3831fa9e4066Sahrens 
3832fa9e4066Sahrens 	/*
38330e34b6a7Sbonwick 	 * We have to be careful when adding new vdevs to an existing pool.
38340e34b6a7Sbonwick 	 * If other threads start allocating from these vdevs before we
38350e34b6a7Sbonwick 	 * sync the config cache, and we lose power, then upon reboot we may
38360e34b6a7Sbonwick 	 * fail to open the pool because there are DVAs that the config cache
38370e34b6a7Sbonwick 	 * can't translate.  Therefore, we first add the vdevs without
38380e34b6a7Sbonwick 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
38390373e76bSbonwick 	 * and then let spa_config_update() initialize the new metaslabs.
38400e34b6a7Sbonwick 	 *
38410e34b6a7Sbonwick 	 * spa_load() checks for added-but-not-initialized vdevs, so that
38420e34b6a7Sbonwick 	 * if we lose power at any point in this sequence, the remaining
38430e34b6a7Sbonwick 	 * steps will be completed the next time we load the pool.
38440e34b6a7Sbonwick 	 */
38450373e76bSbonwick 	(void) spa_vdev_exit(spa, vd, txg, 0);
38460e34b6a7Sbonwick 
38470373e76bSbonwick 	mutex_enter(&spa_namespace_lock);
38480373e76bSbonwick 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
38490373e76bSbonwick 	mutex_exit(&spa_namespace_lock);
3850fa9e4066Sahrens 
38510373e76bSbonwick 	return (0);
3852fa9e4066Sahrens }
3853fa9e4066Sahrens 
3854fa9e4066Sahrens /*
3855fa9e4066Sahrens  * Attach a device to a mirror.  The arguments are the path to any device
3856fa9e4066Sahrens  * in the mirror, and the nvroot for the new device.  If the path specifies
3857fa9e4066Sahrens  * a device that is not mirrored, we automatically insert the mirror vdev.
3858fa9e4066Sahrens  *
3859fa9e4066Sahrens  * If 'replacing' is specified, the new device is intended to replace the
3860fa9e4066Sahrens  * existing device; in this case the two devices are made into their own
38613d7072f8Seschrock  * mirror using the 'replacing' vdev, which is functionally identical to
3862fa9e4066Sahrens  * the mirror vdev (it actually reuses all the same ops) but has a few
3863fa9e4066Sahrens  * extra rules: you can't attach to it after it's been created, and upon
3864fa9e4066Sahrens  * completion of resilvering, the first disk (the one being replaced)
3865fa9e4066Sahrens  * is automatically detached.
3866fa9e4066Sahrens  */
3867fa9e4066Sahrens int
3868ea8dc4b6Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3869fa9e4066Sahrens {
38703f9d6ad7SLin Ling 	uint64_t txg, dtl_max_txg;
3871fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
3872fa9e4066Sahrens 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
387399653d4eSeschrock 	vdev_ops_t *pvops;
38749b3f6b42SEric Kustarz 	char *oldvdpath, *newvdpath;
38759b3f6b42SEric Kustarz 	int newvd_isspare;
38769b3f6b42SEric Kustarz 	int error;
3877fa9e4066Sahrens 
3878f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
3879f9af39baSGeorge Wilson 
3880fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
3881fa9e4066Sahrens 
3882c5904d13Seschrock 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3883fa9e4066Sahrens 
3884fa9e4066Sahrens 	if (oldvd == NULL)
3885fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3886fa9e4066Sahrens 
38870e34b6a7Sbonwick 	if (!oldvd->vdev_ops->vdev_op_leaf)
38880e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
38890e34b6a7Sbonwick 
3890fa9e4066Sahrens 	pvd = oldvd->vdev_parent;
3891fa9e4066Sahrens 
389299653d4eSeschrock 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
38933d7072f8Seschrock 	    VDEV_ALLOC_ADD)) != 0)
38943d7072f8Seschrock 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
38953d7072f8Seschrock 
38963d7072f8Seschrock 	if (newrootvd->vdev_children != 1)
3897fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3898fa9e4066Sahrens 
3899fa9e4066Sahrens 	newvd = newrootvd->vdev_child[0];
3900fa9e4066Sahrens 
3901fa9e4066Sahrens 	if (!newvd->vdev_ops->vdev_op_leaf)
3902fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3903fa9e4066Sahrens 
390499653d4eSeschrock 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3905fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, error));
3906fa9e4066Sahrens 
39078654d025Sperrin 	/*
39088654d025Sperrin 	 * Spares can't replace logs
39098654d025Sperrin 	 */
3910ee0eb9f2SEric Schrock 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
39118654d025Sperrin 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
39128654d025Sperrin 
391399653d4eSeschrock 	if (!replacing) {
391499653d4eSeschrock 		/*
391599653d4eSeschrock 		 * For attach, the only allowable parent is a mirror or the root
391699653d4eSeschrock 		 * vdev.
391799653d4eSeschrock 		 */
391899653d4eSeschrock 		if (pvd->vdev_ops != &vdev_mirror_ops &&
391999653d4eSeschrock 		    pvd->vdev_ops != &vdev_root_ops)
392099653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
392199653d4eSeschrock 
392299653d4eSeschrock 		pvops = &vdev_mirror_ops;
392399653d4eSeschrock 	} else {
392499653d4eSeschrock 		/*
392599653d4eSeschrock 		 * Active hot spares can only be replaced by inactive hot
392699653d4eSeschrock 		 * spares.
392799653d4eSeschrock 		 */
392899653d4eSeschrock 		if (pvd->vdev_ops == &vdev_spare_ops &&
3929cb04b873SMark J Musante 		    oldvd->vdev_isspare &&
393099653d4eSeschrock 		    !spa_has_spare(spa, newvd->vdev_guid))
393199653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
393299653d4eSeschrock 
393399653d4eSeschrock 		/*
393499653d4eSeschrock 		 * If the source is a hot spare, and the parent isn't already a
393599653d4eSeschrock 		 * spare, then we want to create a new hot spare.  Otherwise, we
393639c23413Seschrock 		 * want to create a replacing vdev.  The user is not allowed to
393739c23413Seschrock 		 * attach to a spared vdev child unless the 'isspare' state is
393839c23413Seschrock 		 * the same (spare replaces spare, non-spare replaces
393939c23413Seschrock 		 * non-spare).
394099653d4eSeschrock 		 */
3941cb04b873SMark J Musante 		if (pvd->vdev_ops == &vdev_replacing_ops &&
3942cb04b873SMark J Musante 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
394399653d4eSeschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3944cb04b873SMark J Musante 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
3945cb04b873SMark J Musante 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
394639c23413Seschrock 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3947cb04b873SMark J Musante 		}
3948cb04b873SMark J Musante 
3949cb04b873SMark J Musante 		if (newvd->vdev_isspare)
395099653d4eSeschrock 			pvops = &vdev_spare_ops;
395199653d4eSeschrock 		else
395299653d4eSeschrock 			pvops = &vdev_replacing_ops;
395399653d4eSeschrock 	}
395499653d4eSeschrock 
39552a79c5feSlling 	/*
3956573ca77eSGeorge Wilson 	 * Make sure the new device is big enough.
39572a79c5feSlling 	 */
3958573ca77eSGeorge Wilson 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3959fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3960fa9e4066Sahrens 
3961ecc2d604Sbonwick 	/*
3962ecc2d604Sbonwick 	 * The new device cannot have a higher alignment requirement
3963ecc2d604Sbonwick 	 * than the top-level vdev.
3964ecc2d604Sbonwick 	 */
3965ecc2d604Sbonwick 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3966fa9e4066Sahrens 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3967fa9e4066Sahrens 
3968fa9e4066Sahrens 	/*
3969fa9e4066Sahrens 	 * If this is an in-place replacement, update oldvd's path and devid
3970fa9e4066Sahrens 	 * to make it distinguishable from newvd, and unopenable from now on.
3971fa9e4066Sahrens 	 */
3972fa9e4066Sahrens 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3973fa9e4066Sahrens 		spa_strfree(oldvd->vdev_path);
3974fa9e4066Sahrens 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3975fa9e4066Sahrens 		    KM_SLEEP);
3976fa9e4066Sahrens 		(void) sprintf(oldvd->vdev_path, "%s/%s",
3977fa9e4066Sahrens 		    newvd->vdev_path, "old");
3978fa9e4066Sahrens 		if (oldvd->vdev_devid != NULL) {
3979fa9e4066Sahrens 			spa_strfree(oldvd->vdev_devid);
3980fa9e4066Sahrens 			oldvd->vdev_devid = NULL;
3981fa9e4066Sahrens 		}
3982fa9e4066Sahrens 	}
3983fa9e4066Sahrens 
3984cb04b873SMark J Musante 	/* mark the device being resilvered */
3985cb04b873SMark J Musante 	newvd->vdev_resilvering = B_TRUE;
3986cb04b873SMark J Musante 
3987fa9e4066Sahrens 	/*
398899653d4eSeschrock 	 * If the parent is not a mirror, or if we're replacing, insert the new
398999653d4eSeschrock 	 * mirror/replacing/spare vdev above oldvd.
3990fa9e4066Sahrens 	 */
3991fa9e4066Sahrens 	if (pvd->vdev_ops != pvops)
3992fa9e4066Sahrens 		pvd = vdev_add_parent(oldvd, pvops);
3993fa9e4066Sahrens 
3994fa9e4066Sahrens 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
3995fa9e4066Sahrens 	ASSERT(pvd->vdev_ops == pvops);
3996fa9e4066Sahrens 	ASSERT(oldvd->vdev_parent == pvd);
3997fa9e4066Sahrens 
3998fa9e4066Sahrens 	/*
3999fa9e4066Sahrens 	 * Extract the new device from its root and add it to pvd.
4000fa9e4066Sahrens 	 */
4001fa9e4066Sahrens 	vdev_remove_child(newrootvd, newvd);
4002fa9e4066Sahrens 	newvd->vdev_id = pvd->vdev_children;
400388ecc943SGeorge Wilson 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
4004fa9e4066Sahrens 	vdev_add_child(pvd, newvd);
4005fa9e4066Sahrens 
4006fa9e4066Sahrens 	tvd = newvd->vdev_top;
4007fa9e4066Sahrens 	ASSERT(pvd->vdev_top == tvd);
4008fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
4009fa9e4066Sahrens 
4010fa9e4066Sahrens 	vdev_config_dirty(tvd);
4011fa9e4066Sahrens 
4012fa9e4066Sahrens 	/*
40133f9d6ad7SLin Ling 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
40143f9d6ad7SLin Ling 	 * for any dmu_sync-ed blocks.  It will propagate upward when
40153f9d6ad7SLin Ling 	 * spa_vdev_exit() calls vdev_dtl_reassess().
4016fa9e4066Sahrens 	 */
40173f9d6ad7SLin Ling 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4018fa9e4066Sahrens 
40193f9d6ad7SLin Ling 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
40203f9d6ad7SLin Ling 	    dtl_max_txg - TXG_INITIAL);
4021fa9e4066Sahrens 
40226809eb4eSEric Schrock 	if (newvd->vdev_isspare) {
402339c23413Seschrock 		spa_spare_activate(newvd);
40246809eb4eSEric Schrock 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
40256809eb4eSEric Schrock 	}
40266809eb4eSEric Schrock 
4027e14bb325SJeff Bonwick 	oldvdpath = spa_strdup(oldvd->vdev_path);
4028e14bb325SJeff Bonwick 	newvdpath = spa_strdup(newvd->vdev_path);
40299b3f6b42SEric Kustarz 	newvd_isspare = newvd->vdev_isspare;
4030ea8dc4b6Seschrock 
4031fa9e4066Sahrens 	/*
4032fa9e4066Sahrens 	 * Mark newvd's DTL dirty in this txg.
4033fa9e4066Sahrens 	 */
4034ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
4035fa9e4066Sahrens 
40363f9d6ad7SLin Ling 	/*
40373f9d6ad7SLin Ling 	 * Restart the resilver
40383f9d6ad7SLin Ling 	 */
40393f9d6ad7SLin Ling 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
40403f9d6ad7SLin Ling 
40413f9d6ad7SLin Ling 	/*
40423f9d6ad7SLin Ling 	 * Commit the config
40433f9d6ad7SLin Ling 	 */
40443f9d6ad7SLin Ling 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4045fa9e4066Sahrens 
40463f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
40473f9d6ad7SLin Ling 	    "%s vdev=%s %s vdev=%s",
4048c8e1f6d2SMark J Musante 	    replacing && newvd_isspare ? "spare in" :
4049c8e1f6d2SMark J Musante 	    replacing ? "replace" : "attach", newvdpath,
4050c8e1f6d2SMark J Musante 	    replacing ? "for" : "to", oldvdpath);
40519b3f6b42SEric Kustarz 
40529b3f6b42SEric Kustarz 	spa_strfree(oldvdpath);
40539b3f6b42SEric Kustarz 	spa_strfree(newvdpath);
40549b3f6b42SEric Kustarz 
4055943e9869SLori Alt 	if (spa->spa_bootfs)
4056943e9869SLori Alt 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4057943e9869SLori Alt 
4058fa9e4066Sahrens 	return (0);
4059fa9e4066Sahrens }
4060fa9e4066Sahrens 
4061fa9e4066Sahrens /*
4062fa9e4066Sahrens  * Detach a device from a mirror or replacing vdev.
4063fa9e4066Sahrens  * If 'replace_done' is specified, only detach if the parent
4064fa9e4066Sahrens  * is a replacing vdev.
4065fa9e4066Sahrens  */
4066fa9e4066Sahrens int
40678ad4d6ddSJeff Bonwick spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4068fa9e4066Sahrens {
4069fa9e4066Sahrens 	uint64_t txg;
40708ad4d6ddSJeff Bonwick 	int error;
4071fa9e4066Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
4072fa9e4066Sahrens 	vdev_t *vd, *pvd, *cvd, *tvd;
407399653d4eSeschrock 	boolean_t unspare = B_FALSE;
407499653d4eSeschrock 	uint64_t unspare_guid;
40751195e687SMark J Musante 	char *vdpath;
4076fa9e4066Sahrens 
4077f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
4078f9af39baSGeorge Wilson 
4079fa9e4066Sahrens 	txg = spa_vdev_enter(spa);
4080fa9e4066Sahrens 
4081c5904d13Seschrock 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4082fa9e4066Sahrens 
4083fa9e4066Sahrens 	if (vd == NULL)
4084fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4085fa9e4066Sahrens 
40860e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
40870e34b6a7Sbonwick 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
40880e34b6a7Sbonwick 
4089fa9e4066Sahrens 	pvd = vd->vdev_parent;
4090fa9e4066Sahrens 
40918ad4d6ddSJeff Bonwick 	/*
40928ad4d6ddSJeff Bonwick 	 * If the parent/child relationship is not as expected, don't do it.
40938ad4d6ddSJeff Bonwick 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
40948ad4d6ddSJeff Bonwick 	 * vdev that's replacing B with C.  The user's intent in replacing
40958ad4d6ddSJeff Bonwick 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
40968ad4d6ddSJeff Bonwick 	 * the replace by detaching C, the expected behavior is to end up
40978ad4d6ddSJeff Bonwick 	 * M(A,B).  But suppose that right after deciding to detach C,
40988ad4d6ddSJeff Bonwick 	 * the replacement of B completes.  We would have M(A,C), and then
40998ad4d6ddSJeff Bonwick 	 * ask to detach C, which would leave us with just A -- not what
41008ad4d6ddSJeff Bonwick 	 * the user wanted.  To prevent this, we make sure that the
41018ad4d6ddSJeff Bonwick 	 * parent/child relationship hasn't changed -- in this example,
41028ad4d6ddSJeff Bonwick 	 * that C's parent is still the replacing vdev R.
41038ad4d6ddSJeff Bonwick 	 */
41048ad4d6ddSJeff Bonwick 	if (pvd->vdev_guid != pguid && pguid != 0)
41058ad4d6ddSJeff Bonwick 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
41068ad4d6ddSJeff Bonwick 
4107fa9e4066Sahrens 	/*
4108cb04b873SMark J Musante 	 * Only 'replacing' or 'spare' vdevs can be replaced.
410999653d4eSeschrock 	 */
4110cb04b873SMark J Musante 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4111cb04b873SMark J Musante 	    pvd->vdev_ops != &vdev_spare_ops)
4112cb04b873SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
411399653d4eSeschrock 
411499653d4eSeschrock 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4115e7437265Sahrens 	    spa_version(spa) >= SPA_VERSION_SPARES);
4116fa9e4066Sahrens 
4117fa9e4066Sahrens 	/*
411899653d4eSeschrock 	 * Only mirror, replacing, and spare vdevs support detach.
4119fa9e4066Sahrens 	 */
4120fa9e4066Sahrens 	if (pvd->vdev_ops != &vdev_replacing_ops &&
412199653d4eSeschrock 	    pvd->vdev_ops != &vdev_mirror_ops &&
412299653d4eSeschrock 	    pvd->vdev_ops != &vdev_spare_ops)
4123fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4124fa9e4066Sahrens 
4125fa9e4066Sahrens 	/*
41268ad4d6ddSJeff Bonwick 	 * If this device has the only valid copy of some data,
41278ad4d6ddSJeff Bonwick 	 * we cannot safely detach it.
4128fa9e4066Sahrens 	 */
41298ad4d6ddSJeff Bonwick 	if (vdev_dtl_required(vd))
4130fa9e4066Sahrens 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4131fa9e4066Sahrens 
41328ad4d6ddSJeff Bonwick 	ASSERT(pvd->vdev_children >= 2);
4133fa9e4066Sahrens 
4134bf82a41bSeschrock 	/*
4135bf82a41bSeschrock 	 * If we are detaching the second disk from a replacing vdev, then
4136bf82a41bSeschrock 	 * check to see if we changed the original vdev's path to have "/old"
4137bf82a41bSeschrock 	 * at the end in spa_vdev_attach().  If so, undo that change now.
4138bf82a41bSeschrock 	 */
4139cb04b873SMark J Musante 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4140cb04b873SMark J Musante 	    vd->vdev_path != NULL) {
4141cb04b873SMark J Musante 		size_t len = strlen(vd->vdev_path);
4142cb04b873SMark J Musante 
4143cb04b873SMark J Musante 		for (int c = 0; c < pvd->vdev_children; c++) {
4144cb04b873SMark J Musante 			cvd = pvd->vdev_child[c];
4145cb04b873SMark J Musante 
4146cb04b873SMark J Musante 			if (cvd == vd || cvd->vdev_path == NULL)
4147cb04b873SMark J Musante 				continue;
4148cb04b873SMark J Musante 
4149cb04b873SMark J Musante 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4150cb04b873SMark J Musante 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
4151cb04b873SMark J Musante 				spa_strfree(cvd->vdev_path);
4152cb04b873SMark J Musante 				cvd->vdev_path = spa_strdup(vd->vdev_path);
4153cb04b873SMark J Musante 				break;
4154cb04b873SMark J Musante 			}
4155bf82a41bSeschrock 		}
4156bf82a41bSeschrock 	}
4157bf82a41bSeschrock 
415899653d4eSeschrock 	/*
415999653d4eSeschrock 	 * If we are detaching the original disk from a spare, then it implies
416099653d4eSeschrock 	 * that the spare should become a real disk, and be removed from the
416199653d4eSeschrock 	 * active spare list for the pool.
416299653d4eSeschrock 	 */
416399653d4eSeschrock 	if (pvd->vdev_ops == &vdev_spare_ops &&
4164cb04b873SMark J Musante 	    vd->vdev_id == 0 &&
4165cb04b873SMark J Musante 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
416699653d4eSeschrock 		unspare = B_TRUE;
416799653d4eSeschrock 
4168fa9e4066Sahrens 	/*
4169fa9e4066Sahrens 	 * Erase the disk labels so the disk can be used for other things.
4170fa9e4066Sahrens 	 * This must be done after all other error cases are handled,
4171fa9e4066Sahrens 	 * but before we disembowel vd (so we can still do I/O to it).
4172fa9e4066Sahrens 	 * But if we can't do it, don't treat the error as fatal --
4173fa9e4066Sahrens 	 * it may be that the unwritability of the disk is the reason
4174fa9e4066Sahrens 	 * it's being detached!
4175fa9e4066Sahrens 	 */
417639c23413Seschrock 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4177fa9e4066Sahrens 
4178fa9e4066Sahrens 	/*
4179fa9e4066Sahrens 	 * Remove vd from its parent and compact the parent's children.
4180fa9e4066Sahrens 	 */
4181fa9e4066Sahrens 	vdev_remove_child(pvd, vd);
4182fa9e4066Sahrens 	vdev_compact_children(pvd);
4183fa9e4066Sahrens 
4184fa9e4066Sahrens 	/*
4185fa9e4066Sahrens 	 * Remember one of the remaining children so we can get tvd below.
4186fa9e4066Sahrens 	 */
4187cb04b873SMark J Musante 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
4188fa9e4066Sahrens 
418999653d4eSeschrock 	/*
419099653d4eSeschrock 	 * If we need to remove the remaining child from the list of hot spares,
41918ad4d6ddSJeff Bonwick 	 * do it now, marking the vdev as no longer a spare in the process.
41928ad4d6ddSJeff Bonwick 	 * We must do this before vdev_remove_parent(), because that can
41938ad4d6ddSJeff Bonwick 	 * change the GUID if it creates a new toplevel GUID.  For a similar
41948ad4d6ddSJeff Bonwick 	 * reason, we must remove the spare now, in the same txg as the detach;
41958ad4d6ddSJeff Bonwick 	 * otherwise someone could attach a new sibling, change the GUID, and
41968ad4d6ddSJeff Bonwick 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
419799653d4eSeschrock 	 */
419899653d4eSeschrock 	if (unspare) {
419999653d4eSeschrock 		ASSERT(cvd->vdev_isspare);
420039c23413Seschrock 		spa_spare_remove(cvd);
420199653d4eSeschrock 		unspare_guid = cvd->vdev_guid;
42028ad4d6ddSJeff Bonwick 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4203cb04b873SMark J Musante 		cvd->vdev_unspare = B_TRUE;
420499653d4eSeschrock 	}
420599653d4eSeschrock 
4206fa9e4066Sahrens 	/*
4207fa9e4066Sahrens 	 * If the parent mirror/replacing vdev only has one child,
4208fa9e4066Sahrens 	 * the parent is no longer needed.  Remove it from the tree.
4209fa9e4066Sahrens 	 */
4210cb04b873SMark J Musante 	if (pvd->vdev_children == 1) {
4211cb04b873SMark J Musante 		if (pvd->vdev_ops == &vdev_spare_ops)
4212cb04b873SMark J Musante 			cvd->vdev_unspare = B_FALSE;
4213fa9e4066Sahrens 		vdev_remove_parent(cvd);
4214cb04b873SMark J Musante 		cvd->vdev_resilvering = B_FALSE;
4215cb04b873SMark J Musante 	}
4216cb04b873SMark J Musante 
4217fa9e4066Sahrens 
4218fa9e4066Sahrens 	/*
4219fa9e4066Sahrens 	 * We don't set tvd until now because the parent we just removed
4220fa9e4066Sahrens 	 * may have been the previous top-level vdev.
4221fa9e4066Sahrens 	 */
4222fa9e4066Sahrens 	tvd = cvd->vdev_top;
4223fa9e4066Sahrens 	ASSERT(tvd->vdev_parent == rvd);
4224fa9e4066Sahrens 
4225fa9e4066Sahrens 	/*
422639c23413Seschrock 	 * Reevaluate the parent vdev state.
4227fa9e4066Sahrens 	 */
42283d7072f8Seschrock 	vdev_propagate_state(cvd);
4229fa9e4066Sahrens 
4230fa9e4066Sahrens 	/*
4231573ca77eSGeorge Wilson 	 * If the 'autoexpand' property is set on the pool then automatically
4232573ca77eSGeorge Wilson 	 * try to expand the size of the pool. For example if the device we
4233573ca77eSGeorge Wilson 	 * just detached was smaller than the others, it may be possible to
4234573ca77eSGeorge Wilson 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4235573ca77eSGeorge Wilson 	 * first so that we can obtain the updated sizes of the leaf vdevs.
4236fa9e4066Sahrens 	 */
4237573ca77eSGeorge Wilson 	if (spa->spa_autoexpand) {
4238573ca77eSGeorge Wilson 		vdev_reopen(tvd);
4239573ca77eSGeorge Wilson 		vdev_expand(tvd, txg);
4240573ca77eSGeorge Wilson 	}
4241fa9e4066Sahrens 
4242fa9e4066Sahrens 	vdev_config_dirty(tvd);
4243fa9e4066Sahrens 
4244fa9e4066Sahrens 	/*
424539c23413Seschrock 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
424639c23413Seschrock 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
424739c23413Seschrock 	 * But first make sure we're not on any *other* txg's DTL list, to
424839c23413Seschrock 	 * prevent vd from being accessed after it's freed.
4249fa9e4066Sahrens 	 */
42501195e687SMark J Musante 	vdpath = spa_strdup(vd->vdev_path);
42518ad4d6ddSJeff Bonwick 	for (int t = 0; t < TXG_SIZE; t++)
4252fa9e4066Sahrens 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4253ecc2d604Sbonwick 	vd->vdev_detached = B_TRUE;
4254ecc2d604Sbonwick 	vdev_dirty(tvd, VDD_DTL, vd, txg);
4255fa9e4066Sahrens 
42563d7072f8Seschrock 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
42573d7072f8Seschrock 
4258cb04b873SMark J Musante 	/* hang on to the spa before we release the lock */
4259cb04b873SMark J Musante 	spa_open_ref(spa, FTAG);
4260cb04b873SMark J Musante 
426199653d4eSeschrock 	error = spa_vdev_exit(spa, vd, txg, 0);
426299653d4eSeschrock 
42633f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
42641195e687SMark J Musante 	    "vdev=%s", vdpath);
42651195e687SMark J Musante 	spa_strfree(vdpath);
42661195e687SMark J Musante 
426799653d4eSeschrock 	/*
426839c23413Seschrock 	 * If this was the removal of the original device in a hot spare vdev,
426939c23413Seschrock 	 * then we want to go through and remove the device from the hot spare
427039c23413Seschrock 	 * list of every other pool.
427199653d4eSeschrock 	 */
427299653d4eSeschrock 	if (unspare) {
4273cb04b873SMark J Musante 		spa_t *altspa = NULL;
4274cb04b873SMark J Musante 
427599653d4eSeschrock 		mutex_enter(&spa_namespace_lock);
4276cb04b873SMark J Musante 		while ((altspa = spa_next(altspa)) != NULL) {
4277cb04b873SMark J Musante 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
4278cb04b873SMark J Musante 			    altspa == spa)
427999653d4eSeschrock 				continue;
4280cb04b873SMark J Musante 
4281cb04b873SMark J Musante 			spa_open_ref(altspa, FTAG);
42829af0a4dfSJeff Bonwick 			mutex_exit(&spa_namespace_lock);
4283cb04b873SMark J Musante 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
42849af0a4dfSJeff Bonwick 			mutex_enter(&spa_namespace_lock);
4285cb04b873SMark J Musante 			spa_close(altspa, FTAG);
428699653d4eSeschrock 		}
428799653d4eSeschrock 		mutex_exit(&spa_namespace_lock);
4288cb04b873SMark J Musante 
4289cb04b873SMark J Musante 		/* search the rest of the vdevs for spares to remove */
4290cb04b873SMark J Musante 		spa_vdev_resilver_done(spa);
429199653d4eSeschrock 	}
429299653d4eSeschrock 
4293cb04b873SMark J Musante 	/* all done with the spa; OK to release */
4294cb04b873SMark J Musante 	mutex_enter(&spa_namespace_lock);
4295cb04b873SMark J Musante 	spa_close(spa, FTAG);
4296cb04b873SMark J Musante 	mutex_exit(&spa_namespace_lock);
4297cb04b873SMark J Musante 
429899653d4eSeschrock 	return (error);
429999653d4eSeschrock }
430099653d4eSeschrock 
43011195e687SMark J Musante /*
43021195e687SMark J Musante  * Split a set of devices from their mirrors, and create a new pool from them.
43031195e687SMark J Musante  */
43041195e687SMark J Musante int
43051195e687SMark J Musante spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
43061195e687SMark J Musante     nvlist_t *props, boolean_t exp)
43071195e687SMark J Musante {
43081195e687SMark J Musante 	int error = 0;
43091195e687SMark J Musante 	uint64_t txg, *glist;
43101195e687SMark J Musante 	spa_t *newspa;
43111195e687SMark J Musante 	uint_t c, children, lastlog;
43121195e687SMark J Musante 	nvlist_t **child, *nvl, *tmp;
43131195e687SMark J Musante 	dmu_tx_t *tx;
43141195e687SMark J Musante 	char *altroot = NULL;
43151195e687SMark J Musante 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
43161195e687SMark J Musante 	boolean_t activate_slog;
43171195e687SMark J Musante 
4318f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
43191195e687SMark J Musante 
43201195e687SMark J Musante 	txg = spa_vdev_enter(spa);
43211195e687SMark J Musante 
43221195e687SMark J Musante 	/* clear the log and flush everything up to now */
43231195e687SMark J Musante 	activate_slog = spa_passivate_log(spa);
43241195e687SMark J Musante 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
43251195e687SMark J Musante 	error = spa_offline_log(spa);
43261195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
43271195e687SMark J Musante 
43281195e687SMark J Musante 	if (activate_slog)
43291195e687SMark J Musante 		spa_activate_log(spa);
43301195e687SMark J Musante 
43311195e687SMark J Musante 	if (error != 0)
43321195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, error));
43331195e687SMark J Musante 
43341195e687SMark J Musante 	/* check new spa name before going any further */
43351195e687SMark J Musante 	if (spa_lookup(newname) != NULL)
43361195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
43371195e687SMark J Musante 
43381195e687SMark J Musante 	/*
43391195e687SMark J Musante 	 * scan through all the children to ensure they're all mirrors
43401195e687SMark J Musante 	 */
43411195e687SMark J Musante 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
43421195e687SMark J Musante 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
43431195e687SMark J Musante 	    &children) != 0)
43441195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
43451195e687SMark J Musante 
43461195e687SMark J Musante 	/* first, check to ensure we've got the right child count */
43471195e687SMark J Musante 	rvd = spa->spa_root_vdev;
43481195e687SMark J Musante 	lastlog = 0;
43491195e687SMark J Musante 	for (c = 0; c < rvd->vdev_children; c++) {
43501195e687SMark J Musante 		vdev_t *vd = rvd->vdev_child[c];
43511195e687SMark J Musante 
43521195e687SMark J Musante 		/* don't count the holes & logs as children */
43531195e687SMark J Musante 		if (vd->vdev_islog || vd->vdev_ishole) {
43541195e687SMark J Musante 			if (lastlog == 0)
43551195e687SMark J Musante 				lastlog = c;
43561195e687SMark J Musante 			continue;
43571195e687SMark J Musante 		}
43581195e687SMark J Musante 
43591195e687SMark J Musante 		lastlog = 0;
43601195e687SMark J Musante 	}
43611195e687SMark J Musante 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
43621195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
43631195e687SMark J Musante 
43641195e687SMark J Musante 	/* next, ensure no spare or cache devices are part of the split */
43651195e687SMark J Musante 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
43661195e687SMark J Musante 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
43671195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
43681195e687SMark J Musante 
43691195e687SMark J Musante 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
43701195e687SMark J Musante 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
43711195e687SMark J Musante 
43721195e687SMark J Musante 	/* then, loop over each vdev and validate it */
43731195e687SMark J Musante 	for (c = 0; c < children; c++) {
43741195e687SMark J Musante 		uint64_t is_hole = 0;
43751195e687SMark J Musante 
43761195e687SMark J Musante 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
43771195e687SMark J Musante 		    &is_hole);
43781195e687SMark J Musante 
43791195e687SMark J Musante 		if (is_hole != 0) {
43801195e687SMark J Musante 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
43811195e687SMark J Musante 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
43821195e687SMark J Musante 				continue;
43831195e687SMark J Musante 			} else {
43841195e687SMark J Musante 				error = EINVAL;
43851195e687SMark J Musante 				break;
43861195e687SMark J Musante 			}
43871195e687SMark J Musante 		}
43881195e687SMark J Musante 
43891195e687SMark J Musante 		/* which disk is going to be split? */
43901195e687SMark J Musante 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
43911195e687SMark J Musante 		    &glist[c]) != 0) {
43921195e687SMark J Musante 			error = EINVAL;
43931195e687SMark J Musante 			break;
43941195e687SMark J Musante 		}
43951195e687SMark J Musante 
43961195e687SMark J Musante 		/* look it up in the spa */
43971195e687SMark J Musante 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
43981195e687SMark J Musante 		if (vml[c] == NULL) {
43991195e687SMark J Musante 			error = ENODEV;
44001195e687SMark J Musante 			break;
44011195e687SMark J Musante 		}
44021195e687SMark J Musante 
44031195e687SMark J Musante 		/* make sure there's nothing stopping the split */
44041195e687SMark J Musante 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
44051195e687SMark J Musante 		    vml[c]->vdev_islog ||
44061195e687SMark J Musante 		    vml[c]->vdev_ishole ||
44071195e687SMark J Musante 		    vml[c]->vdev_isspare ||
44081195e687SMark J Musante 		    vml[c]->vdev_isl2cache ||
44091195e687SMark J Musante 		    !vdev_writeable(vml[c]) ||
4410d41c4376SMark J Musante 		    vml[c]->vdev_children != 0 ||
44111195e687SMark J Musante 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
44121195e687SMark J Musante 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
44131195e687SMark J Musante 			error = EINVAL;
44141195e687SMark J Musante 			break;
44151195e687SMark J Musante 		}
44161195e687SMark J Musante 
44171195e687SMark J Musante 		if (vdev_dtl_required(vml[c])) {
44181195e687SMark J Musante 			error = EBUSY;
44191195e687SMark J Musante 			break;
44201195e687SMark J Musante 		}
44211195e687SMark J Musante 
44221195e687SMark J Musante 		/* we need certain info from the top level */
44231195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
44241195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ms_array) == 0);
44251195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
44261195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
44271195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
44281195e687SMark J Musante 		    vml[c]->vdev_top->vdev_asize) == 0);
44291195e687SMark J Musante 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
44301195e687SMark J Musante 		    vml[c]->vdev_top->vdev_ashift) == 0);
44311195e687SMark J Musante 	}
44321195e687SMark J Musante 
44331195e687SMark J Musante 	if (error != 0) {
44341195e687SMark J Musante 		kmem_free(vml, children * sizeof (vdev_t *));
44351195e687SMark J Musante 		kmem_free(glist, children * sizeof (uint64_t));
44361195e687SMark J Musante 		return (spa_vdev_exit(spa, NULL, txg, error));
44371195e687SMark J Musante 	}
44381195e687SMark J Musante 
44391195e687SMark J Musante 	/* stop writers from using the disks */
44401195e687SMark J Musante 	for (c = 0; c < children; c++) {
44411195e687SMark J Musante 		if (vml[c] != NULL)
44421195e687SMark J Musante 			vml[c]->vdev_offline = B_TRUE;
44431195e687SMark J Musante 	}
44441195e687SMark J Musante 	vdev_reopen(spa->spa_root_vdev);
44451195e687SMark J Musante 
44461195e687SMark J Musante 	/*
44471195e687SMark J Musante 	 * Temporarily record the splitting vdevs in the spa config.  This
44481195e687SMark J Musante 	 * will disappear once the config is regenerated.
44491195e687SMark J Musante 	 */
44501195e687SMark J Musante 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
44511195e687SMark J Musante 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
44521195e687SMark J Musante 	    glist, children) == 0);
44531195e687SMark J Musante 	kmem_free(glist, children * sizeof (uint64_t));
44541195e687SMark J Musante 
445598295d61SMark J Musante 	mutex_enter(&spa->spa_props_lock);
44561195e687SMark J Musante 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
44571195e687SMark J Musante 	    nvl) == 0);
445898295d61SMark J Musante 	mutex_exit(&spa->spa_props_lock);
44591195e687SMark J Musante 	spa->spa_config_splitting = nvl;
44601195e687SMark J Musante 	vdev_config_dirty(spa->spa_root_vdev);
44611195e687SMark J Musante 
44621195e687SMark J Musante 	/* configure and create the new pool */
44631195e687SMark J Musante 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
44641195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
44651195e687SMark J Musante 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
44661195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
44671195e687SMark J Musante 	    spa_version(spa)) == 0);
44681195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
44691195e687SMark J Musante 	    spa->spa_config_txg) == 0);
44701195e687SMark J Musante 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
44711195e687SMark J Musante 	    spa_generate_guid(NULL)) == 0);
44721195e687SMark J Musante 	(void) nvlist_lookup_string(props,
44731195e687SMark J Musante 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
44741195e687SMark J Musante 
4475d41c4376SMark J Musante 	/* add the new pool to the namespace */
44761195e687SMark J Musante 	newspa = spa_add(newname, config, altroot);
44771195e687SMark J Musante 	newspa->spa_config_txg = spa->spa_config_txg;
44781195e687SMark J Musante 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
44791195e687SMark J Musante 
44801195e687SMark J Musante 	/* release the spa config lock, retaining the namespace lock */
44811195e687SMark J Musante 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
44821195e687SMark J Musante 
44831195e687SMark J Musante 	if (zio_injection_enabled)
44841195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 1);
44851195e687SMark J Musante 
44861195e687SMark J Musante 	spa_activate(newspa, spa_mode_global);
44871195e687SMark J Musante 	spa_async_suspend(newspa);
44881195e687SMark J Musante 
44891195e687SMark J Musante 	/* create the new pool from the disks of the original pool */
44901195e687SMark J Musante 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
44911195e687SMark J Musante 	if (error)
44921195e687SMark J Musante 		goto out;
44931195e687SMark J Musante 
44941195e687SMark J Musante 	/* if that worked, generate a real config for the new pool */
44951195e687SMark J Musante 	if (newspa->spa_root_vdev != NULL) {
44961195e687SMark J Musante 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
44971195e687SMark J Musante 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
44981195e687SMark J Musante 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
44991195e687SMark J Musante 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
45001195e687SMark J Musante 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
45011195e687SMark J Musante 		    B_TRUE));
45021195e687SMark J Musante 	}
45031195e687SMark J Musante 
45041195e687SMark J Musante 	/* set the props */
45051195e687SMark J Musante 	if (props != NULL) {
45061195e687SMark J Musante 		spa_configfile_set(newspa, props, B_FALSE);
45071195e687SMark J Musante 		error = spa_prop_set(newspa, props);
45081195e687SMark J Musante 		if (error)
45091195e687SMark J Musante 			goto out;
45101195e687SMark J Musante 	}
45111195e687SMark J Musante 
45121195e687SMark J Musante 	/* flush everything */
45131195e687SMark J Musante 	txg = spa_vdev_config_enter(newspa);
45141195e687SMark J Musante 	vdev_config_dirty(newspa->spa_root_vdev);
45151195e687SMark J Musante 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
45161195e687SMark J Musante 
45171195e687SMark J Musante 	if (zio_injection_enabled)
45181195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 2);
45191195e687SMark J Musante 
45201195e687SMark J Musante 	spa_async_resume(newspa);
45211195e687SMark J Musante 
45221195e687SMark J Musante 	/* finally, update the original pool's config */
45231195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
45241195e687SMark J Musante 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
45251195e687SMark J Musante 	error = dmu_tx_assign(tx, TXG_WAIT);
45261195e687SMark J Musante 	if (error != 0)
45271195e687SMark J Musante 		dmu_tx_abort(tx);
45281195e687SMark J Musante 	for (c = 0; c < children; c++) {
45291195e687SMark J Musante 		if (vml[c] != NULL) {
45301195e687SMark J Musante 			vdev_split(vml[c]);
45311195e687SMark J Musante 			if (error == 0)
45323f9d6ad7SLin Ling 				spa_history_log_internal(LOG_POOL_VDEV_DETACH,
45333f9d6ad7SLin Ling 				    spa, tx, "vdev=%s",
45341195e687SMark J Musante 				    vml[c]->vdev_path);
45351195e687SMark J Musante 			vdev_free(vml[c]);
45361195e687SMark J Musante 		}
45371195e687SMark J Musante 	}
45381195e687SMark J Musante 	vdev_config_dirty(spa->spa_root_vdev);
45391195e687SMark J Musante 	spa->spa_config_splitting = NULL;
45401195e687SMark J Musante 	nvlist_free(nvl);
45411195e687SMark J Musante 	if (error == 0)
45421195e687SMark J Musante 		dmu_tx_commit(tx);
45431195e687SMark J Musante 	(void) spa_vdev_exit(spa, NULL, txg, 0);
45441195e687SMark J Musante 
45451195e687SMark J Musante 	if (zio_injection_enabled)
45461195e687SMark J Musante 		zio_handle_panic_injection(spa, FTAG, 3);
45471195e687SMark J Musante 
45481195e687SMark J Musante 	/* split is complete; log a history record */
45493f9d6ad7SLin Ling 	spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
45501195e687SMark J Musante 	    "split new pool %s from pool %s", newname, spa_name(spa));
45511195e687SMark J Musante 
45521195e687SMark J Musante 	kmem_free(vml, children * sizeof (vdev_t *));
45531195e687SMark J Musante 
45541195e687SMark J Musante 	/* if we're not going to mount the filesystems in userland, export */
45551195e687SMark J Musante 	if (exp)
45561195e687SMark J Musante 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
45571195e687SMark J Musante 		    B_FALSE, B_FALSE);
45581195e687SMark J Musante 
45591195e687SMark J Musante 	return (error);
45601195e687SMark J Musante 
45611195e687SMark J Musante out:
45621195e687SMark J Musante 	spa_unload(newspa);
45631195e687SMark J Musante 	spa_deactivate(newspa);
45641195e687SMark J Musante 	spa_remove(newspa);
45651195e687SMark J Musante 
45661195e687SMark J Musante 	txg = spa_vdev_config_enter(spa);
456798295d61SMark J Musante 
456898295d61SMark J Musante 	/* re-online all offlined disks */
456998295d61SMark J Musante 	for (c = 0; c < children; c++) {
457098295d61SMark J Musante 		if (vml[c] != NULL)
457198295d61SMark J Musante 			vml[c]->vdev_offline = B_FALSE;
457298295d61SMark J Musante 	}
457398295d61SMark J Musante 	vdev_reopen(spa->spa_root_vdev);
457498295d61SMark J Musante 
45751195e687SMark J Musante 	nvlist_free(spa->spa_config_splitting);
45761195e687SMark J Musante 	spa->spa_config_splitting = NULL;
4577d41c4376SMark J Musante 	(void) spa_vdev_exit(spa, NULL, txg, error);
45781195e687SMark J Musante 
45791195e687SMark J Musante 	kmem_free(vml, children * sizeof (vdev_t *));
45801195e687SMark J Musante 	return (error);
45811195e687SMark J Musante }
45821195e687SMark J Musante 
4583e14bb325SJeff Bonwick static nvlist_t *
4584e14bb325SJeff Bonwick spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
458599653d4eSeschrock {
4586e14bb325SJeff Bonwick 	for (int i = 0; i < count; i++) {
4587e14bb325SJeff Bonwick 		uint64_t guid;
458899653d4eSeschrock 
4589e14bb325SJeff Bonwick 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4590e14bb325SJeff Bonwick 		    &guid) == 0);
459199653d4eSeschrock 
4592e14bb325SJeff Bonwick 		if (guid == target_guid)
4593e14bb325SJeff Bonwick 			return (nvpp[i]);
459499653d4eSeschrock 	}
459599653d4eSeschrock 
4596e14bb325SJeff Bonwick 	return (NULL);
4597fa94a07fSbrendan }
4598fa94a07fSbrendan 
4599e14bb325SJeff Bonwick static void
4600e14bb325SJeff Bonwick spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4601e14bb325SJeff Bonwick 	nvlist_t *dev_to_remove)
4602fa94a07fSbrendan {
4603e14bb325SJeff Bonwick 	nvlist_t **newdev = NULL;
4604fa94a07fSbrendan 
4605e14bb325SJeff Bonwick 	if (count > 1)
4606e14bb325SJeff Bonwick 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4607fa94a07fSbrendan 
4608e14bb325SJeff Bonwick 	for (int i = 0, j = 0; i < count; i++) {
4609e14bb325SJeff Bonwick 		if (dev[i] == dev_to_remove)
4610e14bb325SJeff Bonwick 			continue;
4611e14bb325SJeff Bonwick 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4612fa94a07fSbrendan 	}
4613fa94a07fSbrendan 
4614e14bb325SJeff Bonwick 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4615e14bb325SJeff Bonwick 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4616fa94a07fSbrendan 
4617e14bb325SJeff Bonwick 	for (int i = 0; i < count - 1; i++)
4618e14bb325SJeff Bonwick 		nvlist_free(newdev[i]);
4619fa94a07fSbrendan 
4620e14bb325SJeff Bonwick 	if (count > 1)
4621e14bb325SJeff Bonwick 		kmem_free(newdev, (count - 1) * sizeof (void *));
4622fa94a07fSbrendan }
4623fa94a07fSbrendan 
462488ecc943SGeorge Wilson /*
462588ecc943SGeorge Wilson  * Evacuate the device.
462688ecc943SGeorge Wilson  */
46273f9d6ad7SLin Ling static int
462888ecc943SGeorge Wilson spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
462988ecc943SGeorge Wilson {
463088ecc943SGeorge Wilson 	uint64_t txg;
46313f9d6ad7SLin Ling 	int error = 0;
463288ecc943SGeorge Wilson 
463388ecc943SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
463488ecc943SGeorge Wilson 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4635b24ab676SJeff Bonwick 	ASSERT(vd == vd->vdev_top);
463688ecc943SGeorge Wilson 
463788ecc943SGeorge Wilson 	/*
463888ecc943SGeorge Wilson 	 * Evacuate the device.  We don't hold the config lock as writer
463988ecc943SGeorge Wilson 	 * since we need to do I/O but we do keep the
464088ecc943SGeorge Wilson 	 * spa_namespace_lock held.  Once this completes the device
464188ecc943SGeorge Wilson 	 * should no longer have any blocks allocated on it.
464288ecc943SGeorge Wilson 	 */
464388ecc943SGeorge Wilson 	if (vd->vdev_islog) {
46443f9d6ad7SLin Ling 		if (vd->vdev_stat.vs_alloc != 0)
46453f9d6ad7SLin Ling 			error = spa_offline_log(spa);
4646a1521560SJeff Bonwick 	} else {
46473f9d6ad7SLin Ling 		error = ENOTSUP;
464888ecc943SGeorge Wilson 	}
464988ecc943SGeorge Wilson 
4650a1521560SJeff Bonwick 	if (error)
4651a1521560SJeff Bonwick 		return (error);
4652a1521560SJeff Bonwick 
465388ecc943SGeorge Wilson 	/*
4654a1521560SJeff Bonwick 	 * The evacuation succeeded.  Remove any remaining MOS metadata
4655a1521560SJeff Bonwick 	 * associated with this vdev, and wait for these changes to sync.
465688ecc943SGeorge Wilson 	 */
46573f9d6ad7SLin Ling 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
465888ecc943SGeorge Wilson 	txg = spa_vdev_config_enter(spa);
465988ecc943SGeorge Wilson 	vd->vdev_removing = B_TRUE;
466088ecc943SGeorge Wilson 	vdev_dirty(vd, 0, NULL, txg);
466188ecc943SGeorge Wilson 	vdev_config_dirty(vd);
466288ecc943SGeorge Wilson 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
466388ecc943SGeorge Wilson 
466488ecc943SGeorge Wilson 	return (0);
466588ecc943SGeorge Wilson }
466688ecc943SGeorge Wilson 
466788ecc943SGeorge Wilson /*
466888ecc943SGeorge Wilson  * Complete the removal by cleaning up the namespace.
466988ecc943SGeorge Wilson  */
46703f9d6ad7SLin Ling static void
4671a1521560SJeff Bonwick spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
467288ecc943SGeorge Wilson {
467388ecc943SGeorge Wilson 	vdev_t *rvd = spa->spa_root_vdev;
467488ecc943SGeorge Wilson 	uint64_t id = vd->vdev_id;
467588ecc943SGeorge Wilson 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
467688ecc943SGeorge Wilson 
467788ecc943SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
467888ecc943SGeorge Wilson 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4679b24ab676SJeff Bonwick 	ASSERT(vd == vd->vdev_top);
468088ecc943SGeorge Wilson 
46813f9d6ad7SLin Ling 	/*
46823f9d6ad7SLin Ling 	 * Only remove any devices which are empty.
46833f9d6ad7SLin Ling 	 */
46843f9d6ad7SLin Ling 	if (vd->vdev_stat.vs_alloc != 0)
46853f9d6ad7SLin Ling 		return;
46863f9d6ad7SLin Ling 
468788ecc943SGeorge Wilson 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4688b24ab676SJeff Bonwick 
4689b24ab676SJeff Bonwick 	if (list_link_active(&vd->vdev_state_dirty_node))
4690b24ab676SJeff Bonwick 		vdev_state_clean(vd);
4691b24ab676SJeff Bonwick 	if (list_link_active(&vd->vdev_config_dirty_node))
4692b24ab676SJeff Bonwick 		vdev_config_clean(vd);
4693b24ab676SJeff Bonwick 
469488ecc943SGeorge Wilson 	vdev_free(vd);
469588ecc943SGeorge Wilson 
469688ecc943SGeorge Wilson 	if (last_vdev) {
469788ecc943SGeorge Wilson 		vdev_compact_children(rvd);
469888ecc943SGeorge Wilson 	} else {
469988ecc943SGeorge Wilson 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
470088ecc943SGeorge Wilson 		vdev_add_child(rvd, vd);
470188ecc943SGeorge Wilson 	}
4702fcbfa62bSLin Ling 	vdev_config_dirty(rvd);
4703fcbfa62bSLin Ling 
4704fcbfa62bSLin Ling 	/*
4705fcbfa62bSLin Ling 	 * Reassess the health of our root vdev.
4706fcbfa62bSLin Ling 	 */
4707fcbfa62bSLin Ling 	vdev_reopen(rvd);
470888ecc943SGeorge Wilson }
470988ecc943SGeorge Wilson 
47103f9d6ad7SLin Ling /*
47113f9d6ad7SLin Ling  * Remove a device from the pool -
47123f9d6ad7SLin Ling  *
47133f9d6ad7SLin Ling  * Removing a device from the vdev namespace requires several steps
47143f9d6ad7SLin Ling  * and can take a significant amount of time.  As a result we use
47153f9d6ad7SLin Ling  * the spa_vdev_config_[enter/exit] functions which allow us to
47163f9d6ad7SLin Ling  * grab and release the spa_config_lock while still holding the namespace
47173f9d6ad7SLin Ling  * lock.  During each step the configuration is synced out.
47183f9d6ad7SLin Ling  */
47193f9d6ad7SLin Ling 
4720fa94a07fSbrendan /*
4721fa94a07fSbrendan  * Remove a device from the pool.  Currently, this supports removing only hot
472288ecc943SGeorge Wilson  * spares, slogs, and level 2 ARC devices.
4723fa94a07fSbrendan  */
4724fa94a07fSbrendan int
4725fa94a07fSbrendan spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4726fa94a07fSbrendan {
4727fa94a07fSbrendan 	vdev_t *vd;
4728a1521560SJeff Bonwick 	metaslab_group_t *mg;
4729e14bb325SJeff Bonwick 	nvlist_t **spares, **l2cache, *nv;
47308ad4d6ddSJeff Bonwick 	uint64_t txg = 0;
473188ecc943SGeorge Wilson 	uint_t nspares, nl2cache;
4732fa94a07fSbrendan 	int error = 0;
47338ad4d6ddSJeff Bonwick 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4734fa94a07fSbrendan 
4735f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
4736f9af39baSGeorge Wilson 
47378ad4d6ddSJeff Bonwick 	if (!locked)
47388ad4d6ddSJeff Bonwick 		txg = spa_vdev_enter(spa);
4739fa94a07fSbrendan 
4740c5904d13Seschrock 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4741fa94a07fSbrendan 
4742fa94a07fSbrendan 	if (spa->spa_spares.sav_vdevs != NULL &&
4743fa94a07fSbrendan 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4744e14bb325SJeff Bonwick 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4745e14bb325SJeff Bonwick 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4746e14bb325SJeff Bonwick 		/*
4747e14bb325SJeff Bonwick 		 * Only remove the hot spare if it's not currently in use
4748e14bb325SJeff Bonwick 		 * in this pool.
4749e14bb325SJeff Bonwick 		 */
4750e14bb325SJeff Bonwick 		if (vd == NULL || unspare) {
4751e14bb325SJeff Bonwick 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
4752e14bb325SJeff Bonwick 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4753e14bb325SJeff Bonwick 			spa_load_spares(spa);
4754e14bb325SJeff Bonwick 			spa->spa_spares.sav_sync = B_TRUE;
4755e14bb325SJeff Bonwick 		} else {
4756e14bb325SJeff Bonwick 			error = EBUSY;
4757e14bb325SJeff Bonwick 		}
4758e14bb325SJeff Bonwick 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
4759fa94a07fSbrendan 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4760e14bb325SJeff Bonwick 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4761e14bb325SJeff Bonwick 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4762e14bb325SJeff Bonwick 		/*
4763e14bb325SJeff Bonwick 		 * Cache devices can always be removed.
4764e14bb325SJeff Bonwick 		 */
4765e14bb325SJeff Bonwick 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4766e14bb325SJeff Bonwick 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4767fa94a07fSbrendan 		spa_load_l2cache(spa);
4768fa94a07fSbrendan 		spa->spa_l2cache.sav_sync = B_TRUE;
476988ecc943SGeorge Wilson 	} else if (vd != NULL && vd->vdev_islog) {
477088ecc943SGeorge Wilson 		ASSERT(!locked);
4771b24ab676SJeff Bonwick 		ASSERT(vd == vd->vdev_top);
477288ecc943SGeorge Wilson 
477388ecc943SGeorge Wilson 		/*
477488ecc943SGeorge Wilson 		 * XXX - Once we have bp-rewrite this should
477588ecc943SGeorge Wilson 		 * become the common case.
477688ecc943SGeorge Wilson 		 */
477788ecc943SGeorge Wilson 
4778a1521560SJeff Bonwick 		mg = vd->vdev_mg;
4779a1521560SJeff Bonwick 
478088ecc943SGeorge Wilson 		/*
4781a1521560SJeff Bonwick 		 * Stop allocating from this vdev.
478288ecc943SGeorge Wilson 		 */
4783a1521560SJeff Bonwick 		metaslab_group_passivate(mg);
478488ecc943SGeorge Wilson 
4785b24ab676SJeff Bonwick 		/*
4786b24ab676SJeff Bonwick 		 * Wait for the youngest allocations and frees to sync,
4787b24ab676SJeff Bonwick 		 * and then wait for the deferral of those frees to finish.
4788b24ab676SJeff Bonwick 		 */
4789b24ab676SJeff Bonwick 		spa_vdev_config_exit(spa, NULL,
4790b24ab676SJeff Bonwick 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4791b24ab676SJeff Bonwick 
4792a1521560SJeff Bonwick 		/*
4793a1521560SJeff Bonwick 		 * Attempt to evacuate the vdev.
4794a1521560SJeff Bonwick 		 */
4795a1521560SJeff Bonwick 		error = spa_vdev_remove_evacuate(spa, vd);
4796a1521560SJeff Bonwick 
479788ecc943SGeorge Wilson 		txg = spa_vdev_config_enter(spa);
479888ecc943SGeorge Wilson 
4799a1521560SJeff Bonwick 		/*
4800a1521560SJeff Bonwick 		 * If we couldn't evacuate the vdev, unwind.
4801a1521560SJeff Bonwick 		 */
4802a1521560SJeff Bonwick 		if (error) {
4803a1521560SJeff Bonwick 			metaslab_group_activate(mg);
4804a1521560SJeff Bonwick 			return (spa_vdev_exit(spa, NULL, txg, error));
4805a1521560SJeff Bonwick 		}
4806a1521560SJeff Bonwick 
4807a1521560SJeff Bonwick 		/*
4808a1521560SJeff Bonwick 		 * Clean up the vdev namespace.
4809a1521560SJeff Bonwick 		 */
4810a1521560SJeff Bonwick 		spa_vdev_remove_from_namespace(spa, vd);
481188ecc943SGeorge Wilson 
4812e14bb325SJeff Bonwick 	} else if (vd != NULL) {
4813e14bb325SJeff Bonwick 		/*
4814e14bb325SJeff Bonwick 		 * Normal vdevs cannot be removed (yet).
4815e14bb325SJeff Bonwick 		 */
4816e14bb325SJeff Bonwick 		error = ENOTSUP;
4817e14bb325SJeff Bonwick 	} else {
4818e14bb325SJeff Bonwick 		/*
4819e14bb325SJeff Bonwick 		 * There is no vdev of any kind with the specified guid.
4820e14bb325SJeff Bonwick 		 */
4821e14bb325SJeff Bonwick 		error = ENOENT;
4822fa94a07fSbrendan 	}
482399653d4eSeschrock 
48248ad4d6ddSJeff Bonwick 	if (!locked)
48258ad4d6ddSJeff Bonwick 		return (spa_vdev_exit(spa, NULL, txg, error));
48268ad4d6ddSJeff Bonwick 
48278ad4d6ddSJeff Bonwick 	return (error);
4828fa9e4066Sahrens }
4829fa9e4066Sahrens 
4830fa9e4066Sahrens /*
48313d7072f8Seschrock  * Find any device that's done replacing, or a vdev marked 'unspare' that's
48323d7072f8Seschrock  * current spared, so we can detach it.
4833fa9e4066Sahrens  */
4834ea8dc4b6Seschrock static vdev_t *
48353d7072f8Seschrock spa_vdev_resilver_done_hunt(vdev_t *vd)
4836fa9e4066Sahrens {
4837ea8dc4b6Seschrock 	vdev_t *newvd, *oldvd;
4838fa9e4066Sahrens 
4839573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++) {
48403d7072f8Seschrock 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4841ea8dc4b6Seschrock 		if (oldvd != NULL)
4842ea8dc4b6Seschrock 			return (oldvd);
4843ea8dc4b6Seschrock 	}
4844fa9e4066Sahrens 
48453d7072f8Seschrock 	/*
4846cb04b873SMark J Musante 	 * Check for a completed replacement.  We always consider the first
4847cb04b873SMark J Musante 	 * vdev in the list to be the oldest vdev, and the last one to be
4848cb04b873SMark J Musante 	 * the newest (see spa_vdev_attach() for how that works).  In
4849cb04b873SMark J Musante 	 * the case where the newest vdev is faulted, we will not automatically
4850cb04b873SMark J Musante 	 * remove it after a resilver completes.  This is OK as it will require
4851cb04b873SMark J Musante 	 * user intervention to determine which disk the admin wishes to keep.
48523d7072f8Seschrock 	 */
4853cb04b873SMark J Musante 	if (vd->vdev_ops == &vdev_replacing_ops) {
4854cb04b873SMark J Musante 		ASSERT(vd->vdev_children > 1);
4855cb04b873SMark J Musante 
4856cb04b873SMark J Musante 		newvd = vd->vdev_child[vd->vdev_children - 1];
4857ea8dc4b6Seschrock 		oldvd = vd->vdev_child[0];
4858ea8dc4b6Seschrock 
48598ad4d6ddSJeff Bonwick 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4860e69acc92SVictor Latushkin 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
48618ad4d6ddSJeff Bonwick 		    !vdev_dtl_required(oldvd))
4862ea8dc4b6Seschrock 			return (oldvd);
4863fa9e4066Sahrens 	}
4864ea8dc4b6Seschrock 
48653d7072f8Seschrock 	/*
48663d7072f8Seschrock 	 * Check for a completed resilver with the 'unspare' flag set.
48673d7072f8Seschrock 	 */
4868cb04b873SMark J Musante 	if (vd->vdev_ops == &vdev_spare_ops) {
4869cb04b873SMark J Musante 		vdev_t *first = vd->vdev_child[0];
4870cb04b873SMark J Musante 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
4871cb04b873SMark J Musante 
4872cb04b873SMark J Musante 		if (last->vdev_unspare) {
4873cb04b873SMark J Musante 			oldvd = first;
4874cb04b873SMark J Musante 			newvd = last;
4875cb04b873SMark J Musante 		} else if (first->vdev_unspare) {
4876cb04b873SMark J Musante 			oldvd = last;
4877cb04b873SMark J Musante 			newvd = first;
4878cb04b873SMark J Musante 		} else {
4879cb04b873SMark J Musante 			oldvd = NULL;
4880cb04b873SMark J Musante 		}
48813d7072f8Seschrock 
4882cb04b873SMark J Musante 		if (oldvd != NULL &&
48838ad4d6ddSJeff Bonwick 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
4884e69acc92SVictor Latushkin 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4885cb04b873SMark J Musante 		    !vdev_dtl_required(oldvd))
48863d7072f8Seschrock 			return (oldvd);
4887cb04b873SMark J Musante 
4888cb04b873SMark J Musante 		/*
4889cb04b873SMark J Musante 		 * If there are more than two spares attached to a disk,
4890cb04b873SMark J Musante 		 * and those spares are not required, then we want to
4891cb04b873SMark J Musante 		 * attempt to free them up now so that they can be used
4892cb04b873SMark J Musante 		 * by other pools.  Once we're back down to a single
4893cb04b873SMark J Musante 		 * disk+spare, we stop removing them.
4894cb04b873SMark J Musante 		 */
4895cb04b873SMark J Musante 		if (vd->vdev_children > 2) {
4896cb04b873SMark J Musante 			newvd = vd->vdev_child[1];
4897cb04b873SMark J Musante 
4898cb04b873SMark J Musante 			if (newvd->vdev_isspare && last->vdev_isspare &&
4899cb04b873SMark J Musante 			    vdev_dtl_empty(last, DTL_MISSING) &&
4900cb04b873SMark J Musante 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
4901cb04b873SMark J Musante 			    !vdev_dtl_required(newvd))
4902cb04b873SMark J Musante 				return (newvd);
49033d7072f8Seschrock 		}
49043d7072f8Seschrock 	}
49053d7072f8Seschrock 
4906ea8dc4b6Seschrock 	return (NULL);
4907fa9e4066Sahrens }
4908fa9e4066Sahrens 
4909ea8dc4b6Seschrock static void
49103d7072f8Seschrock spa_vdev_resilver_done(spa_t *spa)
4911fa9e4066Sahrens {
49128ad4d6ddSJeff Bonwick 	vdev_t *vd, *pvd, *ppvd;
49138ad4d6ddSJeff Bonwick 	uint64_t guid, sguid, pguid, ppguid;
4914ea8dc4b6Seschrock 
49158ad4d6ddSJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4916ea8dc4b6Seschrock 
49173d7072f8Seschrock 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
49188ad4d6ddSJeff Bonwick 		pvd = vd->vdev_parent;
49198ad4d6ddSJeff Bonwick 		ppvd = pvd->vdev_parent;
4920ea8dc4b6Seschrock 		guid = vd->vdev_guid;
49218ad4d6ddSJeff Bonwick 		pguid = pvd->vdev_guid;
49228ad4d6ddSJeff Bonwick 		ppguid = ppvd->vdev_guid;
49238ad4d6ddSJeff Bonwick 		sguid = 0;
492499653d4eSeschrock 		/*
492599653d4eSeschrock 		 * If we have just finished replacing a hot spared device, then
492699653d4eSeschrock 		 * we need to detach the parent's first child (the original hot
492799653d4eSeschrock 		 * spare) as well.
492899653d4eSeschrock 		 */
4929cb04b873SMark J Musante 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
4930cb04b873SMark J Musante 		    ppvd->vdev_children == 2) {
493199653d4eSeschrock 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
49328ad4d6ddSJeff Bonwick 			sguid = ppvd->vdev_child[1]->vdev_guid;
493399653d4eSeschrock 		}
49348ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_ALL, FTAG);
49358ad4d6ddSJeff Bonwick 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4936ea8dc4b6Seschrock 			return;
49378ad4d6ddSJeff Bonwick 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
493899653d4eSeschrock 			return;
49398ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4940fa9e4066Sahrens 	}
4941fa9e4066Sahrens 
49428ad4d6ddSJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
4943fa9e4066Sahrens }
4944fa9e4066Sahrens 
4945c67d9675Seschrock /*
4946b3388e4fSEric Taylor  * Update the stored path or FRU for this vdev.
4947c67d9675Seschrock  */
4948c67d9675Seschrock int
49496809eb4eSEric Schrock spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
49506809eb4eSEric Schrock     boolean_t ispath)
4951c67d9675Seschrock {
4952c5904d13Seschrock 	vdev_t *vd;
4953208044b8SGeorge Wilson 	boolean_t sync = B_FALSE;
4954c67d9675Seschrock 
4955f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
4956f9af39baSGeorge Wilson 
4957b3388e4fSEric Taylor 	spa_vdev_state_enter(spa, SCL_ALL);
4958c67d9675Seschrock 
49596809eb4eSEric Schrock 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4960b3388e4fSEric Taylor 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
4961c67d9675Seschrock 
49620e34b6a7Sbonwick 	if (!vd->vdev_ops->vdev_op_leaf)
4963b3388e4fSEric Taylor 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
49640e34b6a7Sbonwick 
49656809eb4eSEric Schrock 	if (ispath) {
4966208044b8SGeorge Wilson 		if (strcmp(value, vd->vdev_path) != 0) {
4967208044b8SGeorge Wilson 			spa_strfree(vd->vdev_path);
4968208044b8SGeorge Wilson 			vd->vdev_path = spa_strdup(value);
4969208044b8SGeorge Wilson 			sync = B_TRUE;
4970208044b8SGeorge Wilson 		}
49716809eb4eSEric Schrock 	} else {
4972208044b8SGeorge Wilson 		if (vd->vdev_fru == NULL) {
4973208044b8SGeorge Wilson 			vd->vdev_fru = spa_strdup(value);
4974208044b8SGeorge Wilson 			sync = B_TRUE;
4975208044b8SGeorge Wilson 		} else if (strcmp(value, vd->vdev_fru) != 0) {
49766809eb4eSEric Schrock 			spa_strfree(vd->vdev_fru);
4977208044b8SGeorge Wilson 			vd->vdev_fru = spa_strdup(value);
4978208044b8SGeorge Wilson 			sync = B_TRUE;
4979208044b8SGeorge Wilson 		}
49806809eb4eSEric Schrock 	}
4981c67d9675Seschrock 
4982208044b8SGeorge Wilson 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4983c67d9675Seschrock }
4984c67d9675Seschrock 
49856809eb4eSEric Schrock int
49866809eb4eSEric Schrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
49876809eb4eSEric Schrock {
49886809eb4eSEric Schrock 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
49896809eb4eSEric Schrock }
49906809eb4eSEric Schrock 
49916809eb4eSEric Schrock int
49926809eb4eSEric Schrock spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
49936809eb4eSEric Schrock {
49946809eb4eSEric Schrock 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
49956809eb4eSEric Schrock }
49966809eb4eSEric Schrock 
4997fa9e4066Sahrens /*
4998fa9e4066Sahrens  * ==========================================================================
49993f9d6ad7SLin Ling  * SPA Scanning
5000fa9e4066Sahrens  * ==========================================================================
5001fa9e4066Sahrens  */
5002fa9e4066Sahrens 
5003ea8dc4b6Seschrock int
50043f9d6ad7SLin Ling spa_scan_stop(spa_t *spa)
5005fa9e4066Sahrens {
5006e14bb325SJeff Bonwick 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
50073f9d6ad7SLin Ling 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
50083f9d6ad7SLin Ling 		return (EBUSY);
50093f9d6ad7SLin Ling 	return (dsl_scan_cancel(spa->spa_dsl_pool));
50103f9d6ad7SLin Ling }
5011bb8b5132Sek 
50123f9d6ad7SLin Ling int
50133f9d6ad7SLin Ling spa_scan(spa_t *spa, pool_scan_func_t func)
50143f9d6ad7SLin Ling {
50153f9d6ad7SLin Ling 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
50163f9d6ad7SLin Ling 
50173f9d6ad7SLin Ling 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5018fa9e4066Sahrens 		return (ENOTSUP);
5019fa9e4066Sahrens 
5020fa9e4066Sahrens 	/*
5021088f3894Sahrens 	 * If a resilver was requested, but there is no DTL on a
5022088f3894Sahrens 	 * writeable leaf device, we have nothing to do.
5023fa9e4066Sahrens 	 */
50243f9d6ad7SLin Ling 	if (func == POOL_SCAN_RESILVER &&
5025088f3894Sahrens 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5026088f3894Sahrens 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5027ea8dc4b6Seschrock 		return (0);
5028ea8dc4b6Seschrock 	}
5029fa9e4066Sahrens 
50303f9d6ad7SLin Ling 	return (dsl_scan(spa->spa_dsl_pool, func));
5031fa9e4066Sahrens }
5032fa9e4066Sahrens 
5033ea8dc4b6Seschrock /*
5034ea8dc4b6Seschrock  * ==========================================================================
5035ea8dc4b6Seschrock  * SPA async task processing
5036ea8dc4b6Seschrock  * ==========================================================================
5037ea8dc4b6Seschrock  */
5038ea8dc4b6Seschrock 
5039ea8dc4b6Seschrock static void
50403d7072f8Seschrock spa_async_remove(spa_t *spa, vdev_t *vd)
5041fa9e4066Sahrens {
504249cf58c0SBrendan Gregg - Sun Microsystems 	if (vd->vdev_remove_wanted) {
504398d1cbfeSGeorge Wilson 		vd->vdev_remove_wanted = B_FALSE;
504498d1cbfeSGeorge Wilson 		vd->vdev_delayed_close = B_FALSE;
504549cf58c0SBrendan Gregg - Sun Microsystems 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
50461d713200SEric Schrock 
50471d713200SEric Schrock 		/*
50481d713200SEric Schrock 		 * We want to clear the stats, but we don't want to do a full
50491d713200SEric Schrock 		 * vdev_clear() as that will cause us to throw away
50501d713200SEric Schrock 		 * degraded/faulted state as well as attempt to reopen the
50511d713200SEric Schrock 		 * device, all of which is a waste.
50521d713200SEric Schrock 		 */
50531d713200SEric Schrock 		vd->vdev_stat.vs_read_errors = 0;
50541d713200SEric Schrock 		vd->vdev_stat.vs_write_errors = 0;
50551d713200SEric Schrock 		vd->vdev_stat.vs_checksum_errors = 0;
50561d713200SEric Schrock 
5057e14bb325SJeff Bonwick 		vdev_state_dirty(vd->vdev_top);
5058ea8dc4b6Seschrock 	}
505949cf58c0SBrendan Gregg - Sun Microsystems 
5060e14bb325SJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++)
506149cf58c0SBrendan Gregg - Sun Microsystems 		spa_async_remove(spa, vd->vdev_child[c]);
5062ea8dc4b6Seschrock }
5063fa9e4066Sahrens 
5064e14bb325SJeff Bonwick static void
5065e14bb325SJeff Bonwick spa_async_probe(spa_t *spa, vdev_t *vd)
5066e14bb325SJeff Bonwick {
5067e14bb325SJeff Bonwick 	if (vd->vdev_probe_wanted) {
506898d1cbfeSGeorge Wilson 		vd->vdev_probe_wanted = B_FALSE;
5069e14bb325SJeff Bonwick 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
5070e14bb325SJeff Bonwick 	}
5071e14bb325SJeff Bonwick 
5072e14bb325SJeff Bonwick 	for (int c = 0; c < vd->vdev_children; c++)
5073e14bb325SJeff Bonwick 		spa_async_probe(spa, vd->vdev_child[c]);
5074e14bb325SJeff Bonwick }
5075e14bb325SJeff Bonwick 
5076573ca77eSGeorge Wilson static void
5077573ca77eSGeorge Wilson spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5078573ca77eSGeorge Wilson {
5079573ca77eSGeorge Wilson 	sysevent_id_t eid;
5080573ca77eSGeorge Wilson 	nvlist_t *attr;
5081573ca77eSGeorge Wilson 	char *physpath;
5082573ca77eSGeorge Wilson 
5083573ca77eSGeorge Wilson 	if (!spa->spa_autoexpand)
5084573ca77eSGeorge Wilson 		return;
5085573ca77eSGeorge Wilson 
5086573ca77eSGeorge Wilson 	for (int c = 0; c < vd->vdev_children; c++) {
5087573ca77eSGeorge Wilson 		vdev_t *cvd = vd->vdev_child[c];
5088573ca77eSGeorge Wilson 		spa_async_autoexpand(spa, cvd);
5089573ca77eSGeorge Wilson 	}
5090573ca77eSGeorge Wilson 
5091573ca77eSGeorge Wilson 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5092573ca77eSGeorge Wilson 		return;
5093573ca77eSGeorge Wilson 
5094573ca77eSGeorge Wilson 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5095573ca77eSGeorge Wilson 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5096573ca77eSGeorge Wilson 
5097573ca77eSGeorge Wilson 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5098573ca77eSGeorge Wilson 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5099573ca77eSGeorge Wilson 
5100573ca77eSGeorge Wilson 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5101573ca77eSGeorge Wilson 	    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5102573ca77eSGeorge Wilson 
5103573ca77eSGeorge Wilson 	nvlist_free(attr);
5104573ca77eSGeorge Wilson 	kmem_free(physpath, MAXPATHLEN);
5105573ca77eSGeorge Wilson }
5106573ca77eSGeorge Wilson 
5107ea8dc4b6Seschrock static void
5108ea8dc4b6Seschrock spa_async_thread(spa_t *spa)
5109ea8dc4b6Seschrock {
5110e14bb325SJeff Bonwick 	int tasks;
5111ea8dc4b6Seschrock 
5112ea8dc4b6Seschrock 	ASSERT(spa->spa_sync_on);
5113ea8dc4b6Seschrock 
5114ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5115ea8dc4b6Seschrock 	tasks = spa->spa_async_tasks;
5116ea8dc4b6Seschrock 	spa->spa_async_tasks = 0;
5117ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5118ea8dc4b6Seschrock 
51190373e76bSbonwick 	/*
51200373e76bSbonwick 	 * See if the config needs to be updated.
51210373e76bSbonwick 	 */
51220373e76bSbonwick 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5123b24ab676SJeff Bonwick 		uint64_t old_space, new_space;
5124573ca77eSGeorge Wilson 
51250373e76bSbonwick 		mutex_enter(&spa_namespace_lock);
5126b24ab676SJeff Bonwick 		old_space = metaslab_class_get_space(spa_normal_class(spa));
51270373e76bSbonwick 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5128b24ab676SJeff Bonwick 		new_space = metaslab_class_get_space(spa_normal_class(spa));
51290373e76bSbonwick 		mutex_exit(&spa_namespace_lock);
5130573ca77eSGeorge Wilson 
5131573ca77eSGeorge Wilson 		/*
5132573ca77eSGeorge Wilson 		 * If the pool grew as a result of the config update,
5133573ca77eSGeorge Wilson 		 * then log an internal history event.
5134573ca77eSGeorge Wilson 		 */
5135b24ab676SJeff Bonwick 		if (new_space != old_space) {
51363f9d6ad7SLin Ling 			spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
51373f9d6ad7SLin Ling 			    spa, NULL,
5138c8e1f6d2SMark J Musante 			    "pool '%s' size: %llu(+%llu)",
5139b24ab676SJeff Bonwick 			    spa_name(spa), new_space, new_space - old_space);
5140573ca77eSGeorge Wilson 		}
51410373e76bSbonwick 	}
51420373e76bSbonwick 
5143ea8dc4b6Seschrock 	/*
51443d7072f8Seschrock 	 * See if any devices need to be marked REMOVED.
5145ea8dc4b6Seschrock 	 */
5146e14bb325SJeff Bonwick 	if (tasks & SPA_ASYNC_REMOVE) {
51478f18d1faSGeorge Wilson 		spa_vdev_state_enter(spa, SCL_NONE);
51483d7072f8Seschrock 		spa_async_remove(spa, spa->spa_root_vdev);
5149e14bb325SJeff Bonwick 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
515049cf58c0SBrendan Gregg - Sun Microsystems 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5151e14bb325SJeff Bonwick 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
515249cf58c0SBrendan Gregg - Sun Microsystems 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5153e14bb325SJeff Bonwick 		(void) spa_vdev_state_exit(spa, NULL, 0);
5154e14bb325SJeff Bonwick 	}
5155e14bb325SJeff Bonwick 
5156573ca77eSGeorge Wilson 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5157573ca77eSGeorge Wilson 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5158573ca77eSGeorge Wilson 		spa_async_autoexpand(spa, spa->spa_root_vdev);
5159573ca77eSGeorge Wilson 		spa_config_exit(spa, SCL_CONFIG, FTAG);
5160573ca77eSGeorge Wilson 	}
5161573ca77eSGeorge Wilson 
5162e14bb325SJeff Bonwick 	/*
5163e14bb325SJeff Bonwick 	 * See if any devices need to be probed.
5164e14bb325SJeff Bonwick 	 */
5165e14bb325SJeff Bonwick 	if (tasks & SPA_ASYNC_PROBE) {
51668f18d1faSGeorge Wilson 		spa_vdev_state_enter(spa, SCL_NONE);
5167e14bb325SJeff Bonwick 		spa_async_probe(spa, spa->spa_root_vdev);
5168e14bb325SJeff Bonwick 		(void) spa_vdev_state_exit(spa, NULL, 0);
51693d7072f8Seschrock 	}
5170ea8dc4b6Seschrock 
5171ea8dc4b6Seschrock 	/*
5172ea8dc4b6Seschrock 	 * If any devices are done replacing, detach them.
5173ea8dc4b6Seschrock 	 */
51743d7072f8Seschrock 	if (tasks & SPA_ASYNC_RESILVER_DONE)
51753d7072f8Seschrock 		spa_vdev_resilver_done(spa);
5176fa9e4066Sahrens 
5177ea8dc4b6Seschrock 	/*
5178ea8dc4b6Seschrock 	 * Kick off a resilver.
5179ea8dc4b6Seschrock 	 */
5180088f3894Sahrens 	if (tasks & SPA_ASYNC_RESILVER)
51813f9d6ad7SLin Ling 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
5182ea8dc4b6Seschrock 
5183ea8dc4b6Seschrock 	/*
5184ea8dc4b6Seschrock 	 * Let the world know that we're done.
5185ea8dc4b6Seschrock 	 */
5186ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5187ea8dc4b6Seschrock 	spa->spa_async_thread = NULL;
5188ea8dc4b6Seschrock 	cv_broadcast(&spa->spa_async_cv);
5189ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5190ea8dc4b6Seschrock 	thread_exit();
5191ea8dc4b6Seschrock }
5192ea8dc4b6Seschrock 
5193ea8dc4b6Seschrock void
5194ea8dc4b6Seschrock spa_async_suspend(spa_t *spa)
5195ea8dc4b6Seschrock {
5196ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5197ea8dc4b6Seschrock 	spa->spa_async_suspended++;
5198ea8dc4b6Seschrock 	while (spa->spa_async_thread != NULL)
5199ea8dc4b6Seschrock 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5200ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5201ea8dc4b6Seschrock }
5202ea8dc4b6Seschrock 
5203ea8dc4b6Seschrock void
5204ea8dc4b6Seschrock spa_async_resume(spa_t *spa)
5205ea8dc4b6Seschrock {
5206ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5207ea8dc4b6Seschrock 	ASSERT(spa->spa_async_suspended != 0);
5208ea8dc4b6Seschrock 	spa->spa_async_suspended--;
5209ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5210ea8dc4b6Seschrock }
5211ea8dc4b6Seschrock 
5212ea8dc4b6Seschrock static void
5213ea8dc4b6Seschrock spa_async_dispatch(spa_t *spa)
5214ea8dc4b6Seschrock {
5215ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5216ea8dc4b6Seschrock 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
52170373e76bSbonwick 	    spa->spa_async_thread == NULL &&
52180373e76bSbonwick 	    rootdir != NULL && !vn_is_readonly(rootdir))
5219ea8dc4b6Seschrock 		spa->spa_async_thread = thread_create(NULL, 0,
5220ea8dc4b6Seschrock 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5221ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5222ea8dc4b6Seschrock }
5223ea8dc4b6Seschrock 
5224ea8dc4b6Seschrock void
5225ea8dc4b6Seschrock spa_async_request(spa_t *spa, int task)
5226ea8dc4b6Seschrock {
52273f9d6ad7SLin Ling 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5228ea8dc4b6Seschrock 	mutex_enter(&spa->spa_async_lock);
5229ea8dc4b6Seschrock 	spa->spa_async_tasks |= task;
5230ea8dc4b6Seschrock 	mutex_exit(&spa->spa_async_lock);
5231fa9e4066Sahrens }
5232fa9e4066Sahrens 
5233fa9e4066Sahrens /*
5234fa9e4066Sahrens  * ==========================================================================
5235fa9e4066Sahrens  * SPA syncing routines
5236fa9e4066Sahrens  * ==========================================================================
5237fa9e4066Sahrens  */
5238fa9e4066Sahrens 
5239cde58dbcSMatthew Ahrens static int
5240cde58dbcSMatthew Ahrens bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5241cde58dbcSMatthew Ahrens {
5242cde58dbcSMatthew Ahrens 	bpobj_t *bpo = arg;
5243cde58dbcSMatthew Ahrens 	bpobj_enqueue(bpo, bp, tx);
5244cde58dbcSMatthew Ahrens 	return (0);
5245b24ab676SJeff Bonwick }
5246b24ab676SJeff Bonwick 
5247cde58dbcSMatthew Ahrens static int
5248cde58dbcSMatthew Ahrens spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5249b24ab676SJeff Bonwick {
5250b24ab676SJeff Bonwick 	zio_t *zio = arg;
5251b24ab676SJeff Bonwick 
5252b24ab676SJeff Bonwick 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5253b24ab676SJeff Bonwick 	    zio->io_flags));
5254cde58dbcSMatthew Ahrens 	return (0);
5255fa9e4066Sahrens }
5256fa9e4066Sahrens 
5257fa9e4066Sahrens static void
525899653d4eSeschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5259fa9e4066Sahrens {
5260fa9e4066Sahrens 	char *packed = NULL;
5261f7991ba4STim Haley 	size_t bufsize;
5262fa9e4066Sahrens 	size_t nvsize = 0;
5263fa9e4066Sahrens 	dmu_buf_t *db;
5264fa9e4066Sahrens 
526599653d4eSeschrock 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5266fa9e4066Sahrens 
5267f7991ba4STim Haley 	/*
5268f7991ba4STim Haley 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5269f7991ba4STim Haley 	 * information.  This avoids the dbuf_will_dirty() path and
5270f7991ba4STim Haley 	 * saves us a pre-read to get data we don't actually care about.
5271f7991ba4STim Haley 	 */
5272f7991ba4STim Haley 	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5273f7991ba4STim Haley 	packed = kmem_alloc(bufsize, KM_SLEEP);
5274fa9e4066Sahrens 
527599653d4eSeschrock 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5276ea8dc4b6Seschrock 	    KM_SLEEP) == 0);
5277f7991ba4STim Haley 	bzero(packed + nvsize, bufsize - nvsize);
5278fa9e4066Sahrens 
5279f7991ba4STim Haley 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5280fa9e4066Sahrens 
5281f7991ba4STim Haley 	kmem_free(packed, bufsize);
5282fa9e4066Sahrens 
528399653d4eSeschrock 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5284fa9e4066Sahrens 	dmu_buf_will_dirty(db, tx);
5285fa9e4066Sahrens 	*(uint64_t *)db->db_data = nvsize;
5286ea8dc4b6Seschrock 	dmu_buf_rele(db, FTAG);
5287fa9e4066Sahrens }
5288fa9e4066Sahrens 
528999653d4eSeschrock static void
5290fa94a07fSbrendan spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5291fa94a07fSbrendan     const char *config, const char *entry)
529299653d4eSeschrock {
529399653d4eSeschrock 	nvlist_t *nvroot;
5294fa94a07fSbrendan 	nvlist_t **list;
529599653d4eSeschrock 	int i;
529699653d4eSeschrock 
5297fa94a07fSbrendan 	if (!sav->sav_sync)
529899653d4eSeschrock 		return;
529999653d4eSeschrock 
530099653d4eSeschrock 	/*
5301fa94a07fSbrendan 	 * Update the MOS nvlist describing the list of available devices.
5302fa94a07fSbrendan 	 * spa_validate_aux() will have already made sure this nvlist is
53033d7072f8Seschrock 	 * valid and the vdevs are labeled appropriately.
530499653d4eSeschrock 	 */
5305fa94a07fSbrendan 	if (sav->sav_object == 0) {
5306fa94a07fSbrendan 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5307fa94a07fSbrendan 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5308fa94a07fSbrendan 		    sizeof (uint64_t), tx);
530999653d4eSeschrock 		VERIFY(zap_update(spa->spa_meta_objset,
5310fa94a07fSbrendan 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5311fa94a07fSbrendan 		    &sav->sav_object, tx) == 0);
531299653d4eSeschrock 	}
531399653d4eSeschrock 
531499653d4eSeschrock 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5315fa94a07fSbrendan 	if (sav->sav_count == 0) {
5316fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
531799653d4eSeschrock 	} else {
5318fa94a07fSbrendan 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5319fa94a07fSbrendan 		for (i = 0; i < sav->sav_count; i++)
5320fa94a07fSbrendan 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
53213f9d6ad7SLin Ling 			    B_FALSE, VDEV_CONFIG_L2CACHE);
5322fa94a07fSbrendan 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5323fa94a07fSbrendan 		    sav->sav_count) == 0);
5324fa94a07fSbrendan 		for (i = 0; i < sav->sav_count; i++)
5325fa94a07fSbrendan 			nvlist_free(list[i]);
5326fa94a07fSbrendan 		kmem_free(list, sav->sav_count * sizeof (void *));
532799653d4eSeschrock 	}
532899653d4eSeschrock 
5329fa94a07fSbrendan 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
533006eeb2adSek 	nvlist_free(nvroot);
533199653d4eSeschrock 
5332fa94a07fSbrendan 	sav->sav_sync = B_FALSE;
533399653d4eSeschrock }
533499653d4eSeschrock 
533599653d4eSeschrock static void
533699653d4eSeschrock spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
533799653d4eSeschrock {
533899653d4eSeschrock 	nvlist_t *config;
533999653d4eSeschrock 
5340e14bb325SJeff Bonwick 	if (list_is_empty(&spa->spa_config_dirty_list))
534199653d4eSeschrock 		return;
534299653d4eSeschrock 
5343e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5344e14bb325SJeff Bonwick 
5345e14bb325SJeff Bonwick 	config = spa_config_generate(spa, spa->spa_root_vdev,
5346e14bb325SJeff Bonwick 	    dmu_tx_get_txg(tx), B_FALSE);
5347e14bb325SJeff Bonwick 
5348e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_STATE, FTAG);
534999653d4eSeschrock 
535099653d4eSeschrock 	if (spa->spa_config_syncing)
535199653d4eSeschrock 		nvlist_free(spa->spa_config_syncing);
535299653d4eSeschrock 	spa->spa_config_syncing = config;
535399653d4eSeschrock 
535499653d4eSeschrock 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
535599653d4eSeschrock }
535699653d4eSeschrock 
5357990b4856Slling /*
5358990b4856Slling  * Set zpool properties.
5359990b4856Slling  */
5360b1b8ab34Slling static void
53613f9d6ad7SLin Ling spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5362b1b8ab34Slling {
5363b1b8ab34Slling 	spa_t *spa = arg1;
5364b1b8ab34Slling 	objset_t *mos = spa->spa_meta_objset;
5365990b4856Slling 	nvlist_t *nvp = arg2;
5366990b4856Slling 	nvpair_t *elem;
53673d7072f8Seschrock 	uint64_t intval;
5368c5904d13Seschrock 	char *strval;
5369990b4856Slling 	zpool_prop_t prop;
5370990b4856Slling 	const char *propname;
5371990b4856Slling 	zprop_type_t proptype;
5372b1b8ab34Slling 
5373e14bb325SJeff Bonwick 	mutex_enter(&spa->spa_props_lock);
5374e14bb325SJeff Bonwick 
5375990b4856Slling 	elem = NULL;
5376990b4856Slling 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
5377990b4856Slling 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5378990b4856Slling 		case ZPOOL_PROP_VERSION:
5379990b4856Slling 			/*
5380990b4856Slling 			 * Only set version for non-zpool-creation cases
5381990b4856Slling 			 * (set/import). spa_create() needs special care
5382990b4856Slling 			 * for version setting.
5383990b4856Slling 			 */
5384990b4856Slling 			if (tx->tx_txg != TXG_INITIAL) {
5385990b4856Slling 				VERIFY(nvpair_value_uint64(elem,
5386990b4856Slling 				    &intval) == 0);
5387990b4856Slling 				ASSERT(intval <= SPA_VERSION);
5388990b4856Slling 				ASSERT(intval >= spa_version(spa));
5389990b4856Slling 				spa->spa_uberblock.ub_version = intval;
5390990b4856Slling 				vdev_config_dirty(spa->spa_root_vdev);
5391990b4856Slling 			}
5392ecd6cf80Smarks 			break;
5393990b4856Slling 
5394990b4856Slling 		case ZPOOL_PROP_ALTROOT:
5395990b4856Slling 			/*
5396990b4856Slling 			 * 'altroot' is a non-persistent property. It should
5397990b4856Slling 			 * have been set temporarily at creation or import time.
5398990b4856Slling 			 */
5399990b4856Slling 			ASSERT(spa->spa_root != NULL);
5400b1b8ab34Slling 			break;
54013d7072f8Seschrock 
5402f9af39baSGeorge Wilson 		case ZPOOL_PROP_READONLY:
54032f8aaab3Seschrock 		case ZPOOL_PROP_CACHEFILE:
5404990b4856Slling 			/*
5405f9af39baSGeorge Wilson 			 * 'readonly' and 'cachefile' are also non-persisitent
5406f9af39baSGeorge Wilson 			 * properties.
5407990b4856Slling 			 */
54083d7072f8Seschrock 			break;
5409*8704186eSDan McDonald 		case ZPOOL_PROP_COMMENT:
5410*8704186eSDan McDonald 			VERIFY(nvpair_value_string(elem, &strval) == 0);
5411*8704186eSDan McDonald 			if (spa->spa_comment != NULL)
5412*8704186eSDan McDonald 				spa_strfree(spa->spa_comment);
5413*8704186eSDan McDonald 			spa->spa_comment = spa_strdup(strval);
5414*8704186eSDan McDonald 			/*
5415*8704186eSDan McDonald 			 * We need to dirty the configuration on all the vdevs
5416*8704186eSDan McDonald 			 * so that their labels get updated.  It's unnecessary
5417*8704186eSDan McDonald 			 * to do this for pool creation since the vdev's
5418*8704186eSDan McDonald 			 * configuratoin has already been dirtied.
5419*8704186eSDan McDonald 			 */
5420*8704186eSDan McDonald 			if (tx->tx_txg != TXG_INITIAL)
5421*8704186eSDan McDonald 				vdev_config_dirty(spa->spa_root_vdev);
5422*8704186eSDan McDonald 			break;
5423990b4856Slling 		default:
5424990b4856Slling 			/*
5425990b4856Slling 			 * Set pool property values in the poolprops mos object.
5426990b4856Slling 			 */
5427990b4856Slling 			if (spa->spa_pool_props_object == 0) {
5428990b4856Slling 				VERIFY((spa->spa_pool_props_object =
5429990b4856Slling 				    zap_create(mos, DMU_OT_POOL_PROPS,
5430990b4856Slling 				    DMU_OT_NONE, 0, tx)) > 0);
5431990b4856Slling 
5432990b4856Slling 				VERIFY(zap_update(mos,
5433990b4856Slling 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5434990b4856Slling 				    8, 1, &spa->spa_pool_props_object, tx)
5435990b4856Slling 				    == 0);
5436990b4856Slling 			}
5437990b4856Slling 
5438990b4856Slling 			/* normalize the property name */
5439990b4856Slling 			propname = zpool_prop_to_name(prop);
5440990b4856Slling 			proptype = zpool_prop_get_type(prop);
5441990b4856Slling 
5442990b4856Slling 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
5443990b4856Slling 				ASSERT(proptype == PROP_TYPE_STRING);
5444990b4856Slling 				VERIFY(nvpair_value_string(elem, &strval) == 0);
5445990b4856Slling 				VERIFY(zap_update(mos,
5446990b4856Slling 				    spa->spa_pool_props_object, propname,
5447990b4856Slling 				    1, strlen(strval) + 1, strval, tx) == 0);
5448990b4856Slling 
5449990b4856Slling 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5450990b4856Slling 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5451990b4856Slling 
5452990b4856Slling 				if (proptype == PROP_TYPE_INDEX) {
5453990b4856Slling 					const char *unused;
5454990b4856Slling 					VERIFY(zpool_prop_index_to_string(
5455990b4856Slling 					    prop, intval, &unused) == 0);
5456990b4856Slling 				}
5457990b4856Slling 				VERIFY(zap_update(mos,
5458990b4856Slling 				    spa->spa_pool_props_object, propname,
5459990b4856Slling 				    8, 1, &intval, tx) == 0);
5460990b4856Slling 			} else {
5461990b4856Slling 				ASSERT(0); /* not allowed */
5462990b4856Slling 			}
5463990b4856Slling 
54640a4e9518Sgw 			switch (prop) {
54650a4e9518Sgw 			case ZPOOL_PROP_DELEGATION:
5466990b4856Slling 				spa->spa_delegation = intval;
54670a4e9518Sgw 				break;
54680a4e9518Sgw 			case ZPOOL_PROP_BOOTFS:
5469990b4856Slling 				spa->spa_bootfs = intval;
54700a4e9518Sgw 				break;
54710a4e9518Sgw 			case ZPOOL_PROP_FAILUREMODE:
54720a4e9518Sgw 				spa->spa_failmode = intval;
54730a4e9518Sgw 				break;
5474573ca77eSGeorge Wilson 			case ZPOOL_PROP_AUTOEXPAND:
5475573ca77eSGeorge Wilson 				spa->spa_autoexpand = intval;
5476b98131cfSEric Taylor 				if (tx->tx_txg != TXG_INITIAL)
5477b98131cfSEric Taylor 					spa_async_request(spa,
5478b98131cfSEric Taylor 					    SPA_ASYNC_AUTOEXPAND);
5479573ca77eSGeorge Wilson 				break;
5480b24ab676SJeff Bonwick 			case ZPOOL_PROP_DEDUPDITTO:
5481b24ab676SJeff Bonwick 				spa->spa_dedup_ditto = intval;
5482b24ab676SJeff Bonwick 				break;
54830a4e9518Sgw 			default:
54840a4e9518Sgw 				break;
54850a4e9518Sgw 			}
5486990b4856Slling 		}
5487990b4856Slling 
5488990b4856Slling 		/* log internal history if this is not a zpool create */
5489990b4856Slling 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5490990b4856Slling 		    tx->tx_txg != TXG_INITIAL) {
54913f9d6ad7SLin Ling 			spa_history_log_internal(LOG_POOL_PROPSET,
54923f9d6ad7SLin Ling 			    spa, tx, "%s %lld %s",
5493e14bb325SJeff Bonwick 			    nvpair_name(elem), intval, spa_name(spa));
5494b1b8ab34Slling 		}
5495b1b8ab34Slling 	}
5496e14bb325SJeff Bonwick 
5497e14bb325SJeff Bonwick 	mutex_exit(&spa->spa_props_lock);
5498b1b8ab34Slling }
5499b1b8ab34Slling 
5500cde58dbcSMatthew Ahrens /*
5501cde58dbcSMatthew Ahrens  * Perform one-time upgrade on-disk changes.  spa_version() does not
5502cde58dbcSMatthew Ahrens  * reflect the new version this txg, so there must be no changes this
5503cde58dbcSMatthew Ahrens  * txg to anything that the upgrade code depends on after it executes.
5504cde58dbcSMatthew Ahrens  * Therefore this must be called after dsl_pool_sync() does the sync
5505cde58dbcSMatthew Ahrens  * tasks.
5506cde58dbcSMatthew Ahrens  */
5507cde58dbcSMatthew Ahrens static void
5508cde58dbcSMatthew Ahrens spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5509cde58dbcSMatthew Ahrens {
5510cde58dbcSMatthew Ahrens 	dsl_pool_t *dp = spa->spa_dsl_pool;
5511cde58dbcSMatthew Ahrens 
5512cde58dbcSMatthew Ahrens 	ASSERT(spa->spa_sync_pass == 1);
5513cde58dbcSMatthew Ahrens 
5514cde58dbcSMatthew Ahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5515cde58dbcSMatthew Ahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5516cde58dbcSMatthew Ahrens 		dsl_pool_create_origin(dp, tx);
5517cde58dbcSMatthew Ahrens 
5518cde58dbcSMatthew Ahrens 		/* Keeping the origin open increases spa_minref */
5519cde58dbcSMatthew Ahrens 		spa->spa_minref += 3;
5520cde58dbcSMatthew Ahrens 	}
5521cde58dbcSMatthew Ahrens 
5522cde58dbcSMatthew Ahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5523cde58dbcSMatthew Ahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5524cde58dbcSMatthew Ahrens 		dsl_pool_upgrade_clones(dp, tx);
5525cde58dbcSMatthew Ahrens 	}
5526cde58dbcSMatthew Ahrens 
5527cde58dbcSMatthew Ahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5528cde58dbcSMatthew Ahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5529cde58dbcSMatthew Ahrens 		dsl_pool_upgrade_dir_clones(dp, tx);
5530cde58dbcSMatthew Ahrens 
5531cde58dbcSMatthew Ahrens 		/* Keeping the freedir open increases spa_minref */
5532cde58dbcSMatthew Ahrens 		spa->spa_minref += 3;
5533cde58dbcSMatthew Ahrens 	}
5534cde58dbcSMatthew Ahrens }
5535cde58dbcSMatthew Ahrens 
5536fa9e4066Sahrens /*
5537fa9e4066Sahrens  * Sync the specified transaction group.  New blocks may be dirtied as
5538fa9e4066Sahrens  * part of the process, so we iterate until it converges.
5539fa9e4066Sahrens  */
5540fa9e4066Sahrens void
5541fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg)
5542fa9e4066Sahrens {
5543fa9e4066Sahrens 	dsl_pool_t *dp = spa->spa_dsl_pool;
5544fa9e4066Sahrens 	objset_t *mos = spa->spa_meta_objset;
5545cde58dbcSMatthew Ahrens 	bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5546b24ab676SJeff Bonwick 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
55470373e76bSbonwick 	vdev_t *rvd = spa->spa_root_vdev;
5548fa9e4066Sahrens 	vdev_t *vd;
5549fa9e4066Sahrens 	dmu_tx_t *tx;
5550e14bb325SJeff Bonwick 	int error;
5551fa9e4066Sahrens 
5552f9af39baSGeorge Wilson 	VERIFY(spa_writeable(spa));
5553f9af39baSGeorge Wilson 
5554fa9e4066Sahrens 	/*
5555fa9e4066Sahrens 	 * Lock out configuration changes.
5556fa9e4066Sahrens 	 */
5557e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5558fa9e4066Sahrens 
5559fa9e4066Sahrens 	spa->spa_syncing_txg = txg;
5560fa9e4066Sahrens 	spa->spa_sync_pass = 0;
5561fa9e4066Sahrens 
5562e14bb325SJeff Bonwick 	/*
5563e14bb325SJeff Bonwick 	 * If there are any pending vdev state changes, convert them
5564e14bb325SJeff Bonwick 	 * into config changes that go out with this transaction group.
5565e14bb325SJeff Bonwick 	 */
5566e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
55678ad4d6ddSJeff Bonwick 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
55688ad4d6ddSJeff Bonwick 		/*
55698ad4d6ddSJeff Bonwick 		 * We need the write lock here because, for aux vdevs,
55708ad4d6ddSJeff Bonwick 		 * calling vdev_config_dirty() modifies sav_config.
55718ad4d6ddSJeff Bonwick 		 * This is ugly and will become unnecessary when we
55728ad4d6ddSJeff Bonwick 		 * eliminate the aux vdev wart by integrating all vdevs
55738ad4d6ddSJeff Bonwick 		 * into the root vdev tree.
55748ad4d6ddSJeff Bonwick 		 */
55758ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
55768ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
55778ad4d6ddSJeff Bonwick 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
55788ad4d6ddSJeff Bonwick 			vdev_state_clean(vd);
55798ad4d6ddSJeff Bonwick 			vdev_config_dirty(vd);
55808ad4d6ddSJeff Bonwick 		}
55818ad4d6ddSJeff Bonwick 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
55828ad4d6ddSJeff Bonwick 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5583e14bb325SJeff Bonwick 	}
5584e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_STATE, FTAG);
5585e14bb325SJeff Bonwick 
558699653d4eSeschrock 	tx = dmu_tx_create_assigned(dp, txg);
558799653d4eSeschrock 
558899653d4eSeschrock 	/*
5589e7437265Sahrens 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
559099653d4eSeschrock 	 * set spa_deflate if we have no raid-z vdevs.
559199653d4eSeschrock 	 */
5592e7437265Sahrens 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5593e7437265Sahrens 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
559499653d4eSeschrock 		int i;
559599653d4eSeschrock 
559699653d4eSeschrock 		for (i = 0; i < rvd->vdev_children; i++) {
559799653d4eSeschrock 			vd = rvd->vdev_child[i];
559899653d4eSeschrock 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
559999653d4eSeschrock 				break;
560099653d4eSeschrock 		}
560199653d4eSeschrock 		if (i == rvd->vdev_children) {
560299653d4eSeschrock 			spa->spa_deflate = TRUE;
560399653d4eSeschrock 			VERIFY(0 == zap_add(spa->spa_meta_objset,
560499653d4eSeschrock 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
560599653d4eSeschrock 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
560699653d4eSeschrock 		}
560799653d4eSeschrock 	}
560899653d4eSeschrock 
5609fa9e4066Sahrens 	/*
56103f9d6ad7SLin Ling 	 * If anything has changed in this txg, or if someone is waiting
56113f9d6ad7SLin Ling 	 * for this txg to sync (eg, spa_vdev_remove()), push the
56123f9d6ad7SLin Ling 	 * deferred frees from the previous txg.  If not, leave them
56133f9d6ad7SLin Ling 	 * alone so that we don't generate work on an otherwise idle
56143f9d6ad7SLin Ling 	 * system.
5615fa9e4066Sahrens 	 */
5616fa9e4066Sahrens 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
56171615a317Sek 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
56183f9d6ad7SLin Ling 	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5619cde58dbcSMatthew Ahrens 	    ((dsl_scan_active(dp->dp_scan) ||
5620cde58dbcSMatthew Ahrens 	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
5621cde58dbcSMatthew Ahrens 		zio_t *zio = zio_root(spa, NULL, NULL, 0);
5622cde58dbcSMatthew Ahrens 		VERIFY3U(bpobj_iterate(defer_bpo,
5623cde58dbcSMatthew Ahrens 		    spa_free_sync_cb, zio, tx), ==, 0);
5624cde58dbcSMatthew Ahrens 		VERIFY3U(zio_wait(zio), ==, 0);
5625cde58dbcSMatthew Ahrens 	}
5626fa9e4066Sahrens 
5627fa9e4066Sahrens 	/*
5628fa9e4066Sahrens 	 * Iterate to convergence.
5629fa9e4066Sahrens 	 */
5630fa9e4066Sahrens 	do {
5631b24ab676SJeff Bonwick 		int pass = ++spa->spa_sync_pass;
5632fa9e4066Sahrens 
5633fa9e4066Sahrens 		spa_sync_config_object(spa, tx);
5634fa94a07fSbrendan 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5635fa94a07fSbrendan 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5636fa94a07fSbrendan 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5637fa94a07fSbrendan 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5638ea8dc4b6Seschrock 		spa_errlog_sync(spa, txg);
5639fa9e4066Sahrens 		dsl_pool_sync(dp, txg);
5640fa9e4066Sahrens 
5641b24ab676SJeff Bonwick 		if (pass <= SYNC_PASS_DEFERRED_FREE) {
5642b24ab676SJeff Bonwick 			zio_t *zio = zio_root(spa, NULL, NULL, 0);
5643cde58dbcSMatthew Ahrens 			bplist_iterate(free_bpl, spa_free_sync_cb,
5644cde58dbcSMatthew Ahrens 			    zio, tx);
5645b24ab676SJeff Bonwick 			VERIFY(zio_wait(zio) == 0);
5646b24ab676SJeff Bonwick 		} else {
5647cde58dbcSMatthew Ahrens 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
5648cde58dbcSMatthew Ahrens 			    defer_bpo, tx);
5649fa9e4066Sahrens 		}
5650fa9e4066Sahrens 
5651b24ab676SJeff Bonwick 		ddt_sync(spa, txg);
56523f9d6ad7SLin Ling 		dsl_scan_sync(dp, tx);
5653afee20e4SGeorge Wilson 
5654b24ab676SJeff Bonwick 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5655b24ab676SJeff Bonwick 			vdev_sync(vd, txg);
5656b24ab676SJeff Bonwick 
5657cde58dbcSMatthew Ahrens 		if (pass == 1)
5658cde58dbcSMatthew Ahrens 			spa_sync_upgrades(spa, tx);
5659fa9e4066Sahrens 
5660cde58dbcSMatthew Ahrens 	} while (dmu_objset_is_dirty(mos, txg));
5661fa9e4066Sahrens 
5662fa9e4066Sahrens 	/*
5663fa9e4066Sahrens 	 * Rewrite the vdev configuration (which includes the uberblock)
5664fa9e4066Sahrens 	 * to commit the transaction group.
56650373e76bSbonwick 	 *
566617f17c2dSbonwick 	 * If there are no dirty vdevs, we sync the uberblock to a few
566717f17c2dSbonwick 	 * random top-level vdevs that are known to be visible in the
5668e14bb325SJeff Bonwick 	 * config cache (see spa_vdev_add() for a complete description).
5669e14bb325SJeff Bonwick 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
56700373e76bSbonwick 	 */
5671e14bb325SJeff Bonwick 	for (;;) {
5672e14bb325SJeff Bonwick 		/*
5673e14bb325SJeff Bonwick 		 * We hold SCL_STATE to prevent vdev open/close/etc.
5674e14bb325SJeff Bonwick 		 * while we're attempting to write the vdev labels.
5675e14bb325SJeff Bonwick 		 */
5676e14bb325SJeff Bonwick 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5677e14bb325SJeff Bonwick 
5678e14bb325SJeff Bonwick 		if (list_is_empty(&spa->spa_config_dirty_list)) {
5679e14bb325SJeff Bonwick 			vdev_t *svd[SPA_DVAS_PER_BP];
5680e14bb325SJeff Bonwick 			int svdcount = 0;
5681e14bb325SJeff Bonwick 			int children = rvd->vdev_children;
5682e14bb325SJeff Bonwick 			int c0 = spa_get_random(children);
5683e14bb325SJeff Bonwick 
5684573ca77eSGeorge Wilson 			for (int c = 0; c < children; c++) {
5685e14bb325SJeff Bonwick 				vd = rvd->vdev_child[(c0 + c) % children];
5686e14bb325SJeff Bonwick 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5687e14bb325SJeff Bonwick 					continue;
5688e14bb325SJeff Bonwick 				svd[svdcount++] = vd;
5689e14bb325SJeff Bonwick 				if (svdcount == SPA_DVAS_PER_BP)
5690e14bb325SJeff Bonwick 					break;
5691e14bb325SJeff Bonwick 			}
56928956713aSEric Schrock 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
56938956713aSEric Schrock 			if (error != 0)
56948956713aSEric Schrock 				error = vdev_config_sync(svd, svdcount, txg,
56958956713aSEric Schrock 				    B_TRUE);
5696e14bb325SJeff Bonwick 		} else {
5697e14bb325SJeff Bonwick 			error = vdev_config_sync(rvd->vdev_child,
56988956713aSEric Schrock 			    rvd->vdev_children, txg, B_FALSE);
56998956713aSEric Schrock 			if (error != 0)
57008956713aSEric Schrock 				error = vdev_config_sync(rvd->vdev_child,
57018956713aSEric Schrock 				    rvd->vdev_children, txg, B_TRUE);
57020373e76bSbonwick 		}
5703e14bb325SJeff Bonwick 
5704e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_STATE, FTAG);
5705e14bb325SJeff Bonwick 
5706e14bb325SJeff Bonwick 		if (error == 0)
5707e14bb325SJeff Bonwick 			break;
5708e14bb325SJeff Bonwick 		zio_suspend(spa, NULL);
5709e14bb325SJeff Bonwick 		zio_resume_wait(spa);
57100373e76bSbonwick 	}
571199653d4eSeschrock 	dmu_tx_commit(tx);
571299653d4eSeschrock 
57130373e76bSbonwick 	/*
57140373e76bSbonwick 	 * Clear the dirty config list.
5715fa9e4066Sahrens 	 */
5716e14bb325SJeff Bonwick 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
57170373e76bSbonwick 		vdev_config_clean(vd);
57180373e76bSbonwick 
57190373e76bSbonwick 	/*
57200373e76bSbonwick 	 * Now that the new config has synced transactionally,
57210373e76bSbonwick 	 * let it become visible to the config cache.
57220373e76bSbonwick 	 */
57230373e76bSbonwick 	if (spa->spa_config_syncing != NULL) {
57240373e76bSbonwick 		spa_config_set(spa, spa->spa_config_syncing);
57250373e76bSbonwick 		spa->spa_config_txg = txg;
57260373e76bSbonwick 		spa->spa_config_syncing = NULL;
57270373e76bSbonwick 	}
5728fa9e4066Sahrens 
5729fa9e4066Sahrens 	spa->spa_ubsync = spa->spa_uberblock;
5730fa9e4066Sahrens 
5731b24ab676SJeff Bonwick 	dsl_pool_sync_done(dp, txg);
5732fa9e4066Sahrens 
5733fa9e4066Sahrens 	/*
5734fa9e4066Sahrens 	 * Update usable space statistics.
5735fa9e4066Sahrens 	 */
5736fa9e4066Sahrens 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5737fa9e4066Sahrens 		vdev_sync_done(vd, txg);
5738fa9e4066Sahrens 
5739485bbbf5SGeorge Wilson 	spa_update_dspace(spa);
5740485bbbf5SGeorge Wilson 
5741fa9e4066Sahrens 	/*
5742fa9e4066Sahrens 	 * It had better be the case that we didn't dirty anything
574399653d4eSeschrock 	 * since vdev_config_sync().
5744fa9e4066Sahrens 	 */
5745fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5746fa9e4066Sahrens 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5747fa9e4066Sahrens 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
5748b24ab676SJeff Bonwick 
5749b24ab676SJeff Bonwick 	spa->spa_sync_pass = 0;
5750fa9e4066Sahrens 
5751e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_CONFIG, FTAG);
5752ea8dc4b6Seschrock 
5753468c413aSTim Haley 	spa_handle_ignored_writes(spa);
5754468c413aSTim Haley 
5755ea8dc4b6Seschrock 	/*
5756ea8dc4b6Seschrock 	 * If any async tasks have been requested, kick them off.
5757ea8dc4b6Seschrock 	 */
5758ea8dc4b6Seschrock 	spa_async_dispatch(spa);
5759fa9e4066Sahrens }
5760fa9e4066Sahrens 
5761fa9e4066Sahrens /*
5762fa9e4066Sahrens  * Sync all pools.  We don't want to hold the namespace lock across these
5763fa9e4066Sahrens  * operations, so we take a reference on the spa_t and drop the lock during the
5764fa9e4066Sahrens  * sync.
5765fa9e4066Sahrens  */
5766fa9e4066Sahrens void
5767fa9e4066Sahrens spa_sync_allpools(void)
5768fa9e4066Sahrens {
5769fa9e4066Sahrens 	spa_t *spa = NULL;
5770fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
5771fa9e4066Sahrens 	while ((spa = spa_next(spa)) != NULL) {
5772f9af39baSGeorge Wilson 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
5773f9af39baSGeorge Wilson 		    !spa_writeable(spa) || spa_suspended(spa))
5774fa9e4066Sahrens 			continue;
5775fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
5776fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
5777fa9e4066Sahrens 		txg_wait_synced(spa_get_dsl(spa), 0);
5778fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
5779fa9e4066Sahrens 		spa_close(spa, FTAG);
5780fa9e4066Sahrens 	}
5781fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
5782fa9e4066Sahrens }
5783fa9e4066Sahrens 
5784fa9e4066Sahrens /*
5785fa9e4066Sahrens  * ==========================================================================
5786fa9e4066Sahrens  * Miscellaneous routines
5787fa9e4066Sahrens  * ==========================================================================
5788fa9e4066Sahrens  */
5789fa9e4066Sahrens 
5790fa9e4066Sahrens /*
5791fa9e4066Sahrens  * Remove all pools in the system.
5792fa9e4066Sahrens  */
5793fa9e4066Sahrens void
5794fa9e4066Sahrens spa_evict_all(void)
5795fa9e4066Sahrens {
5796fa9e4066Sahrens 	spa_t *spa;
5797fa9e4066Sahrens 
5798fa9e4066Sahrens 	/*
5799fa9e4066Sahrens 	 * Remove all cached state.  All pools should be closed now,
5800fa9e4066Sahrens 	 * so every spa in the AVL tree should be unreferenced.
5801fa9e4066Sahrens 	 */
5802fa9e4066Sahrens 	mutex_enter(&spa_namespace_lock);
5803fa9e4066Sahrens 	while ((spa = spa_next(NULL)) != NULL) {
5804fa9e4066Sahrens 		/*
5805ea8dc4b6Seschrock 		 * Stop async tasks.  The async thread may need to detach
5806ea8dc4b6Seschrock 		 * a device that's been replaced, which requires grabbing
5807ea8dc4b6Seschrock 		 * spa_namespace_lock, so we must drop it here.
5808fa9e4066Sahrens 		 */
5809fa9e4066Sahrens 		spa_open_ref(spa, FTAG);
5810fa9e4066Sahrens 		mutex_exit(&spa_namespace_lock);
5811ea8dc4b6Seschrock 		spa_async_suspend(spa);
5812fa9e4066Sahrens 		mutex_enter(&spa_namespace_lock);
5813fa9e4066Sahrens 		spa_close(spa, FTAG);
5814fa9e4066Sahrens 
5815fa9e4066Sahrens 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5816fa9e4066Sahrens 			spa_unload(spa);
5817fa9e4066Sahrens 			spa_deactivate(spa);
5818fa9e4066Sahrens 		}
5819fa9e4066Sahrens 		spa_remove(spa);
5820fa9e4066Sahrens 	}
5821fa9e4066Sahrens 	mutex_exit(&spa_namespace_lock);
5822fa9e4066Sahrens }
5823ea8dc4b6Seschrock 
5824ea8dc4b6Seschrock vdev_t *
58256809eb4eSEric Schrock spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5826ea8dc4b6Seschrock {
5827c5904d13Seschrock 	vdev_t *vd;
5828c5904d13Seschrock 	int i;
5829c5904d13Seschrock 
5830c5904d13Seschrock 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5831c5904d13Seschrock 		return (vd);
5832c5904d13Seschrock 
58336809eb4eSEric Schrock 	if (aux) {
5834c5904d13Seschrock 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5835c5904d13Seschrock 			vd = spa->spa_l2cache.sav_vdevs[i];
58366809eb4eSEric Schrock 			if (vd->vdev_guid == guid)
58376809eb4eSEric Schrock 				return (vd);
58386809eb4eSEric Schrock 		}
58396809eb4eSEric Schrock 
58406809eb4eSEric Schrock 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
58416809eb4eSEric Schrock 			vd = spa->spa_spares.sav_vdevs[i];
5842c5904d13Seschrock 			if (vd->vdev_guid == guid)
5843c5904d13Seschrock 				return (vd);
5844c5904d13Seschrock 		}
5845c5904d13Seschrock 	}
5846c5904d13Seschrock 
5847c5904d13Seschrock 	return (NULL);
5848ea8dc4b6Seschrock }
5849eaca9bbdSeschrock 
5850eaca9bbdSeschrock void
5851990b4856Slling spa_upgrade(spa_t *spa, uint64_t version)
5852eaca9bbdSeschrock {
5853f9af39baSGeorge Wilson 	ASSERT(spa_writeable(spa));
5854f9af39baSGeorge Wilson 
5855e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5856eaca9bbdSeschrock 
5857eaca9bbdSeschrock 	/*
5858eaca9bbdSeschrock 	 * This should only be called for a non-faulted pool, and since a
5859eaca9bbdSeschrock 	 * future version would result in an unopenable pool, this shouldn't be
5860eaca9bbdSeschrock 	 * possible.
5861eaca9bbdSeschrock 	 */
5862e7437265Sahrens 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5863990b4856Slling 	ASSERT(version >= spa->spa_uberblock.ub_version);
5864eaca9bbdSeschrock 
5865990b4856Slling 	spa->spa_uberblock.ub_version = version;
5866eaca9bbdSeschrock 	vdev_config_dirty(spa->spa_root_vdev);
5867eaca9bbdSeschrock 
5868e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALL, FTAG);
586999653d4eSeschrock 
587099653d4eSeschrock 	txg_wait_synced(spa_get_dsl(spa), 0);
587199653d4eSeschrock }
587299653d4eSeschrock 
587399653d4eSeschrock boolean_t
587499653d4eSeschrock spa_has_spare(spa_t *spa, uint64_t guid)
587599653d4eSeschrock {
587699653d4eSeschrock 	int i;
587739c23413Seschrock 	uint64_t spareguid;
5878fa94a07fSbrendan 	spa_aux_vdev_t *sav = &spa->spa_spares;
587999653d4eSeschrock 
5880fa94a07fSbrendan 	for (i = 0; i < sav->sav_count; i++)
5881fa94a07fSbrendan 		if (sav->sav_vdevs[i]->vdev_guid == guid)
588299653d4eSeschrock 			return (B_TRUE);
588399653d4eSeschrock 
5884fa94a07fSbrendan 	for (i = 0; i < sav->sav_npending; i++) {
5885fa94a07fSbrendan 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5886fa94a07fSbrendan 		    &spareguid) == 0 && spareguid == guid)
588739c23413Seschrock 			return (B_TRUE);
588839c23413Seschrock 	}
588939c23413Seschrock 
589099653d4eSeschrock 	return (B_FALSE);
5891eaca9bbdSeschrock }
5892b1b8ab34Slling 
589389a89ebfSlling /*
589489a89ebfSlling  * Check if a pool has an active shared spare device.
589589a89ebfSlling  * Note: reference count of an active spare is 2, as a spare and as a replace
589689a89ebfSlling  */
589789a89ebfSlling static boolean_t
589889a89ebfSlling spa_has_active_shared_spare(spa_t *spa)
589989a89ebfSlling {
590089a89ebfSlling 	int i, refcnt;
590189a89ebfSlling 	uint64_t pool;
590289a89ebfSlling 	spa_aux_vdev_t *sav = &spa->spa_spares;
590389a89ebfSlling 
590489a89ebfSlling 	for (i = 0; i < sav->sav_count; i++) {
590589a89ebfSlling 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
590689a89ebfSlling 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
590789a89ebfSlling 		    refcnt > 2)
590889a89ebfSlling 			return (B_TRUE);
590989a89ebfSlling 	}
591089a89ebfSlling 
591189a89ebfSlling 	return (B_FALSE);
591289a89ebfSlling }
591389a89ebfSlling 
59143d7072f8Seschrock /*
59153d7072f8Seschrock  * Post a sysevent corresponding to the given event.  The 'name' must be one of
59163d7072f8Seschrock  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
59173d7072f8Seschrock  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
59183d7072f8Seschrock  * in the userland libzpool, as we don't want consumers to misinterpret ztest
59193d7072f8Seschrock  * or zdb as real changes.
59203d7072f8Seschrock  */
59213d7072f8Seschrock void
59223d7072f8Seschrock spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
59233d7072f8Seschrock {
59243d7072f8Seschrock #ifdef _KERNEL
59253d7072f8Seschrock 	sysevent_t		*ev;
59263d7072f8Seschrock 	sysevent_attr_list_t	*attr = NULL;
59273d7072f8Seschrock 	sysevent_value_t	value;
59283d7072f8Seschrock 	sysevent_id_t		eid;
59293d7072f8Seschrock 
59303d7072f8Seschrock 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
59313d7072f8Seschrock 	    SE_SLEEP);
59323d7072f8Seschrock 
59333d7072f8Seschrock 	value.value_type = SE_DATA_TYPE_STRING;
59343d7072f8Seschrock 	value.value.sv_string = spa_name(spa);
59353d7072f8Seschrock 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
59363d7072f8Seschrock 		goto done;
59373d7072f8Seschrock 
59383d7072f8Seschrock 	value.value_type = SE_DATA_TYPE_UINT64;
59393d7072f8Seschrock 	value.value.sv_uint64 = spa_guid(spa);
59403d7072f8Seschrock 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
59413d7072f8Seschrock 		goto done;
59423d7072f8Seschrock 
59433d7072f8Seschrock 	if (vd) {
59443d7072f8Seschrock 		value.value_type = SE_DATA_TYPE_UINT64;
59453d7072f8Seschrock 		value.value.sv_uint64 = vd->vdev_guid;
59463d7072f8Seschrock 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
59473d7072f8Seschrock 		    SE_SLEEP) != 0)
59483d7072f8Seschrock 			goto done;
59493d7072f8Seschrock 
59503d7072f8Seschrock 		if (vd->vdev_path) {
59513d7072f8Seschrock 			value.value_type = SE_DATA_TYPE_STRING;
59523d7072f8Seschrock 			value.value.sv_string = vd->vdev_path;
59533d7072f8Seschrock 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
59543d7072f8Seschrock 			    &value, SE_SLEEP) != 0)
59553d7072f8Seschrock 				goto done;
59563d7072f8Seschrock 		}
59573d7072f8Seschrock 	}
59583d7072f8Seschrock 
5959b01c3b58Seschrock 	if (sysevent_attach_attributes(ev, attr) != 0)
5960b01c3b58Seschrock 		goto done;
5961b01c3b58Seschrock 	attr = NULL;
5962b01c3b58Seschrock 
59633d7072f8Seschrock 	(void) log_sysevent(ev, SE_SLEEP, &eid);
59643d7072f8Seschrock 
59653d7072f8Seschrock done:
59663d7072f8Seschrock 	if (attr)
59673d7072f8Seschrock 		sysevent_free_attr(attr);
59683d7072f8Seschrock 	sysevent_free(ev);
59693d7072f8Seschrock #endif
59703d7072f8Seschrock }
5971