spa.c revision 1b912ec7100c10e7243bf0879af0fe580e08c73d
1c28749ekais/*
2c28749ekais * CDDL HEADER START
3c28749ekais *
4c28749ekais * The contents of this file are subject to the terms of the
5c892ebfkrishna * Common Development and Distribution License (the "License").
6c892ebfkrishna * You may not use this file except in compliance with the License.
7c28749ekais *
8c28749ekais * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c28749ekais * or http://www.opensolaris.org/os/licensing.
10c28749ekais * See the License for the specific language governing permissions
11c28749ekais * and limitations under the License.
12c28749ekais *
13c28749ekais * When distributing Covered Code, include this CDDL HEADER in each
14c28749ekais * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c28749ekais * If applicable, add the following below this CDDL HEADER, with the
16c28749ekais * fields enclosed by brackets "[]" replaced with your own identifying
17c28749ekais * information: Portions Copyright [yyyy] [name of copyright owner]
18c28749ekais *
19c28749ekais * CDDL HEADER END
20c28749ekais */
21c28749ekais
2265d1845Vladimir Kotal/*
23c28749ekais * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24c28749ekais * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
25c28749ekais * Copyright (c) 2012 by Delphix. All rights reserved.
26c28749ekais */
27c28749ekais
28c28749ekais/*
29c28749ekais * This file contains all the routines used when modifying on-disk SPA state.
30c28749ekais * This includes opening, importing, destroying, exporting a pool, and syncing a
31c28749ekais * pool.
32c28749ekais */
33c28749ekais
34c28749ekais#include <sys/zfs_context.h>
35c28749ekais#include <sys/fm/fs/zfs.h>
36c28749ekais#include <sys/spa_impl.h>
37c28749ekais#include <sys/zio.h>
38c28749ekais#include <sys/zio_checksum.h>
39c28749ekais#include <sys/dmu.h>
40c28749ekais#include <sys/dmu_tx.h>
41c28749ekais#include <sys/zap.h>
42c28749ekais#include <sys/zil.h>
43c28749ekais#include <sys/ddt.h>
44c28749ekais#include <sys/vdev_impl.h>
45c28749ekais#include <sys/metaslab.h>
46c28749ekais#include <sys/metaslab_impl.h>
47c28749ekais#include <sys/uberblock_impl.h>
48c28749ekais#include <sys/txg.h>
49c28749ekais#include <sys/avl.h>
50c28749ekais#include <sys/dmu_traverse.h>
51c28749ekais#include <sys/dmu_objset.h>
52c28749ekais#include <sys/unique.h>
53c28749ekais#include <sys/dsl_pool.h>
54c28749ekais#include <sys/dsl_dataset.h>
55c28749ekais#include <sys/dsl_dir.h>
5665d1845Vladimir Kotal#include <sys/dsl_prop.h>
57c28749ekais#include <sys/dsl_synctask.h>
58c28749ekais#include <sys/fs/zfs.h>
59c28749ekais#include <sys/arc.h>
60c28749ekais#include <sys/callb.h>
61c28749ekais#include <sys/systeminfo.h>
62c28749ekais#include <sys/spa_boot.h>
63c28749ekais#include <sys/zfs_ioctl.h>
64c28749ekais#include <sys/dsl_scan.h>
65c28749ekais#include <sys/zfeature.h>
66c28749ekais
67c28749ekais#ifdef	_KERNEL
68c28749ekais#include <sys/bootprops.h>
69c28749ekais#include <sys/callb.h>
70c892ebfkrishna#include <sys/cpupart.h>
71c892ebfkrishna#include <sys/pool.h>
72c28749ekais#include <sys/sysdc.h>
73c892ebfkrishna#include <sys/zone.h>
74c892ebfkrishna#endif	/* _KERNEL */
75c28749ekais
76c28749ekais#include "zfs_prop.h"
77c28749ekais#include "zfs_comutil.h"
78c28749ekais
79c28749ekaistypedef enum zti_modes {
80c28749ekais	zti_mode_fixed,			/* value is # of threads (min 1) */
81c28749ekais	zti_mode_online_percent,	/* value is % of online CPUs */
82c28749ekais	zti_mode_batch,			/* cpu-intensive; value is ignored */
83c892ebfkrishna	zti_mode_null,			/* don't create a taskq */
84c892ebfkrishna	zti_nmodes
85c892ebfkrishna} zti_modes_t;
86c892ebfkrishna
87c892ebfkrishna#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
88c892ebfkrishna#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
89c892ebfkrishna#define	ZTI_BATCH	{ zti_mode_batch, 0 }
90c892ebfkrishna#define	ZTI_NULL	{ zti_mode_null, 0 }
91c892ebfkrishna
92c892ebfkrishna#define	ZTI_ONE		ZTI_FIX(1)
93c892ebfkrishna
94c892ebfkrishnatypedef struct zio_taskq_info {
95c892ebfkrishna	enum zti_modes zti_mode;
96c892ebfkrishna	uint_t zti_value;
97c892ebfkrishna} zio_taskq_info_t;
98c892ebfkrishna
99c892ebfkrishnastatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
100c892ebfkrishna	"issue", "issue_high", "intr", "intr_high"
101c892ebfkrishna};
102c28749ekais
103c28749ekais/*
104c28749ekais * Define the taskq threads for the following I/O types:
105c28749ekais * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
106c28749ekais */
107c28749ekaisconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
108c28749ekais	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
109c28749ekais	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
110c28749ekais	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
111c28749ekais	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
112c892ebfkrishna	{ ZTI_FIX(100),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
113c28749ekais	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
114c28749ekais	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
115c28749ekais};
116c28749ekais
117c28749ekaisstatic dsl_syncfunc_t spa_sync_version;
118c28749ekaisstatic dsl_syncfunc_t spa_sync_props;
119c28749ekaisstatic dsl_checkfunc_t spa_change_guid_check;
120c28749ekaisstatic dsl_syncfunc_t spa_change_guid_sync;
121c28749ekaisstatic boolean_t spa_has_active_shared_spare(spa_t *spa);
122c28749ekaisstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
123c28749ekais    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
124c28749ekais    char **ereport);
125c892ebfkrishnastatic void spa_vdev_resilver_done(spa_t *spa);
126c28749ekais
127c28749ekaisuint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
128c28749ekaisid_t		zio_taskq_psrset_bind = PS_NONE;
129c28749ekaisboolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
130c28749ekaisuint_t		zio_taskq_basedc = 80;		/* base duty cycle */
131c28749ekais
132c28749ekaisboolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
133c28749ekaisextern int	zfs_sync_pass_deferred_free;
134c28749ekais
135c28749ekais/*
136c892ebfkrishna * This (illegal) pool name is used when temporarily importing a spa_t in order
137c892ebfkrishna * to get the vdev stats associated with the imported devices.
138c892ebfkrishna */
139c892ebfkrishna#define	TRYIMPORT_NAME	"$import"
140c892ebfkrishna
141c28749ekais/*
142c28749ekais * ==========================================================================
143c28749ekais * SPA properties routines
144c28749ekais * ==========================================================================
145c28749ekais */
146c28749ekais
147c28749ekais/*
148c28749ekais * Add a (source=src, propname=propval) list to an nvlist.
1492ec7cc7Krishna Yenduri */
150c28749ekaisstatic void
151c28749ekaisspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
152c28749ekais    uint64_t intval, zprop_source_t src)
153c28749ekais{
154c28749ekais	const char *propname = zpool_prop_to_name(prop);
155c28749ekais	nvlist_t *propval;
156c28749ekais
157c28749ekais	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
158c28749ekais	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
159c28749ekais
160c28749ekais	if (strval != NULL)
161c28749ekais		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
162c28749ekais	else
163c28749ekais		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
164c28749ekais
165c28749ekais	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
1662ec7cc7Krishna Yenduri	nvlist_free(propval);
1672ec7cc7Krishna Yenduri}
1682ec7cc7Krishna Yenduri
169c28749ekais/*
170c28749ekais * Get property values from the spa configuration.
171c28749ekais */
172c28749ekaisstatic void
173c28749ekaisspa_prop_get_config(spa_t *spa, nvlist_t **nvp)
174c28749ekais{
175c28749ekais	vdev_t *rvd = spa->spa_root_vdev;
176c28749ekais	dsl_pool_t *pool = spa->spa_dsl_pool;
177c28749ekais	uint64_t size;
178c28749ekais	uint64_t alloc;
179c28749ekais	uint64_t space;
180c28749ekais	uint64_t cap, version;
181c28749ekais	zprop_source_t src = ZPROP_SRC_NONE;
182c28749ekais	spa_config_dirent_t *dp;
183c28749ekais
184c28749ekais	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
185c28749ekais
186c28749ekais	if (rvd != NULL) {
187c28749ekais		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
188c28749ekais		size = metaslab_class_get_space(spa_normal_class(spa));
189c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
190c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
191c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
192c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
193c28749ekais		    size - alloc, src);
194c28749ekais
195c28749ekais		space = 0;
196c28749ekais		for (int c = 0; c < rvd->vdev_children; c++) {
197c28749ekais			vdev_t *tvd = rvd->vdev_child[c];
198c28749ekais			space += tvd->vdev_max_asize - tvd->vdev_asize;
199c28749ekais		}
200c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
201c28749ekais		    src);
202c28749ekais
203c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
204c28749ekais		    (spa_mode(spa) == FREAD), src);
205c28749ekais
206c28749ekais		cap = (size == 0) ? 0 : (alloc * 100 / size);
207c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
208c28749ekais
209c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
210c28749ekais		    ddt_get_pool_dedup_ratio(spa), src);
211c28749ekais
212c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
213c28749ekais		    rvd->vdev_state, src);
214c28749ekais
215c28749ekais		version = spa_version(spa);
216c28749ekais		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
217c28749ekais			src = ZPROP_SRC_DEFAULT;
218c28749ekais		else
219c892ebfkrishna			src = ZPROP_SRC_LOCAL;
220c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
221c28749ekais	}
222c28749ekais
223c28749ekais	if (pool != NULL) {
224c28749ekais		dsl_dir_t *freedir = pool->dp_free_dir;
225c28749ekais
226c28749ekais		/*
227c28749ekais		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
228c28749ekais		 * when opening pools before this version freedir will be NULL.
229c28749ekais		 */
230c28749ekais		if (freedir != NULL) {
231c28749ekais			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
232c28749ekais			    freedir->dd_phys->dd_used_bytes, src);
233c28749ekais		} else {
234c28749ekais			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
235c28749ekais			    NULL, 0, src);
236c28749ekais		}
237c28749ekais	}
238c28749ekais
239c28749ekais	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
240c28749ekais
241c28749ekais	if (spa->spa_comment != NULL) {
242c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
243c28749ekais		    0, ZPROP_SRC_LOCAL);
244c28749ekais	}
245c28749ekais
246c28749ekais	if (spa->spa_root != NULL)
247c28749ekais		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
248c28749ekais		    0, ZPROP_SRC_LOCAL);
249c28749ekais
250c28749ekais	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
251c28749ekais		if (dp->scd_path == NULL) {
252c28749ekais			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
253c28749ekais			    "none", 0, ZPROP_SRC_LOCAL);
254c28749ekais		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
255c28749ekais			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
256c28749ekais			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
257c28749ekais		}
258c28749ekais	}
259c28749ekais}
260c28749ekais
261c28749ekais/*
262c28749ekais * Get zpool property values.
263c28749ekais */
264c28749ekaisint
265c28749ekaisspa_prop_get(spa_t *spa, nvlist_t **nvp)
266c28749ekais{
267c28749ekais	objset_t *mos = spa->spa_meta_objset;
268c28749ekais	zap_cursor_t zc;
269c28749ekais	zap_attribute_t za;
270c28749ekais	int err;
271c28749ekais
272c28749ekais	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
273c28749ekais
274c28749ekais	mutex_enter(&spa->spa_props_lock);
275c28749ekais
276c28749ekais	/*
277c28749ekais	 * Get properties from the spa config.
278c28749ekais	 */
279c28749ekais	spa_prop_get_config(spa, nvp);
280c28749ekais
281c28749ekais	/* If no pool property object, no more prop to get. */
282c28749ekais	if (mos == NULL || spa->spa_pool_props_object == 0) {
283c28749ekais		mutex_exit(&spa->spa_props_lock);
2842ec7cc7Krishna Yenduri		return (0);
285c28749ekais	}
286c28749ekais
287c28749ekais	/*
288c28749ekais	 * Get properties from the MOS pool property object.
289c28749ekais	 */
290c28749ekais	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
291c28749ekais	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
292c28749ekais	    zap_cursor_advance(&zc)) {
293c28749ekais		uint64_t intval = 0;
294c28749ekais		char *strval = NULL;
295c28749ekais		zprop_source_t src = ZPROP_SRC_DEFAULT;
296c28749ekais		zpool_prop_t prop;
297c28749ekais
298c28749ekais		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
299c28749ekais			continue;
300c28749ekais
301c28749ekais		switch (za.za_integer_length) {
302c28749ekais		case 8:
303c28749ekais			/* integer property */
304c28749ekais			if (za.za_first_integer !=
305c28749ekais			    zpool_prop_default_numeric(prop))
306c28749ekais				src = ZPROP_SRC_LOCAL;
307c28749ekais
308c28749ekais			if (prop == ZPOOL_PROP_BOOTFS) {
309c28749ekais				dsl_pool_t *dp;
310c28749ekais				dsl_dataset_t *ds = NULL;
311c28749ekais
312c28749ekais				dp = spa_get_dsl(spa);
313c28749ekais				rw_enter(&dp->dp_config_rwlock, RW_READER);
314c28749ekais				if (err = dsl_dataset_hold_obj(dp,
315c28749ekais				    za.za_first_integer, FTAG, &ds)) {
316c28749ekais					rw_exit(&dp->dp_config_rwlock);
317c28749ekais					break;
318c892ebfkrishna				}
319c892ebfkrishna
320c28749ekais				strval = kmem_alloc(
321c28749ekais				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
322c28749ekais				    KM_SLEEP);
323c28749ekais				dsl_dataset_name(ds, strval);
324c28749ekais				dsl_dataset_rele(ds, FTAG);
325c28749ekais				rw_exit(&dp->dp_config_rwlock);
326c28749ekais			} else {
327c28749ekais				strval = NULL;
328c28749ekais				intval = za.za_first_integer;
329c28749ekais			}
330c28749ekais
331c28749ekais			spa_prop_add_list(*nvp, prop, strval, intval, src);
332c28749ekais
333c28749ekais			if (strval != NULL)
334c28749ekais				kmem_free(strval,
335c28749ekais				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
336c892ebfkrishna
337c28749ekais			break;
338c28749ekais
339c28749ekais		case 1:
340c28749ekais			/* string property */
341c28749ekais			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
342c28749ekais			err = zap_lookup(mos, spa->spa_pool_props_object,
343c28749ekais			    za.za_name, 1, za.za_num_integers, strval);
344c28749ekais			if (err) {
345c28749ekais				kmem_free(strval, za.za_num_integers);
346c28749ekais				break;
347c28749ekais			}
348c28749ekais			spa_prop_add_list(*nvp, prop, strval, 0, src);
349c892ebfkrishna			kmem_free(strval, za.za_num_integers);
350c28749ekais			break;
351c28749ekais
352c28749ekais		default:
353c28749ekais			break;
354c28749ekais		}
355c28749ekais	}
356c28749ekais	zap_cursor_fini(&zc);
357c28749ekais	mutex_exit(&spa->spa_props_lock);
358c892ebfkrishnaout:
359c892ebfkrishna	if (err && err != ENOENT) {
360c892ebfkrishna		nvlist_free(*nvp);
361c892ebfkrishna		*nvp = NULL;
362c892ebfkrishna		return (err);
363c892ebfkrishna	}
364c892ebfkrishna
365c892ebfkrishna	return (0);
366c892ebfkrishna}
367c892ebfkrishna
368c892ebfkrishna/*
369c892ebfkrishna * Validate the given pool properties nvlist and modify the list
370c892ebfkrishna * for the property values to be set.
371c892ebfkrishna */
372c892ebfkrishnastatic int
373c892ebfkrishnaspa_prop_validate(spa_t *spa, nvlist_t *props)
374c892ebfkrishna{
375c892ebfkrishna	nvpair_t *elem;
376c892ebfkrishna	int error = 0, reset_bootfs = 0;
377c892ebfkrishna	uint64_t objnum;
378c892ebfkrishna	boolean_t has_feature = B_FALSE;
379c892ebfkrishna
380c892ebfkrishna	elem = NULL;
381c892ebfkrishna	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
382c892ebfkrishna		uint64_t intval;
383c892ebfkrishna		char *strval, *slash, *check, *fname;
384c892ebfkrishna		const char *propname = nvpair_name(elem);
385c892ebfkrishna		zpool_prop_t prop = zpool_name_to_prop(propname);
386c892ebfkrishna
387c892ebfkrishna		switch (prop) {
388c892ebfkrishna		case ZPROP_INVAL:
389c892ebfkrishna			if (!zpool_prop_feature(propname)) {
390c892ebfkrishna				error = EINVAL;
391c892ebfkrishna				break;
392c892ebfkrishna			}
3932bd70d4krishna
3942bd70d4krishna			/*
395c892ebfkrishna			 * Sanitize the input.
396c892ebfkrishna			 */
397c892ebfkrishna			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
398c892ebfkrishna				error = EINVAL;
399c28749ekais				break;
400c28749ekais			}
401c28749ekais
402c28749ekais			if (nvpair_value_uint64(elem, &intval) != 0) {
403c28749ekais				error = EINVAL;
404c28749ekais				break;
4052bd70d4krishna			}
406c28749ekais
407c28749ekais			if (intval != 0) {
408c28749ekais				error = EINVAL;
4092bd70d4krishna				break;
410c28749ekais			}
411c28749ekais
412c28749ekais			fname = strchr(propname, '@') + 1;
4132ec7cc7Krishna Yenduri			if (zfeature_lookup_name(fname, NULL) != 0) {
4142ec7cc7Krishna Yenduri				error = EINVAL;
415c28749ekais				break;
416c28749ekais			}
417c28749ekais
418c28749ekais			has_feature = B_TRUE;
419c28749ekais			break;
420c28749ekais
421c28749ekais		case ZPOOL_PROP_VERSION:
422c28749ekais			error = nvpair_value_uint64(elem, &intval);
423c28749ekais			if (!error &&
424c28749ekais			    (intval < spa_version(spa) ||
425c28749ekais			    intval > SPA_VERSION_BEFORE_FEATURES ||
426c28749ekais			    has_feature))
427c28749ekais				error = EINVAL;
428c28749ekais			break;
429c892ebfkrishna
430c892ebfkrishna		case ZPOOL_PROP_DELEGATION:
431c892ebfkrishna		case ZPOOL_PROP_AUTOREPLACE:
432c892ebfkrishna		case ZPOOL_PROP_LISTSNAPS:
433c892ebfkrishna		case ZPOOL_PROP_AUTOEXPAND:
434c892ebfkrishna			error = nvpair_value_uint64(elem, &intval);
435c892ebfkrishna			if (!error && intval > 1)
436c892ebfkrishna				error = EINVAL;
437c28749ekais			break;
438c28749ekais
439c28749ekais		case ZPOOL_PROP_BOOTFS:
4402bd70d4krishna			/*
441c28749ekais			 * If the pool version is less than SPA_VERSION_BOOTFS,
442c28749ekais			 * or the pool is still being created (version == 0),
443c28749ekais			 * the bootfs property cannot be set.
444c28749ekais			 */
445c28749ekais			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
446c28749ekais				error = ENOTSUP;
447c28749ekais				break;
448c28749ekais			}
449c28749ekais
450c28749ekais			/*
451c28749ekais			 * Make sure the vdev config is bootable
452c28749ekais			 */
453c28749ekais			if (!vdev_is_bootable(spa->spa_root_vdev)) {
454c28749ekais				error = ENOTSUP;
455c28749ekais				break;
456c28749ekais			}
457c28749ekais
458c28749ekais			reset_bootfs = 1;
459c28749ekais
4602bd70d4krishna			error = nvpair_value_string(elem, &strval);
4612bd70d4krishna
4622bd70d4krishna			if (!error) {
463c28749ekais				objset_t *os;
464c28749ekais				uint64_t compress;
465c28749ekais
4662bd70d4krishna				if (strval == NULL || strval[0] == '\0') {
467c28749ekais					objnum = zpool_prop_default_numeric(
468c28749ekais					    ZPOOL_PROP_BOOTFS);
469c28749ekais					break;
470c28749ekais				}
4712bd70d4krishna
472c28749ekais				if (error = dmu_objset_hold(strval, FTAG, &os))
473c28749ekais					break;
474c28749ekais
4752bd70d4krishna				/* Must be ZPL and not gzip compressed. */
476c28749ekais
477c28749ekais				if (dmu_objset_type(os) != DMU_OST_ZFS) {
478c28749ekais					error = ENOTSUP;
4792bd70d4krishna				} else if ((error = dsl_prop_get_integer(strval,
480c28749ekais				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
481c28749ekais				    &compress, NULL)) == 0 &&
482c28749ekais				    !BOOTFS_COMPRESS_VALID(compress)) {
4832bd70d4krishna					error = ENOTSUP;
4842bd70d4krishna				} else {
4852bd70d4krishna					objnum = dmu_objset_id(os);
4862bd70d4krishna				}
4872bd70d4krishna				dmu_objset_rele(os, FTAG);
4882bd70d4krishna			}
4892bd70d4krishna			break;
4902bd70d4krishna
4912bd70d4krishna		case ZPOOL_PROP_FAILUREMODE:
492c28749ekais			error = nvpair_value_uint64(elem, &intval);
493c28749ekais			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
494c28749ekais			    intval > ZIO_FAILURE_MODE_PANIC))
495c28749ekais				error = EINVAL;
496c28749ekais
497c28749ekais			/*
498c28749ekais			 * This is a special case which only occurs when
499c28749ekais			 * the pool has completely failed. This allows
500c28749ekais			 * the user to change the in-core failmode property
501c28749ekais			 * without syncing it out to disk (I/Os might
50265d1845Vladimir Kotal			 * currently be blocked). We do this by returning
50365d1845Vladimir Kotal			 * EIO to the caller (spa_prop_set) to trick it
50465d1845Vladimir Kotal			 * into thinking we encountered a property validation
50565d1845Vladimir Kotal			 * error.
50665d1845Vladimir Kotal			 */
50765d1845Vladimir Kotal			if (!error && spa_suspended(spa)) {
50865d1845Vladimir Kotal				spa->spa_failmode = intval;
509c28749ekais				error = EIO;
510c28749ekais			}
511c28749ekais			break;
512c28749ekais
513c28749ekais		case ZPOOL_PROP_CACHEFILE:
514c28749ekais			if ((error = nvpair_value_string(elem, &strval)) != 0)
515c28749ekais				break;
516c28749ekais
517c28749ekais			if (strval[0] == '\0')
518c28749ekais				break;
519c28749ekais
520c28749ekais			if (strcmp(strval, "none") == 0)
521c28749ekais				break;
522c28749ekais
523c28749ekais			if (strval[0] != '/') {
524c28749ekais				error = EINVAL;
525c28749ekais				break;
526c28749ekais			}
527c28749ekais
528c28749ekais			slash = strrchr(strval, '/');
529c28749ekais			ASSERT(slash != NULL);
530c28749ekais
531c28749ekais			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
532c28749ekais			    strcmp(slash, "/..") == 0)
533c28749ekais				error = EINVAL;
534c28749ekais			break;
535c28749ekais
5362ec7cc7Krishna Yenduri		case ZPOOL_PROP_COMMENT:
537c28749ekais			if ((error = nvpair_value_string(elem, &strval)) != 0)
538c28749ekais				break;
539c28749ekais			for (check = strval; *check != '\0'; check++) {
540c28749ekais				/*
541c28749ekais				 * The kernel doesn't have an easy isprint()
542c28749ekais				 * check.  For this kernel check, we merely
543c28749ekais				 * check ASCII apart from DEL.  Fix this if
544c28749ekais				 * there is an easy-to-use kernel isprint().
545c28749ekais				 */
546c28749ekais				if (*check >= 0x7f) {
547c28749ekais					error = EINVAL;
548c892ebfkrishna					break;
549c892ebfkrishna				}
550c892ebfkrishna				check++;
551c892ebfkrishna			}
552c892ebfkrishna			if (strlen(strval) > ZPROP_MAX_COMMENT)
553c28749ekais				error = E2BIG;
5542ec7cc7Krishna Yenduri			break;
555c28749ekais
556c28749ekais		case ZPOOL_PROP_DEDUPDITTO:
557c28749ekais			if (spa_version(spa) < SPA_VERSION_DEDUP)
558c28749ekais				error = ENOTSUP;
559c28749ekais			else
560c28749ekais				error = nvpair_value_uint64(elem, &intval);
561c28749ekais			if (error == 0 &&
562c28749ekais			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
563c28749ekais				error = EINVAL;
564c28749ekais			break;
565c28749ekais		}
566c28749ekais
567c28749ekais		if (error)
568c28749ekais			break;
569c28749ekais	}
570c28749ekais
571c28749ekais	if (!error && reset_bootfs) {
572c28749ekais		error = nvlist_remove(props,
573c28749ekais		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
574c28749ekais
575c28749ekais		if (!error) {
576c28749ekais			error = nvlist_add_uint64(props,
577c28749ekais			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
578c28749ekais		}
579c28749ekais	}
5802ec7cc7Krishna Yenduri
581c28749ekais	return (error);
582c28749ekais}
583c28749ekais
584c28749ekaisvoid
585c28749ekaisspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
586c28749ekais{
587c28749ekais	char *cachefile;
588c28749ekais	spa_config_dirent_t *dp;
589c28749ekais
590c28749ekais	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
591c28749ekais	    &cachefile) != 0)
592c28749ekais		return;
593c28749ekais
594c28749ekais	dp = kmem_alloc(sizeof (spa_config_dirent_t),
595c28749ekais	    KM_SLEEP);
596c28749ekais
597c28749ekais	if (cachefile[0] == '\0')
598c28749ekais		dp->scd_path = spa_strdup(spa_config_path);
599c28749ekais	else if (strcmp(cachefile, "none") == 0)
600c28749ekais		dp->scd_path = NULL;
601c28749ekais	else
602c28749ekais		dp->scd_path = spa_strdup(cachefile);
603c28749ekais
604c28749ekais	list_insert_head(&spa->spa_config_list, dp);
605c28749ekais	if (need_sync)
606c28749ekais		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
607c28749ekais}
608c28749ekais
609c28749ekaisint
610c28749ekaisspa_prop_set(spa_t *spa, nvlist_t *nvp)
611c28749ekais{
612c28749ekais	int error;
613c28749ekais	nvpair_t *elem = NULL;
614c28749ekais	boolean_t need_sync = B_FALSE;
615c28749ekais
616c28749ekais	if ((error = spa_prop_validate(spa, nvp)) != 0)
617c28749ekais		return (error);
618c28749ekais
619c28749ekais	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
620c28749ekais		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
621c28749ekais
622c28749ekais		if (prop == ZPOOL_PROP_CACHEFILE ||
623c28749ekais		    prop == ZPOOL_PROP_ALTROOT ||
624c28749ekais		    prop == ZPOOL_PROP_READONLY)
625c28749ekais			continue;
6262ec7cc7Krishna Yenduri
6272ec7cc7Krishna Yenduri		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
628c28749ekais			uint64_t ver;
629c28749ekais
630c28749ekais			if (prop == ZPOOL_PROP_VERSION) {
631c28749ekais				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
632c28749ekais			} else {
633c28749ekais				ASSERT(zpool_prop_feature(nvpair_name(elem)));
6342ec7cc7Krishna Yenduri				ver = SPA_VERSION_FEATURES;
6352ec7cc7Krishna Yenduri				need_sync = B_TRUE;
6362ec7cc7Krishna Yenduri			}
6372ec7cc7Krishna Yenduri
6382ec7cc7Krishna Yenduri			/* Save time if the version is already set. */
6392ec7cc7Krishna Yenduri			if (ver == spa_version(spa))
640c28749ekais				continue;
641c28749ekais
642c28749ekais			/*
643c28749ekais			 * In addition to the pool directory object, we might
644c28749ekais			 * create the pool properties object, the features for
645c28749ekais			 * read object, the features for write object, or the
646c28749ekais			 * feature descriptions object.
647c28749ekais			 */
648c28749ekais			error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
649c28749ekais			    spa_sync_version, spa, &ver, 6);
650c28749ekais			if (error)
651c28749ekais				return (error);
652c28749ekais			continue;
653c28749ekais		}
654c28749ekais
655c28749ekais		need_sync = B_TRUE;
656c28749ekais		break;
657c28749ekais	}
658c28749ekais
659c28749ekais	if (need_sync) {
6602ec7cc7Krishna Yenduri		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
661c28749ekais		    spa, nvp, 6));
6622ec7cc7Krishna Yenduri	}
663c28749ekais
664c28749ekais	return (0);
6652ec7cc7Krishna Yenduri}
666c28749ekais
667c28749ekais/*
6682ec7cc7Krishna Yenduri * If the bootfs property value is dsobj, clear it.
669c28749ekais */
670c28749ekaisvoid
671c28749ekaisspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
672c28749ekais{
673c28749ekais	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
674c28749ekais		VERIFY(zap_remove(spa->spa_meta_objset,
675c28749ekais		    spa->spa_pool_props_object,
676c28749ekais		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
677c28749ekais		spa->spa_bootfs = 0;
678c28749ekais	}
679c28749ekais}
680c28749ekais
681c28749ekais/*ARGSUSED*/
682c28749ekaisstatic int
683c28749ekaisspa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
684c892ebfkrishna{
685c892ebfkrishna	spa_t *spa = arg1;
686c892ebfkrishna	uint64_t *newguid = arg2;
687c892ebfkrishna	vdev_t *rvd = spa->spa_root_vdev;
688c892ebfkrishna	uint64_t vdev_state;
689c892ebfkrishna
690c892ebfkrishna	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
691c892ebfkrishna	vdev_state = rvd->vdev_state;
692c892ebfkrishna	spa_config_exit(spa, SCL_STATE, FTAG);
693c892ebfkrishna
694c892ebfkrishna	if (vdev_state != VDEV_STATE_HEALTHY)
695c892ebfkrishna		return (ENXIO);
696c892ebfkrishna
697c892ebfkrishna	ASSERT3U(spa_guid(spa), !=, *newguid);
698c892ebfkrishna
699c892ebfkrishna	return (0);
700c892ebfkrishna}
701c892ebfkrishna
702c892ebfkrishnastatic void
703c892ebfkrishnaspa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
704c892ebfkrishna{
705c892ebfkrishna	spa_t *spa = arg1;
706c892ebfkrishna	uint64_t *newguid = arg2;
707c892ebfkrishna	uint64_t oldguid;
708c892ebfkrishna	vdev_t *rvd = spa->spa_root_vdev;
709c892ebfkrishna
710c892ebfkrishna	oldguid = spa_guid(spa);
711c892ebfkrishna
712c892ebfkrishna	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
713c892ebfkrishna	rvd->vdev_guid = *newguid;
714c892ebfkrishna	rvd->vdev_guid_sum += (*newguid - oldguid);
715c892ebfkrishna	vdev_config_dirty(rvd);
716c892ebfkrishna	spa_config_exit(spa, SCL_STATE, FTAG);
717c892ebfkrishna
718c892ebfkrishna	spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld",
719c892ebfkrishna	    oldguid, *newguid);
720c892ebfkrishna}
721c892ebfkrishna
722c892ebfkrishna/*
723c892ebfkrishna * Change the GUID for the pool.  This is done so that we can later
724c892ebfkrishna * re-import a pool built from a clone of our own vdevs.  We will modify
725c892ebfkrishna * the root vdev's guid, our own pool guid, and then mark all of our
726c892ebfkrishna * vdevs dirty.  Note that we must make sure that all our vdevs are
727c892ebfkrishna * online when we do this, or else any vdevs that weren't present
728c892ebfkrishna * would be orphaned from our pool.  We are also going to issue a
729c892ebfkrishna * sysevent to update any watchers.
730c892ebfkrishna */
731c892ebfkrishnaint
732c892ebfkrishnaspa_change_guid(spa_t *spa)
733c892ebfkrishna{
734c892ebfkrishna	int error;
735c892ebfkrishna	uint64_t guid;
736c892ebfkrishna
737c892ebfkrishna	mutex_enter(&spa_namespace_lock);
738c892ebfkrishna	guid = spa_generate_guid(NULL);
739c892ebfkrishna
740c892ebfkrishna	error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
741c892ebfkrishna	    spa_change_guid_sync, spa, &guid, 5);
742c892ebfkrishna
743c892ebfkrishna	if (error == 0) {
744c892ebfkrishna		spa_config_sync(spa, B_FALSE, B_TRUE);
745c892ebfkrishna		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
746c892ebfkrishna	}
747c892ebfkrishna
748c892ebfkrishna	mutex_exit(&spa_namespace_lock);
749c892ebfkrishna
750c892ebfkrishna	return (error);
751c892ebfkrishna}
752c892ebfkrishna
753c892ebfkrishna/*
754c892ebfkrishna * ==========================================================================
755c892ebfkrishna * SPA state manipulation (open/create/destroy/import/export)
756c892ebfkrishna * ==========================================================================
757c892ebfkrishna */
758c892ebfkrishna
759c892ebfkrishnastatic int
760c892ebfkrishnaspa_error_entry_compare(const void *a, const void *b)
761c892ebfkrishna{
762c892ebfkrishna	spa_error_entry_t *sa = (spa_error_entry_t *)a;
763c892ebfkrishna	spa_error_entry_t *sb = (spa_error_entry_t *)b;
764c892ebfkrishna	int ret;
765c892ebfkrishna
766c892ebfkrishna	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
767c892ebfkrishna	    sizeof (zbookmark_t));
768
769	if (ret < 0)
770		return (-1);
771	else if (ret > 0)
772		return (1);
773	else
774		return (0);
775}
776
777/*
778 * Utility function which retrieves copies of the current logs and
779 * re-initializes them in the process.
780 */
781void
782spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
783{
784	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
785
786	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
787	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
788
789	avl_create(&spa->spa_errlist_scrub,
790	    spa_error_entry_compare, sizeof (spa_error_entry_t),
791	    offsetof(spa_error_entry_t, se_avl));
792	avl_create(&spa->spa_errlist_last,
793	    spa_error_entry_compare, sizeof (spa_error_entry_t),
794	    offsetof(spa_error_entry_t, se_avl));
795}
796
797static taskq_t *
798spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
799    uint_t value)
800{
801	uint_t flags = 0;
802	boolean_t batch = B_FALSE;
803
804	switch (mode) {
805	case zti_mode_null:
806		return (NULL);		/* no taskq needed */
807
808	case zti_mode_fixed:
809		ASSERT3U(value, >=, 1);
810		value = MAX(value, 1);
811		break;
812
813	case zti_mode_batch:
814		batch = B_TRUE;
815		flags |= TASKQ_THREADS_CPU_PCT;
816		value = zio_taskq_batch_pct;
817		break;
818
819	case zti_mode_online_percent:
820		flags |= TASKQ_THREADS_CPU_PCT;
821		break;
822
823	default:
824		panic("unrecognized mode for %s taskq (%u:%u) in "
825		    "spa_activate()",
826		    name, mode, value);
827		break;
828	}
829
830	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
831		if (batch)
832			flags |= TASKQ_DC_BATCH;
833
834		return (taskq_create_sysdc(name, value, 50, INT_MAX,
835		    spa->spa_proc, zio_taskq_basedc, flags));
836	}
837	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
838	    spa->spa_proc, flags));
839}
840
841static void
842spa_create_zio_taskqs(spa_t *spa)
843{
844	for (int t = 0; t < ZIO_TYPES; t++) {
845		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
846			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
847			enum zti_modes mode = ztip->zti_mode;
848			uint_t value = ztip->zti_value;
849			char name[32];
850
851			(void) snprintf(name, sizeof (name),
852			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
853
854			spa->spa_zio_taskq[t][q] =
855			    spa_taskq_create(spa, name, mode, value);
856		}
857	}
858}
859
860#ifdef _KERNEL
861static void
862spa_thread(void *arg)
863{
864	callb_cpr_t cprinfo;
865
866	spa_t *spa = arg;
867	user_t *pu = PTOU(curproc);
868
869	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
870	    spa->spa_name);
871
872	ASSERT(curproc != &p0);
873	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
874	    "zpool-%s", spa->spa_name);
875	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
876
877	/* bind this thread to the requested psrset */
878	if (zio_taskq_psrset_bind != PS_NONE) {
879		pool_lock();
880		mutex_enter(&cpu_lock);
881		mutex_enter(&pidlock);
882		mutex_enter(&curproc->p_lock);
883
884		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
885		    0, NULL, NULL) == 0)  {
886			curthread->t_bind_pset = zio_taskq_psrset_bind;
887		} else {
888			cmn_err(CE_WARN,
889			    "Couldn't bind process for zfs pool \"%s\" to "
890			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
891		}
892
893		mutex_exit(&curproc->p_lock);
894		mutex_exit(&pidlock);
895		mutex_exit(&cpu_lock);
896		pool_unlock();
897	}
898
899	if (zio_taskq_sysdc) {
900		sysdc_thread_enter(curthread, 100, 0);
901	}
902
903	spa->spa_proc = curproc;
904	spa->spa_did = curthread->t_did;
905
906	spa_create_zio_taskqs(spa);
907
908	mutex_enter(&spa->spa_proc_lock);
909	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
910
911	spa->spa_proc_state = SPA_PROC_ACTIVE;
912	cv_broadcast(&spa->spa_proc_cv);
913
914	CALLB_CPR_SAFE_BEGIN(&cprinfo);
915	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
916		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
917	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
918
919	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
920	spa->spa_proc_state = SPA_PROC_GONE;
921	spa->spa_proc = &p0;
922	cv_broadcast(&spa->spa_proc_cv);
923	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
924
925	mutex_enter(&curproc->p_lock);
926	lwp_exit();
927}
928#endif
929
930/*
931 * Activate an uninitialized pool.
932 */
933static void
934spa_activate(spa_t *spa, int mode)
935{
936	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
937
938	spa->spa_state = POOL_STATE_ACTIVE;
939	spa->spa_mode = mode;
940
941	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
942	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
943
944	/* Try to create a covering process */
945	mutex_enter(&spa->spa_proc_lock);
946	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
947	ASSERT(spa->spa_proc == &p0);
948	spa->spa_did = 0;
949
950	/* Only create a process if we're going to be around a while. */
951	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
952		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
953		    NULL, 0) == 0) {
954			spa->spa_proc_state = SPA_PROC_CREATED;
955			while (spa->spa_proc_state == SPA_PROC_CREATED) {
956				cv_wait(&spa->spa_proc_cv,
957				    &spa->spa_proc_lock);
958			}
959			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
960			ASSERT(spa->spa_proc != &p0);
961			ASSERT(spa->spa_did != 0);
962		} else {
963#ifdef _KERNEL
964			cmn_err(CE_WARN,
965			    "Couldn't create process for zfs pool \"%s\"\n",
966			    spa->spa_name);
967#endif
968		}
969	}
970	mutex_exit(&spa->spa_proc_lock);
971
972	/* If we didn't create a process, we need to create our taskqs. */
973	if (spa->spa_proc == &p0) {
974		spa_create_zio_taskqs(spa);
975	}
976
977	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
978	    offsetof(vdev_t, vdev_config_dirty_node));
979	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
980	    offsetof(vdev_t, vdev_state_dirty_node));
981
982	txg_list_create(&spa->spa_vdev_txg_list,
983	    offsetof(struct vdev, vdev_txg_node));
984
985	avl_create(&spa->spa_errlist_scrub,
986	    spa_error_entry_compare, sizeof (spa_error_entry_t),
987	    offsetof(spa_error_entry_t, se_avl));
988	avl_create(&spa->spa_errlist_last,
989	    spa_error_entry_compare, sizeof (spa_error_entry_t),
990	    offsetof(spa_error_entry_t, se_avl));
991}
992
993/*
994 * Opposite of spa_activate().
995 */
996static void
997spa_deactivate(spa_t *spa)
998{
999	ASSERT(spa->spa_sync_on == B_FALSE);
1000	ASSERT(spa->spa_dsl_pool == NULL);
1001	ASSERT(spa->spa_root_vdev == NULL);
1002	ASSERT(spa->spa_async_zio_root == NULL);
1003	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1004
1005	txg_list_destroy(&spa->spa_vdev_txg_list);
1006
1007	list_destroy(&spa->spa_config_dirty_list);
1008	list_destroy(&spa->spa_state_dirty_list);
1009
1010	for (int t = 0; t < ZIO_TYPES; t++) {
1011		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1012			if (spa->spa_zio_taskq[t][q] != NULL)
1013				taskq_destroy(spa->spa_zio_taskq[t][q]);
1014			spa->spa_zio_taskq[t][q] = NULL;
1015		}
1016	}
1017
1018	metaslab_class_destroy(spa->spa_normal_class);
1019	spa->spa_normal_class = NULL;
1020
1021	metaslab_class_destroy(spa->spa_log_class);
1022	spa->spa_log_class = NULL;
1023
1024	/*
1025	 * If this was part of an import or the open otherwise failed, we may
1026	 * still have errors left in the queues.  Empty them just in case.
1027	 */
1028	spa_errlog_drain(spa);
1029
1030	avl_destroy(&spa->spa_errlist_scrub);
1031	avl_destroy(&spa->spa_errlist_last);
1032
1033	spa->spa_state = POOL_STATE_UNINITIALIZED;
1034
1035	mutex_enter(&spa->spa_proc_lock);
1036	if (spa->spa_proc_state != SPA_PROC_NONE) {
1037		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1038		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1039		cv_broadcast(&spa->spa_proc_cv);
1040		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1041			ASSERT(spa->spa_proc != &p0);
1042			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1043		}
1044		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1045		spa->spa_proc_state = SPA_PROC_NONE;
1046	}
1047	ASSERT(spa->spa_proc == &p0);
1048	mutex_exit(&spa->spa_proc_lock);
1049
1050	/*
1051	 * We want to make sure spa_thread() has actually exited the ZFS
1052	 * module, so that the module can't be unloaded out from underneath
1053	 * it.
1054	 */
1055	if (spa->spa_did != 0) {
1056		thread_join(spa->spa_did);
1057		spa->spa_did = 0;
1058	}
1059}
1060
1061/*
1062 * Verify a pool configuration, and construct the vdev tree appropriately.  This
1063 * will create all the necessary vdevs in the appropriate layout, with each vdev
1064 * in the CLOSED state.  This will prep the pool before open/creation/import.
1065 * All vdev validation is done by the vdev_alloc() routine.
1066 */
1067static int
1068spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1069    uint_t id, int atype)
1070{
1071	nvlist_t **child;
1072	uint_t children;
1073	int error;
1074
1075	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1076		return (error);
1077
1078	if ((*vdp)->vdev_ops->vdev_op_leaf)
1079		return (0);
1080
1081	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1082	    &child, &children);
1083
1084	if (error == ENOENT)
1085		return (0);
1086
1087	if (error) {
1088		vdev_free(*vdp);
1089		*vdp = NULL;
1090		return (EINVAL);
1091	}
1092
1093	for (int c = 0; c < children; c++) {
1094		vdev_t *vd;
1095		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1096		    atype)) != 0) {
1097			vdev_free(*vdp);
1098			*vdp = NULL;
1099			return (error);
1100		}
1101	}
1102
1103	ASSERT(*vdp != NULL);
1104
1105	return (0);
1106}
1107
1108/*
1109 * Opposite of spa_load().
1110 */
1111static void
1112spa_unload(spa_t *spa)
1113{
1114	int i;
1115
1116	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1117
1118	/*
1119	 * Stop async tasks.
1120	 */
1121	spa_async_suspend(spa);
1122
1123	/*
1124	 * Stop syncing.
1125	 */
1126	if (spa->spa_sync_on) {
1127		txg_sync_stop(spa->spa_dsl_pool);
1128		spa->spa_sync_on = B_FALSE;
1129	}
1130
1131	/*
1132	 * Wait for any outstanding async I/O to complete.
1133	 */
1134	if (spa->spa_async_zio_root != NULL) {
1135		(void) zio_wait(spa->spa_async_zio_root);
1136		spa->spa_async_zio_root = NULL;
1137	}
1138
1139	bpobj_close(&spa->spa_deferred_bpobj);
1140
1141	/*
1142	 * Close the dsl pool.
1143	 */
1144	if (spa->spa_dsl_pool) {
1145		dsl_pool_close(spa->spa_dsl_pool);
1146		spa->spa_dsl_pool = NULL;
1147		spa->spa_meta_objset = NULL;
1148	}
1149
1150	ddt_unload(spa);
1151
1152	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1153
1154	/*
1155	 * Drop and purge level 2 cache
1156	 */
1157	spa_l2cache_drop(spa);
1158
1159	/*
1160	 * Close all vdevs.
1161	 */
1162	if (spa->spa_root_vdev)
1163		vdev_free(spa->spa_root_vdev);
1164	ASSERT(spa->spa_root_vdev == NULL);
1165
1166	for (i = 0; i < spa->spa_spares.sav_count; i++)
1167		vdev_free(spa->spa_spares.sav_vdevs[i]);
1168	if (spa->spa_spares.sav_vdevs) {
1169		kmem_free(spa->spa_spares.sav_vdevs,
1170		    spa->spa_spares.sav_count * sizeof (void *));
1171		spa->spa_spares.sav_vdevs = NULL;
1172	}
1173	if (spa->spa_spares.sav_config) {
1174		nvlist_free(spa->spa_spares.sav_config);
1175		spa->spa_spares.sav_config = NULL;
1176	}
1177	spa->spa_spares.sav_count = 0;
1178
1179	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1180		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1181		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1182	}
1183	if (spa->spa_l2cache.sav_vdevs) {
1184		kmem_free(spa->spa_l2cache.sav_vdevs,
1185		    spa->spa_l2cache.sav_count * sizeof (void *));
1186		spa->spa_l2cache.sav_vdevs = NULL;
1187	}
1188	if (spa->spa_l2cache.sav_config) {
1189		nvlist_free(spa->spa_l2cache.sav_config);
1190		spa->spa_l2cache.sav_config = NULL;
1191	}
1192	spa->spa_l2cache.sav_count = 0;
1193
1194	spa->spa_async_suspended = 0;
1195
1196	if (spa->spa_comment != NULL) {
1197		spa_strfree(spa->spa_comment);
1198		spa->spa_comment = NULL;
1199	}
1200
1201	spa_config_exit(spa, SCL_ALL, FTAG);
1202}
1203
1204/*
1205 * Load (or re-load) the current list of vdevs describing the active spares for
1206 * this pool.  When this is called, we have some form of basic information in
1207 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1208 * then re-generate a more complete list including status information.
1209 */
1210static void
1211spa_load_spares(spa_t *spa)
1212{
1213	nvlist_t **spares;
1214	uint_t nspares;
1215	int i;
1216	vdev_t *vd, *tvd;
1217
1218	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1219
1220	/*
1221	 * First, close and free any existing spare vdevs.
1222	 */
1223	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1224		vd = spa->spa_spares.sav_vdevs[i];
1225
1226		/* Undo the call to spa_activate() below */
1227		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1228		    B_FALSE)) != NULL && tvd->vdev_isspare)
1229			spa_spare_remove(tvd);
1230		vdev_close(vd);
1231		vdev_free(vd);
1232	}
1233
1234	if (spa->spa_spares.sav_vdevs)
1235		kmem_free(spa->spa_spares.sav_vdevs,
1236		    spa->spa_spares.sav_count * sizeof (void *));
1237
1238	if (spa->spa_spares.sav_config == NULL)
1239		nspares = 0;
1240	else
1241		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1242		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1243
1244	spa->spa_spares.sav_count = (int)nspares;
1245	spa->spa_spares.sav_vdevs = NULL;
1246
1247	if (nspares == 0)
1248		return;
1249
1250	/*
1251	 * Construct the array of vdevs, opening them to get status in the
1252	 * process.   For each spare, there is potentially two different vdev_t
1253	 * structures associated with it: one in the list of spares (used only
1254	 * for basic validation purposes) and one in the active vdev
1255	 * configuration (if it's spared in).  During this phase we open and
1256	 * validate each vdev on the spare list.  If the vdev also exists in the
1257	 * active configuration, then we also mark this vdev as an active spare.
1258	 */
1259	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1260	    KM_SLEEP);
1261	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1262		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1263		    VDEV_ALLOC_SPARE) == 0);
1264		ASSERT(vd != NULL);
1265
1266		spa->spa_spares.sav_vdevs[i] = vd;
1267
1268		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1269		    B_FALSE)) != NULL) {
1270			if (!tvd->vdev_isspare)
1271				spa_spare_add(tvd);
1272
1273			/*
1274			 * We only mark the spare active if we were successfully
1275			 * able to load the vdev.  Otherwise, importing a pool
1276			 * with a bad active spare would result in strange
1277			 * behavior, because multiple pool would think the spare
1278			 * is actively in use.
1279			 *
1280			 * There is a vulnerability here to an equally bizarre
1281			 * circumstance, where a dead active spare is later
1282			 * brought back to life (onlined or otherwise).  Given
1283			 * the rarity of this scenario, and the extra complexity
1284			 * it adds, we ignore the possibility.
1285			 */
1286			if (!vdev_is_dead(tvd))
1287				spa_spare_activate(tvd);
1288		}
1289
1290		vd->vdev_top = vd;
1291		vd->vdev_aux = &spa->spa_spares;
1292
1293		if (vdev_open(vd) != 0)
1294			continue;
1295
1296		if (vdev_validate_aux(vd) == 0)
1297			spa_spare_add(vd);
1298	}
1299
1300	/*
1301	 * Recompute the stashed list of spares, with status information
1302	 * this time.
1303	 */
1304	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1305	    DATA_TYPE_NVLIST_ARRAY) == 0);
1306
1307	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1308	    KM_SLEEP);
1309	for (i = 0; i < spa->spa_spares.sav_count; i++)
1310		spares[i] = vdev_config_generate(spa,
1311		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1312	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1313	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1314	for (i = 0; i < spa->spa_spares.sav_count; i++)
1315		nvlist_free(spares[i]);
1316	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1317}
1318
1319/*
1320 * Load (or re-load) the current list of vdevs describing the active l2cache for
1321 * this pool.  When this is called, we have some form of basic information in
1322 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1323 * then re-generate a more complete list including status information.
1324 * Devices which are already active have their details maintained, and are
1325 * not re-opened.
1326 */
1327static void
1328spa_load_l2cache(spa_t *spa)
1329{
1330	nvlist_t **l2cache;
1331	uint_t nl2cache;
1332	int i, j, oldnvdevs;
1333	uint64_t guid;
1334	vdev_t *vd, **oldvdevs, **newvdevs;
1335	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1336
1337	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1338
1339	if (sav->sav_config != NULL) {
1340		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1341		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1342		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1343	} else {
1344		nl2cache = 0;
1345	}
1346
1347	oldvdevs = sav->sav_vdevs;
1348	oldnvdevs = sav->sav_count;
1349	sav->sav_vdevs = NULL;
1350	sav->sav_count = 0;
1351
1352	/*
1353	 * Process new nvlist of vdevs.
1354	 */
1355	for (i = 0; i < nl2cache; i++) {
1356		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1357		    &guid) == 0);
1358
1359		newvdevs[i] = NULL;
1360		for (j = 0; j < oldnvdevs; j++) {
1361			vd = oldvdevs[j];
1362			if (vd != NULL && guid == vd->vdev_guid) {
1363				/*
1364				 * Retain previous vdev for add/remove ops.
1365				 */
1366				newvdevs[i] = vd;
1367				oldvdevs[j] = NULL;
1368				break;
1369			}
1370		}
1371
1372		if (newvdevs[i] == NULL) {
1373			/*
1374			 * Create new vdev
1375			 */
1376			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1377			    VDEV_ALLOC_L2CACHE) == 0);
1378			ASSERT(vd != NULL);
1379			newvdevs[i] = vd;
1380
1381			/*
1382			 * Commit this vdev as an l2cache device,
1383			 * even if it fails to open.
1384			 */
1385			spa_l2cache_add(vd);
1386
1387			vd->vdev_top = vd;
1388			vd->vdev_aux = sav;
1389
1390			spa_l2cache_activate(vd);
1391
1392			if (vdev_open(vd) != 0)
1393				continue;
1394
1395			(void) vdev_validate_aux(vd);
1396
1397			if (!vdev_is_dead(vd))
1398				l2arc_add_vdev(spa, vd);
1399		}
1400	}
1401
1402	/*
1403	 * Purge vdevs that were dropped
1404	 */
1405	for (i = 0; i < oldnvdevs; i++) {
1406		uint64_t pool;
1407
1408		vd = oldvdevs[i];
1409		if (vd != NULL) {
1410			ASSERT(vd->vdev_isl2cache);
1411
1412			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1413			    pool != 0ULL && l2arc_vdev_present(vd))
1414				l2arc_remove_vdev(vd);
1415			vdev_clear_stats(vd);
1416			vdev_free(vd);
1417		}
1418	}
1419
1420	if (oldvdevs)
1421		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1422
1423	if (sav->sav_config == NULL)
1424		goto out;
1425
1426	sav->sav_vdevs = newvdevs;
1427	sav->sav_count = (int)nl2cache;
1428
1429	/*
1430	 * Recompute the stashed list of l2cache devices, with status
1431	 * information this time.
1432	 */
1433	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1434	    DATA_TYPE_NVLIST_ARRAY) == 0);
1435
1436	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1437	for (i = 0; i < sav->sav_count; i++)
1438		l2cache[i] = vdev_config_generate(spa,
1439		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1440	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1441	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1442out:
1443	for (i = 0; i < sav->sav_count; i++)
1444		nvlist_free(l2cache[i]);
1445	if (sav->sav_count)
1446		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1447}
1448
1449static int
1450load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1451{
1452	dmu_buf_t *db;
1453	char *packed = NULL;
1454	size_t nvsize = 0;
1455	int error;
1456	*value = NULL;
1457
1458	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1459	nvsize = *(uint64_t *)db->db_data;
1460	dmu_buf_rele(db, FTAG);
1461
1462	packed = kmem_alloc(nvsize, KM_SLEEP);
1463	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1464	    DMU_READ_PREFETCH);
1465	if (error == 0)
1466		error = nvlist_unpack(packed, nvsize, value, 0);
1467	kmem_free(packed, nvsize);
1468
1469	return (error);
1470}
1471
1472/*
1473 * Checks to see if the given vdev could not be opened, in which case we post a
1474 * sysevent to notify the autoreplace code that the device has been removed.
1475 */
1476static void
1477spa_check_removed(vdev_t *vd)
1478{
1479	for (int c = 0; c < vd->vdev_children; c++)
1480		spa_check_removed(vd->vdev_child[c]);
1481
1482	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1483		zfs_post_autoreplace(vd->vdev_spa, vd);
1484		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1485	}
1486}
1487
1488/*
1489 * Validate the current config against the MOS config
1490 */
1491static boolean_t
1492spa_config_valid(spa_t *spa, nvlist_t *config)
1493{
1494	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1495	nvlist_t *nv;
1496
1497	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1498
1499	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1500	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1501
1502	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1503
1504	/*
1505	 * If we're doing a normal import, then build up any additional
1506	 * diagnostic information about missing devices in this config.
1507	 * We'll pass this up to the user for further processing.
1508	 */
1509	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1510		nvlist_t **child, *nv;
1511		uint64_t idx = 0;
1512
1513		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1514		    KM_SLEEP);
1515		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1516
1517		for (int c = 0; c < rvd->vdev_children; c++) {
1518			vdev_t *tvd = rvd->vdev_child[c];
1519			vdev_t *mtvd  = mrvd->vdev_child[c];
1520
1521			if (tvd->vdev_ops == &vdev_missing_ops &&
1522			    mtvd->vdev_ops != &vdev_missing_ops &&
1523			    mtvd->vdev_islog)
1524				child[idx++] = vdev_config_generate(spa, mtvd,
1525				    B_FALSE, 0);
1526		}
1527
1528		if (idx) {
1529			VERIFY(nvlist_add_nvlist_array(nv,
1530			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1531			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1532			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1533
1534			for (int i = 0; i < idx; i++)
1535				nvlist_free(child[i]);
1536		}
1537		nvlist_free(nv);
1538		kmem_free(child, rvd->vdev_children * sizeof (char **));
1539	}
1540
1541	/*
1542	 * Compare the root vdev tree with the information we have
1543	 * from the MOS config (mrvd). Check each top-level vdev
1544	 * with the corresponding MOS config top-level (mtvd).
1545	 */
1546	for (int c = 0; c < rvd->vdev_children; c++) {
1547		vdev_t *tvd = rvd->vdev_child[c];
1548		vdev_t *mtvd  = mrvd->vdev_child[c];
1549
1550		/*
1551		 * Resolve any "missing" vdevs in the current configuration.
1552		 * If we find that the MOS config has more accurate information
1553		 * about the top-level vdev then use that vdev instead.
1554		 */
1555		if (tvd->vdev_ops == &vdev_missing_ops &&
1556		    mtvd->vdev_ops != &vdev_missing_ops) {
1557
1558			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1559				continue;
1560
1561			/*
1562			 * Device specific actions.
1563			 */
1564			if (mtvd->vdev_islog) {
1565				spa_set_log_state(spa, SPA_LOG_CLEAR);
1566			} else {
1567				/*
1568				 * XXX - once we have 'readonly' pool
1569				 * support we should be able to handle
1570				 * missing data devices by transitioning
1571				 * the pool to readonly.
1572				 */
1573				continue;
1574			}
1575
1576			/*
1577			 * Swap the missing vdev with the data we were
1578			 * able to obtain from the MOS config.
1579			 */
1580			vdev_remove_child(rvd, tvd);
1581			vdev_remove_child(mrvd, mtvd);
1582
1583			vdev_add_child(rvd, mtvd);
1584			vdev_add_child(mrvd, tvd);
1585
1586			spa_config_exit(spa, SCL_ALL, FTAG);
1587			vdev_load(mtvd);
1588			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1589
1590			vdev_reopen(rvd);
1591		} else if (mtvd->vdev_islog) {
1592			/*
1593			 * Load the slog device's state from the MOS config
1594			 * since it's possible that the label does not
1595			 * contain the most up-to-date information.
1596			 */
1597			vdev_load_log_state(tvd, mtvd);
1598			vdev_reopen(tvd);
1599		}
1600	}
1601	vdev_free(mrvd);
1602	spa_config_exit(spa, SCL_ALL, FTAG);
1603
1604	/*
1605	 * Ensure we were able to validate the config.
1606	 */
1607	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1608}
1609
1610/*
1611 * Check for missing log devices
1612 */
1613static int
1614spa_check_logs(spa_t *spa)
1615{
1616	switch (spa->spa_log_state) {
1617	case SPA_LOG_MISSING:
1618		/* need to recheck in case slog has been restored */
1619	case SPA_LOG_UNKNOWN:
1620		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1621		    DS_FIND_CHILDREN)) {
1622			spa_set_log_state(spa, SPA_LOG_MISSING);
1623			return (1);
1624		}
1625		break;
1626	}
1627	return (0);
1628}
1629
1630static boolean_t
1631spa_passivate_log(spa_t *spa)
1632{
1633	vdev_t *rvd = spa->spa_root_vdev;
1634	boolean_t slog_found = B_FALSE;
1635
1636	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1637
1638	if (!spa_has_slogs(spa))
1639		return (B_FALSE);
1640
1641	for (int c = 0; c < rvd->vdev_children; c++) {
1642		vdev_t *tvd = rvd->vdev_child[c];
1643		metaslab_group_t *mg = tvd->vdev_mg;
1644
1645		if (tvd->vdev_islog) {
1646			metaslab_group_passivate(mg);
1647			slog_found = B_TRUE;
1648		}
1649	}
1650
1651	return (slog_found);
1652}
1653
1654static void
1655spa_activate_log(spa_t *spa)
1656{
1657	vdev_t *rvd = spa->spa_root_vdev;
1658
1659	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1660
1661	for (int c = 0; c < rvd->vdev_children; c++) {
1662		vdev_t *tvd = rvd->vdev_child[c];
1663		metaslab_group_t *mg = tvd->vdev_mg;
1664
1665		if (tvd->vdev_islog)
1666			metaslab_group_activate(mg);
1667	}
1668}
1669
1670int
1671spa_offline_log(spa_t *spa)
1672{
1673	int error = 0;
1674
1675	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1676	    NULL, DS_FIND_CHILDREN)) == 0) {
1677
1678		/*
1679		 * We successfully offlined the log device, sync out the
1680		 * current txg so that the "stubby" block can be removed
1681		 * by zil_sync().
1682		 */
1683		txg_wait_synced(spa->spa_dsl_pool, 0);
1684	}
1685	return (error);
1686}
1687
1688static void
1689spa_aux_check_removed(spa_aux_vdev_t *sav)
1690{
1691	for (int i = 0; i < sav->sav_count; i++)
1692		spa_check_removed(sav->sav_vdevs[i]);
1693}
1694
1695void
1696spa_claim_notify(zio_t *zio)
1697{
1698	spa_t *spa = zio->io_spa;
1699
1700	if (zio->io_error)
1701		return;
1702
1703	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
1704	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1705		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1706	mutex_exit(&spa->spa_props_lock);
1707}
1708
1709typedef struct spa_load_error {
1710	uint64_t	sle_meta_count;
1711	uint64_t	sle_data_count;
1712} spa_load_error_t;
1713
1714static void
1715spa_load_verify_done(zio_t *zio)
1716{
1717	blkptr_t *bp = zio->io_bp;
1718	spa_load_error_t *sle = zio->io_private;
1719	dmu_object_type_t type = BP_GET_TYPE(bp);
1720	int error = zio->io_error;
1721
1722	if (error) {
1723		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1724		    type != DMU_OT_INTENT_LOG)
1725			atomic_add_64(&sle->sle_meta_count, 1);
1726		else
1727			atomic_add_64(&sle->sle_data_count, 1);
1728	}
1729	zio_data_buf_free(zio->io_data, zio->io_size);
1730}
1731
1732/*ARGSUSED*/
1733static int
1734spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1735    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1736{
1737	if (bp != NULL) {
1738		zio_t *rio = arg;
1739		size_t size = BP_GET_PSIZE(bp);
1740		void *data = zio_data_buf_alloc(size);
1741
1742		zio_nowait(zio_read(rio, spa, bp, data, size,
1743		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1744		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1745		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1746	}
1747	return (0);
1748}
1749
1750static int
1751spa_load_verify(spa_t *spa)
1752{
1753	zio_t *rio;
1754	spa_load_error_t sle = { 0 };
1755	zpool_rewind_policy_t policy;
1756	boolean_t verify_ok = B_FALSE;
1757	int error;
1758
1759	zpool_get_rewind_policy(spa->spa_config, &policy);
1760
1761	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1762		return (0);
1763
1764	rio = zio_root(spa, NULL, &sle,
1765	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1766
1767	error = traverse_pool(spa, spa->spa_verify_min_txg,
1768	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1769
1770	(void) zio_wait(rio);
1771
1772	spa->spa_load_meta_errors = sle.sle_meta_count;
1773	spa->spa_load_data_errors = sle.sle_data_count;
1774
1775	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1776	    sle.sle_data_count <= policy.zrp_maxdata) {
1777		int64_t loss = 0;
1778
1779		verify_ok = B_TRUE;
1780		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1781		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1782
1783		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1784		VERIFY(nvlist_add_uint64(spa->spa_load_info,
1785		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1786		VERIFY(nvlist_add_int64(spa->spa_load_info,
1787		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1788		VERIFY(nvlist_add_uint64(spa->spa_load_info,
1789		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1790	} else {
1791		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1792	}
1793
1794	if (error) {
1795		if (error != ENXIO && error != EIO)
1796			error = EIO;
1797		return (error);
1798	}
1799
1800	return (verify_ok ? 0 : EIO);
1801}
1802
1803/*
1804 * Find a value in the pool props object.
1805 */
1806static void
1807spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1808{
1809	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1810	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1811}
1812
1813/*
1814 * Find a value in the pool directory object.
1815 */
1816static int
1817spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1818{
1819	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1820	    name, sizeof (uint64_t), 1, val));
1821}
1822
1823static int
1824spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1825{
1826	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1827	return (err);
1828}
1829
1830/*
1831 * Fix up config after a partly-completed split.  This is done with the
1832 * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
1833 * pool have that entry in their config, but only the splitting one contains
1834 * a list of all the guids of the vdevs that are being split off.
1835 *
1836 * This function determines what to do with that list: either rejoin
1837 * all the disks to the pool, or complete the splitting process.  To attempt
1838 * the rejoin, each disk that is offlined is marked online again, and
1839 * we do a reopen() call.  If the vdev label for every disk that was
1840 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1841 * then we call vdev_split() on each disk, and complete the split.
1842 *
1843 * Otherwise we leave the config alone, with all the vdevs in place in
1844 * the original pool.
1845 */
1846static void
1847spa_try_repair(spa_t *spa, nvlist_t *config)
1848{
1849	uint_t extracted;
1850	uint64_t *glist;
1851	uint_t i, gcount;
1852	nvlist_t *nvl;
1853	vdev_t **vd;
1854	boolean_t attempt_reopen;
1855
1856	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1857		return;
1858
1859	/* check that the config is complete */
1860	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1861	    &glist, &gcount) != 0)
1862		return;
1863
1864	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1865
1866	/* attempt to online all the vdevs & validate */
1867	attempt_reopen = B_TRUE;
1868	for (i = 0; i < gcount; i++) {
1869		if (glist[i] == 0)	/* vdev is hole */
1870			continue;
1871
1872		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1873		if (vd[i] == NULL) {
1874			/*
1875			 * Don't bother attempting to reopen the disks;
1876			 * just do the split.
1877			 */
1878			attempt_reopen = B_FALSE;
1879		} else {
1880			/* attempt to re-online it */
1881			vd[i]->vdev_offline = B_FALSE;
1882		}
1883	}
1884
1885	if (attempt_reopen) {
1886		vdev_reopen(spa->spa_root_vdev);
1887
1888		/* check each device to see what state it's in */
1889		for (extracted = 0, i = 0; i < gcount; i++) {
1890			if (vd[i] != NULL &&
1891			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1892				break;
1893			++extracted;
1894		}
1895	}
1896
1897	/*
1898	 * If every disk has been moved to the new pool, or if we never
1899	 * even attempted to look at them, then we split them off for
1900	 * good.
1901	 */
1902	if (!attempt_reopen || gcount == extracted) {
1903		for (i = 0; i < gcount; i++)
1904			if (vd[i] != NULL)
1905				vdev_split(vd[i]);
1906		vdev_reopen(spa->spa_root_vdev);
1907	}
1908
1909	kmem_free(vd, gcount * sizeof (vdev_t *));
1910}
1911
1912static int
1913spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1914    boolean_t mosconfig)
1915{
1916	nvlist_t *config = spa->spa_config;
1917	char *ereport = FM_EREPORT_ZFS_POOL;
1918	char *comment;
1919	int error;
1920	uint64_t pool_guid;
1921	nvlist_t *nvl;
1922
1923	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1924		return (EINVAL);
1925
1926	ASSERT(spa->spa_comment == NULL);
1927	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1928		spa->spa_comment = spa_strdup(comment);
1929
1930	/*
1931	 * Versioning wasn't explicitly added to the label until later, so if
1932	 * it's not present treat it as the initial version.
1933	 */
1934	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1935	    &spa->spa_ubsync.ub_version) != 0)
1936		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1937
1938	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1939	    &spa->spa_config_txg);
1940
1941	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1942	    spa_guid_exists(pool_guid, 0)) {
1943		error = EEXIST;
1944	} else {
1945		spa->spa_config_guid = pool_guid;
1946
1947		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1948		    &nvl) == 0) {
1949			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1950			    KM_SLEEP) == 0);
1951		}
1952
1953		nvlist_free(spa->spa_load_info);
1954		spa->spa_load_info = fnvlist_alloc();
1955
1956		gethrestime(&spa->spa_loaded_ts);
1957		error = spa_load_impl(spa, pool_guid, config, state, type,
1958		    mosconfig, &ereport);
1959	}
1960
1961	spa->spa_minref = refcount_count(&spa->spa_refcount);
1962	if (error) {
1963		if (error != EEXIST) {
1964			spa->spa_loaded_ts.tv_sec = 0;
1965			spa->spa_loaded_ts.tv_nsec = 0;
1966		}
1967		if (error != EBADF) {
1968			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1969		}
1970	}
1971	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1972	spa->spa_ena = 0;
1973
1974	return (error);
1975}
1976
1977/*
1978 * Load an existing storage pool, using the pool's builtin spa_config as a
1979 * source of configuration information.
1980 */
1981static int
1982spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1983    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1984    char **ereport)
1985{
1986	int error = 0;
1987	nvlist_t *nvroot = NULL;
1988	nvlist_t *label;
1989	vdev_t *rvd;
1990	uberblock_t *ub = &spa->spa_uberblock;
1991	uint64_t children, config_cache_txg = spa->spa_config_txg;
1992	int orig_mode = spa->spa_mode;
1993	int parse;
1994	uint64_t obj;
1995	boolean_t missing_feat_write = B_FALSE;
1996
1997	/*
1998	 * If this is an untrusted config, access the pool in read-only mode.
1999	 * This prevents things like resilvering recently removed devices.
2000	 */
2001	if (!mosconfig)
2002		spa->spa_mode = FREAD;
2003
2004	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2005
2006	spa->spa_load_state = state;
2007
2008	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2009		return (EINVAL);
2010
2011	parse = (type == SPA_IMPORT_EXISTING ?
2012	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2013
2014	/*
2015	 * Create "The Godfather" zio to hold all async IOs
2016	 */
2017	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2018	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2019
2020	/*
2021	 * Parse the configuration into a vdev tree.  We explicitly set the
2022	 * value that will be returned by spa_version() since parsing the
2023	 * configuration requires knowing the version number.
2024	 */
2025	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2026	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2027	spa_config_exit(spa, SCL_ALL, FTAG);
2028
2029	if (error != 0)
2030		return (error);
2031
2032	ASSERT(spa->spa_root_vdev == rvd);
2033
2034	if (type != SPA_IMPORT_ASSEMBLE) {
2035		ASSERT(spa_guid(spa) == pool_guid);
2036	}
2037
2038	/*
2039	 * Try to open all vdevs, loading each label in the process.
2040	 */
2041	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2042	error = vdev_open(rvd);
2043	spa_config_exit(spa, SCL_ALL, FTAG);
2044	if (error != 0)
2045		return (error);
2046
2047	/*
2048	 * We need to validate the vdev labels against the configuration that
2049	 * we have in hand, which is dependent on the setting of mosconfig. If
2050	 * mosconfig is true then we're validating the vdev labels based on
2051	 * that config.  Otherwise, we're validating against the cached config
2052	 * (zpool.cache) that was read when we loaded the zfs module, and then
2053	 * later we will recursively call spa_load() and validate against
2054	 * the vdev config.
2055	 *
2056	 * If we're assembling a new pool that's been split off from an
2057	 * existing pool, the labels haven't yet been updated so we skip
2058	 * validation for now.
2059	 */
2060	if (type != SPA_IMPORT_ASSEMBLE) {
2061		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2062		error = vdev_validate(rvd, mosconfig);
2063		spa_config_exit(spa, SCL_ALL, FTAG);
2064
2065		if (error != 0)
2066			return (error);
2067
2068		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2069			return (ENXIO);
2070	}
2071<