spa.c revision c67d9675bbc8392fe45f3a7dfbda1ad4daa1eb07
1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5fa9e406ahrens * Common Development and Distribution License, Version 1.0 only
6fa9e406ahrens * (the "License").  You may not use this file except in compliance
7fa9e406ahrens * with the License.
8fa9e406ahrens *
9fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
11fa9e406ahrens * See the License for the specific language governing permissions
12fa9e406ahrens * and limitations under the License.
13fa9e406ahrens *
14fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
15fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
17fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
18fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
19fa9e406ahrens *
20fa9e406ahrens * CDDL HEADER END
21fa9e406ahrens */
22fa9e406ahrens/*
23c67d967eschrock * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24fa9e406ahrens * Use is subject to license terms.
25fa9e406ahrens */
26fa9e406ahrens
27fa9e406ahrens#pragma ident	"%Z%%M%	%I%	%E% SMI"
28fa9e406ahrens
29fa9e406ahrens/*
30fa9e406ahrens * This file contains all the routines used when modifying on-disk SPA state.
31fa9e406ahrens * This includes opening, importing, destroying, exporting a pool, and syncing a
32fa9e406ahrens * pool.
33fa9e406ahrens */
34fa9e406ahrens
35fa9e406ahrens#include <sys/zfs_context.h>
36fa9e406ahrens#include <sys/spa_impl.h>
37fa9e406ahrens#include <sys/zio.h>
38fa9e406ahrens#include <sys/zio_checksum.h>
39fa9e406ahrens#include <sys/zio_compress.h>
40fa9e406ahrens#include <sys/dmu.h>
41fa9e406ahrens#include <sys/dmu_tx.h>
42fa9e406ahrens#include <sys/zap.h>
43fa9e406ahrens#include <sys/zil.h>
44fa9e406ahrens#include <sys/vdev_impl.h>
45fa9e406ahrens#include <sys/metaslab.h>
46fa9e406ahrens#include <sys/uberblock_impl.h>
47fa9e406ahrens#include <sys/txg.h>
48fa9e406ahrens#include <sys/avl.h>
49fa9e406ahrens#include <sys/dmu_traverse.h>
50fa9e406ahrens#include <sys/unique.h>
51fa9e406ahrens#include <sys/dsl_pool.h>
52fa9e406ahrens#include <sys/dsl_dir.h>
53fa9e406ahrens#include <sys/dsl_prop.h>
54fa9e406ahrens#include <sys/fs/zfs.h>
55fa9e406ahrens#include <sys/callb.h>
56fa9e406ahrens
57fa9e406ahrensstatic uint32_t spa_active_count;
58fa9e406ahrens
59fa9e406ahrens/*
60fa9e406ahrens * ==========================================================================
61fa9e406ahrens * SPA state manipulation (open/create/destroy/import/export)
62fa9e406ahrens * ==========================================================================
63fa9e406ahrens */
64fa9e406ahrens
65fa9e406ahrens/*
66fa9e406ahrens * Activate an uninitialized pool.
67fa9e406ahrens */
68fa9e406ahrensstatic void
69fa9e406ahrensspa_activate(spa_t *spa)
70fa9e406ahrens{
71fa9e406ahrens	int t;
72fa9e406ahrens
73fa9e406ahrens	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
74fa9e406ahrens
75fa9e406ahrens	spa->spa_state = POOL_STATE_ACTIVE;
76fa9e406ahrens
77fa9e406ahrens	spa->spa_normal_class = metaslab_class_create();
78fa9e406ahrens
79fa9e406ahrens	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
80fa9e406ahrens	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
81fa9e406ahrens
82fa9e406ahrens	for (t = 0; t < ZIO_TYPES; t++) {
83fa9e406ahrens		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
84fa9e406ahrens		    8, maxclsyspri, 50, INT_MAX,
85fa9e406ahrens		    TASKQ_PREPOPULATE);
86fa9e406ahrens		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
87fa9e406ahrens		    8, maxclsyspri, 50, INT_MAX,
88fa9e406ahrens		    TASKQ_PREPOPULATE);
89fa9e406ahrens	}
90fa9e406ahrens
91fa9e406ahrens	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
92fa9e406ahrens
93fa9e406ahrens	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
94fa9e406ahrens	    offsetof(vdev_t, vdev_dirty_node));
95fa9e406ahrens
96fa9e406ahrens	txg_list_create(&spa->spa_vdev_txg_list,
97fa9e406ahrens	    offsetof(struct vdev, vdev_txg_node));
98fa9e406ahrens}
99fa9e406ahrens
100fa9e406ahrens/*
101fa9e406ahrens * Opposite of spa_activate().
102fa9e406ahrens */
103fa9e406ahrensstatic void
104fa9e406ahrensspa_deactivate(spa_t *spa)
105fa9e406ahrens{
106fa9e406ahrens	int t;
107fa9e406ahrens
108fa9e406ahrens	ASSERT(spa->spa_sync_on == B_FALSE);
109fa9e406ahrens	ASSERT(spa->spa_dsl_pool == NULL);
110fa9e406ahrens	ASSERT(spa->spa_root_vdev == NULL);
111fa9e406ahrens
112fa9e406ahrens	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
113fa9e406ahrens
114fa9e406ahrens	txg_list_destroy(&spa->spa_vdev_txg_list);
115fa9e406ahrens
116fa9e406ahrens	list_destroy(&spa->spa_dirty_list);
117fa9e406ahrens
118fa9e406ahrens	rw_destroy(&spa->spa_traverse_lock);
119fa9e406ahrens
120fa9e406ahrens	for (t = 0; t < ZIO_TYPES; t++) {
121fa9e406ahrens		taskq_destroy(spa->spa_zio_issue_taskq[t]);
122fa9e406ahrens		taskq_destroy(spa->spa_zio_intr_taskq[t]);
123fa9e406ahrens		spa->spa_zio_issue_taskq[t] = NULL;
124fa9e406ahrens		spa->spa_zio_intr_taskq[t] = NULL;
125fa9e406ahrens	}
126fa9e406ahrens
127fa9e406ahrens	taskq_destroy(spa->spa_vdev_retry_taskq);
128fa9e406ahrens	spa->spa_vdev_retry_taskq = NULL;
129fa9e406ahrens
130fa9e406ahrens	metaslab_class_destroy(spa->spa_normal_class);
131fa9e406ahrens	spa->spa_normal_class = NULL;
132fa9e406ahrens
133fa9e406ahrens	spa->spa_state = POOL_STATE_UNINITIALIZED;
134fa9e406ahrens}
135fa9e406ahrens
136fa9e406ahrens/*
137fa9e406ahrens * Verify a pool configuration, and construct the vdev tree appropriately.  This
138fa9e406ahrens * will create all the necessary vdevs in the appropriate layout, with each vdev
139fa9e406ahrens * in the CLOSED state.  This will prep the pool before open/creation/import.
140fa9e406ahrens * All vdev validation is done by the vdev_alloc() routine.
141fa9e406ahrens */
142fa9e406ahrensstatic vdev_t *
143fa9e406ahrensspa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype)
144fa9e406ahrens{
145fa9e406ahrens	nvlist_t **child;
146fa9e406ahrens	uint_t c, children;
147fa9e406ahrens	vdev_t *vd;
148fa9e406ahrens
149fa9e406ahrens	if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL)
150fa9e406ahrens		return (NULL);
151fa9e406ahrens
152fa9e406ahrens	if (vd->vdev_ops->vdev_op_leaf)
153fa9e406ahrens		return (vd);
154fa9e406ahrens
155fa9e406ahrens	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
156fa9e406ahrens	    &child, &children) != 0) {
157fa9e406ahrens		vdev_free(vd);
158fa9e406ahrens		return (NULL);
159fa9e406ahrens	}
160fa9e406ahrens
161fa9e406ahrens	for (c = 0; c < children; c++) {
162fa9e406ahrens		if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) {
163fa9e406ahrens			vdev_free(vd);
164fa9e406ahrens			return (NULL);
165fa9e406ahrens		}
166fa9e406ahrens	}
167fa9e406ahrens
168fa9e406ahrens	return (vd);
169fa9e406ahrens}
170fa9e406ahrens
171fa9e406ahrens/*
172fa9e406ahrens * Opposite of spa_load().
173fa9e406ahrens */
174fa9e406ahrensstatic void
175fa9e406ahrensspa_unload(spa_t *spa)
176fa9e406ahrens{
177fa9e406ahrens	/*
178fa9e406ahrens	 * Stop syncing.
179fa9e406ahrens	 */
180fa9e406ahrens	if (spa->spa_sync_on) {
181fa9e406ahrens		txg_sync_stop(spa->spa_dsl_pool);
182fa9e406ahrens		spa->spa_sync_on = B_FALSE;
183fa9e406ahrens	}
184fa9e406ahrens
185fa9e406ahrens	/*
186fa9e406ahrens	 * Wait for any outstanding prefetch I/O to complete.
187fa9e406ahrens	 */
188fa9e406ahrens	spa_config_enter(spa, RW_WRITER);
189fa9e406ahrens	spa_config_exit(spa);
190fa9e406ahrens
191fa9e406ahrens	/*
192fa9e406ahrens	 * Close the dsl pool.
193fa9e406ahrens	 */
194fa9e406ahrens	if (spa->spa_dsl_pool) {
195fa9e406ahrens		dsl_pool_close(spa->spa_dsl_pool);
196fa9e406ahrens		spa->spa_dsl_pool = NULL;
197fa9e406ahrens	}
198fa9e406ahrens
199fa9e406ahrens	/*
200fa9e406ahrens	 * Close all vdevs.
201fa9e406ahrens	 */
202fa9e406ahrens	if (spa->spa_root_vdev) {
203fa9e406ahrens		vdev_free(spa->spa_root_vdev);
204fa9e406ahrens		spa->spa_root_vdev = NULL;
205fa9e406ahrens	}
206fa9e406ahrens}
207fa9e406ahrens
208fa9e406ahrens/*
209fa9e406ahrens * Load an existing storage pool, using the pool's builtin spa_config as a
210fa9e406ahrens * source of configuration information.  The 'readonly' flag will prevent us
211fa9e406ahrens * from writing any updated state to disk, and can be use when testing a pool
212fa9e406ahrens * for import.
213fa9e406ahrens */
214fa9e406ahrensstatic int
215fa9e406ahrensspa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
216fa9e406ahrens{
217fa9e406ahrens	int error = 0;
218fa9e406ahrens	nvlist_t *nvroot = NULL;
219fa9e406ahrens	vdev_t *rvd;
220fa9e406ahrens	uberblock_t *ub = &spa->spa_uberblock;
221fa9e406ahrens	uint64_t pool_guid;
222fa9e406ahrens	zio_t *zio;
223fa9e406ahrens
224fa9e406ahrens	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
225fa9e406ahrens	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
226fa9e406ahrens		return (EINVAL);
227fa9e406ahrens
228fa9e406ahrens	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
229fa9e406ahrens	    &spa->spa_config_txg);
230fa9e406ahrens
231fa9e406ahrens	if (import && spa_guid_exists(pool_guid, 0))
232fa9e406ahrens		return (EEXIST);
233fa9e406ahrens
234fa9e406ahrens	/*
235fa9e406ahrens	 * Parse the configuration into a vdev tree.
236fa9e406ahrens	 */
237fa9e406ahrens	spa_config_enter(spa, RW_WRITER);
238fa9e406ahrens	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
239fa9e406ahrens	spa_config_exit(spa);
240fa9e406ahrens
241fa9e406ahrens	if (rvd == NULL)
242fa9e406ahrens		return (EINVAL);
243fa9e406ahrens
244fa9e406ahrens	spa->spa_root_vdev = rvd;
245fa9e406ahrens	ASSERT(spa_guid(spa) == pool_guid);
246fa9e406ahrens
247fa9e406ahrens	/*
248fa9e406ahrens	 * Try to open all vdevs, loading each label in the process.
249fa9e406ahrens	 */
250fa9e406ahrens	if (vdev_open(rvd) != 0)
251fa9e406ahrens		return (ENXIO);
252fa9e406ahrens
253fa9e406ahrens	/*
254fa9e406ahrens	 * Find the best uberblock.
255fa9e406ahrens	 */
256fa9e406ahrens	bzero(ub, sizeof (uberblock_t));
257fa9e406ahrens
258fa9e406ahrens	zio = zio_root(spa, NULL, NULL,
259fa9e406ahrens	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
260fa9e406ahrens	vdev_uberblock_load(zio, rvd, ub);
261fa9e406ahrens	error = zio_wait(zio);
262fa9e406ahrens
263fa9e406ahrens	/*
264fa9e406ahrens	 * If we weren't able to find a single valid uberblock, return failure.
265fa9e406ahrens	 */
266fa9e406ahrens	if (ub->ub_txg == 0) {
267fa9e406ahrens		dprintf("ub_txg is zero\n");
268fa9e406ahrens		return (ENXIO);
269fa9e406ahrens	}
270fa9e406ahrens
271fa9e406ahrens	/*
272fa9e406ahrens	 * If the vdev guid sum doesn't match the uberblock, we have an
273fa9e406ahrens	 * incomplete configuration.
274fa9e406ahrens	 */
275fa9e406ahrens	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
276fa9e406ahrens		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
277fa9e406ahrens		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
278fa9e406ahrens		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
279fa9e406ahrens		    rvd->vdev_guid_sum, ub->ub_guid_sum);
280fa9e406ahrens		return (ENXIO);
281fa9e406ahrens	}
282fa9e406ahrens
283fa9e406ahrens	/*
284fa9e406ahrens	 * Initialize internal SPA structures.
285fa9e406ahrens	 */
286fa9e406ahrens	spa->spa_state = POOL_STATE_ACTIVE;
287fa9e406ahrens	spa->spa_ubsync = spa->spa_uberblock;
288fa9e406ahrens	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
289fa9e406ahrens	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
290fa9e406ahrens	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
291fa9e406ahrens
292fa9e406ahrens	VERIFY(zap_lookup(spa->spa_meta_objset,
293fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
294fa9e406ahrens	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
295fa9e406ahrens
296fa9e406ahrens	if (!mosconfig) {
297fa9e406ahrens		dmu_buf_t *db;
298fa9e406ahrens		char *packed = NULL;
299fa9e406ahrens		size_t nvsize = 0;
300fa9e406ahrens		nvlist_t *newconfig = NULL;
301fa9e406ahrens
302fa9e406ahrens		db = dmu_bonus_hold(spa->spa_meta_objset,
303fa9e406ahrens		    spa->spa_config_object);
304fa9e406ahrens		dmu_buf_read(db);
305fa9e406ahrens		nvsize = *(uint64_t *)db->db_data;
306fa9e406ahrens		dmu_buf_rele(db);
307fa9e406ahrens
308fa9e406ahrens		packed = kmem_alloc(nvsize, KM_SLEEP);
309fa9e406ahrens		error = dmu_read_canfail(spa->spa_meta_objset,
310fa9e406ahrens		    spa->spa_config_object, 0, nvsize, packed);
311fa9e406ahrens		if (error == 0)
312fa9e406ahrens			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
313fa9e406ahrens		kmem_free(packed, nvsize);
314fa9e406ahrens
315fa9e406ahrens		if (error)
316fa9e406ahrens			return (ENXIO);
317fa9e406ahrens
318fa9e406ahrens		spa_config_set(spa, newconfig);
319fa9e406ahrens
320fa9e406ahrens		spa_unload(spa);
321fa9e406ahrens		spa_deactivate(spa);
322fa9e406ahrens		spa_activate(spa);
323fa9e406ahrens
324fa9e406ahrens		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
325fa9e406ahrens	}
326fa9e406ahrens
327fa9e406ahrens	VERIFY(zap_lookup(spa->spa_meta_objset,
328fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
329fa9e406ahrens	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
330fa9e406ahrens
331fa9e406ahrens	/*
332fa9e406ahrens	 * Load the vdev state for all top level vdevs.
333fa9e406ahrens	 */
334fa9e406ahrens	if ((error = vdev_load(rvd, import)) != 0)
335fa9e406ahrens		return (error);
336fa9e406ahrens
337fa9e406ahrens	/*
338fa9e406ahrens	 * Propagate the leaf DTLs we just loaded all the way up the tree.
339fa9e406ahrens	 */
340fa9e406ahrens	spa_config_enter(spa, RW_WRITER);
341fa9e406ahrens	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
342fa9e406ahrens	spa_config_exit(spa);
343fa9e406ahrens
344fa9e406ahrens	/*
345fa9e406ahrens	 * Check the state of the root vdev.  If it can't be opened, it
346fa9e406ahrens	 * indicates one or more toplevel vdevs are faulted.
347fa9e406ahrens	 */
348fa9e406ahrens	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
349fa9e406ahrens		return (ENXIO);
350fa9e406ahrens
351fa9e406ahrens	/*
352fa9e406ahrens	 * Claim log blocks that haven't been committed yet, and update all
353fa9e406ahrens	 * top-level vdevs to sync any config changes found in vdev_load().
354fa9e406ahrens	 * This must all happen in a single txg.
355fa9e406ahrens	 */
356fa9e406ahrens	if ((spa_mode & FWRITE) && !readonly) {
357fa9e406ahrens		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
358fa9e406ahrens		    spa_first_txg(spa));
359fa9e406ahrens		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
360fa9e406ahrens		vdev_config_dirty(rvd);
361fa9e406ahrens		dmu_tx_commit(tx);
362fa9e406ahrens
363fa9e406ahrens		spa->spa_sync_on = B_TRUE;
364fa9e406ahrens		txg_sync_start(spa->spa_dsl_pool);
365fa9e406ahrens
366fa9e406ahrens		/*
367fa9e406ahrens		 * Wait for all claims to sync.
368fa9e406ahrens		 */
369fa9e406ahrens		txg_wait_synced(spa->spa_dsl_pool, 0);
370fa9e406ahrens	}
371fa9e406ahrens
372fa9e406ahrens	return (0);
373fa9e406ahrens}
374fa9e406ahrens
375fa9e406ahrens/*
376fa9e406ahrens * Pool Open/Import
377fa9e406ahrens *
378fa9e406ahrens * The import case is identical to an open except that the configuration is sent
379fa9e406ahrens * down from userland, instead of grabbed from the configuration cache.  For the
380fa9e406ahrens * case of an open, the pool configuration will exist in the
381fa9e406ahrens * POOL_STATE_UNITIALIZED state.
382fa9e406ahrens *
383fa9e406ahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at
384fa9e406ahrens * the same time open the pool, without having to keep around the spa_t in some
385fa9e406ahrens * ambiguous state.
386fa9e406ahrens */
387fa9e406ahrensstatic int
388fa9e406ahrensspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
389fa9e406ahrens{
390fa9e406ahrens	spa_t *spa;
391fa9e406ahrens	int error;
392fa9e406ahrens	int loaded = B_FALSE;
393fa9e406ahrens	int locked = B_FALSE;
394fa9e406ahrens
395fa9e406ahrens	*spapp = NULL;
396fa9e406ahrens
397fa9e406ahrens	/*
398fa9e406ahrens	 * As disgusting as this is, we need to support recursive calls to this
399fa9e406ahrens	 * function because dsl_dir_open() is called during spa_load(), and ends
400fa9e406ahrens	 * up calling spa_open() again.  The real fix is to figure out how to
401fa9e406ahrens	 * avoid dsl_dir_open() calling this in the first place.
402fa9e406ahrens	 */
403fa9e406ahrens	if (mutex_owner(&spa_namespace_lock) != curthread) {
404fa9e406ahrens		mutex_enter(&spa_namespace_lock);
405fa9e406ahrens		locked = B_TRUE;
406fa9e406ahrens	}
407fa9e406ahrens
408fa9e406ahrens	if ((spa = spa_lookup(pool)) == NULL) {
409fa9e406ahrens		if (locked)
410fa9e406ahrens			mutex_exit(&spa_namespace_lock);
411fa9e406ahrens		return (ENOENT);
412fa9e406ahrens	}
413fa9e406ahrens	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
414fa9e406ahrens
415fa9e406ahrens		spa_activate(spa);
416fa9e406ahrens
417fa9e406ahrens		error = spa_load(spa, spa->spa_config,
418fa9e406ahrens		    B_FALSE, B_FALSE, B_FALSE);
419fa9e406ahrens
420fa9e406ahrens		if (error == EBADF) {
421fa9e406ahrens			/*
422fa9e406ahrens			 * If vdev_load() returns EBADF, it indicates that one
423fa9e406ahrens			 * of the vdevs indicates that the pool has been
424fa9e406ahrens			 * exported or destroyed.  If this is the case, the
425fa9e406ahrens			 * config cache is out of sync and we should remove the
426fa9e406ahrens			 * pool from the namespace.
427fa9e406ahrens			 */
428fa9e406ahrens			spa_unload(spa);
429fa9e406ahrens			spa_deactivate(spa);
430fa9e406ahrens			spa_remove(spa);
431fa9e406ahrens			spa_config_sync();
432fa9e406ahrens			if (locked)
433fa9e406ahrens				mutex_exit(&spa_namespace_lock);
434fa9e406ahrens			return (ENOENT);
435fa9e406ahrens		} if (error) {
436fa9e406ahrens			/*
437fa9e406ahrens			 * We can't open the pool, but we still have useful
438fa9e406ahrens			 * information: the state of each vdev after the
439fa9e406ahrens			 * attempted vdev_open().  Return this to the user.
440fa9e406ahrens			 */
441fa9e406ahrens			if (config != NULL && spa->spa_root_vdev != NULL)
442fa9e406ahrens				*config = spa_config_generate(spa, NULL, -1ULL,
443fa9e406ahrens				    B_TRUE);
444fa9e406ahrens			spa_unload(spa);
445fa9e406ahrens			spa_deactivate(spa);
446fa9e406ahrens			if (locked)
447fa9e406ahrens				mutex_exit(&spa_namespace_lock);
448fa9e406ahrens			*spapp = NULL;
449fa9e406ahrens			return (error);
450fa9e406ahrens		}
451fa9e406ahrens
452fa9e406ahrens		loaded = B_TRUE;
453fa9e406ahrens	}
454fa9e406ahrens
455fa9e406ahrens	spa_open_ref(spa, tag);
456fa9e406ahrens	if (locked)
457fa9e406ahrens		mutex_exit(&spa_namespace_lock);
458fa9e406ahrens
459fa9e406ahrens	*spapp = spa;
460fa9e406ahrens
461fa9e406ahrens	if (config != NULL) {
462fa9e406ahrens		spa_config_enter(spa, RW_READER);
463fa9e406ahrens		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
464fa9e406ahrens		spa_config_exit(spa);
465fa9e406ahrens	}
466fa9e406ahrens
467fa9e406ahrens	/*
468fa9e406ahrens	 * If we just loaded the pool, resilver anything that's out of date.
469fa9e406ahrens	 */
470fa9e406ahrens	if (loaded && (spa_mode & FWRITE))
471fa9e406ahrens		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
472fa9e406ahrens
473fa9e406ahrens	return (0);
474fa9e406ahrens}
475fa9e406ahrens
476fa9e406ahrensint
477fa9e406ahrensspa_open(const char *name, spa_t **spapp, void *tag)
478fa9e406ahrens{
479fa9e406ahrens	return (spa_open_common(name, spapp, tag, NULL));
480fa9e406ahrens}
481fa9e406ahrens
482fa9e406ahrensint
483fa9e406ahrensspa_get_stats(const char *name, nvlist_t **config)
484fa9e406ahrens{
485fa9e406ahrens	int error;
486fa9e406ahrens	spa_t *spa;
487fa9e406ahrens
488fa9e406ahrens	*config = NULL;
489fa9e406ahrens	error = spa_open_common(name, &spa, FTAG, config);
490fa9e406ahrens
491fa9e406ahrens	if (spa != NULL)
492fa9e406ahrens		spa_close(spa, FTAG);
493fa9e406ahrens
494fa9e406ahrens	return (error);
495fa9e406ahrens}
496fa9e406ahrens
497fa9e406ahrens/*
498fa9e406ahrens * Pool Creation
499fa9e406ahrens */
500fa9e406ahrensint
501fa9e406ahrensspa_create(const char *pool, nvlist_t *nvroot, char *altroot)
502fa9e406ahrens{
503fa9e406ahrens	spa_t *spa;
504fa9e406ahrens	dsl_pool_t *dp;
505fa9e406ahrens	dmu_tx_t *tx;
506fa9e406ahrens	int error;
507fa9e406ahrens	uint64_t txg = TXG_INITIAL;
508fa9e406ahrens
509fa9e406ahrens	/*
510fa9e406ahrens	 * If this pool already exists, return failure.
511fa9e406ahrens	 */
512fa9e406ahrens	mutex_enter(&spa_namespace_lock);
513fa9e406ahrens	if (spa_lookup(pool) != NULL) {
514fa9e406ahrens		mutex_exit(&spa_namespace_lock);
515fa9e406ahrens		return (EEXIST);
516fa9e406ahrens	}
517fa9e406ahrens	spa = spa_add(pool);
518fa9e406ahrens
519fa9e406ahrens	/*
520fa9e406ahrens	 * Allocate a new spa_t structure.
521fa9e406ahrens	 */
522fa9e406ahrens	spa_activate(spa);
523fa9e406ahrens
524fa9e406ahrens	spa->spa_uberblock.ub_txg = txg - 1;
525fa9e406ahrens	spa->spa_ubsync = spa->spa_uberblock;
526fa9e406ahrens
527fa9e406ahrens	error = spa_vdev_add(spa, nvroot);
528fa9e406ahrens
529fa9e406ahrens	if (error) {
530fa9e406ahrens		spa_unload(spa);
531fa9e406ahrens		spa_deactivate(spa);
532fa9e406ahrens		spa_remove(spa);
533fa9e406ahrens		mutex_exit(&spa_namespace_lock);
534fa9e406ahrens		return (error);
535fa9e406ahrens	}
536fa9e406ahrens
537fa9e406ahrens	if (altroot != NULL) {
538fa9e406ahrens		spa->spa_root = spa_strdup(altroot);
539fa9e406ahrens		atomic_add_32(&spa_active_count, 1);
540fa9e406ahrens	}
541fa9e406ahrens
542fa9e406ahrens	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
543fa9e406ahrens	spa->spa_meta_objset = dp->dp_meta_objset;
544fa9e406ahrens
545fa9e406ahrens	tx = dmu_tx_create_assigned(dp, txg);
546fa9e406ahrens
547fa9e406ahrens	/*
548fa9e406ahrens	 * Create the pool config object.
549fa9e406ahrens	 */
550fa9e406ahrens	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
551fa9e406ahrens	    DMU_OT_PACKED_NVLIST, 1 << 14,
552fa9e406ahrens	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
553fa9e406ahrens
554fa9e406ahrens	VERIFY(zap_add(spa->spa_meta_objset,
555fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
556fa9e406ahrens	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
557fa9e406ahrens
558fa9e406ahrens	/*
559fa9e406ahrens	 * Create the deferred-free bplist object.  Turn off compression
560fa9e406ahrens	 * because sync-to-convergence takes longer if the blocksize
561fa9e406ahrens	 * keeps changing.
562fa9e406ahrens	 */
563fa9e406ahrens	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
564fa9e406ahrens	    1 << 14, tx);
565fa9e406ahrens	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
566fa9e406ahrens	    ZIO_COMPRESS_OFF, tx);
567fa9e406ahrens
568fa9e406ahrens	VERIFY(zap_add(spa->spa_meta_objset,
569fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
570fa9e406ahrens	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
571fa9e406ahrens
572fa9e406ahrens	dmu_tx_commit(tx);
573fa9e406ahrens
574fa9e406ahrens	spa->spa_sync_on = B_TRUE;
575fa9e406ahrens	txg_sync_start(spa->spa_dsl_pool);
576fa9e406ahrens
577fa9e406ahrens	/*
578fa9e406ahrens	 * We explicitly wait for the first transaction to complete so that our
579fa9e406ahrens	 * bean counters are appropriately updated.
580fa9e406ahrens	 */
581fa9e406ahrens	txg_wait_synced(spa->spa_dsl_pool, txg);
582fa9e406ahrens
583fa9e406ahrens	spa_config_sync();
584fa9e406ahrens
585fa9e406ahrens	mutex_exit(&spa_namespace_lock);
586fa9e406ahrens
587fa9e406ahrens	return (0);
588fa9e406ahrens}
589fa9e406ahrens
590fa9e406ahrens/*
591fa9e406ahrens * Import the given pool into the system.  We set up the necessary spa_t and
592fa9e406ahrens * then call spa_load() to do the dirty work.
593fa9e406ahrens */
594fa9e406ahrensint
595fa9e406ahrensspa_import(const char *pool, nvlist_t *config, char *altroot)
596fa9e406ahrens{
597fa9e406ahrens	spa_t *spa;
598fa9e406ahrens	int error;
599fa9e406ahrens
600fa9e406ahrens	if (!(spa_mode & FWRITE))
601fa9e406ahrens		return (EROFS);
602fa9e406ahrens
603fa9e406ahrens	/*
604fa9e406ahrens	 * If a pool with this name exists, return failure.
605fa9e406ahrens	 */
606fa9e406ahrens	mutex_enter(&spa_namespace_lock);
607fa9e406ahrens	if (spa_lookup(pool) != NULL) {
608fa9e406ahrens		mutex_exit(&spa_namespace_lock);
609fa9e406ahrens		return (EEXIST);
610fa9e406ahrens	}
611fa9e406ahrens
612fa9e406ahrens	/*
613fa9e406ahrens	 * Create an initialize the spa structure
614fa9e406ahrens	 */
615fa9e406ahrens	spa = spa_add(pool);
616fa9e406ahrens	spa_activate(spa);
617fa9e406ahrens
618fa9e406ahrens	/*
619fa9e406ahrens	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
620fa9e406ahrens	 * so that we don't try to open the pool if the config is damaged.
621fa9e406ahrens	 */
622fa9e406ahrens	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
623fa9e406ahrens
624fa9e406ahrens	if (error) {
625fa9e406ahrens		spa_unload(spa);
626fa9e406ahrens		spa_deactivate(spa);
627fa9e406ahrens		spa_remove(spa);
628fa9e406ahrens		mutex_exit(&spa_namespace_lock);
629fa9e406ahrens		return (error);
630fa9e406ahrens	}
631fa9e406ahrens
632fa9e406ahrens	/*
633fa9e406ahrens	 * Set the alternate root, if there is one.
634fa9e406ahrens	 */
635fa9e406ahrens	if (altroot != NULL) {
636fa9e406ahrens		atomic_add_32(&spa_active_count, 1);
637fa9e406ahrens		spa->spa_root = spa_strdup(altroot);
638fa9e406ahrens	}
639fa9e406ahrens
640fa9e406ahrens	/*
641fa9e406ahrens	 * Initialize the config based on the in-core state.
642fa9e406ahrens	 */
643fa9e406ahrens	config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0);
644fa9e406ahrens
645fa9e406ahrens	spa_config_set(spa, config);
646fa9e406ahrens
647fa9e406ahrens	/*
648fa9e406ahrens	 * Sync the configuration cache.
649fa9e406ahrens	 */
650fa9e406ahrens	spa_config_sync();
651fa9e406ahrens
652fa9e406ahrens	mutex_exit(&spa_namespace_lock);
653fa9e406ahrens
654fa9e406ahrens	/*
655fa9e406ahrens	 * Resilver anything that's out of date.
656fa9e406ahrens	 */
657fa9e406ahrens	if (spa_mode & FWRITE)
658fa9e406ahrens		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
659fa9e406ahrens
660fa9e406ahrens	return (0);
661fa9e406ahrens}
662fa9e406ahrens
663fa9e406ahrens/*
664fa9e406ahrens * This (illegal) pool name is used when temporarily importing a spa_t in order
665fa9e406ahrens * to get the vdev stats associated with the imported devices.
666fa9e406ahrens */
667fa9e406ahrens#define	TRYIMPORT_NAME	"$import"
668fa9e406ahrens
669fa9e406ahrensnvlist_t *
670fa9e406ahrensspa_tryimport(nvlist_t *tryconfig)
671fa9e406ahrens{
672fa9e406ahrens	nvlist_t *config = NULL;
673fa9e406ahrens	char *poolname;
674fa9e406ahrens	spa_t *spa;
675fa9e406ahrens	uint64_t state;
676fa9e406ahrens
677fa9e406ahrens	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
678fa9e406ahrens		return (NULL);
679fa9e406ahrens
680fa9e406ahrens	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
681fa9e406ahrens		return (NULL);
682fa9e406ahrens
683fa9e406ahrens	mutex_enter(&spa_namespace_lock);
684fa9e406ahrens	spa = spa_add(TRYIMPORT_NAME);
685fa9e406ahrens
686fa9e406ahrens	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
687fa9e406ahrens
688fa9e406ahrens	/*
689fa9e406ahrens	 * Initialize the spa_t structure.
690fa9e406ahrens	 */
691fa9e406ahrens	spa_activate(spa);
692fa9e406ahrens
693fa9e406ahrens	/*
694fa9e406ahrens	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
695fa9e406ahrens	 * so we don't try to open the pool if the config is damaged.
696fa9e406ahrens	 */
697fa9e406ahrens	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
698fa9e406ahrens
699fa9e406ahrens	/*
700fa9e406ahrens	 * If 'tryconfig' was at least parsable, return the current config.
701fa9e406ahrens	 */
702fa9e406ahrens	if (spa->spa_root_vdev != NULL) {
703fa9e406ahrens		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
704fa9e406ahrens		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
705fa9e406ahrens		    poolname) == 0);
706fa9e406ahrens		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
707fa9e406ahrens		    state) == 0);
708fa9e406ahrens	}
709fa9e406ahrens
710fa9e406ahrens	spa_unload(spa);
711fa9e406ahrens	spa_deactivate(spa);
712fa9e406ahrens	spa_remove(spa);
713fa9e406ahrens	mutex_exit(&spa_namespace_lock);
714fa9e406ahrens
715fa9e406ahrens	return (config);
716fa9e406ahrens}
717fa9e406ahrens
718fa9e406ahrens/*
719fa9e406ahrens * Pool export/destroy
720fa9e406ahrens *
721fa9e406ahrens * The act of destroying or exporting a pool is very simple.  We make sure there
722fa9e406ahrens * is no more pending I/O and any references to the pool are gone.  Then, we
723fa9e406ahrens * update the pool state and sync all the labels to disk, removing the
724fa9e406ahrens * configuration from the cache afterwards.
725fa9e406ahrens */
726fa9e406ahrensstatic int
727fa9e406ahrensspa_export_common(char *pool, int new_state)
728fa9e406ahrens{
729fa9e406ahrens	spa_t *spa;
730fa9e406ahrens
731fa9e406ahrens	if (!(spa_mode & FWRITE))
732fa9e406ahrens		return (EROFS);
733fa9e406ahrens
734fa9e406ahrens	mutex_enter(&spa_namespace_lock);
735fa9e406ahrens	if ((spa = spa_lookup(pool)) == NULL) {
736fa9e406ahrens		mutex_exit(&spa_namespace_lock);
737fa9e406ahrens		return (ENOENT);
738fa9e406ahrens	}
739fa9e406ahrens
740fa9e406ahrens	/*
741fa9e406ahrens	 * The pool will be in core if it's openable,
742fa9e406ahrens	 * in which case we can modify its state.
743fa9e406ahrens	 */
744fa9e406ahrens	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
745fa9e406ahrens		/*
746fa9e406ahrens		 * Objsets may be open only because they're dirty, so we
747fa9e406ahrens		 * have to force it to sync before checking spa_refcnt.
748fa9e406ahrens		 */
749fa9e406ahrens		spa_scrub_suspend(spa);
750fa9e406ahrens		txg_wait_synced(spa->spa_dsl_pool, 0);
751fa9e406ahrens
752fa9e406ahrens		if (!spa_refcount_zero(spa)) {
753fa9e406ahrens			spa_scrub_resume(spa);
754fa9e406ahrens			mutex_exit(&spa_namespace_lock);
755fa9e406ahrens			return (EBUSY);
756fa9e406ahrens		}
757fa9e406ahrens
758fa9e406ahrens		/*
759fa9e406ahrens		 * Update the pool state.
760fa9e406ahrens		 */
761fa9e406ahrens		spa->spa_state = new_state;
762fa9e406ahrens
763fa9e406ahrens		spa_scrub_resume(spa);
764fa9e406ahrens		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
765fa9e406ahrens
766fa9e406ahrens		if (spa->spa_root != NULL)
767fa9e406ahrens			atomic_add_32(&spa_active_count, -1);
768fa9e406ahrens
769fa9e406ahrens		/*
770fa9e406ahrens		 * We want this to be reflected on every label,
771fa9e406ahrens		 * so mark them all dirty.  spa_unload() will do the
772fa9e406ahrens		 * final sync that pushes these changes out.
773fa9e406ahrens		 */
774fa9e406ahrens		vdev_config_dirty(spa->spa_root_vdev);
775fa9e406ahrens	}
776fa9e406ahrens
777fa9e406ahrens	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
778fa9e406ahrens		spa_unload(spa);
779fa9e406ahrens		spa_deactivate(spa);
780fa9e406ahrens	}
781fa9e406ahrens
782fa9e406ahrens	spa_remove(spa);
783fa9e406ahrens	spa_config_sync();
784fa9e406ahrens	mutex_exit(&spa_namespace_lock);
785fa9e406ahrens
786fa9e406ahrens	return (0);
787fa9e406ahrens}
788fa9e406ahrens
789fa9e406ahrens/*
790fa9e406ahrens * Destroy a storage pool.
791fa9e406ahrens */
792fa9e406ahrensint
793fa9e406ahrensspa_destroy(char *pool)
794fa9e406ahrens{
795fa9e406ahrens	return (spa_export_common(pool, POOL_STATE_DESTROYED));
796fa9e406ahrens}
797fa9e406ahrens
798fa9e406ahrens/*
799fa9e406ahrens * Export a storage pool.
800fa9e406ahrens */
801fa9e406ahrensint
802fa9e406ahrensspa_export(char *pool)
803fa9e406ahrens{
804fa9e406ahrens	return (spa_export_common(pool, POOL_STATE_EXPORTED));
805fa9e406ahrens}
806fa9e406ahrens
807fa9e406ahrens/*
808fa9e406ahrens * ==========================================================================
809fa9e406ahrens * Device manipulation
810fa9e406ahrens * ==========================================================================
811fa9e406ahrens */
812fa9e406ahrens
813fa9e406ahrens/*
814fa9e406ahrens * Add capacity to a storage pool.
815fa9e406ahrens */
816fa9e406ahrensint
817fa9e406ahrensspa_vdev_add(spa_t *spa, nvlist_t *nvroot)
818fa9e406ahrens{
819fa9e406ahrens	uint64_t txg;
820fa9e406ahrens	int c, error;
821fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
822fa9e406ahrens	vdev_t *vd;
823fa9e406ahrens
824fa9e406ahrens	txg = spa_vdev_enter(spa);
825fa9e406ahrens
826fa9e406ahrens	vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
827fa9e406ahrens
828fa9e406ahrens	if (vd == NULL)
829fa9e406ahrens		return (spa_vdev_exit(spa, vd, txg, EINVAL));
830fa9e406ahrens
831fa9e406ahrens	if (rvd == NULL)			/* spa_create() */
832fa9e406ahrens		spa->spa_root_vdev = rvd = vd;
833fa9e406ahrens
834fa9e406ahrens	if ((error = vdev_create(vd, txg)) != 0)
835fa9e406ahrens		return (spa_vdev_exit(spa, vd, txg, error));
836fa9e406ahrens
837fa9e406ahrens	/*
838fa9e406ahrens	 * Transfer each top-level vdev from the temporary root
839fa9e406ahrens	 * to the spa's root and initialize its metaslabs.
840fa9e406ahrens	 */
841fa9e406ahrens	for (c = 0; c < vd->vdev_children; c++) {
842fa9e406ahrens		vdev_t *tvd = vd->vdev_child[c];
843fa9e406ahrens		if (vd != rvd) {
844fa9e406ahrens			vdev_remove_child(vd, tvd);
845fa9e406ahrens			tvd->vdev_id = rvd->vdev_children;
846fa9e406ahrens			vdev_add_child(rvd, tvd);
847fa9e406ahrens		}
848fa9e406ahrens		vdev_init(tvd, txg);
849fa9e406ahrens		vdev_config_dirty(tvd);
850fa9e406ahrens	}
851fa9e406ahrens
852fa9e406ahrens	/*
853fa9e406ahrens	 * Update the config based on the new in-core state.
854fa9e406ahrens	 */
855fa9e406ahrens	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
856fa9e406ahrens
857fa9e406ahrens	return (spa_vdev_exit(spa, vd, txg, 0));
858fa9e406ahrens}
859fa9e406ahrens
860fa9e406ahrens/*
861fa9e406ahrens * Attach a device to a mirror.  The arguments are the path to any device
862fa9e406ahrens * in the mirror, and the nvroot for the new device.  If the path specifies
863fa9e406ahrens * a device that is not mirrored, we automatically insert the mirror vdev.
864fa9e406ahrens *
865fa9e406ahrens * If 'replacing' is specified, the new device is intended to replace the
866fa9e406ahrens * existing device; in this case the two devices are made into their own
867fa9e406ahrens * mirror using the 'replacing' vdev, which is functionally idendical to
868fa9e406ahrens * the mirror vdev (it actually reuses all the same ops) but has a few
869fa9e406ahrens * extra rules: you can't attach to it after it's been created, and upon
870fa9e406ahrens * completion of resilvering, the first disk (the one being replaced)
871fa9e406ahrens * is automatically detached.
872fa9e406ahrens */
873fa9e406ahrensint
874fa9e406ahrensspa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
875fa9e406ahrens{
876fa9e406ahrens	uint64_t txg, open_txg;
877fa9e406ahrens	int error;
878fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
879fa9e406ahrens	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
880fa9e406ahrens	vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops;
881fa9e406ahrens
882fa9e406ahrens	txg = spa_vdev_enter(spa);
883fa9e406ahrens
884fa9e406ahrens	oldvd = vdev_lookup_by_path(rvd, path);
885fa9e406ahrens
886fa9e406ahrens	if (oldvd == NULL)
887fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
888fa9e406ahrens
889fa9e406ahrens	pvd = oldvd->vdev_parent;
890fa9e406ahrens
891fa9e406ahrens	/*
892fa9e406ahrens	 * The parent must be a mirror or the root, unless we're replacing;
893fa9e406ahrens	 * in that case, the parent can be anything but another replacing vdev.
894fa9e406ahrens	 */
895fa9e406ahrens	if (pvd->vdev_ops != &vdev_mirror_ops &&
896fa9e406ahrens	    pvd->vdev_ops != &vdev_root_ops &&
897fa9e406ahrens	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
898fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
899fa9e406ahrens
900fa9e406ahrens	newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
901fa9e406ahrens
902fa9e406ahrens	if (newrootvd == NULL || newrootvd->vdev_children != 1)
903fa9e406ahrens		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
904fa9e406ahrens
905fa9e406ahrens	newvd = newrootvd->vdev_child[0];
906fa9e406ahrens
907fa9e406ahrens	if (!newvd->vdev_ops->vdev_op_leaf)
908fa9e406ahrens		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
909fa9e406ahrens
910fa9e406ahrens	if ((error = vdev_create(newrootvd, txg)) != 0)
911fa9e406ahrens		return (spa_vdev_exit(spa, newrootvd, txg, error));
912fa9e406ahrens
9132a79c5flling	/*
9142a79c5flling	 * Compare the new device size with the replaceable/attachable
9152a79c5flling	 * device size.
9162a79c5flling	 */
9172a79c5flling	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
918fa9e406ahrens		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
919fa9e406ahrens
920fa9e406ahrens	if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0)
921fa9e406ahrens		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
922fa9e406ahrens
923fa9e406ahrens	/*
924fa9e406ahrens	 * If this is an in-place replacement, update oldvd's path and devid
925fa9e406ahrens	 * to make it distinguishable from newvd, and unopenable from now on.
926fa9e406ahrens	 */
927fa9e406ahrens	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
928fa9e406ahrens		spa_strfree(oldvd->vdev_path);
929fa9e406ahrens		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
930fa9e406ahrens		    KM_SLEEP);
931fa9e406ahrens		(void) sprintf(oldvd->vdev_path, "%s/%s",
932fa9e406ahrens		    newvd->vdev_path, "old");
933fa9e406ahrens		if (oldvd->vdev_devid != NULL) {
934fa9e406ahrens			spa_strfree(oldvd->vdev_devid);
935fa9e406ahrens			oldvd->vdev_devid = NULL;
936fa9e406ahrens		}
937fa9e406ahrens	}
938fa9e406ahrens
939fa9e406ahrens	/*
940fa9e406ahrens	 * If the parent is not a mirror, or if we're replacing,
941fa9e406ahrens	 * insert the new mirror/replacing vdev above oldvd.
942fa9e406ahrens	 */
943fa9e406ahrens	if (pvd->vdev_ops != pvops)
944fa9e406ahrens		pvd = vdev_add_parent(oldvd, pvops);
945fa9e406ahrens
946fa9e406ahrens	ASSERT(pvd->vdev_top->vdev_parent == rvd);
947fa9e406ahrens	ASSERT(pvd->vdev_ops == pvops);
948fa9e406ahrens	ASSERT(oldvd->vdev_parent == pvd);
949fa9e406ahrens
950fa9e406ahrens	/*
951fa9e406ahrens	 * Extract the new device from its root and add it to pvd.
952fa9e406ahrens	 */
953fa9e406ahrens	vdev_remove_child(newrootvd, newvd);
954fa9e406ahrens	newvd->vdev_id = pvd->vdev_children;
955fa9e406ahrens	vdev_add_child(pvd, newvd);
956fa9e406ahrens
957fa9e406ahrens	tvd = newvd->vdev_top;
958fa9e406ahrens	ASSERT(pvd->vdev_top == tvd);
959fa9e406ahrens	ASSERT(tvd->vdev_parent == rvd);
960fa9e406ahrens
961fa9e406ahrens	/*
962fa9e406ahrens	 * Update the config based on the new in-core state.
963fa9e406ahrens	 */
964fa9e406ahrens	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
965fa9e406ahrens
966fa9e406ahrens	vdev_config_dirty(tvd);
967fa9e406ahrens
968fa9e406ahrens	/*
969fa9e406ahrens	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
970fa9e406ahrens	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
971fa9e406ahrens	 */
972fa9e406ahrens	open_txg = txg + TXG_CONCURRENT_STATES - 1;
973fa9e406ahrens
974fa9e406ahrens	mutex_enter(&newvd->vdev_dtl_lock);
975fa9e406ahrens	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
976fa9e406ahrens	    open_txg - TXG_INITIAL + 1);
977fa9e406ahrens	mutex_exit(&newvd->vdev_dtl_lock);
978fa9e406ahrens
979fa9e406ahrens	/*
980fa9e406ahrens	 * Mark newvd's DTL dirty in this txg.
981fa9e406ahrens	 */
982fa9e406ahrens	vdev_dirty(tvd, VDD_DTL, txg);
983fa9e406ahrens	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
984fa9e406ahrens
985fa9e406ahrens	dprintf("attached %s, replacing=%d\n", path, replacing);
986fa9e406ahrens
987fa9e406ahrens	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
988fa9e406ahrens
989fa9e406ahrens	/*
990fa9e406ahrens	 * Kick off a resilver to update newvd.
991fa9e406ahrens	 */
992fa9e406ahrens	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
993fa9e406ahrens
994fa9e406ahrens	return (0);
995fa9e406ahrens}
996fa9e406ahrens
997fa9e406ahrens/*
998fa9e406ahrens * Detach a device from a mirror or replacing vdev.
999fa9e406ahrens * If 'replace_done' is specified, only detach if the parent
1000fa9e406ahrens * is a replacing vdev.
1001fa9e406ahrens */
1002fa9e406ahrensint
1003fa9e406ahrensspa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
1004fa9e406ahrens{
1005fa9e406ahrens	uint64_t txg;
1006fa9e406ahrens	int c, t, error;
1007fa9e406ahrens	vdev_t *rvd = spa->spa_root_vdev;
1008fa9e406ahrens	vdev_t *vd, *pvd, *cvd, *tvd;
1009fa9e406ahrens
1010fa9e406ahrens	txg = spa_vdev_enter(spa);
1011fa9e406ahrens
1012fa9e406ahrens	vd = vdev_lookup_by_path(rvd, path);
1013fa9e406ahrens
1014fa9e406ahrens	if (vd == NULL)
1015fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1016fa9e406ahrens
1017fa9e406ahrens	if (guid != 0 && vd->vdev_guid != guid)
1018fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1019fa9e406ahrens
1020fa9e406ahrens	pvd = vd->vdev_parent;
1021fa9e406ahrens
1022fa9e406ahrens	/*
1023fa9e406ahrens	 * If replace_done is specified, only remove this device if it's
1024fa9e406ahrens	 * the first child of a replacing vdev.
1025fa9e406ahrens	 */
1026fa9e406ahrens	if (replace_done &&
1027fa9e406ahrens	    (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops))
1028fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1029fa9e406ahrens
1030fa9e406ahrens	/*
1031fa9e406ahrens	 * Only mirror and replacing vdevs support detach.
1032fa9e406ahrens	 */
1033fa9e406ahrens	if (pvd->vdev_ops != &vdev_replacing_ops &&
1034fa9e406ahrens	    pvd->vdev_ops != &vdev_mirror_ops)
1035fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1036fa9e406ahrens
1037fa9e406ahrens	/*
1038fa9e406ahrens	 * If there's only one replica, you can't detach it.
1039fa9e406ahrens	 */
1040fa9e406ahrens	if (pvd->vdev_children <= 1)
1041fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1042fa9e406ahrens
1043fa9e406ahrens	/*
1044fa9e406ahrens	 * If all siblings have non-empty DTLs, this device may have the only
1045fa9e406ahrens	 * valid copy of the data, which means we cannot safely detach it.
1046fa9e406ahrens	 *
1047fa9e406ahrens	 * XXX -- as in the vdev_offline() case, we really want a more
1048fa9e406ahrens	 * precise DTL check.
1049fa9e406ahrens	 */
1050fa9e406ahrens	for (c = 0; c < pvd->vdev_children; c++) {
1051fa9e406ahrens		uint64_t dirty;
1052fa9e406ahrens
1053fa9e406ahrens		cvd = pvd->vdev_child[c];
1054fa9e406ahrens		if (cvd == vd)
1055fa9e406ahrens			continue;
1056fa9e406ahrens		if (vdev_is_dead(cvd))
1057fa9e406ahrens			continue;
1058fa9e406ahrens		mutex_enter(&cvd->vdev_dtl_lock);
1059fa9e406ahrens		dirty = cvd->vdev_dtl_map.sm_space |
1060fa9e406ahrens		    cvd->vdev_dtl_scrub.sm_space;
1061fa9e406ahrens		mutex_exit(&cvd->vdev_dtl_lock);
1062fa9e406ahrens		if (!dirty)
1063fa9e406ahrens			break;
1064fa9e406ahrens	}
1065fa9e406ahrens	if (c == pvd->vdev_children)
1066fa9e406ahrens		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1067fa9e406ahrens
1068fa9e406ahrens	/*
1069fa9e406ahrens	 * Erase the disk labels so the disk can be used for other things.
1070fa9e406ahrens	 * This must be done after all other error cases are handled,
1071fa9e406ahrens	 * but before we disembowel vd (so we can still do I/O to it).
1072fa9e406ahrens	 * But if we can't do it, don't treat the error as fatal --
1073fa9e406ahrens	 * it may be that the unwritability of the disk is the reason
1074fa9e406ahrens	 * it's being detached!
1075fa9e406ahrens	 */
1076fa9e406ahrens	error = vdev_label_init(vd, 0);
1077fa9e406ahrens	if (error)
1078fa9e406ahrens		dprintf("unable to erase labels on %s\n", vdev_description(vd));
1079fa9e406ahrens
1080fa9e406ahrens	/*
1081fa9e406ahrens	 * Remove vd from its parent and compact the parent's children.
1082fa9e406ahrens	 */
1083fa9e406ahrens	vdev_remove_child(pvd, vd);
1084fa9e406ahrens	vdev_compact_children(pvd);
1085fa9e406ahrens
1086fa9e406ahrens	/*
1087fa9e406ahrens	 * Remember one of the remaining children so we can get tvd below.
1088fa9e406ahrens	 */
1089fa9e406ahrens	cvd = pvd->vdev_child[0];
1090fa9e406ahrens
1091fa9e406ahrens	/*
1092fa9e406ahrens	 * If the parent mirror/replacing vdev only has one child,
1093fa9e406ahrens	 * the parent is no longer needed.  Remove it from the tree.
1094fa9e406ahrens	 */
1095fa9e406ahrens	if (pvd->vdev_children == 1)
1096fa9e406ahrens		vdev_remove_parent(cvd);
1097fa9e406ahrens
1098fa9e406ahrens	/*
1099fa9e406ahrens	 * We don't set tvd until now because the parent we just removed
1100fa9e406ahrens	 * may have been the previous top-level vdev.
1101fa9e406ahrens	 */
1102fa9e406ahrens	tvd = cvd->vdev_top;
1103fa9e406ahrens	ASSERT(tvd->vdev_parent == rvd);
1104fa9e406ahrens
1105fa9e406ahrens	/*
1106fa9e406ahrens	 * Reopen this top-level vdev to reassess health after detach.
1107fa9e406ahrens	 */
1108fa9e406ahrens	vdev_reopen(tvd, NULL);
1109fa9e406ahrens
1110fa9e406ahrens	/*
1111fa9e406ahrens	 * If the device we just detached was smaller than the others,
1112fa9e406ahrens	 * it may be possible to add metaslabs (i.e. grow the pool).
1113fa9e406ahrens	 */
1114fa9e406ahrens	vdev_metaslab_init(tvd, txg);
1115fa9e406ahrens
1116fa9e406ahrens	/*
1117fa9e406ahrens	 * Update the config based on the new in-core state.
1118fa9e406ahrens	 */
1119fa9e406ahrens	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
1120fa9e406ahrens
1121fa9e406ahrens	vdev_config_dirty(tvd);
1122fa9e406ahrens
1123fa9e406ahrens	/*
1124fa9e406ahrens	 * Mark vd's DTL as dirty in this txg.
1125fa9e406ahrens	 * vdev_dtl_sync() will see that vd->vdev_detached is set
1126fa9e406ahrens	 * and free vd's DTL object in syncing context.
1127fa9e406ahrens	 * But first make sure we're not on any *other* txg's DTL list,
1128fa9e406ahrens	 * to prevent vd from being accessed after it's freed.
1129fa9e406ahrens	 */
1130fa9e406ahrens	vdev_dirty(tvd, VDD_DTL, txg);
1131fa9e406ahrens	vd->vdev_detached = B_TRUE;
1132fa9e406ahrens	for (t = 0; t < TXG_SIZE; t++)
1133fa9e406ahrens		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
1134fa9e406ahrens	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1135fa9e406ahrens
1136fa9e406ahrens	dprintf("detached %s\n", path);
1137fa9e406ahrens
1138fa9e406ahrens	return (spa_vdev_exit(spa, vd, txg, 0));
1139fa9e406ahrens}
1140fa9e406ahrens
1141fa9e406ahrens/*
1142fa9e406ahrens * If there are any replacing vdevs that have finished replacing, detach them.
1143fa9e406ahrens * We can't hold the config lock across detaches, so we lock the config,
1144