spa.c revision a9926bf0534670a4dfac3d017036e404b8e903e5
1fa9e406ahrens/*
2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
21fa9e406ahrens/*
22c67d967eschrock * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e406ahrens * Use is subject to license terms.
24fa9e406ahrens */
25fa9e406ahrens
26fa9e406ahrens#pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e406ahrens
28fa9e406ahrens/*
29fa9e406ahrens * This file contains all the routines used when modifying on-disk SPA state.
30fa9e406ahrens * This includes opening, importing, destroying, exporting a pool, and syncing a
31fa9e406ahrens * pool.
32fa9e406ahrens */
33fa9e406ahrens
34fa9e406ahrens#include <sys/zfs_context.h>
35ea8dc4beschrock#include <sys/fm/fs/zfs.h>
36fa9e406ahrens#include <sys/spa_impl.h>
37fa9e406ahrens#include <sys/zio.h>
38fa9e406ahrens#include <sys/zio_checksum.h>
39fa9e406ahrens#include <sys/zio_compress.h>
40fa9e406ahrens#include <sys/dmu.h>
41fa9e406ahrens#include <sys/dmu_tx.h>
42fa9e406ahrens#include <sys/zap.h>
43fa9e406ahrens#include <sys/zil.h>
44fa9e406ahrens#include <sys/vdev_impl.h>
45fa9e406ahrens#include <sys/metaslab.h>
46fa9e406ahrens#include <sys/uberblock_impl.h>
47fa9e406ahrens#include <sys/txg.h>
48fa9e406ahrens#include <sys/avl.h>
49fa9e406ahrens#include <sys/dmu_traverse.h>
50fa9e406ahrens#include <sys/unique.h>
51fa9e406ahrens#include <sys/dsl_pool.h>
52fa9e406ahrens#include <sys/dsl_dir.h>
53fa9e406ahrens#include <sys/dsl_prop.h>
54fa9e406ahrens#include <sys/fs/zfs.h>
55fa9e406ahrens#include <sys/callb.h>
56fa9e406ahrens
57fa9e406ahrens/*
58fa9e406ahrens * ==========================================================================
59fa9e406ahrens * SPA state manipulation (open/create/destroy/import/export)
60fa9e406ahrens * ==========================================================================
61fa9e406ahrens */
62fa9e406ahrens
63ea8dc4beschrockstatic int
64ea8dc4beschrockspa_error_entry_compare(const void *a, const void *b)
65ea8dc4beschrock{
66ea8dc4beschrock	spa_error_entry_t *sa = (spa_error_entry_t *)a;
67ea8dc4beschrock	spa_error_entry_t *sb = (spa_error_entry_t *)b;
68ea8dc4beschrock	int ret;
69ea8dc4beschrock
70ea8dc4beschrock	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
71ea8dc4beschrock	    sizeof (zbookmark_t));
72ea8dc4beschrock
73ea8dc4beschrock	if (ret < 0)
74ea8dc4beschrock		return (-1);
75ea8dc4beschrock	else if (ret > 0)
76ea8dc4beschrock		return (1);
77ea8dc4beschrock	else
78ea8dc4beschrock		return (0);
79ea8dc4beschrock}
80ea8dc4beschrock
81ea8dc4beschrock/*
82ea8dc4beschrock * Utility function which retrieves copies of the current logs and
83ea8dc4beschrock * re-initializes them in the process.
84ea8dc4beschrock */
85ea8dc4beschrockvoid
86ea8dc4beschrockspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
87ea8dc4beschrock{
88ea8dc4beschrock	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
89ea8dc4beschrock
90ea8dc4beschrock	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
91ea8dc4beschrock	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
92ea8dc4beschrock
93ea8dc4beschrock	avl_create(&spa->spa_errlist_scrub,
94ea8dc4beschrock	    spa_error_entry_compare, sizeof (spa_error_entry_t),
95ea8dc4beschrock	    offsetof(spa_error_entry_t, se_avl));
96ea8dc4beschrock	avl_create(&spa->spa_errlist_last,
97ea8dc4beschrock	    spa_error_entry_compare, sizeof (spa_error_entry_t),
98ea8dc4beschrock	    offsetof(spa_error_entry_t, se_avl));
99ea8dc4beschrock}
100ea8dc4beschrock
101fa9e406ahrens/*
102fa9e406ahrens * Activate an uninitialized pool.
103fa9e406ahrens */
104fa9e406ahrensstatic void
105fa9e406ahrensspa_activate(spa_t *spa)
106fa9e406ahrens{
107fa9e406ahrens	int t;
108fa9e406ahrens
109fa9e406ahrens	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
110fa9e406ahrens
111fa9e406ahrens	spa->spa_state = POOL_STATE_ACTIVE;
112fa9e406ahrens
113fa9e406ahrens	spa->spa_normal_class = metaslab_class_create();
114fa9e406ahrens
115fa9e406ahrens	for (t = 0; t < ZIO_TYPES; t++) {
116fa9e406ahrens		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
117fa9e406ahrens		    8, maxclsyspri, 50, INT_MAX,
118fa9e406ahrens		    TASKQ_PREPOPULATE);
119fa9e406ahrens		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
120fa9e406ahrens		    8, maxclsyspri, 50, INT_MAX,
121fa9e406ahrens		    TASKQ_PREPOPULATE);
122fa9e406ahrens	}
123fa9e406ahrens
124fa9e406ahrens	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
125fa9e406ahrens
126fa9e406ahrens	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
127fa9e406ahrens	    offsetof(vdev_t, vdev_dirty_node));
128fa9e406ahrens
129fa9e406ahrens	txg_list_create(&spa->spa_vdev_txg_list,
130fa9e406ahrens	    offsetof(struct vdev, vdev_txg_node));
131ea8dc4beschrock
132ea8dc4beschrock	avl_create(&spa->spa_errlist_scrub,
133ea8dc4beschrock	    spa_error_entry_compare, sizeof (spa_error_entry_t),
134ea8dc4beschrock	    offsetof(spa_error_entry_t, se_avl));
135ea8dc4beschrock	avl_create(&spa->spa_errlist_last,
136ea8dc4beschrock	    spa_error_entry_compare, sizeof (spa_error_entry_t),
137ea8dc4beschrock	    offsetof(spa_error_entry_t, se_avl));
138fa9e406ahrens}
139fa9e406ahrens
140fa9e406ahrens/*
141fa9e406ahrens * Opposite of spa_activate().
142fa9e406ahrens */
143fa9e406ahrensstatic void
144fa9e406ahrensspa_deactivate(spa_t *spa)
145fa9e406ahrens{
146fa9e406ahrens	int t;
147fa9e406ahrens
148fa9e406ahrens	ASSERT(spa->spa_sync_on == B_FALSE);
149fa9e406ahrens	ASSERT(spa->spa_dsl_pool == NULL);
150fa9e406ahrens	ASSERT(spa->spa_root_vdev == NULL);
151fa9e406ahrens
152fa9e406ahrens	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
153fa9e406ahrens
154fa9e406ahrens	txg_list_destroy(&spa->spa_vdev_txg_list);
155fa9e406ahrens
156fa9e406ahrens	list_destroy(&spa->spa_dirty_list);
157fa9e406ahrens
158fa9e406ahrens	rw_destroy(&spa->spa_traverse_lock);
159fa9e406ahrens
160fa9e406ahrens	for (t = 0; t < ZIO_TYPES; t++) {
161fa9e406ahrens		taskq_destroy(spa->spa_zio_issue_taskq[t]);
162fa9e406ahrens		taskq_destroy(spa->spa_zio_intr_taskq[t]);
163fa9e406ahrens		spa->spa_zio_issue_taskq[t] = NULL;
164fa9e406ahrens		spa->spa_zio_intr_taskq[t] = NULL;
165fa9e406ahrens	}
166fa9e406ahrens
167fa9e406ahrens	metaslab_class_destroy(spa->spa_normal_class);
168fa9e406ahrens	spa->spa_normal_class = NULL;
169fa9e406ahrens
170ea8dc4beschrock	/*
171ea8dc4beschrock	 * If this was part of an import or the open otherwise failed, we may
172ea8dc4beschrock	 * still have errors left in the queues.  Empty them just in case.
173ea8dc4beschrock	 */
174ea8dc4beschrock	spa_errlog_drain(spa);
175ea8dc4beschrock
176ea8dc4beschrock	avl_destroy(&spa->spa_errlist_scrub);
177ea8dc4beschrock	avl_destroy(&spa->spa_errlist_last);
178ea8dc4beschrock
179fa9e406ahrens	spa->spa_state = POOL_STATE_UNINITIALIZED;
180fa9e406ahrens}
181fa9e406ahrens
182fa9e406ahrens/*
183fa9e406ahrens * Verify a pool configuration, and construct the vdev tree appropriately.  This
184fa9e406ahrens * will create all the necessary vdevs in the appropriate layout, with each vdev
185fa9e406ahrens * in the CLOSED state.  This will prep the pool before open/creation/import.
186fa9e406ahrens * All vdev validation is done by the vdev_alloc() routine.
187fa9e406ahrens */
188fa9e406ahrensstatic vdev_t *
189fa9e406ahrensspa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype)
190fa9e406ahrens{
191fa9e406ahrens	nvlist_t **child;
192fa9e406ahrens	uint_t c, children;
193fa9e406ahrens	vdev_t *vd;
194fa9e406ahrens
195fa9e406ahrens	if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL)
196fa9e406ahrens		return (NULL);
197fa9e406ahrens
198fa9e406ahrens	if (vd->vdev_ops->vdev_op_leaf)
199fa9e406ahrens		return (vd);
200fa9e406ahrens
201fa9e406ahrens	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
202fa9e406ahrens	    &child, &children) != 0) {
203fa9e406ahrens		vdev_free(vd);
204fa9e406ahrens		return (NULL);
205fa9e406ahrens	}
206fa9e406ahrens
207fa9e406ahrens	for (c = 0; c < children; c++) {
208fa9e406ahrens		if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) {
209fa9e406ahrens			vdev_free(vd);
210fa9e406ahrens			return (NULL);
211fa9e406ahrens		}
212fa9e406ahrens	}
213fa9e406ahrens
214fa9e406ahrens	return (vd);
215fa9e406ahrens}
216fa9e406ahrens
217fa9e406ahrens/*
218fa9e406ahrens * Opposite of spa_load().
219fa9e406ahrens */
220fa9e406ahrensstatic void
221fa9e406ahrensspa_unload(spa_t *spa)
222fa9e406ahrens{
223fa9e406ahrens	/*
224ea8dc4beschrock	 * Stop async tasks.
225ea8dc4beschrock	 */
226ea8dc4beschrock	spa_async_suspend(spa);
227ea8dc4beschrock
228ea8dc4beschrock	/*
229fa9e406ahrens	 * Stop syncing.
230fa9e406ahrens	 */
231fa9e406ahrens	if (spa->spa_sync_on) {
232fa9e406ahrens		txg_sync_stop(spa->spa_dsl_pool);
233fa9e406ahrens		spa->spa_sync_on = B_FALSE;
234fa9e406ahrens	}
235fa9e406ahrens
236fa9e406ahrens	/*
237fa9e406ahrens	 * Wait for any outstanding prefetch I/O to complete.
238fa9e406ahrens	 */
239ea8dc4beschrock	spa_config_enter(spa, RW_WRITER, FTAG);
240ea8dc4beschrock	spa_config_exit(spa, FTAG);
241fa9e406ahrens
242fa9e406ahrens	/*
243fa9e406ahrens	 * Close the dsl pool.
244fa9e406ahrens	 */
245fa9e406ahrens	if (spa->spa_dsl_pool) {
246fa9e406ahrens		dsl_pool_close(spa->spa_dsl_pool);
247fa9e406ahrens		spa->spa_dsl_pool = NULL;
248fa9e406ahrens	}
249fa9e406ahrens
250fa9e406ahrens	/*
251fa9e406ahrens	 * Close all vdevs.
252fa9e406ahrens	 */
2530e34b6abonwick	if (spa->spa_root_vdev)
254fa9e406ahrens		vdev_free(spa->spa_root_vdev);
2550e34b6abonwick	ASSERT(spa->spa_root_vdev == NULL);
256ea8dc4beschrock
257ea8dc4beschrock	spa->spa_async_suspended = 0;
258fa9e406ahrens}
259fa9e406ahrens
260fa9e406ahrens/*
261fa9e406ahrens * Load an existing storage pool, using the pool's builtin spa_config as a
262ea8dc4beschrock * source of configuration information.
263fa9e406ahrens */
264fa9e406ahrensstatic int
265ea8dc4beschrockspa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
266fa9e406ahrens{
267fa9e406ahrens	int error = 0;
268fa9e406ahrens	nvlist_t *nvroot = NULL;
269fa9e406ahrens	vdev_t *rvd;
270fa9e406ahrens	uberblock_t *ub = &spa->spa_uberblock;
2710373e76bonwick	uint64_t config_cache_txg = spa->spa_config_txg;
272fa9e406ahrens	uint64_t pool_guid;
273fa9e406ahrens	zio_t *zio;
274fa9e406ahrens
275ea8dc4beschrock	spa->spa_load_state = state;
2760373e76bonwick
277fa9e406ahrens	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
278a9926bfbonwick	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
279ea8dc4beschrock		error = EINVAL;
280ea8dc4beschrock		goto out;
281ea8dc4beschrock	}
282fa9e406ahrens
283a9926bfbonwick	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
284a9926bfbonwick	    &spa->spa_config_txg);
285a9926bfbonwick
2860373e76bonwick	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
287ea8dc4beschrock	    spa_guid_exists(pool_guid, 0)) {
288ea8dc4beschrock		error = EEXIST;
289ea8dc4beschrock		goto out;
290ea8dc4beschrock	}
291fa9e406ahrens
292fa9e406ahrens	/*
293fa9e406ahrens	 * Parse the configuration into a vdev tree.
294fa9e406ahrens	 */
295ea8dc4beschrock	spa_config_enter(spa, RW_WRITER, FTAG);
296fa9e406ahrens	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
297ea8dc4beschrock	spa_config_exit(spa, FTAG);
298fa9e406ahrens
299ea8dc4beschrock	if (rvd == NULL) {
300ea8dc4beschrock		error = EINVAL;
301ea8dc4beschrock		goto out;
302ea8dc4beschrock	}
303fa9e406ahrens
3040e34b6abonwick	ASSERT(spa->spa_root_vdev == rvd);
305fa9e406ahrens	ASSERT(spa_guid(spa) == pool_guid);
306fa9e406ahrens
307fa9e406ahrens	/*
308fa9e406ahrens	 * Try to open all vdevs, loading each label in the process.
309fa9e406ahrens	 */
310ea8dc4beschrock	if (vdev_open(rvd) != 0) {
311ea8dc4beschrock		error = ENXIO;
312ea8dc4beschrock		goto out;
313ea8dc4beschrock	}
314fa9e406ahrens
315fa9e406ahrens	/*
316fa9e406ahrens	 * Find the best uberblock.
317fa9e406ahrens	 */
318fa9e406ahrens	bzero(ub, sizeof (uberblock_t));
319fa9e406ahrens
320fa9e406ahrens	zio = zio_root(spa, NULL, NULL,
321fa9e406ahrens	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
322fa9e406ahrens	vdev_uberblock_load(zio, rvd, ub);
323fa9e406ahrens	error = zio_wait(zio);
324fa9e406ahrens
325fa9e406ahrens	/*
326fa9e406ahrens	 * If we weren't able to find a single valid uberblock, return failure.
327fa9e406ahrens	 */
328fa9e406ahrens	if (ub->ub_txg == 0) {
329ea8dc4beschrock		error = ENXIO;
330ea8dc4beschrock		goto out;
331ea8dc4beschrock	}
332ea8dc4beschrock
333ea8dc4beschrock	/*
334ea8dc4beschrock	 * If the pool is newer than the code, we can't open it.
335ea8dc4beschrock	 */
336ea8dc4beschrock	if (ub->ub_version > UBERBLOCK_VERSION) {
337ea8dc4beschrock		error = ENOTSUP;
338ea8dc4beschrock		goto out;
339fa9e406ahrens	}
340fa9e406ahrens
341fa9e406ahrens	/*
342fa9e406ahrens	 * If the vdev guid sum doesn't match the uberblock, we have an
343fa9e406ahrens	 * incomplete configuration.
344fa9e406ahrens	 */
345ecc2d60bonwick	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
346ea8dc4beschrock		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
347ea8dc4beschrock		    VDEV_AUX_BAD_GUID_SUM);
348ea8dc4beschrock		error = ENXIO;
349ea8dc4beschrock		goto out;
350fa9e406ahrens	}
351fa9e406ahrens
352fa9e406ahrens	/*
353fa9e406ahrens	 * Initialize internal SPA structures.
354fa9e406ahrens	 */
355fa9e406ahrens	spa->spa_state = POOL_STATE_ACTIVE;
356fa9e406ahrens	spa->spa_ubsync = spa->spa_uberblock;
357fa9e406ahrens	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
358ea8dc4beschrock	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
359ea8dc4beschrock	if (error) {
360ea8dc4beschrock		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
361ea8dc4beschrock		    VDEV_AUX_CORRUPT_DATA);
362ea8dc4beschrock		goto out;
363ea8dc4beschrock	}
364fa9e406ahrens	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
365fa9e406ahrens
366ea8dc4beschrock	if (zap_lookup(spa->spa_meta_objset,
367fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
368ea8dc4beschrock	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
369ea8dc4beschrock		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
370ea8dc4beschrock		    VDEV_AUX_CORRUPT_DATA);
371ea8dc4beschrock		error = EIO;
372ea8dc4beschrock		goto out;
373ea8dc4beschrock	}
374fa9e406ahrens
375fa9e406ahrens	if (!mosconfig) {
376fa9e406ahrens		dmu_buf_t *db;
377fa9e406ahrens		char *packed = NULL;
378fa9e406ahrens		size_t nvsize = 0;
379fa9e406ahrens		nvlist_t *newconfig = NULL;
380fa9e406ahrens
381ea8dc4beschrock		VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
382ea8dc4beschrock		    spa->spa_config_object, FTAG, &db));
383fa9e406ahrens		nvsize = *(uint64_t *)db->db_data;
384ea8dc4beschrock		dmu_buf_rele(db, FTAG);
385fa9e406ahrens
386fa9e406ahrens		packed = kmem_alloc(nvsize, KM_SLEEP);
387ea8dc4beschrock		error = dmu_read(spa->spa_meta_objset,
388fa9e406ahrens		    spa->spa_config_object, 0, nvsize, packed);
389fa9e406ahrens		if (error == 0)
390fa9e406ahrens			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
391fa9e406ahrens		kmem_free(packed, nvsize);
392fa9e406ahrens
393ea8dc4beschrock		if (error) {
394ea8dc4beschrock			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
395ea8dc4beschrock			    VDEV_AUX_CORRUPT_DATA);
396ea8dc4beschrock			error = EIO;
397ea8dc4beschrock			goto out;
398ea8dc4beschrock		}
399fa9e406ahrens
400fa9e406ahrens		spa_config_set(spa, newconfig);
401fa9e406ahrens
402fa9e406ahrens		spa_unload(spa);
403fa9e406ahrens		spa_deactivate(spa);
404fa9e406ahrens		spa_activate(spa);
405fa9e406ahrens
406ea8dc4beschrock		return (spa_load(spa, newconfig, state, B_TRUE));
407fa9e406ahrens	}
408fa9e406ahrens
409ea8dc4beschrock	if (zap_lookup(spa->spa_meta_objset,
410fa9e406ahrens	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
411ea8dc4beschrock	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
412ea8dc4beschrock		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
413ea8dc4beschrock		    VDEV_AUX_CORRUPT_DATA);
414ea8dc4beschrock		error = EIO;
415ea8dc4beschrock		goto out;
416ea8dc4beschrock	}
417fa9e406ahrens
418fa9e406ahrens	/*
419ea8dc4beschrock	 * Load the persistent error log.  If we have an older pool, this will
420ea8dc4beschrock	 * not be present.
421fa9e406ahrens	 */
422ea8dc4beschrock	error = zap_lookup(spa->spa_meta_objset,
423