spa_misc.c revision 88b7b0f29b20b808b9e06071885b1d6a3ddb6328
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/zfs_context.h>
27#include <sys/spa_impl.h>
28#include <sys/zio.h>
29#include <sys/zio_checksum.h>
30#include <sys/zio_compress.h>
31#include <sys/dmu.h>
32#include <sys/dmu_tx.h>
33#include <sys/zap.h>
34#include <sys/zil.h>
35#include <sys/vdev_impl.h>
36#include <sys/metaslab.h>
37#include <sys/uberblock_impl.h>
38#include <sys/txg.h>
39#include <sys/avl.h>
40#include <sys/unique.h>
41#include <sys/dsl_pool.h>
42#include <sys/dsl_dir.h>
43#include <sys/dsl_prop.h>
44#include <sys/fs/zfs.h>
45#include <sys/metaslab_impl.h>
46#include <sys/sunddi.h>
47#include <sys/arc.h>
48#include "zfs_prop.h"
49
50/*
51 * SPA locking
52 *
53 * There are four basic locks for managing spa_t structures:
54 *
55 * spa_namespace_lock (global mutex)
56 *
57 *	This lock must be acquired to do any of the following:
58 *
59 *		- Lookup a spa_t by name
60 *		- Add or remove a spa_t from the namespace
61 *		- Increase spa_refcount from non-zero
62 *		- Check if spa_refcount is zero
63 *		- Rename a spa_t
64 *		- add/remove/attach/detach devices
65 *		- Held for the duration of create/destroy/import/export
66 *
67 *	It does not need to handle recursion.  A create or destroy may
68 *	reference objects (files or zvols) in other pools, but by
69 *	definition they must have an existing reference, and will never need
70 *	to lookup a spa_t by name.
71 *
72 * spa_refcount (per-spa refcount_t protected by mutex)
73 *
74 *	This reference count keep track of any active users of the spa_t.  The
75 *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
76 *	the refcount is never really 'zero' - opening a pool implicitly keeps
77 *	some references in the DMU.  Internally we check against spa_minref, but
78 *	present the image of a zero/non-zero value to consumers.
79 *
80 * spa_config_lock[] (per-spa array of rwlocks)
81 *
82 *	This protects the spa_t from config changes, and must be held in
83 *	the following circumstances:
84 *
85 *		- RW_READER to perform I/O to the spa
86 *		- RW_WRITER to change the vdev config
87 *
88 * The locking order is fairly straightforward:
89 *
90 *		spa_namespace_lock	->	spa_refcount
91 *
92 *	The namespace lock must be acquired to increase the refcount from 0
93 *	or to check if it is zero.
94 *
95 *		spa_refcount		->	spa_config_lock[]
96 *
97 *	There must be at least one valid reference on the spa_t to acquire
98 *	the config lock.
99 *
100 *		spa_namespace_lock	->	spa_config_lock[]
101 *
102 *	The namespace lock must always be taken before the config lock.
103 *
104 *
105 * The spa_namespace_lock can be acquired directly and is globally visible.
106 *
107 * The namespace is manipulated using the following functions, all of which
108 * require the spa_namespace_lock to be held.
109 *
110 *	spa_lookup()		Lookup a spa_t by name.
111 *
112 *	spa_add()		Create a new spa_t in the namespace.
113 *
114 *	spa_remove()		Remove a spa_t from the namespace.  This also
115 *				frees up any memory associated with the spa_t.
116 *
117 *	spa_next()		Returns the next spa_t in the system, or the
118 *				first if NULL is passed.
119 *
120 *	spa_evict_all()		Shutdown and remove all spa_t structures in
121 *				the system.
122 *
123 *	spa_guid_exists()	Determine whether a pool/device guid exists.
124 *
125 * The spa_refcount is manipulated using the following functions:
126 *
127 *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
128 *				called with spa_namespace_lock held if the
129 *				refcount is currently zero.
130 *
131 *	spa_close()		Remove a reference from the spa_t.  This will
132 *				not free the spa_t or remove it from the
133 *				namespace.  No locking is required.
134 *
135 *	spa_refcount_zero()	Returns true if the refcount is currently
136 *				zero.  Must be called with spa_namespace_lock
137 *				held.
138 *
139 * The spa_config_lock[] is an array of rwlocks, ordered as follows:
140 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
141 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
142 *
143 * To read the configuration, it suffices to hold one of these locks as reader.
144 * To modify the configuration, you must hold all locks as writer.  To modify
145 * vdev state without altering the vdev tree's topology (e.g. online/offline),
146 * you must hold SCL_STATE and SCL_ZIO as writer.
147 *
148 * We use these distinct config locks to avoid recursive lock entry.
149 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
150 * block allocations (SCL_ALLOC), which may require reading space maps
151 * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
152 *
153 * The spa config locks cannot be normal rwlocks because we need the
154 * ability to hand off ownership.  For example, SCL_ZIO is acquired
155 * by the issuing thread and later released by an interrupt thread.
156 * They do, however, obey the usual write-wanted semantics to prevent
157 * writer (i.e. system administrator) starvation.
158 *
159 * The lock acquisition rules are as follows:
160 *
161 * SCL_CONFIG
162 *	Protects changes to the vdev tree topology, such as vdev
163 *	add/remove/attach/detach.  Protects the dirty config list
164 *	(spa_config_dirty_list) and the set of spares and l2arc devices.
165 *
166 * SCL_STATE
167 *	Protects changes to pool state and vdev state, such as vdev
168 *	online/offline/fault/degrade/clear.  Protects the dirty state list
169 *	(spa_state_dirty_list) and global pool state (spa_state).
170 *
171 * SCL_ALLOC
172 *	Protects changes to metaslab groups and classes.
173 *	Held as reader by metaslab_alloc() and metaslab_claim().
174 *
175 * SCL_ZIO
176 *	Held by bp-level zios (those which have no io_vd upon entry)
177 *	to prevent changes to the vdev tree.  The bp-level zio implicitly
178 *	protects all of its vdev child zios, which do not hold SCL_ZIO.
179 *
180 * SCL_FREE
181 *	Protects changes to metaslab groups and classes.
182 *	Held as reader by metaslab_free().  SCL_FREE is distinct from
183 *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
184 *	blocks in zio_done() while another i/o that holds either
185 *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
186 *
187 * SCL_VDEV
188 *	Held as reader to prevent changes to the vdev tree during trivial
189 *	inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
190 *	other locks, and lower than all of them, to ensure that it's safe
191 *	to acquire regardless of caller context.
192 *
193 * In addition, the following rules apply:
194 *
195 * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
196 *	The lock ordering is SCL_CONFIG > spa_props_lock.
197 *
198 * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
199 *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
200 *	or zio_write_phys() -- the caller must ensure that the config cannot
201 *	cannot change in the interim, and that the vdev cannot be reopened.
202 *	SCL_STATE as reader suffices for both.
203 *
204 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
205 *
206 *	spa_vdev_enter()	Acquire the namespace lock and the config lock
207 *				for writing.
208 *
209 *	spa_vdev_exit()		Release the config lock, wait for all I/O
210 *				to complete, sync the updated configs to the
211 *				cache, and release the namespace lock.
212 *
213 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
214 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
215 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
216 *
217 * spa_rename() is also implemented within this file since is requires
218 * manipulation of the namespace.
219 */
220
221static avl_tree_t spa_namespace_avl;
222kmutex_t spa_namespace_lock;
223static kcondvar_t spa_namespace_cv;
224static int spa_active_count;
225int spa_max_replication_override = SPA_DVAS_PER_BP;
226
227static kmutex_t spa_spare_lock;
228static avl_tree_t spa_spare_avl;
229static kmutex_t spa_l2cache_lock;
230static avl_tree_t spa_l2cache_avl;
231
232kmem_cache_t *spa_buffer_pool;
233int spa_mode;
234
235#ifdef ZFS_DEBUG
236/* Everything except dprintf is on by default in debug builds */
237int zfs_flags = ~ZFS_DEBUG_DPRINTF;
238#else
239int zfs_flags = 0;
240#endif
241
242/*
243 * zfs_recover can be set to nonzero to attempt to recover from
244 * otherwise-fatal errors, typically caused by on-disk corruption.  When
245 * set, calls to zfs_panic_recover() will turn into warning messages.
246 */
247int zfs_recover = 0;
248
249
250/*
251 * ==========================================================================
252 * SPA config locking
253 * ==========================================================================
254 */
255static void
256spa_config_lock_init(spa_t *spa)
257{
258	for (int i = 0; i < SCL_LOCKS; i++) {
259		spa_config_lock_t *scl = &spa->spa_config_lock[i];
260		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
261		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
262		refcount_create(&scl->scl_count);
263		scl->scl_writer = NULL;
264		scl->scl_write_wanted = 0;
265	}
266}
267
268static void
269spa_config_lock_destroy(spa_t *spa)
270{
271	for (int i = 0; i < SCL_LOCKS; i++) {
272		spa_config_lock_t *scl = &spa->spa_config_lock[i];
273		mutex_destroy(&scl->scl_lock);
274		cv_destroy(&scl->scl_cv);
275		refcount_destroy(&scl->scl_count);
276		ASSERT(scl->scl_writer == NULL);
277		ASSERT(scl->scl_write_wanted == 0);
278	}
279}
280
281int
282spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
283{
284	for (int i = 0; i < SCL_LOCKS; i++) {
285		spa_config_lock_t *scl = &spa->spa_config_lock[i];
286		if (!(locks & (1 << i)))
287			continue;
288		mutex_enter(&scl->scl_lock);
289		if (rw == RW_READER) {
290			if (scl->scl_writer || scl->scl_write_wanted) {
291				mutex_exit(&scl->scl_lock);
292				spa_config_exit(spa, locks ^ (1 << i), tag);
293				return (0);
294			}
295		} else {
296			ASSERT(scl->scl_writer != curthread);
297			if (!refcount_is_zero(&scl->scl_count)) {
298				mutex_exit(&scl->scl_lock);
299				spa_config_exit(spa, locks ^ (1 << i), tag);
300				return (0);
301			}
302			scl->scl_writer = curthread;
303		}
304		(void) refcount_add(&scl->scl_count, tag);
305		mutex_exit(&scl->scl_lock);
306	}
307	return (1);
308}
309
310void
311spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
312{
313	for (int i = 0; i < SCL_LOCKS; i++) {
314		spa_config_lock_t *scl = &spa->spa_config_lock[i];
315		if (!(locks & (1 << i)))
316			continue;
317		mutex_enter(&scl->scl_lock);
318		if (rw == RW_READER) {
319			while (scl->scl_writer || scl->scl_write_wanted) {
320				cv_wait(&scl->scl_cv, &scl->scl_lock);
321			}
322		} else {
323			ASSERT(scl->scl_writer != curthread);
324			while (!refcount_is_zero(&scl->scl_count)) {
325				scl->scl_write_wanted++;
326				cv_wait(&scl->scl_cv, &scl->scl_lock);
327				scl->scl_write_wanted--;
328			}
329			scl->scl_writer = curthread;
330		}
331		(void) refcount_add(&scl->scl_count, tag);
332		mutex_exit(&scl->scl_lock);
333	}
334}
335
336void
337spa_config_exit(spa_t *spa, int locks, void *tag)
338{
339	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
340		spa_config_lock_t *scl = &spa->spa_config_lock[i];
341		if (!(locks & (1 << i)))
342			continue;
343		mutex_enter(&scl->scl_lock);
344		ASSERT(!refcount_is_zero(&scl->scl_count));
345		if (refcount_remove(&scl->scl_count, tag) == 0) {
346			ASSERT(scl->scl_writer == NULL ||
347			    scl->scl_writer == curthread);
348			scl->scl_writer = NULL;	/* OK in either case */
349			cv_broadcast(&scl->scl_cv);
350		}
351		mutex_exit(&scl->scl_lock);
352	}
353}
354
355int
356spa_config_held(spa_t *spa, int locks, krw_t rw)
357{
358	int locks_held = 0;
359
360	for (int i = 0; i < SCL_LOCKS; i++) {
361		spa_config_lock_t *scl = &spa->spa_config_lock[i];
362		if (!(locks & (1 << i)))
363			continue;
364		if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
365		    (rw == RW_WRITER && scl->scl_writer == curthread))
366			locks_held |= 1 << i;
367	}
368
369	return (locks_held);
370}
371
372/*
373 * ==========================================================================
374 * SPA namespace functions
375 * ==========================================================================
376 */
377
378/*
379 * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
380 * Returns NULL if no matching spa_t is found.
381 */
382spa_t *
383spa_lookup(const char *name)
384{
385	static spa_t search;	/* spa_t is large; don't allocate on stack */
386	spa_t *spa;
387	avl_index_t where;
388	char c;
389	char *cp;
390
391	ASSERT(MUTEX_HELD(&spa_namespace_lock));
392
393	/*
394	 * If it's a full dataset name, figure out the pool name and
395	 * just use that.
396	 */
397	cp = strpbrk(name, "/@");
398	if (cp) {
399		c = *cp;
400		*cp = '\0';
401	}
402
403	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
404	spa = avl_find(&spa_namespace_avl, &search, &where);
405
406	if (cp)
407		*cp = c;
408
409	return (spa);
410}
411
412/*
413 * Create an uninitialized spa_t with the given name.  Requires
414 * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
415 * exist by calling spa_lookup() first.
416 */
417spa_t *
418spa_add(const char *name, const char *altroot)
419{
420	spa_t *spa;
421	spa_config_dirent_t *dp;
422
423	ASSERT(MUTEX_HELD(&spa_namespace_lock));
424
425	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
426
427	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
428	mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
429	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
430	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
431	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
432	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
433	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
434	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
435
436	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
437	cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
438	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
439	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
440
441	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
442	spa->spa_state = POOL_STATE_UNINITIALIZED;
443	spa->spa_freeze_txg = UINT64_MAX;
444	spa->spa_final_txg = UINT64_MAX;
445
446	refcount_create(&spa->spa_refcount);
447	spa_config_lock_init(spa);
448
449	avl_add(&spa_namespace_avl, spa);
450
451	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
452
453	/*
454	 * Set the alternate root, if there is one.
455	 */
456	if (altroot) {
457		spa->spa_root = spa_strdup(altroot);
458		spa_active_count++;
459	}
460
461	/*
462	 * Every pool starts with the default cachefile
463	 */
464	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
465	    offsetof(spa_config_dirent_t, scd_link));
466
467	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
468	dp->scd_path = spa_strdup(spa_config_path);
469	list_insert_head(&spa->spa_config_list, dp);
470
471	return (spa);
472}
473
474/*
475 * Removes a spa_t from the namespace, freeing up any memory used.  Requires
476 * spa_namespace_lock.  This is called only after the spa_t has been closed and
477 * deactivated.
478 */
479void
480spa_remove(spa_t *spa)
481{
482	spa_config_dirent_t *dp;
483
484	ASSERT(MUTEX_HELD(&spa_namespace_lock));
485	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
486
487	avl_remove(&spa_namespace_avl, spa);
488	cv_broadcast(&spa_namespace_cv);
489
490	if (spa->spa_root) {
491		spa_strfree(spa->spa_root);
492		spa_active_count--;
493	}
494
495	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
496		list_remove(&spa->spa_config_list, dp);
497		if (dp->scd_path != NULL)
498			spa_strfree(dp->scd_path);
499		kmem_free(dp, sizeof (spa_config_dirent_t));
500	}
501
502	list_destroy(&spa->spa_config_list);
503
504	spa_config_set(spa, NULL);
505
506	refcount_destroy(&spa->spa_refcount);
507
508	spa_config_lock_destroy(spa);
509
510	cv_destroy(&spa->spa_async_cv);
511	cv_destroy(&spa->spa_async_root_cv);
512	cv_destroy(&spa->spa_scrub_io_cv);
513	cv_destroy(&spa->spa_suspend_cv);
514
515	mutex_destroy(&spa->spa_async_lock);
516	mutex_destroy(&spa->spa_async_root_lock);
517	mutex_destroy(&spa->spa_scrub_lock);
518	mutex_destroy(&spa->spa_errlog_lock);
519	mutex_destroy(&spa->spa_errlist_lock);
520	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
521	mutex_destroy(&spa->spa_history_lock);
522	mutex_destroy(&spa->spa_props_lock);
523	mutex_destroy(&spa->spa_suspend_lock);
524
525	kmem_free(spa, sizeof (spa_t));
526}
527
528/*
529 * Given a pool, return the next pool in the namespace, or NULL if there is
530 * none.  If 'prev' is NULL, return the first pool.
531 */
532spa_t *
533spa_next(spa_t *prev)
534{
535	ASSERT(MUTEX_HELD(&spa_namespace_lock));
536
537	if (prev)
538		return (AVL_NEXT(&spa_namespace_avl, prev));
539	else
540		return (avl_first(&spa_namespace_avl));
541}
542
543/*
544 * ==========================================================================
545 * SPA refcount functions
546 * ==========================================================================
547 */
548
549/*
550 * Add a reference to the given spa_t.  Must have at least one reference, or
551 * have the namespace lock held.
552 */
553void
554spa_open_ref(spa_t *spa, void *tag)
555{
556	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
557	    MUTEX_HELD(&spa_namespace_lock));
558	(void) refcount_add(&spa->spa_refcount, tag);
559}
560
561/*
562 * Remove a reference to the given spa_t.  Must have at least one reference, or
563 * have the namespace lock held.
564 */
565void
566spa_close(spa_t *spa, void *tag)
567{
568	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
569	    MUTEX_HELD(&spa_namespace_lock));
570	(void) refcount_remove(&spa->spa_refcount, tag);
571}
572
573/*
574 * Check to see if the spa refcount is zero.  Must be called with
575 * spa_namespace_lock held.  We really compare against spa_minref, which is the
576 * number of references acquired when opening a pool
577 */
578boolean_t
579spa_refcount_zero(spa_t *spa)
580{
581	ASSERT(MUTEX_HELD(&spa_namespace_lock));
582
583	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
584}
585
586/*
587 * ==========================================================================
588 * SPA spare and l2cache tracking
589 * ==========================================================================
590 */
591
592/*
593 * Hot spares and cache devices are tracked using the same code below,
594 * for 'auxiliary' devices.
595 */
596
597typedef struct spa_aux {
598	uint64_t	aux_guid;
599	uint64_t	aux_pool;
600	avl_node_t	aux_avl;
601	int		aux_count;
602} spa_aux_t;
603
604static int
605spa_aux_compare(const void *a, const void *b)
606{
607	const spa_aux_t *sa = a;
608	const spa_aux_t *sb = b;
609
610	if (sa->aux_guid < sb->aux_guid)
611		return (-1);
612	else if (sa->aux_guid > sb->aux_guid)
613		return (1);
614	else
615		return (0);
616}
617
618void
619spa_aux_add(vdev_t *vd, avl_tree_t *avl)
620{
621	avl_index_t where;
622	spa_aux_t search;
623	spa_aux_t *aux;
624
625	search.aux_guid = vd->vdev_guid;
626	if ((aux = avl_find(avl, &search, &where)) != NULL) {
627		aux->aux_count++;
628	} else {
629		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
630		aux->aux_guid = vd->vdev_guid;
631		aux->aux_count = 1;
632		avl_insert(avl, aux, where);
633	}
634}
635
636void
637spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
638{
639	spa_aux_t search;
640	spa_aux_t *aux;
641	avl_index_t where;
642
643	search.aux_guid = vd->vdev_guid;
644	aux = avl_find(avl, &search, &where);
645
646	ASSERT(aux != NULL);
647
648	if (--aux->aux_count == 0) {
649		avl_remove(avl, aux);
650		kmem_free(aux, sizeof (spa_aux_t));
651	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
652		aux->aux_pool = 0ULL;
653	}
654}
655
656boolean_t
657spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
658{
659	spa_aux_t search, *found;
660
661	search.aux_guid = guid;
662	found = avl_find(avl, &search, NULL);
663
664	if (pool) {
665		if (found)
666			*pool = found->aux_pool;
667		else
668			*pool = 0ULL;
669	}
670
671	if (refcnt) {
672		if (found)
673			*refcnt = found->aux_count;
674		else
675			*refcnt = 0;
676	}
677
678	return (found != NULL);
679}
680
681void
682spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
683{
684	spa_aux_t search, *found;
685	avl_index_t where;
686
687	search.aux_guid = vd->vdev_guid;
688	found = avl_find(avl, &search, &where);
689	ASSERT(found != NULL);
690	ASSERT(found->aux_pool == 0ULL);
691
692	found->aux_pool = spa_guid(vd->vdev_spa);
693}
694
695/*
696 * Spares are tracked globally due to the following constraints:
697 *
698 * 	- A spare may be part of multiple pools.
699 * 	- A spare may be added to a pool even if it's actively in use within
700 *	  another pool.
701 * 	- A spare in use in any pool can only be the source of a replacement if
702 *	  the target is a spare in the same pool.
703 *
704 * We keep track of all spares on the system through the use of a reference
705 * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
706 * spare, then we bump the reference count in the AVL tree.  In addition, we set
707 * the 'vdev_isspare' member to indicate that the device is a spare (active or
708 * inactive).  When a spare is made active (used to replace a device in the
709 * pool), we also keep track of which pool its been made a part of.
710 *
711 * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
712 * called under the spa_namespace lock as part of vdev reconfiguration.  The
713 * separate spare lock exists for the status query path, which does not need to
714 * be completely consistent with respect to other vdev configuration changes.
715 */
716
717static int
718spa_spare_compare(const void *a, const void *b)
719{
720	return (spa_aux_compare(a, b));
721}
722
723void
724spa_spare_add(vdev_t *vd)
725{
726	mutex_enter(&spa_spare_lock);
727	ASSERT(!vd->vdev_isspare);
728	spa_aux_add(vd, &spa_spare_avl);
729	vd->vdev_isspare = B_TRUE;
730	mutex_exit(&spa_spare_lock);
731}
732
733void
734spa_spare_remove(vdev_t *vd)
735{
736	mutex_enter(&spa_spare_lock);
737	ASSERT(vd->vdev_isspare);
738	spa_aux_remove(vd, &spa_spare_avl);
739	vd->vdev_isspare = B_FALSE;
740	mutex_exit(&spa_spare_lock);
741}
742
743boolean_t
744spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
745{
746	boolean_t found;
747
748	mutex_enter(&spa_spare_lock);
749	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
750	mutex_exit(&spa_spare_lock);
751
752	return (found);
753}
754
755void
756spa_spare_activate(vdev_t *vd)
757{
758	mutex_enter(&spa_spare_lock);
759	ASSERT(vd->vdev_isspare);
760	spa_aux_activate(vd, &spa_spare_avl);
761	mutex_exit(&spa_spare_lock);
762}
763
764/*
765 * Level 2 ARC devices are tracked globally for the same reasons as spares.
766 * Cache devices currently only support one pool per cache device, and so
767 * for these devices the aux reference count is currently unused beyond 1.
768 */
769
770static int
771spa_l2cache_compare(const void *a, const void *b)
772{
773	return (spa_aux_compare(a, b));
774}
775
776void
777spa_l2cache_add(vdev_t *vd)
778{
779	mutex_enter(&spa_l2cache_lock);
780	ASSERT(!vd->vdev_isl2cache);
781	spa_aux_add(vd, &spa_l2cache_avl);
782	vd->vdev_isl2cache = B_TRUE;
783	mutex_exit(&spa_l2cache_lock);
784}
785
786void
787spa_l2cache_remove(vdev_t *vd)
788{
789	mutex_enter(&spa_l2cache_lock);
790	ASSERT(vd->vdev_isl2cache);
791	spa_aux_remove(vd, &spa_l2cache_avl);
792	vd->vdev_isl2cache = B_FALSE;
793	mutex_exit(&spa_l2cache_lock);
794}
795
796boolean_t
797spa_l2cache_exists(uint64_t guid, uint64_t *pool)
798{
799	boolean_t found;
800
801	mutex_enter(&spa_l2cache_lock);
802	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
803	mutex_exit(&spa_l2cache_lock);
804
805	return (found);
806}
807
808void
809spa_l2cache_activate(vdev_t *vd)
810{
811	mutex_enter(&spa_l2cache_lock);
812	ASSERT(vd->vdev_isl2cache);
813	spa_aux_activate(vd, &spa_l2cache_avl);
814	mutex_exit(&spa_l2cache_lock);
815}
816
817void
818spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
819{
820	vdev_space_update(vd, space, alloc, B_FALSE);
821}
822
823/*
824 * ==========================================================================
825 * SPA vdev locking
826 * ==========================================================================
827 */
828
829/*
830 * Lock the given spa_t for the purpose of adding or removing a vdev.
831 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
832 * It returns the next transaction group for the spa_t.
833 */
834uint64_t
835spa_vdev_enter(spa_t *spa)
836{
837	mutex_enter(&spa_namespace_lock);
838
839	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
840
841	return (spa_last_synced_txg(spa) + 1);
842}
843
844/*
845 * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
846 * locking of spa_vdev_enter(), we also want make sure the transactions have
847 * synced to disk, and then update the global configuration cache with the new
848 * information.
849 */
850int
851spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
852{
853	int config_changed = B_FALSE;
854
855	ASSERT(txg > spa_last_synced_txg(spa));
856
857	spa->spa_pending_vdev = NULL;
858
859	/*
860	 * Reassess the DTLs.
861	 */
862	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
863
864	/*
865	 * If the config changed, notify the scrub thread that it must restart.
866	 */
867	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
868		dsl_pool_scrub_restart(spa->spa_dsl_pool);
869		config_changed = B_TRUE;
870	}
871
872	spa_config_exit(spa, SCL_ALL, spa);
873
874	/*
875	 * Note: this txg_wait_synced() is important because it ensures
876	 * that there won't be more than one config change per txg.
877	 * This allows us to use the txg as the generation number.
878	 */
879	if (error == 0)
880		txg_wait_synced(spa->spa_dsl_pool, txg);
881
882	if (vd != NULL) {
883		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
884		vdev_free(vd);
885	}
886
887	/*
888	 * If the config changed, update the config cache.
889	 */
890	if (config_changed)
891		spa_config_sync(spa, B_FALSE, B_TRUE);
892
893	mutex_exit(&spa_namespace_lock);
894
895	return (error);
896}
897
898/*
899 * Lock the given spa_t for the purpose of changing vdev state.
900 */
901void
902spa_vdev_state_enter(spa_t *spa)
903{
904	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
905}
906
907int
908spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
909{
910	if (vd != NULL)
911		vdev_state_dirty(vd->vdev_top);
912
913	spa_config_exit(spa, SCL_STATE_ALL, spa);
914
915	return (error);
916}
917
918/*
919 * ==========================================================================
920 * Miscellaneous functions
921 * ==========================================================================
922 */
923
924/*
925 * Rename a spa_t.
926 */
927int
928spa_rename(const char *name, const char *newname)
929{
930	spa_t *spa;
931	int err;
932
933	/*
934	 * Lookup the spa_t and grab the config lock for writing.  We need to
935	 * actually open the pool so that we can sync out the necessary labels.
936	 * It's OK to call spa_open() with the namespace lock held because we
937	 * allow recursive calls for other reasons.
938	 */
939	mutex_enter(&spa_namespace_lock);
940	if ((err = spa_open(name, &spa, FTAG)) != 0) {
941		mutex_exit(&spa_namespace_lock);
942		return (err);
943	}
944
945	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
946
947	avl_remove(&spa_namespace_avl, spa);
948	(void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
949	avl_add(&spa_namespace_avl, spa);
950
951	/*
952	 * Sync all labels to disk with the new names by marking the root vdev
953	 * dirty and waiting for it to sync.  It will pick up the new pool name
954	 * during the sync.
955	 */
956	vdev_config_dirty(spa->spa_root_vdev);
957
958	spa_config_exit(spa, SCL_ALL, FTAG);
959
960	txg_wait_synced(spa->spa_dsl_pool, 0);
961
962	/*
963	 * Sync the updated config cache.
964	 */
965	spa_config_sync(spa, B_FALSE, B_TRUE);
966
967	spa_close(spa, FTAG);
968
969	mutex_exit(&spa_namespace_lock);
970
971	return (0);
972}
973
974
975/*
976 * Determine whether a pool with given pool_guid exists.  If device_guid is
977 * non-zero, determine whether the pool exists *and* contains a device with the
978 * specified device_guid.
979 */
980boolean_t
981spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
982{
983	spa_t *spa;
984	avl_tree_t *t = &spa_namespace_avl;
985
986	ASSERT(MUTEX_HELD(&spa_namespace_lock));
987
988	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
989		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
990			continue;
991		if (spa->spa_root_vdev == NULL)
992			continue;
993		if (spa_guid(spa) == pool_guid) {
994			if (device_guid == 0)
995				break;
996
997			if (vdev_lookup_by_guid(spa->spa_root_vdev,
998			    device_guid) != NULL)
999				break;
1000
1001			/*
1002			 * Check any devices we may be in the process of adding.
1003			 */
1004			if (spa->spa_pending_vdev) {
1005				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1006				    device_guid) != NULL)
1007					break;
1008			}
1009		}
1010	}
1011
1012	return (spa != NULL);
1013}
1014
1015char *
1016spa_strdup(const char *s)
1017{
1018	size_t len;
1019	char *new;
1020
1021	len = strlen(s);
1022	new = kmem_alloc(len + 1, KM_SLEEP);
1023	bcopy(s, new, len);
1024	new[len] = '\0';
1025
1026	return (new);
1027}
1028
1029void
1030spa_strfree(char *s)
1031{
1032	kmem_free(s, strlen(s) + 1);
1033}
1034
1035uint64_t
1036spa_get_random(uint64_t range)
1037{
1038	uint64_t r;
1039
1040	ASSERT(range != 0);
1041
1042	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1043
1044	return (r % range);
1045}
1046
1047void
1048sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
1049{
1050	int d;
1051
1052	if (bp == NULL) {
1053		(void) snprintf(buf, len, "<NULL>");
1054		return;
1055	}
1056
1057	if (BP_IS_HOLE(bp)) {
1058		(void) snprintf(buf, len, "<hole>");
1059		return;
1060	}
1061
1062	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
1063	    (u_longlong_t)BP_GET_LEVEL(bp),
1064	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
1065	    (u_longlong_t)BP_GET_LSIZE(bp),
1066	    (u_longlong_t)BP_GET_PSIZE(bp));
1067
1068	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
1069		const dva_t *dva = &bp->blk_dva[d];
1070		(void) snprintf(buf + strlen(buf), len - strlen(buf),
1071		    "DVA[%d]=<%llu:%llx:%llx> ", d,
1072		    (u_longlong_t)DVA_GET_VDEV(dva),
1073		    (u_longlong_t)DVA_GET_OFFSET(dva),
1074		    (u_longlong_t)DVA_GET_ASIZE(dva));
1075	}
1076
1077	(void) snprintf(buf + strlen(buf), len - strlen(buf),
1078	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
1079	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
1080	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
1081	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
1082	    BP_IS_GANG(bp) ? "gang" : "contiguous",
1083	    (u_longlong_t)bp->blk_birth,
1084	    (u_longlong_t)bp->blk_fill,
1085	    (u_longlong_t)bp->blk_cksum.zc_word[0],
1086	    (u_longlong_t)bp->blk_cksum.zc_word[1],
1087	    (u_longlong_t)bp->blk_cksum.zc_word[2],
1088	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
1089}
1090
1091void
1092spa_freeze(spa_t *spa)
1093{
1094	uint64_t freeze_txg = 0;
1095
1096	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1097	if (spa->spa_freeze_txg == UINT64_MAX) {
1098		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1099		spa->spa_freeze_txg = freeze_txg;
1100	}
1101	spa_config_exit(spa, SCL_ALL, FTAG);
1102	if (freeze_txg != 0)
1103		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1104}
1105
1106void
1107zfs_panic_recover(const char *fmt, ...)
1108{
1109	va_list adx;
1110
1111	va_start(adx, fmt);
1112	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1113	va_end(adx);
1114}
1115
1116/*
1117 * ==========================================================================
1118 * Accessor functions
1119 * ==========================================================================
1120 */
1121
1122boolean_t
1123spa_shutting_down(spa_t *spa)
1124{
1125	return (spa->spa_async_suspended);
1126}
1127
1128dsl_pool_t *
1129spa_get_dsl(spa_t *spa)
1130{
1131	return (spa->spa_dsl_pool);
1132}
1133
1134blkptr_t *
1135spa_get_rootblkptr(spa_t *spa)
1136{
1137	return (&spa->spa_ubsync.ub_rootbp);
1138}
1139
1140void
1141spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1142{
1143	spa->spa_uberblock.ub_rootbp = *bp;
1144}
1145
1146void
1147spa_altroot(spa_t *spa, char *buf, size_t buflen)
1148{
1149	if (spa->spa_root == NULL)
1150		buf[0] = '\0';
1151	else
1152		(void) strncpy(buf, spa->spa_root, buflen);
1153}
1154
1155int
1156spa_sync_pass(spa_t *spa)
1157{
1158	return (spa->spa_sync_pass);
1159}
1160
1161char *
1162spa_name(spa_t *spa)
1163{
1164	return (spa->spa_name);
1165}
1166
1167uint64_t
1168spa_guid(spa_t *spa)
1169{
1170	/*
1171	 * If we fail to parse the config during spa_load(), we can go through
1172	 * the error path (which posts an ereport) and end up here with no root
1173	 * vdev.  We stash the original pool guid in 'spa_load_guid' to handle
1174	 * this case.
1175	 */
1176	if (spa->spa_root_vdev != NULL)
1177		return (spa->spa_root_vdev->vdev_guid);
1178	else
1179		return (spa->spa_load_guid);
1180}
1181
1182uint64_t
1183spa_last_synced_txg(spa_t *spa)
1184{
1185	return (spa->spa_ubsync.ub_txg);
1186}
1187
1188uint64_t
1189spa_first_txg(spa_t *spa)
1190{
1191	return (spa->spa_first_txg);
1192}
1193
1194pool_state_t
1195spa_state(spa_t *spa)
1196{
1197	return (spa->spa_state);
1198}
1199
1200uint64_t
1201spa_freeze_txg(spa_t *spa)
1202{
1203	return (spa->spa_freeze_txg);
1204}
1205
1206/*
1207 * Return how much space is allocated in the pool (ie. sum of all asize)
1208 */
1209uint64_t
1210spa_get_alloc(spa_t *spa)
1211{
1212	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
1213}
1214
1215/*
1216 * Return how much (raid-z inflated) space there is in the pool.
1217 */
1218uint64_t
1219spa_get_space(spa_t *spa)
1220{
1221	return (spa->spa_root_vdev->vdev_stat.vs_space);
1222}
1223
1224/*
1225 * Return the amount of raid-z-deflated space in the pool.
1226 */
1227uint64_t
1228spa_get_dspace(spa_t *spa)
1229{
1230	if (spa->spa_deflate)
1231		return (spa->spa_root_vdev->vdev_stat.vs_dspace);
1232	else
1233		return (spa->spa_root_vdev->vdev_stat.vs_space);
1234}
1235
1236/* ARGSUSED */
1237uint64_t
1238spa_get_asize(spa_t *spa, uint64_t lsize)
1239{
1240	/*
1241	 * For now, the worst case is 512-byte RAID-Z blocks, in which
1242	 * case the space requirement is exactly 2x; so just assume that.
1243	 * Add to this the fact that we can have up to 3 DVAs per bp, and
1244	 * we have to multiply by a total of 6x.
1245	 */
1246	return (lsize * 6);
1247}
1248
1249/*
1250 * Return the failure mode that has been set to this pool. The default
1251 * behavior will be to block all I/Os when a complete failure occurs.
1252 */
1253uint8_t
1254spa_get_failmode(spa_t *spa)
1255{
1256	return (spa->spa_failmode);
1257}
1258
1259boolean_t
1260spa_suspended(spa_t *spa)
1261{
1262	return (spa->spa_suspended);
1263}
1264
1265uint64_t
1266spa_version(spa_t *spa)
1267{
1268	return (spa->spa_ubsync.ub_version);
1269}
1270
1271int
1272spa_max_replication(spa_t *spa)
1273{
1274	/*
1275	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1276	 * handle BPs with more than one DVA allocated.  Set our max
1277	 * replication level accordingly.
1278	 */
1279	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1280		return (1);
1281	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1282}
1283
1284uint64_t
1285bp_get_dasize(spa_t *spa, const blkptr_t *bp)
1286{
1287	int sz = 0, i;
1288
1289	if (!spa->spa_deflate)
1290		return (BP_GET_ASIZE(bp));
1291
1292	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1293	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1294		vdev_t *vd =
1295		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
1296		if (vd)
1297			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
1298			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1299	}
1300	spa_config_exit(spa, SCL_VDEV, FTAG);
1301	return (sz);
1302}
1303
1304/*
1305 * ==========================================================================
1306 * Initialization and Termination
1307 * ==========================================================================
1308 */
1309
1310static int
1311spa_name_compare(const void *a1, const void *a2)
1312{
1313	const spa_t *s1 = a1;
1314	const spa_t *s2 = a2;
1315	int s;
1316
1317	s = strcmp(s1->spa_name, s2->spa_name);
1318	if (s > 0)
1319		return (1);
1320	if (s < 0)
1321		return (-1);
1322	return (0);
1323}
1324
1325int
1326spa_busy(void)
1327{
1328	return (spa_active_count);
1329}
1330
1331void
1332spa_boot_init()
1333{
1334	spa_config_load();
1335}
1336
1337void
1338spa_init(int mode)
1339{
1340	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1341	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1342	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1343	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1344
1345	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1346	    offsetof(spa_t, spa_avl));
1347
1348	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1349	    offsetof(spa_aux_t, aux_avl));
1350
1351	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1352	    offsetof(spa_aux_t, aux_avl));
1353
1354	spa_mode = mode;
1355
1356	refcount_init();
1357	unique_init();
1358	zio_init();
1359	dmu_init();
1360	zil_init();
1361	vdev_cache_stat_init();
1362	zfs_prop_init();
1363	zpool_prop_init();
1364	spa_config_load();
1365	l2arc_start();
1366}
1367
1368void
1369spa_fini(void)
1370{
1371	l2arc_stop();
1372
1373	spa_evict_all();
1374
1375	vdev_cache_stat_fini();
1376	zil_fini();
1377	dmu_fini();
1378	zio_fini();
1379	unique_fini();
1380	refcount_fini();
1381
1382	avl_destroy(&spa_namespace_avl);
1383	avl_destroy(&spa_spare_avl);
1384	avl_destroy(&spa_l2cache_avl);
1385
1386	cv_destroy(&spa_namespace_cv);
1387	mutex_destroy(&spa_namespace_lock);
1388	mutex_destroy(&spa_spare_lock);
1389	mutex_destroy(&spa_l2cache_lock);
1390}
1391
1392/*
1393 * Return whether this pool has slogs. No locking needed.
1394 * It's not a problem if the wrong answer is returned as it's only for
1395 * performance and not correctness
1396 */
1397boolean_t
1398spa_has_slogs(spa_t *spa)
1399{
1400	return (spa->spa_log_class->mc_rotor != NULL);
1401}
1402
1403/*
1404 * Return whether this pool is the root pool.
1405 */
1406boolean_t
1407spa_is_root(spa_t *spa)
1408{
1409	return (spa->spa_is_root);
1410}
1411