1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2013 Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright (c) 2017 Datto Inc.
29 * Copyright (c) 2017, Intel Corporation.
30 */
31
32#include <sys/zfs_context.h>
33#include <sys/spa_impl.h>
34#include <sys/spa_boot.h>
35#include <sys/zio.h>
36#include <sys/zio_checksum.h>
37#include <sys/zio_compress.h>
38#include <sys/dmu.h>
39#include <sys/dmu_tx.h>
40#include <sys/zap.h>
41#include <sys/zil.h>
42#include <sys/vdev_impl.h>
43#include <sys/vdev_initialize.h>
44#include <sys/vdev_trim.h>
45#include <sys/metaslab.h>
46#include <sys/uberblock_impl.h>
47#include <sys/txg.h>
48#include <sys/avl.h>
49#include <sys/unique.h>
50#include <sys/dsl_pool.h>
51#include <sys/dsl_dir.h>
52#include <sys/dsl_prop.h>
53#include <sys/dsl_scan.h>
54#include <sys/fs/zfs.h>
55#include <sys/metaslab_impl.h>
56#include <sys/arc.h>
57#include <sys/ddt.h>
58#include "zfs_prop.h"
59#include <sys/zfeature.h>
60
61/*
62 * SPA locking
63 *
64 * There are four basic locks for managing spa_t structures:
65 *
66 * spa_namespace_lock (global mutex)
67 *
68 *	This lock must be acquired to do any of the following:
69 *
70 *		- Lookup a spa_t by name
71 *		- Add or remove a spa_t from the namespace
72 *		- Increase spa_refcount from non-zero
73 *		- Check if spa_refcount is zero
74 *		- Rename a spa_t
75 *		- add/remove/attach/detach devices
76 *		- Held for the duration of create/destroy/import/export
77 *
78 *	It does not need to handle recursion.  A create or destroy may
79 *	reference objects (files or zvols) in other pools, but by
80 *	definition they must have an existing reference, and will never need
81 *	to lookup a spa_t by name.
82 *
83 * spa_refcount (per-spa zfs_refcount_t protected by mutex)
84 *
85 *	This reference count keep track of any active users of the spa_t.  The
86 *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
87 *	the refcount is never really 'zero' - opening a pool implicitly keeps
88 *	some references in the DMU.  Internally we check against spa_minref, but
89 *	present the image of a zero/non-zero value to consumers.
90 *
91 * spa_config_lock[] (per-spa array of rwlocks)
92 *
93 *	This protects the spa_t from config changes, and must be held in
94 *	the following circumstances:
95 *
96 *		- RW_READER to perform I/O to the spa
97 *		- RW_WRITER to change the vdev config
98 *
99 * The locking order is fairly straightforward:
100 *
101 *		spa_namespace_lock	->	spa_refcount
102 *
103 *	The namespace lock must be acquired to increase the refcount from 0
104 *	or to check if it is zero.
105 *
106 *		spa_refcount		->	spa_config_lock[]
107 *
108 *	There must be at least one valid reference on the spa_t to acquire
109 *	the config lock.
110 *
111 *		spa_namespace_lock	->	spa_config_lock[]
112 *
113 *	The namespace lock must always be taken before the config lock.
114 *
115 *
116 * The spa_namespace_lock can be acquired directly and is globally visible.
117 *
118 * The namespace is manipulated using the following functions, all of which
119 * require the spa_namespace_lock to be held.
120 *
121 *	spa_lookup()		Lookup a spa_t by name.
122 *
123 *	spa_add()		Create a new spa_t in the namespace.
124 *
125 *	spa_remove()		Remove a spa_t from the namespace.  This also
126 *				frees up any memory associated with the spa_t.
127 *
128 *	spa_next()		Returns the next spa_t in the system, or the
129 *				first if NULL is passed.
130 *
131 *	spa_evict_all()		Shutdown and remove all spa_t structures in
132 *				the system.
133 *
134 *	spa_guid_exists()	Determine whether a pool/device guid exists.
135 *
136 * The spa_refcount is manipulated using the following functions:
137 *
138 *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
139 *				called with spa_namespace_lock held if the
140 *				refcount is currently zero.
141 *
142 *	spa_close()		Remove a reference from the spa_t.  This will
143 *				not free the spa_t or remove it from the
144 *				namespace.  No locking is required.
145 *
146 *	spa_refcount_zero()	Returns true if the refcount is currently
147 *				zero.  Must be called with spa_namespace_lock
148 *				held.
149 *
150 * The spa_config_lock[] is an array of rwlocks, ordered as follows:
151 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
152 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
153 *
154 * To read the configuration, it suffices to hold one of these locks as reader.
155 * To modify the configuration, you must hold all locks as writer.  To modify
156 * vdev state without altering the vdev tree's topology (e.g. online/offline),
157 * you must hold SCL_STATE and SCL_ZIO as writer.
158 *
159 * We use these distinct config locks to avoid recursive lock entry.
160 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
161 * block allocations (SCL_ALLOC), which may require reading space maps
162 * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
163 *
164 * The spa config locks cannot be normal rwlocks because we need the
165 * ability to hand off ownership.  For example, SCL_ZIO is acquired
166 * by the issuing thread and later released by an interrupt thread.
167 * They do, however, obey the usual write-wanted semantics to prevent
168 * writer (i.e. system administrator) starvation.
169 *
170 * The lock acquisition rules are as follows:
171 *
172 * SCL_CONFIG
173 *	Protects changes to the vdev tree topology, such as vdev
174 *	add/remove/attach/detach.  Protects the dirty config list
175 *	(spa_config_dirty_list) and the set of spares and l2arc devices.
176 *
177 * SCL_STATE
178 *	Protects changes to pool state and vdev state, such as vdev
179 *	online/offline/fault/degrade/clear.  Protects the dirty state list
180 *	(spa_state_dirty_list) and global pool state (spa_state).
181 *
182 * SCL_ALLOC
183 *	Protects changes to metaslab groups and classes.
184 *	Held as reader by metaslab_alloc() and metaslab_claim().
185 *
186 * SCL_ZIO
187 *	Held by bp-level zios (those which have no io_vd upon entry)
188 *	to prevent changes to the vdev tree.  The bp-level zio implicitly
189 *	protects all of its vdev child zios, which do not hold SCL_ZIO.
190 *
191 * SCL_FREE
192 *	Protects changes to metaslab groups and classes.
193 *	Held as reader by metaslab_free().  SCL_FREE is distinct from
194 *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
195 *	blocks in zio_done() while another i/o that holds either
196 *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
197 *
198 * SCL_VDEV
199 *	Held as reader to prevent changes to the vdev tree during trivial
200 *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
201 *	other locks, and lower than all of them, to ensure that it's safe
202 *	to acquire regardless of caller context.
203 *
204 * In addition, the following rules apply:
205 *
206 * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
207 *	The lock ordering is SCL_CONFIG > spa_props_lock.
208 *
209 * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
210 *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
211 *	or zio_write_phys() -- the caller must ensure that the config cannot
212 *	cannot change in the interim, and that the vdev cannot be reopened.
213 *	SCL_STATE as reader suffices for both.
214 *
215 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
216 *
217 *	spa_vdev_enter()	Acquire the namespace lock and the config lock
218 *				for writing.
219 *
220 *	spa_vdev_exit()		Release the config lock, wait for all I/O
221 *				to complete, sync the updated configs to the
222 *				cache, and release the namespace lock.
223 *
224 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
225 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
226 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
227 */
228
229static avl_tree_t spa_namespace_avl;
230kmutex_t spa_namespace_lock;
231static kcondvar_t spa_namespace_cv;
232static int spa_active_count;
233int spa_max_replication_override = SPA_DVAS_PER_BP;
234
235static kmutex_t spa_spare_lock;
236static avl_tree_t spa_spare_avl;
237static kmutex_t spa_l2cache_lock;
238static avl_tree_t spa_l2cache_avl;
239
240kmem_cache_t *spa_buffer_pool;
241int spa_mode_global;
242
243#ifdef ZFS_DEBUG
244/*
245 * Everything except dprintf, spa, and indirect_remap is on by default
246 * in debug builds.
247 */
248int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
249#else
250int zfs_flags = 0;
251#endif
252
253/*
254 * zfs_recover can be set to nonzero to attempt to recover from
255 * otherwise-fatal errors, typically caused by on-disk corruption.  When
256 * set, calls to zfs_panic_recover() will turn into warning messages.
257 * This should only be used as a last resort, as it typically results
258 * in leaked space, or worse.
259 */
260boolean_t zfs_recover = B_FALSE;
261
262/*
263 * If destroy encounters an EIO while reading metadata (e.g. indirect
264 * blocks), space referenced by the missing metadata can not be freed.
265 * Normally this causes the background destroy to become "stalled", as
266 * it is unable to make forward progress.  While in this stalled state,
267 * all remaining space to free from the error-encountering filesystem is
268 * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
269 * permanently leak the space from indirect blocks that can not be read,
270 * and continue to free everything else that it can.
271 *
272 * The default, "stalling" behavior is useful if the storage partially
273 * fails (i.e. some but not all i/os fail), and then later recovers.  In
274 * this case, we will be able to continue pool operations while it is
275 * partially failed, and when it recovers, we can continue to free the
276 * space, with no leaks.  However, note that this case is actually
277 * fairly rare.
278 *
279 * Typically pools either (a) fail completely (but perhaps temporarily,
280 * e.g. a top-level vdev going offline), or (b) have localized,
281 * permanent errors (e.g. disk returns the wrong data due to bit flip or
282 * firmware bug).  In case (a), this setting does not matter because the
283 * pool will be suspended and the sync thread will not be able to make
284 * forward progress regardless.  In case (b), because the error is
285 * permanent, the best we can do is leak the minimum amount of space,
286 * which is what setting this flag will do.  Therefore, it is reasonable
287 * for this flag to normally be set, but we chose the more conservative
288 * approach of not setting it, so that there is no possibility of
289 * leaking space in the "partial temporary" failure case.
290 */
291boolean_t zfs_free_leak_on_eio = B_FALSE;
292
293/*
294 * Expiration time in milliseconds. This value has two meanings. First it is
295 * used to determine when the spa_deadman() logic should fire. By default the
296 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
297 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
298 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
299 * in a system panic.
300 */
301uint64_t zfs_deadman_synctime_ms = 1000000ULL;
302
303/*
304 * Check time in milliseconds. This defines the frequency at which we check
305 * for hung I/O.
306 */
307uint64_t zfs_deadman_checktime_ms = 5000ULL;
308
309/*
310 * Override the zfs deadman behavior via /etc/system. By default the
311 * deadman is enabled except on VMware and sparc deployments.
312 */
313int zfs_deadman_enabled = -1;
314
315/*
316 * The worst case is single-sector max-parity RAID-Z blocks, in which
317 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
318 * times the size; so just assume that.  Add to this the fact that
319 * we can have up to 3 DVAs per bp, and one more factor of 2 because
320 * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
321 * the worst case is:
322 *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
323 */
324int spa_asize_inflation = 24;
325
326/*
327 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
328 * the pool to be consumed.  This ensures that we don't run the pool
329 * completely out of space, due to unaccounted changes (e.g. to the MOS).
330 * It also limits the worst-case time to allocate space.  If we have
331 * less than this amount of free space, most ZPL operations (e.g. write,
332 * create) will return ENOSPC.
333 *
334 * Certain operations (e.g. file removal, most administrative actions) can
335 * use half the slop space.  They will only return ENOSPC if less than half
336 * the slop space is free.  Typically, once the pool has less than the slop
337 * space free, the user will use these operations to free up space in the pool.
338 * These are the operations that call dsl_pool_adjustedsize() with the netfree
339 * argument set to TRUE.
340 *
341 * Operations that are almost guaranteed to free up space in the absence of
342 * a pool checkpoint can use up to three quarters of the slop space
343 * (e.g zfs destroy).
344 *
345 * A very restricted set of operations are always permitted, regardless of
346 * the amount of free space.  These are the operations that call
347 * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
348 * increase in the amount of space used, it is possible to run the pool
349 * completely out of space, causing it to be permanently read-only.
350 *
351 * Note that on very small pools, the slop space will be larger than
352 * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
353 * but we never allow it to be more than half the pool size.
354 *
355 * See also the comments in zfs_space_check_t.
356 */
357int spa_slop_shift = 5;
358uint64_t spa_min_slop = 128 * 1024 * 1024;
359
360int spa_allocators = 4;
361
362/*PRINTFLIKE2*/
363void
364spa_load_failed(spa_t *spa, const char *fmt, ...)
365{
366	va_list adx;
367	char buf[256];
368
369	va_start(adx, fmt);
370	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
371	va_end(adx);
372
373	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
374	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
375}
376
377/*PRINTFLIKE2*/
378void
379spa_load_note(spa_t *spa, const char *fmt, ...)
380{
381	va_list adx;
382	char buf[256];
383
384	va_start(adx, fmt);
385	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
386	va_end(adx);
387
388	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
389	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
390}
391
392/*
393 * By default dedup and user data indirects land in the special class
394 */
395int zfs_ddt_data_is_special = B_TRUE;
396int zfs_user_indirect_is_special = B_TRUE;
397
398/*
399 * The percentage of special class final space reserved for metadata only.
400 * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
401 * let metadata into the class.
402 */
403int zfs_special_class_metadata_reserve_pct = 25;
404
405/*
406 * ==========================================================================
407 * SPA config locking
408 * ==========================================================================
409 */
410static void
411spa_config_lock_init(spa_t *spa)
412{
413	for (int i = 0; i < SCL_LOCKS; i++) {
414		spa_config_lock_t *scl = &spa->spa_config_lock[i];
415		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
416		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
417		zfs_refcount_create_untracked(&scl->scl_count);
418		scl->scl_writer = NULL;
419		scl->scl_write_wanted = 0;
420	}
421}
422
423static void
424spa_config_lock_destroy(spa_t *spa)
425{
426	for (int i = 0; i < SCL_LOCKS; i++) {
427		spa_config_lock_t *scl = &spa->spa_config_lock[i];
428		mutex_destroy(&scl->scl_lock);
429		cv_destroy(&scl->scl_cv);
430		zfs_refcount_destroy(&scl->scl_count);
431		ASSERT(scl->scl_writer == NULL);
432		ASSERT(scl->scl_write_wanted == 0);
433	}
434}
435
436int
437spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
438{
439	for (int i = 0; i < SCL_LOCKS; i++) {
440		spa_config_lock_t *scl = &spa->spa_config_lock[i];
441		if (!(locks & (1 << i)))
442			continue;
443		mutex_enter(&scl->scl_lock);
444		if (rw == RW_READER) {
445			if (scl->scl_writer || scl->scl_write_wanted) {
446				mutex_exit(&scl->scl_lock);
447				spa_config_exit(spa, locks & ((1 << i) - 1),
448				    tag);
449				return (0);
450			}
451		} else {
452			ASSERT(scl->scl_writer != curthread);
453			if (!zfs_refcount_is_zero(&scl->scl_count)) {
454				mutex_exit(&scl->scl_lock);
455				spa_config_exit(spa, locks & ((1 << i) - 1),
456				    tag);
457				return (0);
458			}
459			scl->scl_writer = curthread;
460		}
461		(void) zfs_refcount_add(&scl->scl_count, tag);
462		mutex_exit(&scl->scl_lock);
463	}
464	return (1);
465}
466
467void
468spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
469{
470	int wlocks_held = 0;
471
472	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
473
474	for (int i = 0; i < SCL_LOCKS; i++) {
475		spa_config_lock_t *scl = &spa->spa_config_lock[i];
476		if (scl->scl_writer == curthread)
477			wlocks_held |= (1 << i);
478		if (!(locks & (1 << i)))
479			continue;
480		mutex_enter(&scl->scl_lock);
481		if (rw == RW_READER) {
482			while (scl->scl_writer || scl->scl_write_wanted) {
483				cv_wait(&scl->scl_cv, &scl->scl_lock);
484			}
485		} else {
486			ASSERT(scl->scl_writer != curthread);
487			while (!zfs_refcount_is_zero(&scl->scl_count)) {
488				scl->scl_write_wanted++;
489				cv_wait(&scl->scl_cv, &scl->scl_lock);
490				scl->scl_write_wanted--;
491			}
492			scl->scl_writer = curthread;
493		}
494		(void) zfs_refcount_add(&scl->scl_count, tag);
495		mutex_exit(&scl->scl_lock);
496	}
497	ASSERT3U(wlocks_held, <=, locks);
498}
499
500void
501spa_config_exit(spa_t *spa, int locks, void *tag)
502{
503	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
504		spa_config_lock_t *scl = &spa->spa_config_lock[i];
505		if (!(locks & (1 << i)))
506			continue;
507		mutex_enter(&scl->scl_lock);
508		ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
509		if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
510			ASSERT(scl->scl_writer == NULL ||
511			    scl->scl_writer == curthread);
512			scl->scl_writer = NULL;	/* OK in either case */
513			cv_broadcast(&scl->scl_cv);
514		}
515		mutex_exit(&scl->scl_lock);
516	}
517}
518
519int
520spa_config_held(spa_t *spa, int locks, krw_t rw)
521{
522	int locks_held = 0;
523
524	for (int i = 0; i < SCL_LOCKS; i++) {
525		spa_config_lock_t *scl = &spa->spa_config_lock[i];
526		if (!(locks & (1 << i)))
527			continue;
528		if ((rw == RW_READER &&
529		    !zfs_refcount_is_zero(&scl->scl_count)) ||
530		    (rw == RW_WRITER && scl->scl_writer == curthread))
531			locks_held |= 1 << i;
532	}
533
534	return (locks_held);
535}
536
537/*
538 * ==========================================================================
539 * SPA namespace functions
540 * ==========================================================================
541 */
542
543/*
544 * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
545 * Returns NULL if no matching spa_t is found.
546 */
547spa_t *
548spa_lookup(const char *name)
549{
550	static spa_t search;	/* spa_t is large; don't allocate on stack */
551	spa_t *spa;
552	avl_index_t where;
553	char *cp;
554
555	ASSERT(MUTEX_HELD(&spa_namespace_lock));
556
557	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
558
559	/*
560	 * If it's a full dataset name, figure out the pool name and
561	 * just use that.
562	 */
563	cp = strpbrk(search.spa_name, "/@#");
564	if (cp != NULL)
565		*cp = '\0';
566
567	spa = avl_find(&spa_namespace_avl, &search, &where);
568
569	return (spa);
570}
571
572/*
573 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
574 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
575 * looking for potentially hung I/Os.
576 */
577void
578spa_deadman(void *arg)
579{
580	spa_t *spa = arg;
581
582	/*
583	 * Disable the deadman timer if the pool is suspended.
584	 */
585	if (spa_suspended(spa)) {
586		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
587		return;
588	}
589
590	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
591	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
592	    ++spa->spa_deadman_calls);
593	if (zfs_deadman_enabled)
594		vdev_deadman(spa->spa_root_vdev);
595}
596
597/*
598 * Create an uninitialized spa_t with the given name.  Requires
599 * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
600 * exist by calling spa_lookup() first.
601 */
602spa_t *
603spa_add(const char *name, nvlist_t *config, const char *altroot)
604{
605	spa_t *spa;
606	spa_config_dirent_t *dp;
607	cyc_handler_t hdlr;
608	cyc_time_t when;
609
610	ASSERT(MUTEX_HELD(&spa_namespace_lock));
611
612	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
613
614	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
615	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
616	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
617	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
618	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
619	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
620	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
621	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
622	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
623	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
624	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
625	mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
626
627	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
628	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
629	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
630	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
631	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
632
633	for (int t = 0; t < TXG_SIZE; t++)
634		bplist_create(&spa->spa_free_bplist[t]);
635
636	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
637	spa->spa_state = POOL_STATE_UNINITIALIZED;
638	spa->spa_freeze_txg = UINT64_MAX;
639	spa->spa_final_txg = UINT64_MAX;
640	spa->spa_load_max_txg = UINT64_MAX;
641	spa->spa_proc = &p0;
642	spa->spa_proc_state = SPA_PROC_NONE;
643	spa->spa_trust_config = B_TRUE;
644
645	hdlr.cyh_func = spa_deadman;
646	hdlr.cyh_arg = spa;
647	hdlr.cyh_level = CY_LOW_LEVEL;
648
649	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
650
651	/*
652	 * This determines how often we need to check for hung I/Os after
653	 * the cyclic has already fired. Since checking for hung I/Os is
654	 * an expensive operation we don't want to check too frequently.
655	 * Instead wait for 5 seconds before checking again.
656	 */
657	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
658	when.cyt_when = CY_INFINITY;
659	mutex_enter(&cpu_lock);
660	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
661	mutex_exit(&cpu_lock);
662
663	zfs_refcount_create(&spa->spa_refcount);
664	spa_config_lock_init(spa);
665
666	avl_add(&spa_namespace_avl, spa);
667
668	/*
669	 * Set the alternate root, if there is one.
670	 */
671	if (altroot) {
672		spa->spa_root = spa_strdup(altroot);
673		spa_active_count++;
674	}
675
676	spa->spa_alloc_count = spa_allocators;
677	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
678	    sizeof (kmutex_t), KM_SLEEP);
679	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
680	    sizeof (avl_tree_t), KM_SLEEP);
681	for (int i = 0; i < spa->spa_alloc_count; i++) {
682		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
683		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
684		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
685	}
686
687	/*
688	 * Every pool starts with the default cachefile
689	 */
690	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
691	    offsetof(spa_config_dirent_t, scd_link));
692
693	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
694	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
695	list_insert_head(&spa->spa_config_list, dp);
696
697	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
698	    KM_SLEEP) == 0);
699
700	if (config != NULL) {
701		nvlist_t *features;
702
703		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
704		    &features) == 0) {
705			VERIFY(nvlist_dup(features, &spa->spa_label_features,
706			    0) == 0);
707		}
708
709		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
710	}
711
712	if (spa->spa_label_features == NULL) {
713		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
714		    KM_SLEEP) == 0);
715	}
716
717	spa->spa_iokstat = kstat_create("zfs", 0, name,
718	    "disk", KSTAT_TYPE_IO, 1, 0);
719	if (spa->spa_iokstat) {
720		spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
721		kstat_install(spa->spa_iokstat);
722	}
723
724	spa->spa_min_ashift = INT_MAX;
725	spa->spa_max_ashift = 0;
726
727	/*
728	 * As a pool is being created, treat all features as disabled by
729	 * setting SPA_FEATURE_DISABLED for all entries in the feature
730	 * refcount cache.
731	 */
732	for (int i = 0; i < SPA_FEATURES; i++) {
733		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
734	}
735
736	list_create(&spa->spa_leaf_list, sizeof (vdev_t),
737	    offsetof(vdev_t, vdev_leaf_node));
738
739	return (spa);
740}
741
742/*
743 * Removes a spa_t from the namespace, freeing up any memory used.  Requires
744 * spa_namespace_lock.  This is called only after the spa_t has been closed and
745 * deactivated.
746 */
747void
748spa_remove(spa_t *spa)
749{
750	spa_config_dirent_t *dp;
751
752	ASSERT(MUTEX_HELD(&spa_namespace_lock));
753	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
754	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
755
756	nvlist_free(spa->spa_config_splitting);
757
758	avl_remove(&spa_namespace_avl, spa);
759	cv_broadcast(&spa_namespace_cv);
760
761	if (spa->spa_root) {
762		spa_strfree(spa->spa_root);
763		spa_active_count--;
764	}
765
766	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
767		list_remove(&spa->spa_config_list, dp);
768		if (dp->scd_path != NULL)
769			spa_strfree(dp->scd_path);
770		kmem_free(dp, sizeof (spa_config_dirent_t));
771	}
772
773	for (int i = 0; i < spa->spa_alloc_count; i++) {
774		avl_destroy(&spa->spa_alloc_trees[i]);
775		mutex_destroy(&spa->spa_alloc_locks[i]);
776	}
777	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
778	    sizeof (kmutex_t));
779	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
780	    sizeof (avl_tree_t));
781
782	list_destroy(&spa->spa_config_list);
783	list_destroy(&spa->spa_leaf_list);
784
785	nvlist_free(spa->spa_label_features);
786	nvlist_free(spa->spa_load_info);
787	spa_config_set(spa, NULL);
788
789	mutex_enter(&cpu_lock);
790	if (spa->spa_deadman_cycid != CYCLIC_NONE)
791		cyclic_remove(spa->spa_deadman_cycid);
792	mutex_exit(&cpu_lock);
793	spa->spa_deadman_cycid = CYCLIC_NONE;
794
795	zfs_refcount_destroy(&spa->spa_refcount);
796
797	spa_config_lock_destroy(spa);
798
799	kstat_delete(spa->spa_iokstat);
800	spa->spa_iokstat = NULL;
801
802	for (int t = 0; t < TXG_SIZE; t++)
803		bplist_destroy(&spa->spa_free_bplist[t]);
804
805	zio_checksum_templates_free(spa);
806
807	cv_destroy(&spa->spa_async_cv);
808	cv_destroy(&spa->spa_evicting_os_cv);
809	cv_destroy(&spa->spa_proc_cv);
810	cv_destroy(&spa->spa_scrub_io_cv);
811	cv_destroy(&spa->spa_suspend_cv);
812
813	mutex_destroy(&spa->spa_async_lock);
814	mutex_destroy(&spa->spa_errlist_lock);
815	mutex_destroy(&spa->spa_errlog_lock);
816	mutex_destroy(&spa->spa_evicting_os_lock);
817	mutex_destroy(&spa->spa_history_lock);
818	mutex_destroy(&spa->spa_proc_lock);
819	mutex_destroy(&spa->spa_props_lock);
820	mutex_destroy(&spa->spa_cksum_tmpls_lock);
821	mutex_destroy(&spa->spa_scrub_lock);
822	mutex_destroy(&spa->spa_suspend_lock);
823	mutex_destroy(&spa->spa_vdev_top_lock);
824	mutex_destroy(&spa->spa_iokstat_lock);
825
826	kmem_free(spa, sizeof (spa_t));
827}
828
829/*
830 * Given a pool, return the next pool in the namespace, or NULL if there is
831 * none.  If 'prev' is NULL, return the first pool.
832 */
833spa_t *
834spa_next(spa_t *prev)
835{
836	ASSERT(MUTEX_HELD(&spa_namespace_lock));
837
838	if (prev)
839		return (AVL_NEXT(&spa_namespace_avl, prev));
840	else
841		return (avl_first(&spa_namespace_avl));
842}
843
844/*
845 * ==========================================================================
846 * SPA refcount functions
847 * ==========================================================================
848 */
849
850/*
851 * Add a reference to the given spa_t.  Must have at least one reference, or
852 * have the namespace lock held.
853 */
854void
855spa_open_ref(spa_t *spa, void *tag)
856{
857	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
858	    MUTEX_HELD(&spa_namespace_lock));
859	(void) zfs_refcount_add(&spa->spa_refcount, tag);
860}
861
862/*
863 * Remove a reference to the given spa_t.  Must have at least one reference, or
864 * have the namespace lock held.
865 */
866void
867spa_close(spa_t *spa, void *tag)
868{
869	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
870	    MUTEX_HELD(&spa_namespace_lock));
871	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
872}
873
874/*
875 * Remove a reference to the given spa_t held by a dsl dir that is
876 * being asynchronously released.  Async releases occur from a taskq
877 * performing eviction of dsl datasets and dirs.  The namespace lock
878 * isn't held and the hold by the object being evicted may contribute to
879 * spa_minref (e.g. dataset or directory released during pool export),
880 * so the asserts in spa_close() do not apply.
881 */
882void
883spa_async_close(spa_t *spa, void *tag)
884{
885	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
886}
887
888/*
889 * Check to see if the spa refcount is zero.  Must be called with
890 * spa_namespace_lock held.  We really compare against spa_minref, which is the
891 * number of references acquired when opening a pool
892 */
893boolean_t
894spa_refcount_zero(spa_t *spa)
895{
896	ASSERT(MUTEX_HELD(&spa_namespace_lock));
897
898	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
899}
900
901/*
902 * ==========================================================================
903 * SPA spare and l2cache tracking
904 * ==========================================================================
905 */
906
907/*
908 * Hot spares and cache devices are tracked using the same code below,
909 * for 'auxiliary' devices.
910 */
911
912typedef struct spa_aux {
913	uint64_t	aux_guid;
914	uint64_t	aux_pool;
915	avl_node_t	aux_avl;
916	int		aux_count;
917} spa_aux_t;
918
919static inline int
920spa_aux_compare(const void *a, const void *b)
921{
922	const spa_aux_t *sa = (const spa_aux_t *)a;
923	const spa_aux_t *sb = (const spa_aux_t *)b;
924
925	return (AVL_CMP(sa->aux_guid, sb->aux_guid));
926}
927
928void
929spa_aux_add(vdev_t *vd, avl_tree_t *avl)
930{
931	avl_index_t where;
932	spa_aux_t search;
933	spa_aux_t *aux;
934
935	search.aux_guid = vd->vdev_guid;
936	if ((aux = avl_find(avl, &search, &where)) != NULL) {
937		aux->aux_count++;
938	} else {
939		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
940		aux->aux_guid = vd->vdev_guid;
941		aux->aux_count = 1;
942		avl_insert(avl, aux, where);
943	}
944}
945
946void
947spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
948{
949	spa_aux_t search;
950	spa_aux_t *aux;
951	avl_index_t where;
952
953	search.aux_guid = vd->vdev_guid;
954	aux = avl_find(avl, &search, &where);
955
956	ASSERT(aux != NULL);
957
958	if (--aux->aux_count == 0) {
959		avl_remove(avl, aux);
960		kmem_free(aux, sizeof (spa_aux_t));
961	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
962		aux->aux_pool = 0ULL;
963	}
964}
965
966boolean_t
967spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
968{
969	spa_aux_t search, *found;
970
971	search.aux_guid = guid;
972	found = avl_find(avl, &search, NULL);
973
974	if (pool) {
975		if (found)
976			*pool = found->aux_pool;
977		else
978			*pool = 0ULL;
979	}
980
981	if (refcnt) {
982		if (found)
983			*refcnt = found->aux_count;
984		else
985			*refcnt = 0;
986	}
987
988	return (found != NULL);
989}
990
991void
992spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
993{
994	spa_aux_t search, *found;
995	avl_index_t where;
996
997	search.aux_guid = vd->vdev_guid;
998	found = avl_find(avl, &search, &where);
999	ASSERT(found != NULL);
1000	ASSERT(found->aux_pool == 0ULL);
1001
1002	found->aux_pool = spa_guid(vd->vdev_spa);
1003}
1004
1005/*
1006 * Spares are tracked globally due to the following constraints:
1007 *
1008 *	- A spare may be part of multiple pools.
1009 *	- A spare may be added to a pool even if it's actively in use within
1010 *	  another pool.
1011 *	- A spare in use in any pool can only be the source of a replacement if
1012 *	  the target is a spare in the same pool.
1013 *
1014 * We keep track of all spares on the system through the use of a reference
1015 * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
1016 * spare, then we bump the reference count in the AVL tree.  In addition, we set
1017 * the 'vdev_isspare' member to indicate that the device is a spare (active or
1018 * inactive).  When a spare is made active (used to replace a device in the
1019 * pool), we also keep track of which pool its been made a part of.
1020 *
1021 * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
1022 * called under the spa_namespace lock as part of vdev reconfiguration.  The
1023 * separate spare lock exists for the status query path, which does not need to
1024 * be completely consistent with respect to other vdev configuration changes.
1025 */
1026
1027static int
1028spa_spare_compare(const void *a, const void *b)
1029{
1030	return (spa_aux_compare(a, b));
1031}
1032
1033void
1034spa_spare_add(vdev_t *vd)
1035{
1036	mutex_enter(&spa_spare_lock);
1037	ASSERT(!vd->vdev_isspare);
1038	spa_aux_add(vd, &spa_spare_avl);
1039	vd->vdev_isspare = B_TRUE;
1040	mutex_exit(&spa_spare_lock);
1041}
1042
1043void
1044spa_spare_remove(vdev_t *vd)
1045{
1046	mutex_enter(&spa_spare_lock);
1047	ASSERT(vd->vdev_isspare);
1048	spa_aux_remove(vd, &spa_spare_avl);
1049	vd->vdev_isspare = B_FALSE;
1050	mutex_exit(&spa_spare_lock);
1051}
1052
1053boolean_t
1054spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
1055{
1056	boolean_t found;
1057
1058	mutex_enter(&spa_spare_lock);
1059	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
1060	mutex_exit(&spa_spare_lock);
1061
1062	return (found);
1063}
1064
1065void
1066spa_spare_activate(vdev_t *vd)
1067{
1068	mutex_enter(&spa_spare_lock);
1069	ASSERT(vd->vdev_isspare);
1070	spa_aux_activate(vd, &spa_spare_avl);
1071	mutex_exit(&spa_spare_lock);
1072}
1073
1074/*
1075 * Level 2 ARC devices are tracked globally for the same reasons as spares.
1076 * Cache devices currently only support one pool per cache device, and so
1077 * for these devices the aux reference count is currently unused beyond 1.
1078 */
1079
1080static int
1081spa_l2cache_compare(const void *a, const void *b)
1082{
1083	return (spa_aux_compare(a, b));
1084}
1085
1086void
1087spa_l2cache_add(vdev_t *vd)
1088{
1089	mutex_enter(&spa_l2cache_lock);
1090	ASSERT(!vd->vdev_isl2cache);
1091	spa_aux_add(vd, &spa_l2cache_avl);
1092	vd->vdev_isl2cache = B_TRUE;
1093	mutex_exit(&spa_l2cache_lock);
1094}
1095
1096void
1097spa_l2cache_remove(vdev_t *vd)
1098{
1099	mutex_enter(&spa_l2cache_lock);
1100	ASSERT(vd->vdev_isl2cache);
1101	spa_aux_remove(vd, &spa_l2cache_avl);
1102	vd->vdev_isl2cache = B_FALSE;
1103	mutex_exit(&spa_l2cache_lock);
1104}
1105
1106boolean_t
1107spa_l2cache_exists(uint64_t guid, uint64_t *pool)
1108{
1109	boolean_t found;
1110
1111	mutex_enter(&spa_l2cache_lock);
1112	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
1113	mutex_exit(&spa_l2cache_lock);
1114
1115	return (found);
1116}
1117
1118void
1119spa_l2cache_activate(vdev_t *vd)
1120{
1121	mutex_enter(&spa_l2cache_lock);
1122	ASSERT(vd->vdev_isl2cache);
1123	spa_aux_activate(vd, &spa_l2cache_avl);
1124	mutex_exit(&spa_l2cache_lock);
1125}
1126
1127/*
1128 * ==========================================================================
1129 * SPA vdev locking
1130 * ==========================================================================
1131 */
1132
1133/*
1134 * Lock the given spa_t for the purpose of adding or removing a vdev.
1135 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1136 * It returns the next transaction group for the spa_t.
1137 */
1138uint64_t
1139spa_vdev_enter(spa_t *spa)
1140{
1141	mutex_enter(&spa->spa_vdev_top_lock);
1142	mutex_enter(&spa_namespace_lock);
1143
1144	vdev_autotrim_stop_all(spa);
1145
1146	return (spa_vdev_config_enter(spa));
1147}
1148
1149/*
1150 * Internal implementation for spa_vdev_enter().  Used when a vdev
1151 * operation requires multiple syncs (i.e. removing a device) while
1152 * keeping the spa_namespace_lock held.
1153 */
1154uint64_t
1155spa_vdev_config_enter(spa_t *spa)
1156{
1157	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1158
1159	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1160
1161	return (spa_last_synced_txg(spa) + 1);
1162}
1163
1164/*
1165 * Used in combination with spa_vdev_config_enter() to allow the syncing
1166 * of multiple transactions without releasing the spa_namespace_lock.
1167 */
1168void
1169spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
1170{
1171	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1172
1173	int config_changed = B_FALSE;
1174
1175	ASSERT(txg > spa_last_synced_txg(spa));
1176
1177	spa->spa_pending_vdev = NULL;
1178
1179	/*
1180	 * Reassess the DTLs.
1181	 */
1182	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1183
1184	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1185		config_changed = B_TRUE;
1186		spa->spa_config_generation++;
1187	}
1188
1189	/*
1190	 * Verify the metaslab classes.
1191	 */
1192	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1193	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1194	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
1195	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
1196
1197	spa_config_exit(spa, SCL_ALL, spa);
1198
1199	/*
1200	 * Panic the system if the specified tag requires it.  This
1201	 * is useful for ensuring that configurations are updated
1202	 * transactionally.
1203	 */
1204	if (zio_injection_enabled)
1205		zio_handle_panic_injection(spa, tag, 0);
1206
1207	/*
1208	 * Note: this txg_wait_synced() is important because it ensures
1209	 * that there won't be more than one config change per txg.
1210	 * This allows us to use the txg as the generation number.
1211	 */
1212	if (error == 0)
1213		txg_wait_synced(spa->spa_dsl_pool, txg);
1214
1215	if (vd != NULL) {
1216		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1217		if (vd->vdev_ops->vdev_op_leaf) {
1218			mutex_enter(&vd->vdev_initialize_lock);
1219			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
1220			    NULL);
1221			mutex_exit(&vd->vdev_initialize_lock);
1222
1223			mutex_enter(&vd->vdev_trim_lock);
1224			vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
1225			mutex_exit(&vd->vdev_trim_lock);
1226		}
1227
1228		/*
1229		 * The vdev may be both a leaf and top-level device.
1230		 */
1231		vdev_autotrim_stop_wait(vd);
1232
1233		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1234		vdev_free(vd);
1235		spa_config_exit(spa, SCL_ALL, spa);
1236	}
1237
1238	/*
1239	 * If the config changed, update the config cache.
1240	 */
1241	if (config_changed)
1242		spa_write_cachefile(spa, B_FALSE, B_TRUE);
1243}
1244
1245/*
1246 * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
1247 * locking of spa_vdev_enter(), we also want make sure the transactions have
1248 * synced to disk, and then update the global configuration cache with the new
1249 * information.
1250 */
1251int
1252spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1253{
1254	vdev_autotrim_restart(spa);
1255
1256	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1257	mutex_exit(&spa_namespace_lock);
1258	mutex_exit(&spa->spa_vdev_top_lock);
1259
1260	return (error);
1261}
1262
1263/*
1264 * Lock the given spa_t for the purpose of changing vdev state.
1265 */
1266void
1267spa_vdev_state_enter(spa_t *spa, int oplocks)
1268{
1269	int locks = SCL_STATE_ALL | oplocks;
1270
1271	/*
1272	 * Root pools may need to read of the underlying devfs filesystem
1273	 * when opening up a vdev.  Unfortunately if we're holding the
1274	 * SCL_ZIO lock it will result in a deadlock when we try to issue
1275	 * the read from the root filesystem.  Instead we "prefetch"
1276	 * the associated vnodes that we need prior to opening the
1277	 * underlying devices and cache them so that we can prevent
1278	 * any I/O when we are doing the actual open.
1279	 */
1280	if (spa_is_root(spa)) {
1281		int low = locks & ~(SCL_ZIO - 1);
1282		int high = locks & ~low;
1283
1284		spa_config_enter(spa, high, spa, RW_WRITER);
1285		vdev_hold(spa->spa_root_vdev);
1286		spa_config_enter(spa, low, spa, RW_WRITER);
1287	} else {
1288		spa_config_enter(spa, locks, spa, RW_WRITER);
1289	}
1290	spa->spa_vdev_locks = locks;
1291}
1292
1293int
1294spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1295{
1296	boolean_t config_changed = B_FALSE;
1297
1298	if (vd != NULL || error == 0)
1299		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1300		    0, 0, B_FALSE);
1301
1302	if (vd != NULL) {
1303		vdev_state_dirty(vd->vdev_top);
1304		config_changed = B_TRUE;
1305		spa->spa_config_generation++;
1306	}
1307
1308	if (spa_is_root(spa))
1309		vdev_rele(spa->spa_root_vdev);
1310
1311	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1312	spa_config_exit(spa, spa->spa_vdev_locks, spa);
1313
1314	/*
1315	 * If anything changed, wait for it to sync.  This ensures that,
1316	 * from the system administrator's perspective, zpool(1M) commands
1317	 * are synchronous.  This is important for things like zpool offline:
1318	 * when the command completes, you expect no further I/O from ZFS.
1319	 */
1320	if (vd != NULL)
1321		txg_wait_synced(spa->spa_dsl_pool, 0);
1322
1323	/*
1324	 * If the config changed, update the config cache.
1325	 */
1326	if (config_changed) {
1327		mutex_enter(&spa_namespace_lock);
1328		spa_write_cachefile(spa, B_FALSE, B_TRUE);
1329		mutex_exit(&spa_namespace_lock);
1330	}
1331
1332	return (error);
1333}
1334
1335/*
1336 * ==========================================================================
1337 * Miscellaneous functions
1338 * ==========================================================================
1339 */
1340
1341void
1342spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1343{
1344	if (!nvlist_exists(spa->spa_label_features, feature)) {
1345		fnvlist_add_boolean(spa->spa_label_features, feature);
1346		/*
1347		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1348		 * dirty the vdev config because lock SCL_CONFIG is not held.
1349		 * Thankfully, in this case we don't need to dirty the config
1350		 * because it will be written out anyway when we finish
1351		 * creating the pool.
1352		 */
1353		if (tx->tx_txg != TXG_INITIAL)
1354			vdev_config_dirty(spa->spa_root_vdev);
1355	}
1356}
1357
1358void
1359spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1360{
1361	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1362		vdev_config_dirty(spa->spa_root_vdev);
1363}
1364
1365/*
1366 * Return the spa_t associated with given pool_guid, if it exists.  If
1367 * device_guid is non-zero, determine whether the pool exists *and* contains
1368 * a device with the specified device_guid.
1369 */
1370spa_t *
1371spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1372{
1373	spa_t *spa;
1374	avl_tree_t *t = &spa_namespace_avl;
1375
1376	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1377
1378	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1379		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1380			continue;
1381		if (spa->spa_root_vdev == NULL)
1382			continue;
1383		if (spa_guid(spa) == pool_guid) {
1384			if (device_guid == 0)
1385				break;
1386
1387			if (vdev_lookup_by_guid(spa->spa_root_vdev,
1388			    device_guid) != NULL)
1389				break;
1390
1391			/*
1392			 * Check any devices we may be in the process of adding.
1393			 */
1394			if (spa->spa_pending_vdev) {
1395				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1396				    device_guid) != NULL)
1397					break;
1398			}
1399		}
1400	}
1401
1402	return (spa);
1403}
1404
1405/*
1406 * Determine whether a pool with the given pool_guid exists.
1407 */
1408boolean_t
1409spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1410{
1411	return (spa_by_guid(pool_guid, device_guid) != NULL);
1412}
1413
1414char *
1415spa_strdup(const char *s)
1416{
1417	size_t len;
1418	char *new;
1419
1420	len = strlen(s);
1421	new = kmem_alloc(len + 1, KM_SLEEP);
1422	bcopy(s, new, len);
1423	new[len] = '\0';
1424
1425	return (new);
1426}
1427
1428void
1429spa_strfree(char *s)
1430{
1431	kmem_free(s, strlen(s) + 1);
1432}
1433
1434uint64_t
1435spa_get_random(uint64_t range)
1436{
1437	uint64_t r;
1438
1439	ASSERT(range != 0);
1440
1441	if (range == 1)
1442		return (0);
1443
1444	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1445
1446	return (r % range);
1447}
1448
1449uint64_t
1450spa_generate_guid(spa_t *spa)
1451{
1452	uint64_t guid = spa_get_random(-1ULL);
1453
1454	if (spa != NULL) {
1455		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1456			guid = spa_get_random(-1ULL);
1457	} else {
1458		while (guid == 0 || spa_guid_exists(guid, 0))
1459			guid = spa_get_random(-1ULL);
1460	}
1461
1462	return (guid);
1463}
1464
1465void
1466snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1467{
1468	char type[256];
1469	char *checksum = NULL;
1470	char *compress = NULL;
1471
1472	if (bp != NULL) {
1473		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1474			dmu_object_byteswap_t bswap =
1475			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1476			(void) snprintf(type, sizeof (type), "bswap %s %s",
1477			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1478			    "metadata" : "data",
1479			    dmu_ot_byteswap[bswap].ob_name);
1480		} else {
1481			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1482			    sizeof (type));
1483		}
1484		if (!BP_IS_EMBEDDED(bp)) {
1485			checksum =
1486			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1487		}
1488		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1489	}
1490
1491	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
1492	    compress);
1493}
1494
1495void
1496spa_freeze(spa_t *spa)
1497{
1498	uint64_t freeze_txg = 0;
1499
1500	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1501	if (spa->spa_freeze_txg == UINT64_MAX) {
1502		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1503		spa->spa_freeze_txg = freeze_txg;
1504	}
1505	spa_config_exit(spa, SCL_ALL, FTAG);
1506	if (freeze_txg != 0)
1507		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1508}
1509
1510void
1511zfs_panic_recover(const char *fmt, ...)
1512{
1513	va_list adx;
1514
1515	va_start(adx, fmt);
1516	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1517	va_end(adx);
1518}
1519
1520/*
1521 * This is a stripped-down version of strtoull, suitable only for converting
1522 * lowercase hexadecimal numbers that don't overflow.
1523 */
1524uint64_t
1525zfs_strtonum(const char *str, char **nptr)
1526{
1527	uint64_t val = 0;
1528	char c;
1529	int digit;
1530
1531	while ((c = *str) != '\0') {
1532		if (c >= '0' && c <= '9')
1533			digit = c - '0';
1534		else if (c >= 'a' && c <= 'f')
1535			digit = 10 + c - 'a';
1536		else
1537			break;
1538
1539		val *= 16;
1540		val += digit;
1541
1542		str++;
1543	}
1544
1545	if (nptr)
1546		*nptr = (char *)str;
1547
1548	return (val);
1549}
1550
1551void
1552spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
1553{
1554	/*
1555	 * We bump the feature refcount for each special vdev added to the pool
1556	 */
1557	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
1558	spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
1559}
1560
1561/*
1562 * ==========================================================================
1563 * Accessor functions
1564 * ==========================================================================
1565 */
1566
1567boolean_t
1568spa_shutting_down(spa_t *spa)
1569{
1570	return (spa->spa_async_suspended);
1571}
1572
1573dsl_pool_t *
1574spa_get_dsl(spa_t *spa)
1575{
1576	return (spa->spa_dsl_pool);
1577}
1578
1579boolean_t
1580spa_is_initializing(spa_t *spa)
1581{
1582	return (spa->spa_is_initializing);
1583}
1584
1585boolean_t
1586spa_indirect_vdevs_loaded(spa_t *spa)
1587{
1588	return (spa->spa_indirect_vdevs_loaded);
1589}
1590
1591blkptr_t *
1592spa_get_rootblkptr(spa_t *spa)
1593{
1594	return (&spa->spa_ubsync.ub_rootbp);
1595}
1596
1597void
1598spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1599{
1600	spa->spa_uberblock.ub_rootbp = *bp;
1601}
1602
1603void
1604spa_altroot(spa_t *spa, char *buf, size_t buflen)
1605{
1606	if (spa->spa_root == NULL)
1607		buf[0] = '\0';
1608	else
1609		(void) strncpy(buf, spa->spa_root, buflen);
1610}
1611
1612int
1613spa_sync_pass(spa_t *spa)
1614{
1615	return (spa->spa_sync_pass);
1616}
1617
1618char *
1619spa_name(spa_t *spa)
1620{
1621	return (spa->spa_name);
1622}
1623
1624uint64_t
1625spa_guid(spa_t *spa)
1626{
1627	dsl_pool_t *dp = spa_get_dsl(spa);
1628	uint64_t guid;
1629
1630	/*
1631	 * If we fail to parse the config during spa_load(), we can go through
1632	 * the error path (which posts an ereport) and end up here with no root
1633	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
1634	 * this case.
1635	 */
1636	if (spa->spa_root_vdev == NULL)
1637		return (spa->spa_config_guid);
1638
1639	guid = spa->spa_last_synced_guid != 0 ?
1640	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1641
1642	/*
1643	 * Return the most recently synced out guid unless we're
1644	 * in syncing context.
1645	 */
1646	if (dp && dsl_pool_sync_context(dp))
1647		return (spa->spa_root_vdev->vdev_guid);
1648	else
1649		return (guid);
1650}
1651
1652uint64_t
1653spa_load_guid(spa_t *spa)
1654{
1655	/*
1656	 * This is a GUID that exists solely as a reference for the
1657	 * purposes of the arc.  It is generated at load time, and
1658	 * is never written to persistent storage.
1659	 */
1660	return (spa->spa_load_guid);
1661}
1662
1663uint64_t
1664spa_last_synced_txg(spa_t *spa)
1665{
1666	return (spa->spa_ubsync.ub_txg);
1667}
1668
1669uint64_t
1670spa_first_txg(spa_t *spa)
1671{
1672	return (spa->spa_first_txg);
1673}
1674
1675uint64_t
1676spa_syncing_txg(spa_t *spa)
1677{
1678	return (spa->spa_syncing_txg);
1679}
1680
1681/*
1682 * Return the last txg where data can be dirtied. The final txgs
1683 * will be used to just clear out any deferred frees that remain.
1684 */
1685uint64_t
1686spa_final_dirty_txg(spa_t *spa)
1687{
1688	return (spa->spa_final_txg - TXG_DEFER_SIZE);
1689}
1690
1691pool_state_t
1692spa_state(spa_t *spa)
1693{
1694	return (spa->spa_state);
1695}
1696
1697spa_load_state_t
1698spa_load_state(spa_t *spa)
1699{
1700	return (spa->spa_load_state);
1701}
1702
1703uint64_t
1704spa_freeze_txg(spa_t *spa)
1705{
1706	return (spa->spa_freeze_txg);
1707}
1708
1709/* ARGSUSED */
1710uint64_t
1711spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
1712{
1713	return (lsize * spa_asize_inflation);
1714}
1715
1716/*
1717 * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
1718 * or at least 128MB, unless that would cause it to be more than half the
1719 * pool size.
1720 *
1721 * See the comment above spa_slop_shift for details.
1722 */
1723uint64_t
1724spa_get_slop_space(spa_t *spa)
1725{
1726	uint64_t space = spa_get_dspace(spa);
1727	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
1728}
1729
1730uint64_t
1731spa_get_dspace(spa_t *spa)
1732{
1733	return (spa->spa_dspace);
1734}
1735
1736uint64_t
1737spa_get_checkpoint_space(spa_t *spa)
1738{
1739	return (spa->spa_checkpoint_info.sci_dspace);
1740}
1741
1742void
1743spa_update_dspace(spa_t *spa)
1744{
1745	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1746	    ddt_get_dedup_dspace(spa);
1747	if (spa->spa_vdev_removal != NULL) {
1748		/*
1749		 * We can't allocate from the removing device, so
1750		 * subtract its size.  This prevents the DMU/DSL from
1751		 * filling up the (now smaller) pool while we are in the
1752		 * middle of removing the device.
1753		 *
1754		 * Note that the DMU/DSL doesn't actually know or care
1755		 * how much space is allocated (it does its own tracking
1756		 * of how much space has been logically used).  So it
1757		 * doesn't matter that the data we are moving may be
1758		 * allocated twice (on the old device and the new
1759		 * device).
1760		 */
1761		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1762		vdev_t *vd =
1763		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
1764		spa->spa_dspace -= spa_deflate(spa) ?
1765		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1766		spa_config_exit(spa, SCL_VDEV, FTAG);
1767	}
1768}
1769
1770/*
1771 * Return the failure mode that has been set to this pool. The default
1772 * behavior will be to block all I/Os when a complete failure occurs.
1773 */
1774uint8_t
1775spa_get_failmode(spa_t *spa)
1776{
1777	return (spa->spa_failmode);
1778}
1779
1780boolean_t
1781spa_suspended(spa_t *spa)
1782{
1783	return (spa->spa_suspended != ZIO_SUSPEND_NONE);
1784}
1785
1786uint64_t
1787spa_version(spa_t *spa)
1788{
1789	return (spa->spa_ubsync.ub_version);
1790}
1791
1792boolean_t
1793spa_deflate(spa_t *spa)
1794{
1795	return (spa->spa_deflate);
1796}
1797
1798metaslab_class_t *
1799spa_normal_class(spa_t *spa)
1800{
1801	return (spa->spa_normal_class);
1802}
1803
1804metaslab_class_t *
1805spa_log_class(spa_t *spa)
1806{
1807	return (spa->spa_log_class);
1808}
1809
1810metaslab_class_t *
1811spa_special_class(spa_t *spa)
1812{
1813	return (spa->spa_special_class);
1814}
1815
1816metaslab_class_t *
1817spa_dedup_class(spa_t *spa)
1818{
1819	return (spa->spa_dedup_class);
1820}
1821
1822/*
1823 * Locate an appropriate allocation class
1824 */
1825metaslab_class_t *
1826spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
1827    uint_t level, uint_t special_smallblk)
1828{
1829	if (DMU_OT_IS_ZIL(objtype)) {
1830		if (spa->spa_log_class->mc_groups != 0)
1831			return (spa_log_class(spa));
1832		else
1833			return (spa_normal_class(spa));
1834	}
1835
1836	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
1837
1838	if (DMU_OT_IS_DDT(objtype)) {
1839		if (spa->spa_dedup_class->mc_groups != 0)
1840			return (spa_dedup_class(spa));
1841		else if (has_special_class && zfs_ddt_data_is_special)
1842			return (spa_special_class(spa));
1843		else
1844			return (spa_normal_class(spa));
1845	}
1846
1847	/* Indirect blocks for user data can land in special if allowed */
1848	if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
1849		if (has_special_class && zfs_user_indirect_is_special)
1850			return (spa_special_class(spa));
1851		else
1852			return (spa_normal_class(spa));
1853	}
1854
1855	if (DMU_OT_IS_METADATA(objtype) || level > 0) {
1856		if (has_special_class)
1857			return (spa_special_class(spa));
1858		else
1859			return (spa_normal_class(spa));
1860	}
1861
1862	/*
1863	 * Allow small file blocks in special class in some cases (like
1864	 * for the dRAID vdev feature). But always leave a reserve of
1865	 * zfs_special_class_metadata_reserve_pct exclusively for metadata.
1866	 */
1867	if (DMU_OT_IS_FILE(objtype) &&
1868	    has_special_class && size <= special_smallblk) {
1869		metaslab_class_t *special = spa_special_class(spa);
1870		uint64_t alloc = metaslab_class_get_alloc(special);
1871		uint64_t space = metaslab_class_get_space(special);
1872		uint64_t limit =
1873		    (space * (100 - zfs_special_class_metadata_reserve_pct))
1874		    / 100;
1875
1876		if (alloc < limit)
1877			return (special);
1878	}
1879
1880	return (spa_normal_class(spa));
1881}
1882
1883void
1884spa_evicting_os_register(spa_t *spa, objset_t *os)
1885{
1886	mutex_enter(&spa->spa_evicting_os_lock);
1887	list_insert_head(&spa->spa_evicting_os_list, os);
1888	mutex_exit(&spa->spa_evicting_os_lock);
1889}
1890
1891void
1892spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1893{
1894	mutex_enter(&spa->spa_evicting_os_lock);
1895	list_remove(&spa->spa_evicting_os_list, os);
1896	cv_broadcast(&spa->spa_evicting_os_cv);
1897	mutex_exit(&spa->spa_evicting_os_lock);
1898}
1899
1900void
1901spa_evicting_os_wait(spa_t *spa)
1902{
1903	mutex_enter(&spa->spa_evicting_os_lock);
1904	while (!list_is_empty(&spa->spa_evicting_os_list))
1905		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1906	mutex_exit(&spa->spa_evicting_os_lock);
1907
1908	dmu_buf_user_evict_wait();
1909}
1910
1911int
1912spa_max_replication(spa_t *spa)
1913{
1914	/*
1915	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1916	 * handle BPs with more than one DVA allocated.  Set our max
1917	 * replication level accordingly.
1918	 */
1919	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1920		return (1);
1921	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1922}
1923
1924int
1925spa_prev_software_version(spa_t *spa)
1926{
1927	return (spa->spa_prev_software_version);
1928}
1929
1930uint64_t
1931spa_deadman_synctime(spa_t *spa)
1932{
1933	return (spa->spa_deadman_synctime);
1934}
1935
1936spa_autotrim_t
1937spa_get_autotrim(spa_t *spa)
1938{
1939	return (spa->spa_autotrim);
1940}
1941
1942uint64_t
1943dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1944{
1945	uint64_t asize = DVA_GET_ASIZE(dva);
1946	uint64_t dsize = asize;
1947
1948	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1949
1950	if (asize != 0 && spa->spa_deflate) {
1951		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1952		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1953	}
1954
1955	return (dsize);
1956}
1957
1958uint64_t
1959bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1960{
1961	uint64_t dsize = 0;
1962
1963	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1964		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1965
1966	return (dsize);
1967}
1968
1969uint64_t
1970bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1971{
1972	uint64_t dsize = 0;
1973
1974	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1975
1976	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1977		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1978
1979	spa_config_exit(spa, SCL_VDEV, FTAG);
1980
1981	return (dsize);
1982}
1983
1984uint64_t
1985spa_dirty_data(spa_t *spa)
1986{
1987	return (spa->spa_dsl_pool->dp_dirty_total);
1988}
1989
1990/*
1991 * ==========================================================================
1992 * Initialization and Termination
1993 * ==========================================================================
1994 */
1995
1996static int
1997spa_name_compare(const void *a1, const void *a2)
1998{
1999	const spa_t *s1 = a1;
2000	const spa_t *s2 = a2;
2001	int s;
2002
2003	s = strcmp(s1->spa_name, s2->spa_name);
2004
2005	return (AVL_ISIGN(s));
2006}
2007
2008int
2009spa_busy(void)
2010{
2011	return (spa_active_count);
2012}
2013
2014void
2015spa_boot_init()
2016{
2017	spa_config_load();
2018}
2019
2020void
2021spa_init(int mode)
2022{
2023	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
2024	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
2025	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
2026	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
2027
2028	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
2029	    offsetof(spa_t, spa_avl));
2030
2031	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
2032	    offsetof(spa_aux_t, aux_avl));
2033
2034	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
2035	    offsetof(spa_aux_t, aux_avl));
2036
2037	spa_mode_global = mode;
2038
2039#ifdef _KERNEL
2040	spa_arch_init();
2041#else
2042	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
2043		arc_procfd = open("/proc/self/ctl", O_WRONLY);
2044		if (arc_procfd == -1) {
2045			perror("could not enable watchpoints: "
2046			    "opening /proc/self/ctl failed: ");
2047		} else {
2048			arc_watch = B_TRUE;
2049		}
2050	}
2051#endif
2052
2053	zfs_refcount_init();
2054	unique_init();
2055	range_tree_init();
2056	metaslab_alloc_trace_init();
2057	zio_init();
2058	dmu_init();
2059	zil_init();
2060	vdev_cache_stat_init();
2061	vdev_mirror_stat_init();
2062	zfs_prop_init();
2063	zpool_prop_init();
2064	zpool_feature_init();
2065	spa_config_load();
2066	l2arc_start();
2067	scan_init();
2068}
2069
2070void
2071spa_fini(void)
2072{
2073	l2arc_stop();
2074
2075	spa_evict_all();
2076
2077	vdev_cache_stat_fini();
2078	vdev_mirror_stat_fini();
2079	zil_fini();
2080	dmu_fini();
2081	zio_fini();
2082	metaslab_alloc_trace_fini();
2083	range_tree_fini();
2084	unique_fini();
2085	zfs_refcount_fini();
2086	scan_fini();
2087
2088	avl_destroy(&spa_namespace_avl);
2089	avl_destroy(&spa_spare_avl);
2090	avl_destroy(&spa_l2cache_avl);
2091
2092	cv_destroy(&spa_namespace_cv);
2093	mutex_destroy(&spa_namespace_lock);
2094	mutex_destroy(&spa_spare_lock);
2095	mutex_destroy(&spa_l2cache_lock);
2096}
2097
2098/*
2099 * Return whether this pool has slogs. No locking needed.
2100 * It's not a problem if the wrong answer is returned as it's only for
2101 * performance and not correctness
2102 */
2103boolean_t
2104spa_has_slogs(spa_t *spa)
2105{
2106	return (spa->spa_log_class->mc_rotor != NULL);
2107}
2108
2109spa_log_state_t
2110spa_get_log_state(spa_t *spa)
2111{
2112	return (spa->spa_log_state);
2113}
2114
2115void
2116spa_set_log_state(spa_t *spa, spa_log_state_t state)
2117{
2118	spa->spa_log_state = state;
2119}
2120
2121boolean_t
2122spa_is_root(spa_t *spa)
2123{
2124	return (spa->spa_is_root);
2125}
2126
2127boolean_t
2128spa_writeable(spa_t *spa)
2129{
2130	return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
2131}
2132
2133/*
2134 * Returns true if there is a pending sync task in any of the current
2135 * syncing txg, the current quiescing txg, or the current open txg.
2136 */
2137boolean_t
2138spa_has_pending_synctask(spa_t *spa)
2139{
2140	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
2141	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
2142}
2143
2144int
2145spa_mode(spa_t *spa)
2146{
2147	return (spa->spa_mode);
2148}
2149
2150uint64_t
2151spa_bootfs(spa_t *spa)
2152{
2153	return (spa->spa_bootfs);
2154}
2155
2156uint64_t
2157spa_delegation(spa_t *spa)
2158{
2159	return (spa->spa_delegation);
2160}
2161
2162objset_t *
2163spa_meta_objset(spa_t *spa)
2164{
2165	return (spa->spa_meta_objset);
2166}
2167
2168enum zio_checksum
2169spa_dedup_checksum(spa_t *spa)
2170{
2171	return (spa->spa_dedup_checksum);
2172}
2173
2174/*
2175 * Reset pool scan stat per scan pass (or reboot).
2176 */
2177void
2178spa_scan_stat_init(spa_t *spa)
2179{
2180	/* data not stored on disk */
2181	spa->spa_scan_pass_start = gethrestime_sec();
2182	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
2183		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
2184	else
2185		spa->spa_scan_pass_scrub_pause = 0;
2186	spa->spa_scan_pass_scrub_spent_paused = 0;
2187	spa->spa_scan_pass_exam = 0;
2188	spa->spa_scan_pass_issued = 0;
2189	vdev_scan_stat_init(spa->spa_root_vdev);
2190}
2191
2192/*
2193 * Get scan stats for zpool status reports
2194 */
2195int
2196spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2197{
2198	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2199
2200	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2201		return (SET_ERROR(ENOENT));
2202	bzero(ps, sizeof (pool_scan_stat_t));
2203
2204	/* data stored on disk */
2205	ps->pss_func = scn->scn_phys.scn_func;
2206	ps->pss_state = scn->scn_phys.scn_state;
2207	ps->pss_start_time = scn->scn_phys.scn_start_time;
2208	ps->pss_end_time = scn->scn_phys.scn_end_time;
2209	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2210	ps->pss_to_process = scn->scn_phys.scn_to_process;
2211	ps->pss_processed = scn->scn_phys.scn_processed;
2212	ps->pss_errors = scn->scn_phys.scn_errors;
2213	ps->pss_examined = scn->scn_phys.scn_examined;
2214	ps->pss_issued =
2215	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
2216	ps->pss_state = scn->scn_phys.scn_state;
2217
2218	/* data not stored on disk */
2219	ps->pss_pass_start = spa->spa_scan_pass_start;
2220	ps->pss_pass_exam = spa->spa_scan_pass_exam;
2221	ps->pss_pass_issued = spa->spa_scan_pass_issued;
2222	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
2223	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
2224
2225	return (0);
2226}
2227
2228int
2229spa_maxblocksize(spa_t *spa)
2230{
2231	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2232		return (SPA_MAXBLOCKSIZE);
2233	else
2234		return (SPA_OLD_MAXBLOCKSIZE);
2235}
2236
2237int
2238spa_maxdnodesize(spa_t *spa)
2239{
2240	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
2241		return (DNODE_MAX_SIZE);
2242	else
2243		return (DNODE_MIN_SIZE);
2244}
2245
2246boolean_t
2247spa_multihost(spa_t *spa)
2248{
2249	return (spa->spa_multihost ? B_TRUE : B_FALSE);
2250}
2251
2252unsigned long
2253spa_get_hostid(void)
2254{
2255	unsigned long myhostid;
2256
2257#ifdef	_KERNEL
2258	myhostid = zone_get_hostid(NULL);
2259#else	/* _KERNEL */
2260	/*
2261	 * We're emulating the system's hostid in userland, so
2262	 * we can't use zone_get_hostid().
2263	 */
2264	(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2265#endif	/* _KERNEL */
2266
2267	return (myhostid);
2268}
2269
2270/*
2271 * Returns the txg that the last device removal completed. No indirect mappings
2272 * have been added since this txg.
2273 */
2274uint64_t
2275spa_get_last_removal_txg(spa_t *spa)
2276{
2277	uint64_t vdevid;
2278	uint64_t ret = -1ULL;
2279
2280	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2281	/*
2282	 * sr_prev_indirect_vdev is only modified while holding all the
2283	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
2284	 * examining it.
2285	 */
2286	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
2287
2288	while (vdevid != -1ULL) {
2289		vdev_t *vd = vdev_lookup_top(spa, vdevid);
2290		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
2291
2292		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
2293
2294		/*
2295		 * If the removal did not remap any data, we don't care.
2296		 */
2297		if (vdev_indirect_births_count(vib) != 0) {
2298			ret = vdev_indirect_births_last_entry_txg(vib);
2299			break;
2300		}
2301
2302		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
2303	}
2304	spa_config_exit(spa, SCL_VDEV, FTAG);
2305
2306	IMPLY(ret != -1ULL,
2307	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
2308
2309	return (ret);
2310}
2311
2312boolean_t
2313spa_trust_config(spa_t *spa)
2314{
2315	return (spa->spa_trust_config);
2316}
2317
2318uint64_t
2319spa_missing_tvds_allowed(spa_t *spa)
2320{
2321	return (spa->spa_missing_tvds_allowed);
2322}
2323
2324void
2325spa_set_missing_tvds(spa_t *spa, uint64_t missing)
2326{
2327	spa->spa_missing_tvds = missing;
2328}
2329
2330boolean_t
2331spa_top_vdevs_spacemap_addressable(spa_t *spa)
2332{
2333	vdev_t *rvd = spa->spa_root_vdev;
2334	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2335		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
2336			return (B_FALSE);
2337	}
2338	return (B_TRUE);
2339}
2340
2341boolean_t
2342spa_has_checkpoint(spa_t *spa)
2343{
2344	return (spa->spa_checkpoint_txg != 0);
2345}
2346
2347boolean_t
2348spa_importing_readonly_checkpoint(spa_t *spa)
2349{
2350	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
2351	    spa->spa_mode == FREAD);
2352}
2353
2354uint64_t
2355spa_min_claim_txg(spa_t *spa)
2356{
2357	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
2358
2359	if (checkpoint_txg != 0)
2360		return (checkpoint_txg + 1);
2361
2362	return (spa->spa_first_txg);
2363}
2364
2365/*
2366 * If there is a checkpoint, async destroys may consume more space from
2367 * the pool instead of freeing it. In an attempt to save the pool from
2368 * getting suspended when it is about to run out of space, we stop
2369 * processing async destroys.
2370 */
2371boolean_t
2372spa_suspend_async_destroy(spa_t *spa)
2373{
2374	dsl_pool_t *dp = spa_get_dsl(spa);
2375
2376	uint64_t unreserved = dsl_pool_unreserved_space(dp,
2377	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
2378	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
2379	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
2380
2381	if (spa_has_checkpoint(spa) && avail == 0)
2382		return (B_TRUE);
2383
2384	return (B_FALSE);
2385}
2386