xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision ecd18dec)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
228d18220dSMark J Musante  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23f78cdc34SPaul Dagnelie  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
249dc3941cSSašo Kiselkov  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25c3d26abcSMatthew Ahrens  * Copyright (c) 2014 Integros [integros.com]
26663207adSDon Brady  * Copyright (c) 2017, Intel Corporation.
27fa9e4066Sahrens  */
28fa9e4066Sahrens 
29fa9e4066Sahrens #include <sys/zfs_context.h>
30fa9e4066Sahrens #include <sys/dmu.h>
31fa9e4066Sahrens #include <sys/dmu_tx.h>
32fa9e4066Sahrens #include <sys/space_map.h>
33fa9e4066Sahrens #include <sys/metaslab_impl.h>
34fa9e4066Sahrens #include <sys/vdev_impl.h>
35fa9e4066Sahrens #include <sys/zio.h>
360713e232SGeorge Wilson #include <sys/spa_impl.h>
372e4c9986SGeorge Wilson #include <sys/zfeature.h>
385cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_mapping.h>
3986714001SSerapheim Dimitropoulos #include <sys/zap.h>
404d7988d6SPaul Dagnelie #include <sys/btree.h>
41fa9e4066Sahrens 
420f7643c7SGeorge Wilson #define	GANG_ALLOCATION(flags) \
430f7643c7SGeorge Wilson 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
4409c9d376SGeorge Wilson 
4513506d1eSmaybee uint64_t metaslab_aliquot = 512ULL << 10;
46243952c7SMatt Ahrens uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
4713506d1eSmaybee 
4886714001SSerapheim Dimitropoulos /*
49814dcd43SSerapheim Dimitropoulos  * In pools where the log space map feature is not enabled we touch
50814dcd43SSerapheim Dimitropoulos  * multiple metaslabs (and their respective space maps) with each
51814dcd43SSerapheim Dimitropoulos  * transaction group. Thus, we benefit from having a small space map
5286714001SSerapheim Dimitropoulos  * block size since it allows us to issue more I/O operations scattered
53814dcd43SSerapheim Dimitropoulos  * around the disk. So a sane default for the space map block size
54814dcd43SSerapheim Dimitropoulos  * is 8~16K.
5586714001SSerapheim Dimitropoulos  */
56814dcd43SSerapheim Dimitropoulos int zfs_metaslab_sm_blksz_no_log = (1 << 14);
57814dcd43SSerapheim Dimitropoulos 
58814dcd43SSerapheim Dimitropoulos /*
59814dcd43SSerapheim Dimitropoulos  * When the log space map feature is enabled, we accumulate a lot of
60814dcd43SSerapheim Dimitropoulos  * changes per metaslab that are flushed once in a while so we benefit
61814dcd43SSerapheim Dimitropoulos  * from a bigger block size like 128K for the metaslab space maps.
62814dcd43SSerapheim Dimitropoulos  */
63814dcd43SSerapheim Dimitropoulos int zfs_metaslab_sm_blksz_with_log = (1 << 17);
6486714001SSerapheim Dimitropoulos 
6516a4a807SGeorge Wilson /*
6616a4a807SGeorge Wilson  * The in-core space map representation is more compact than its on-disk form.
6716a4a807SGeorge Wilson  * The zfs_condense_pct determines how much more compact the in-core
688363e80aSGeorge Wilson  * space map representation must be before we compact it on-disk.
6916a4a807SGeorge Wilson  * Values should be greater than or equal to 100.
7016a4a807SGeorge Wilson  */
7116a4a807SGeorge Wilson int zfs_condense_pct = 200;
7216a4a807SGeorge Wilson 
732a104a52SAlex Reece /*
742a104a52SAlex Reece  * Condensing a metaslab is not guaranteed to actually reduce the amount of
752a104a52SAlex Reece  * space used on disk. In particular, a space map uses data in increments of
76b1be2892SMatthew Ahrens  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
772a104a52SAlex Reece  * same number of blocks after condensing. Since the goal of condensing is to
782a104a52SAlex Reece  * reduce the number of IOPs required to read the space map, we only want to
792a104a52SAlex Reece  * condense when we can be sure we will reduce the number of blocks used by the
802a104a52SAlex Reece  * space map. Unfortunately, we cannot precisely compute whether or not this is
812a104a52SAlex Reece  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
822a104a52SAlex Reece  * we apply the following heuristic: do not condense a spacemap unless the
832a104a52SAlex Reece  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
842a104a52SAlex Reece  * blocks.
852a104a52SAlex Reece  */
862a104a52SAlex Reece int zfs_metaslab_condense_block_threshold = 4;
872a104a52SAlex Reece 
8822e30981SGeorge Wilson /*
8922e30981SGeorge Wilson  * The zfs_mg_noalloc_threshold defines which metaslab groups should
9022e30981SGeorge Wilson  * be eligible for allocation. The value is defined as a percentage of
912e4c9986SGeorge Wilson  * free space. Metaslab groups that have more free space than
9222e30981SGeorge Wilson  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
9322e30981SGeorge Wilson  * a metaslab group's free space is less than or equal to the
9422e30981SGeorge Wilson  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
9522e30981SGeorge Wilson  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
9622e30981SGeorge Wilson  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
9722e30981SGeorge Wilson  * groups are allowed to accept allocations. Gang blocks are always
9822e30981SGeorge Wilson  * eligible to allocate on any metaslab group. The default value of 0 means
9922e30981SGeorge Wilson  * no metaslab group will be excluded based on this criterion.
10022e30981SGeorge Wilson  */
10122e30981SGeorge Wilson int zfs_mg_noalloc_threshold = 0;
10209c9d376SGeorge Wilson 
1032e4c9986SGeorge Wilson /*
1042e4c9986SGeorge Wilson  * Metaslab groups are considered eligible for allocations if their
105814dcd43SSerapheim Dimitropoulos  * fragmenation metric (measured as a percentage) is less than or
106814dcd43SSerapheim Dimitropoulos  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
107814dcd43SSerapheim Dimitropoulos  * exceeds this threshold then it will be skipped unless all metaslab
108814dcd43SSerapheim Dimitropoulos  * groups within the metaslab class have also crossed this threshold.
109814dcd43SSerapheim Dimitropoulos  *
110814dcd43SSerapheim Dimitropoulos  * This tunable was introduced to avoid edge cases where we continue
111814dcd43SSerapheim Dimitropoulos  * allocating from very fragmented disks in our pool while other, less
112814dcd43SSerapheim Dimitropoulos  * fragmented disks, exists. On the other hand, if all disks in the
113814dcd43SSerapheim Dimitropoulos  * pool are uniformly approaching the threshold, the threshold can
114814dcd43SSerapheim Dimitropoulos  * be a speed bump in performance, where we keep switching the disks
115814dcd43SSerapheim Dimitropoulos  * that we allocate from (e.g. we allocate some segments from disk A
116814dcd43SSerapheim Dimitropoulos  * making it bypassing the threshold while freeing segments from disk
117814dcd43SSerapheim Dimitropoulos  * B getting its fragmentation below the threshold).
118814dcd43SSerapheim Dimitropoulos  *
119814dcd43SSerapheim Dimitropoulos  * Empirically, we've seen that our vdev selection for allocations is
120814dcd43SSerapheim Dimitropoulos  * good enough that fragmentation increases uniformly across all vdevs
121814dcd43SSerapheim Dimitropoulos  * the majority of the time. Thus we set the threshold percentage high
122814dcd43SSerapheim Dimitropoulos  * enough to avoid hitting the speed bump on pools that are being pushed
123814dcd43SSerapheim Dimitropoulos  * to the edge.
1242e4c9986SGeorge Wilson  */
125814dcd43SSerapheim Dimitropoulos int zfs_mg_fragmentation_threshold = 95;
1262e4c9986SGeorge Wilson 
1272e4c9986SGeorge Wilson /*
1282e4c9986SGeorge Wilson  * Allow metaslabs to keep their active state as long as their fragmentation
1292e4c9986SGeorge Wilson  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
1302e4c9986SGeorge Wilson  * active metaslab that exceeds this threshold will no longer keep its active
1312e4c9986SGeorge Wilson  * status allowing better metaslabs to be selected.
1322e4c9986SGeorge Wilson  */
1332e4c9986SGeorge Wilson int zfs_metaslab_fragmentation_threshold = 70;
1342e4c9986SGeorge Wilson 
135b24ab676SJeff Bonwick /*
1360713e232SGeorge Wilson  * When set will load all metaslabs when pool is first opened.
137b24ab676SJeff Bonwick  */
1380713e232SGeorge Wilson int metaslab_debug_load = 0;
1390713e232SGeorge Wilson 
1400713e232SGeorge Wilson /*
1410713e232SGeorge Wilson  * When set will prevent metaslabs from being unloaded.
1420713e232SGeorge Wilson  */
1430713e232SGeorge Wilson int metaslab_debug_unload = 0;
144b24ab676SJeff Bonwick 
145d6e555bdSGeorge Wilson /*
146d6e555bdSGeorge Wilson  * Minimum size which forces the dynamic allocator to change
14780eb36f2SGeorge Wilson  * it's allocation strategy.  Once the space map cannot satisfy
148d6e555bdSGeorge Wilson  * an allocation of this size then it switches to using more
149d6e555bdSGeorge Wilson  * aggressive strategy (i.e search by size rather than offset).
150d6e555bdSGeorge Wilson  */
151b5152584SMatthew Ahrens uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
152d6e555bdSGeorge Wilson 
153d6e555bdSGeorge Wilson /*
154d6e555bdSGeorge Wilson  * The minimum free space, in percent, which must be available
155d6e555bdSGeorge Wilson  * in a space map to continue allocations in a first-fit fashion.
1568363e80aSGeorge Wilson  * Once the space map's free space drops below this level we dynamically
157d6e555bdSGeorge Wilson  * switch to using best-fit allocations.
158d6e555bdSGeorge Wilson  */
15980eb36f2SGeorge Wilson int metaslab_df_free_pct = 4;
16080eb36f2SGeorge Wilson 
161814dcd43SSerapheim Dimitropoulos /*
162814dcd43SSerapheim Dimitropoulos  * Maximum distance to search forward from the last offset. Without this
163814dcd43SSerapheim Dimitropoulos  * limit, fragmented pools can see >100,000 iterations and
164814dcd43SSerapheim Dimitropoulos  * metaslab_block_picker() becomes the performance limiting factor on
165814dcd43SSerapheim Dimitropoulos  * high-performance storage.
166814dcd43SSerapheim Dimitropoulos  *
167814dcd43SSerapheim Dimitropoulos  * With the default setting of 16MB, we typically see less than 500
168814dcd43SSerapheim Dimitropoulos  * iterations, even with very fragmented, ashift=9 pools. The maximum number
169814dcd43SSerapheim Dimitropoulos  * of iterations possible is:
170814dcd43SSerapheim Dimitropoulos  *     metaslab_df_max_search / (2 * (1<<ashift))
171814dcd43SSerapheim Dimitropoulos  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
172814dcd43SSerapheim Dimitropoulos  * 2048 (with ashift=12).
173814dcd43SSerapheim Dimitropoulos  */
174814dcd43SSerapheim Dimitropoulos int metaslab_df_max_search = 16 * 1024 * 1024;
175814dcd43SSerapheim Dimitropoulos 
1764d7988d6SPaul Dagnelie /*
1774d7988d6SPaul Dagnelie  * Forces the metaslab_block_picker function to search for at least this many
1784d7988d6SPaul Dagnelie  * segments forwards until giving up on finding a segment that the allocation
1794d7988d6SPaul Dagnelie  * will fit into.
1804d7988d6SPaul Dagnelie  */
1814d7988d6SPaul Dagnelie uint32_t metaslab_min_search_count = 100;
1824d7988d6SPaul Dagnelie 
183814dcd43SSerapheim Dimitropoulos /*
184814dcd43SSerapheim Dimitropoulos  * If we are not searching forward (due to metaslab_df_max_search,
185814dcd43SSerapheim Dimitropoulos  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
186814dcd43SSerapheim Dimitropoulos  * controls what segment is used.  If it is set, we will use the largest free
187814dcd43SSerapheim Dimitropoulos  * segment.  If it is not set, we will use a segment of exactly the requested
188814dcd43SSerapheim Dimitropoulos  * size (or larger).
189814dcd43SSerapheim Dimitropoulos  */
190814dcd43SSerapheim Dimitropoulos int metaslab_df_use_largest_segment = B_FALSE;
191814dcd43SSerapheim Dimitropoulos 
19280eb36f2SGeorge Wilson /*
19380eb36f2SGeorge Wilson  * A metaslab is considered "free" if it contains a contiguous
19480eb36f2SGeorge Wilson  * segment which is greater than metaslab_min_alloc_size.
19580eb36f2SGeorge Wilson  */
19680eb36f2SGeorge Wilson uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
19780eb36f2SGeorge Wilson 
19880eb36f2SGeorge Wilson /*
1990713e232SGeorge Wilson  * Percentage of all cpus that can be used by the metaslab taskq.
20080eb36f2SGeorge Wilson  */
2010713e232SGeorge Wilson int metaslab_load_pct = 50;
20280eb36f2SGeorge Wilson 
20380eb36f2SGeorge Wilson /*
204af1d63abSPaul Dagnelie  * These tunables control how long a metaslab will remain loaded after the
205af1d63abSPaul Dagnelie  * last allocation from it.  A metaslab can't be unloaded until at least
206af1d63abSPaul Dagnelie  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
207af1d63abSPaul Dagnelie  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
208af1d63abSPaul Dagnelie  * unloaded sooner.  These settings are intended to be generous -- to keep
209af1d63abSPaul Dagnelie  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
21080eb36f2SGeorge Wilson  */
211af1d63abSPaul Dagnelie int metaslab_unload_delay = 32;
212af1d63abSPaul Dagnelie int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
213d6e555bdSGeorge Wilson 
2140713e232SGeorge Wilson /*
2150713e232SGeorge Wilson  * Max number of metaslabs per group to preload.
2160713e232SGeorge Wilson  */
217af1d63abSPaul Dagnelie int metaslab_preload_limit = 10;
2180713e232SGeorge Wilson 
2190713e232SGeorge Wilson /*
2200713e232SGeorge Wilson  * Enable/disable preloading of metaslab.
2210713e232SGeorge Wilson  */
2220713e232SGeorge Wilson boolean_t metaslab_preload_enabled = B_TRUE;
2230713e232SGeorge Wilson 
2240713e232SGeorge Wilson /*
2252e4c9986SGeorge Wilson  * Enable/disable fragmentation weighting on metaslabs.
2262e4c9986SGeorge Wilson  */
2272e4c9986SGeorge Wilson boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
2282e4c9986SGeorge Wilson 
2292e4c9986SGeorge Wilson /*
2302e4c9986SGeorge Wilson  * Enable/disable lba weighting (i.e. outer tracks are given preference).
2312e4c9986SGeorge Wilson  */
2322e4c9986SGeorge Wilson boolean_t metaslab_lba_weighting_enabled = B_TRUE;
2332e4c9986SGeorge Wilson 
2342e4c9986SGeorge Wilson /*
2352e4c9986SGeorge Wilson  * Enable/disable metaslab group biasing.
2360713e232SGeorge Wilson  */
2372e4c9986SGeorge Wilson boolean_t metaslab_bias_enabled = B_TRUE;
2380713e232SGeorge Wilson 
2395cabbc6bSPrashanth Sreenivasa /*
2405cabbc6bSPrashanth Sreenivasa  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
2415cabbc6bSPrashanth Sreenivasa  */
2425cabbc6bSPrashanth Sreenivasa boolean_t zfs_remap_blkptr_enable = B_TRUE;
2435cabbc6bSPrashanth Sreenivasa 
2448363e80aSGeorge Wilson /*
2458363e80aSGeorge Wilson  * Enable/disable segment-based metaslab selection.
2468363e80aSGeorge Wilson  */
2478363e80aSGeorge Wilson boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
2488363e80aSGeorge Wilson 
2498363e80aSGeorge Wilson /*
2508363e80aSGeorge Wilson  * When using segment-based metaslab selection, we will continue
2518363e80aSGeorge Wilson  * allocating from the active metaslab until we have exhausted
2528363e80aSGeorge Wilson  * zfs_metaslab_switch_threshold of its buckets.
2538363e80aSGeorge Wilson  */
2548363e80aSGeorge Wilson int zfs_metaslab_switch_threshold = 2;
2558363e80aSGeorge Wilson 
2568363e80aSGeorge Wilson /*
2578363e80aSGeorge Wilson  * Internal switch to enable/disable the metaslab allocation tracing
2588363e80aSGeorge Wilson  * facility.
2598363e80aSGeorge Wilson  */
2608363e80aSGeorge Wilson boolean_t metaslab_trace_enabled = B_TRUE;
2618363e80aSGeorge Wilson 
2628363e80aSGeorge Wilson /*
2638363e80aSGeorge Wilson  * Maximum entries that the metaslab allocation tracing facility will keep
2648363e80aSGeorge Wilson  * in a given list when running in non-debug mode. We limit the number
2658363e80aSGeorge Wilson  * of entries in non-debug mode to prevent us from using up too much memory.
2668363e80aSGeorge Wilson  * The limit should be sufficiently large that we don't expect any allocation
2678363e80aSGeorge Wilson  * to every exceed this value. In debug mode, the system will panic if this
2688363e80aSGeorge Wilson  * limit is ever reached allowing for further investigation.
2698363e80aSGeorge Wilson  */
2708363e80aSGeorge Wilson uint64_t metaslab_trace_max_entries = 5000;
2718363e80aSGeorge Wilson 
272084fd14fSBrian Behlendorf /*
273084fd14fSBrian Behlendorf  * Maximum number of metaslabs per group that can be disabled
274084fd14fSBrian Behlendorf  * simultaneously.
275084fd14fSBrian Behlendorf  */
276084fd14fSBrian Behlendorf int max_disabled_ms = 3;
277084fd14fSBrian Behlendorf 
2784d7988d6SPaul Dagnelie /*
2794d7988d6SPaul Dagnelie  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
2804d7988d6SPaul Dagnelie  * To avoid 64-bit overflow, don't set above UINT32_MAX.
2814d7988d6SPaul Dagnelie  */
2824d7988d6SPaul Dagnelie unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
2834d7988d6SPaul Dagnelie 
284af1d63abSPaul Dagnelie /*
285af1d63abSPaul Dagnelie  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
286af1d63abSPaul Dagnelie  * a metaslab would take it over this percentage, the oldest selected metaslab
287af1d63abSPaul Dagnelie  * is automatically unloaded.
288af1d63abSPaul Dagnelie  */
2894d7988d6SPaul Dagnelie int zfs_metaslab_mem_limit = 75;
290af1d63abSPaul Dagnelie 
291af1d63abSPaul Dagnelie /*
2924d7988d6SPaul Dagnelie  * Force the per-metaslab range trees to use 64-bit integers to store
2934d7988d6SPaul Dagnelie  * segments. Used for debugging purposes.
294af1d63abSPaul Dagnelie  */
2954d7988d6SPaul Dagnelie boolean_t zfs_metaslab_force_large_segs = B_FALSE;
2964d7988d6SPaul Dagnelie 
2974d7988d6SPaul Dagnelie /*
2984d7988d6SPaul Dagnelie  * By default we only store segments over a certain size in the size-sorted
2994d7988d6SPaul Dagnelie  * metaslab trees (ms_allocatable_by_size and
3004d7988d6SPaul Dagnelie  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
3014d7988d6SPaul Dagnelie  * improves load and unload times at the cost of causing us to use slightly
3024d7988d6SPaul Dagnelie  * larger segments than we would otherwise in some cases.
3034d7988d6SPaul Dagnelie  */
3044d7988d6SPaul Dagnelie uint32_t metaslab_by_size_min_shift = 14;
305af1d63abSPaul Dagnelie 
3068363e80aSGeorge Wilson static uint64_t metaslab_weight(metaslab_t *);
3078363e80aSGeorge Wilson static void metaslab_set_fragmentation(metaslab_t *);
30886714001SSerapheim Dimitropoulos static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
3095cabbc6bSPrashanth Sreenivasa static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
310f78cdc34SPaul Dagnelie static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
311f78cdc34SPaul Dagnelie static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
312814dcd43SSerapheim Dimitropoulos static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
313af1d63abSPaul Dagnelie static unsigned int metaslab_idx_func(multilist_t *, void *);
314af1d63abSPaul Dagnelie static void metaslab_evict(metaslab_t *, uint64_t);
3154d7988d6SPaul Dagnelie static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
3168363e80aSGeorge Wilson 
3178363e80aSGeorge Wilson kmem_cache_t *metaslab_alloc_trace_cache;
3180713e232SGeorge Wilson 
3194d7988d6SPaul Dagnelie typedef struct metaslab_stats {
3204d7988d6SPaul Dagnelie 	kstat_named_t metaslabstat_trace_over_limit;
3214d7988d6SPaul Dagnelie 	kstat_named_t metaslabstat_df_find_under_floor;
3224d7988d6SPaul Dagnelie 	kstat_named_t metaslabstat_reload_tree;
3234d7988d6SPaul Dagnelie } metaslab_stats_t;
3244d7988d6SPaul Dagnelie 
3254d7988d6SPaul Dagnelie static metaslab_stats_t metaslab_stats = {
3264d7988d6SPaul Dagnelie 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
3274d7988d6SPaul Dagnelie 	{ "df_find_under_floor",	KSTAT_DATA_UINT64 },
3284d7988d6SPaul Dagnelie 	{ "reload_tree",		KSTAT_DATA_UINT64 },
3294d7988d6SPaul Dagnelie };
3304d7988d6SPaul Dagnelie 
3314d7988d6SPaul Dagnelie #define	METASLABSTAT_BUMP(stat) \
3324d7988d6SPaul Dagnelie 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
3334d7988d6SPaul Dagnelie 
3344d7988d6SPaul Dagnelie 
3354d7988d6SPaul Dagnelie kstat_t *metaslab_ksp;
3364d7988d6SPaul Dagnelie 
3374d7988d6SPaul Dagnelie void
metaslab_stat_init(void)3384d7988d6SPaul Dagnelie metaslab_stat_init(void)
3394d7988d6SPaul Dagnelie {
3404d7988d6SPaul Dagnelie 	ASSERT(metaslab_alloc_trace_cache == NULL);
3414d7988d6SPaul Dagnelie 	metaslab_alloc_trace_cache = kmem_cache_create(
3424d7988d6SPaul Dagnelie 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
3434d7988d6SPaul Dagnelie 	    0, NULL, NULL, NULL, NULL, NULL, 0);
3444d7988d6SPaul Dagnelie 	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
3454d7988d6SPaul Dagnelie 	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
3464d7988d6SPaul Dagnelie 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3474d7988d6SPaul Dagnelie 	if (metaslab_ksp != NULL) {
3484d7988d6SPaul Dagnelie 		metaslab_ksp->ks_data = &metaslab_stats;
3494d7988d6SPaul Dagnelie 		kstat_install(metaslab_ksp);
3504d7988d6SPaul Dagnelie 	}
3514d7988d6SPaul Dagnelie }
3524d7988d6SPaul Dagnelie 
3534d7988d6SPaul Dagnelie void
metaslab_stat_fini(void)3544d7988d6SPaul Dagnelie metaslab_stat_fini(void)
3554d7988d6SPaul Dagnelie {
3564d7988d6SPaul Dagnelie 	if (metaslab_ksp != NULL) {
3574d7988d6SPaul Dagnelie 		kstat_delete(metaslab_ksp);
3584d7988d6SPaul Dagnelie 		metaslab_ksp = NULL;
3594d7988d6SPaul Dagnelie 	}
3604d7988d6SPaul Dagnelie 
3614d7988d6SPaul Dagnelie 	kmem_cache_destroy(metaslab_alloc_trace_cache);
3624d7988d6SPaul Dagnelie 	metaslab_alloc_trace_cache = NULL;
3634d7988d6SPaul Dagnelie }
3644d7988d6SPaul Dagnelie 
365fa9e4066Sahrens /*
366fa9e4066Sahrens  * ==========================================================================
367fa9e4066Sahrens  * Metaslab classes
368fa9e4066Sahrens  * ==========================================================================
369fa9e4066Sahrens  */
370fa9e4066Sahrens metaslab_class_t *
metaslab_class_create(spa_t * spa,metaslab_ops_t * ops)3710713e232SGeorge Wilson metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
372fa9e4066Sahrens {
373fa9e4066Sahrens 	metaslab_class_t *mc;
374fa9e4066Sahrens 
375fa9e4066Sahrens 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
376fa9e4066Sahrens 
37788ecc943SGeorge Wilson 	mc->mc_spa = spa;
378fa9e4066Sahrens 	mc->mc_rotor = NULL;
379d6e555bdSGeorge Wilson 	mc->mc_ops = ops;
3800f7643c7SGeorge Wilson 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
381af1d63abSPaul Dagnelie 	mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
382af1d63abSPaul Dagnelie 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
383f78cdc34SPaul Dagnelie 	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
384e914ace2STim Schumacher 	    sizeof (zfs_refcount_t), KM_SLEEP);
385f78cdc34SPaul Dagnelie 	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
386f78cdc34SPaul Dagnelie 	    sizeof (uint64_t), KM_SLEEP);
387f78cdc34SPaul Dagnelie 	for (int i = 0; i < spa->spa_alloc_count; i++)
388e914ace2STim Schumacher 		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
389fa9e4066Sahrens 
390fa9e4066Sahrens 	return (mc);
391fa9e4066Sahrens }
392fa9e4066Sahrens 
393fa9e4066Sahrens void
metaslab_class_destroy(metaslab_class_t * mc)394fa9e4066Sahrens metaslab_class_destroy(metaslab_class_t *mc)
395fa9e4066Sahrens {
396a1521560SJeff Bonwick 	ASSERT(mc->mc_rotor == NULL);
397a1521560SJeff Bonwick 	ASSERT(mc->mc_alloc == 0);
398a1521560SJeff Bonwick 	ASSERT(mc->mc_deferred == 0);
399a1521560SJeff Bonwick 	ASSERT(mc->mc_space == 0);
400a1521560SJeff Bonwick 	ASSERT(mc->mc_dspace == 0);
401fa9e4066Sahrens 
402f78cdc34SPaul Dagnelie 	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
403e914ace2STim Schumacher 		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
404f78cdc34SPaul Dagnelie 	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
405e914ace2STim Schumacher 	    sizeof (zfs_refcount_t));
406f78cdc34SPaul Dagnelie 	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
407f78cdc34SPaul Dagnelie 	    sizeof (uint64_t));
4080f7643c7SGeorge Wilson 	mutex_destroy(&mc->mc_lock);
409af1d63abSPaul Dagnelie 	multilist_destroy(mc->mc_metaslab_txg_list);
410fa9e4066Sahrens 	kmem_free(mc, sizeof (metaslab_class_t));
411fa9e4066Sahrens }
412fa9e4066Sahrens 
41388ecc943SGeorge Wilson int
metaslab_class_validate(metaslab_class_t * mc)41488ecc943SGeorge Wilson metaslab_class_validate(metaslab_class_t *mc)
41588ecc943SGeorge Wilson {
41688ecc943SGeorge Wilson 	metaslab_group_t *mg;
41788ecc943SGeorge Wilson 	vdev_t *vd;
41888ecc943SGeorge Wilson 
41988ecc943SGeorge Wilson 	/*
42088ecc943SGeorge Wilson 	 * Must hold one of the spa_config locks.
42188ecc943SGeorge Wilson 	 */
42288ecc943SGeorge Wilson 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
42388ecc943SGeorge Wilson 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
42488ecc943SGeorge Wilson 
42588ecc943SGeorge Wilson 	if ((mg = mc->mc_rotor) == NULL)
42688ecc943SGeorge Wilson 		return (0);
42788ecc943SGeorge Wilson 
42888ecc943SGeorge Wilson 	do {
42988ecc943SGeorge Wilson 		vd = mg->mg_vd;
43088ecc943SGeorge Wilson 		ASSERT(vd->vdev_mg != NULL);
43188ecc943SGeorge Wilson 		ASSERT3P(vd->vdev_top, ==, vd);
43288ecc943SGeorge Wilson 		ASSERT3P(mg->mg_class, ==, mc);
43388ecc943SGeorge Wilson 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
43488ecc943SGeorge Wilson 	} while ((mg = mg->mg_next) != mc->mc_rotor);
43588ecc943SGeorge Wilson 
43688ecc943SGeorge Wilson 	return (0);
43788ecc943SGeorge Wilson }
43888ecc943SGeorge Wilson 
439663207adSDon Brady static void
metaslab_class_space_update(metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta,int64_t dspace_delta)440b24ab676SJeff Bonwick metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
441b24ab676SJeff Bonwick     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
442b24ab676SJeff Bonwick {
443b24ab676SJeff Bonwick 	atomic_add_64(&mc->mc_alloc, alloc_delta);
444b24ab676SJeff Bonwick 	atomic_add_64(&mc->mc_deferred, defer_delta);
445b24ab676SJeff Bonwick 	atomic_add_64(&mc->mc_space, space_delta);
446b24ab676SJeff Bonwick 	atomic_add_64(&mc->mc_dspace, dspace_delta);
447b24ab676SJeff Bonwick }
448b24ab676SJeff Bonwick 
449b24ab676SJeff Bonwick uint64_t
metaslab_class_get_alloc(metaslab_class_t * mc)450b24ab676SJeff Bonwick metaslab_class_get_alloc(metaslab_class_t *mc)
451b24ab676SJeff Bonwick {
452b24ab676SJeff Bonwick 	return (mc->mc_alloc);
453b24ab676SJeff Bonwick }
454b24ab676SJeff Bonwick 
455b24ab676SJeff Bonwick uint64_t
metaslab_class_get_deferred(metaslab_class_t * mc)456b24ab676SJeff Bonwick metaslab_class_get_deferred(metaslab_class_t *mc)
457b24ab676SJeff Bonwick {
458b24ab676SJeff Bonwick 	return (mc->mc_deferred);
459b24ab676SJeff Bonwick }
460b24ab676SJeff Bonwick 
461b24ab676SJeff Bonwick uint64_t
metaslab_class_get_space(metaslab_class_t * mc)462b24ab676SJeff Bonwick metaslab_class_get_space(metaslab_class_t *mc)
463b24ab676SJeff Bonwick {
464b24ab676SJeff Bonwick 	return (mc->mc_space);
465b24ab676SJeff Bonwick }
466b24ab676SJeff Bonwick 
467b24ab676SJeff Bonwick uint64_t
metaslab_class_get_dspace(metaslab_class_t * mc)468b24ab676SJeff Bonwick metaslab_class_get_dspace(metaslab_class_t *mc)
469b24ab676SJeff Bonwick {
470b24ab676SJeff Bonwick 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
471b24ab676SJeff Bonwick }
472b24ab676SJeff Bonwick 
4732e4c9986SGeorge Wilson void
metaslab_class_histogram_verify(metaslab_class_t * mc)4742e4c9986SGeorge Wilson metaslab_class_histogram_verify(metaslab_class_t *mc)
4752e4c9986SGeorge Wilson {
476663207adSDon Brady 	spa_t *spa = mc->mc_spa;
477663207adSDon Brady 	vdev_t *rvd = spa->spa_root_vdev;
4782e4c9986SGeorge Wilson 	uint64_t *mc_hist;
4792e4c9986SGeorge Wilson 	int i;
4802e4c9986SGeorge Wilson 
4812e4c9986SGeorge Wilson 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
4822e4c9986SGeorge Wilson 		return;
4832e4c9986SGeorge Wilson 
4842e4c9986SGeorge Wilson 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
4852e4c9986SGeorge Wilson 	    KM_SLEEP);
4862e4c9986SGeorge Wilson 
4872e4c9986SGeorge Wilson 	for (int c = 0; c < rvd->vdev_children; c++) {
4882e4c9986SGeorge Wilson 		vdev_t *tvd = rvd->vdev_child[c];
4892e4c9986SGeorge Wilson 		metaslab_group_t *mg = tvd->vdev_mg;
4902e4c9986SGeorge Wilson 
4912e4c9986SGeorge Wilson 		/*
4922e4c9986SGeorge Wilson 		 * Skip any holes, uninitialized top-levels, or
4932e4c9986SGeorge Wilson 		 * vdevs that are not in this metalab class.
4942e4c9986SGeorge Wilson 		 */
4955cabbc6bSPrashanth Sreenivasa 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
4962e4c9986SGeorge Wilson 		    mg->mg_class != mc) {
4972e4c9986SGeorge Wilson 			continue;
4982e4c9986SGeorge Wilson 		}
4992e4c9986SGeorge Wilson 
5002e4c9986SGeorge Wilson 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
5012e4c9986SGeorge Wilson 			mc_hist[i] += mg->mg_histogram[i];
5022e4c9986SGeorge Wilson 	}
5032e4c9986SGeorge Wilson 
5042e4c9986SGeorge Wilson 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
5052e4c9986SGeorge Wilson 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
5062e4c9986SGeorge Wilson 
5072e4c9986SGeorge Wilson 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
5082e4c9986SGeorge Wilson }
5092e4c9986SGeorge Wilson 
5102e4c9986SGeorge Wilson /*
5112e4c9986SGeorge Wilson  * Calculate the metaslab class's fragmentation metric. The metric
5122e4c9986SGeorge Wilson  * is weighted based on the space contribution of each metaslab group.
5132e4c9986SGeorge Wilson  * The return value will be a number between 0 and 100 (inclusive), or
5142e4c9986SGeorge Wilson  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
5152e4c9986SGeorge Wilson  * zfs_frag_table for more information about the metric.
5162e4c9986SGeorge Wilson  */
5172e4c9986SGeorge Wilson uint64_t
metaslab_class_fragmentation(metaslab_class_t * mc)5182e4c9986SGeorge Wilson metaslab_class_fragmentation(metaslab_class_t *mc)
5192e4c9986SGeorge Wilson {
5202e4c9986SGeorge Wilson 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
5212e4c9986SGeorge Wilson 	uint64_t fragmentation = 0;
5222e4c9986SGeorge Wilson 
5232e4c9986SGeorge Wilson 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
5242e4c9986SGeorge Wilson 
5252e4c9986SGeorge Wilson 	for (int c = 0; c < rvd->vdev_children; c++) {
5262e4c9986SGeorge Wilson 		vdev_t *tvd = rvd->vdev_child[c];
5272e4c9986SGeorge Wilson 		metaslab_group_t *mg = tvd->vdev_mg;
5282e4c9986SGeorge Wilson 
5292e4c9986SGeorge Wilson 		/*
5305cabbc6bSPrashanth Sreenivasa 		 * Skip any holes, uninitialized top-levels,
5315cabbc6bSPrashanth Sreenivasa 		 * or vdevs that are not in this metalab class.
5322e4c9986SGeorge Wilson 		 */
5335cabbc6bSPrashanth Sreenivasa 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
5342e4c9986SGeorge Wilson 		    mg->mg_class != mc) {
5352e4c9986SGeorge Wilson 			continue;
5362e4c9986SGeorge Wilson 		}
5372e4c9986SGeorge Wilson 
5382e4c9986SGeorge Wilson 		/*
5392e4c9986SGeorge Wilson 		 * If a metaslab group does not contain a fragmentation
5402e4c9986SGeorge Wilson 		 * metric then just bail out.
5412e4c9986SGeorge Wilson 		 */
5422e4c9986SGeorge Wilson 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
5432e4c9986SGeorge Wilson 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
5442e4c9986SGeorge Wilson 			return (ZFS_FRAG_INVALID);
5452e4c9986SGeorge Wilson 		}
5462e4c9986SGeorge Wilson 
5472e4c9986SGeorge Wilson 		/*
5482e4c9986SGeorge Wilson 		 * Determine how much this metaslab_group is contributing
5492e4c9986SGeorge Wilson 		 * to the overall pool fragmentation metric.
5502e4c9986SGeorge Wilson 		 */
5512e4c9986SGeorge Wilson 		fragmentation += mg->mg_fragmentation *
5522e4c9986SGeorge Wilson 		    metaslab_group_get_space(mg);
5532e4c9986SGeorge Wilson 	}
5542e4c9986SGeorge Wilson 	fragmentation /= metaslab_class_get_space(mc);
5552e4c9986SGeorge Wilson 
5562e4c9986SGeorge Wilson 	ASSERT3U(fragmentation, <=, 100);
5572e4c9986SGeorge Wilson 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
5582e4c9986SGeorge Wilson 	return (fragmentation);
5592e4c9986SGeorge Wilson }
5602e4c9986SGeorge Wilson 
5612e4c9986SGeorge Wilson /*
5622e4c9986SGeorge Wilson  * Calculate the amount of expandable space that is available in
5632e4c9986SGeorge Wilson  * this metaslab class. If a device is expanded then its expandable
5642e4c9986SGeorge Wilson  * space will be the amount of allocatable space that is currently not
5652e4c9986SGeorge Wilson  * part of this metaslab class.
5662e4c9986SGeorge Wilson  */
5672e4c9986SGeorge Wilson uint64_t
metaslab_class_expandable_space(metaslab_class_t * mc)5682e4c9986SGeorge Wilson metaslab_class_expandable_space(metaslab_class_t *mc)
5692e4c9986SGeorge Wilson {
5702e4c9986SGeorge Wilson 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
5712e4c9986SGeorge Wilson 	uint64_t space = 0;
5722e4c9986SGeorge Wilson 
5732e4c9986SGeorge Wilson 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
5742e4c9986SGeorge Wilson 	for (int c = 0; c < rvd->vdev_children; c++) {
5757855d95bSToomas Soome 		uint64_t tspace;
5762e4c9986SGeorge Wilson 		vdev_t *tvd = rvd->vdev_child[c];
5772e4c9986SGeorge Wilson 		metaslab_group_t *mg = tvd->vdev_mg;
5782e4c9986SGeorge Wilson 
5795cabbc6bSPrashanth Sreenivasa 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
5802e4c9986SGeorge Wilson 		    mg->mg_class != mc) {
5812e4c9986SGeorge Wilson 			continue;
5822e4c9986SGeorge Wilson 		}
5832e4c9986SGeorge Wilson 
584c39a2aaeSGeorge Wilson 		/*
585c39a2aaeSGeorge Wilson 		 * Calculate if we have enough space to add additional
586c39a2aaeSGeorge Wilson 		 * metaslabs. We report the expandable space in terms
587c39a2aaeSGeorge Wilson 		 * of the metaslab size since that's the unit of expansion.
5887855d95bSToomas Soome 		 * Adjust by efi system partition size.
589c39a2aaeSGeorge Wilson 		 */
5907855d95bSToomas Soome 		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
5917855d95bSToomas Soome 		if (tspace > mc->mc_spa->spa_bootsize) {
5927855d95bSToomas Soome 			tspace -= mc->mc_spa->spa_bootsize;
5937855d95bSToomas Soome 		}
5947855d95bSToomas Soome 		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
5952e4c9986SGeorge Wilson 	}
5962e4c9986SGeorge Wilson 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
5972e4c9986SGeorge Wilson 	return (space);
5982e4c9986SGeorge Wilson }
5992e4c9986SGeorge Wilson 
600af1d63abSPaul Dagnelie void
metaslab_class_evict_old(metaslab_class_t * mc,uint64_t txg)601af1d63abSPaul Dagnelie metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
602af1d63abSPaul Dagnelie {
603af1d63abSPaul Dagnelie 	multilist_t *ml = mc->mc_metaslab_txg_list;
604af1d63abSPaul Dagnelie 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
605af1d63abSPaul Dagnelie 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
606af1d63abSPaul Dagnelie 		metaslab_t *msp = multilist_sublist_head(mls);
607af1d63abSPaul Dagnelie 		multilist_sublist_unlock(mls);
608af1d63abSPaul Dagnelie 		while (msp != NULL) {
609af1d63abSPaul Dagnelie 			mutex_enter(&msp->ms_lock);
610af1d63abSPaul Dagnelie 
611af1d63abSPaul Dagnelie 			/*
612af1d63abSPaul Dagnelie 			 * If the metaslab has been removed from the list
613af1d63abSPaul Dagnelie 			 * (which could happen if we were at the memory limit
614af1d63abSPaul Dagnelie 			 * and it was evicted during this loop), then we can't
615af1d63abSPaul Dagnelie 			 * proceed and we should restart the sublist.
616af1d63abSPaul Dagnelie 			 */
617af1d63abSPaul Dagnelie 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
618af1d63abSPaul Dagnelie 				mutex_exit(&msp->ms_lock);
619af1d63abSPaul Dagnelie 				i--;
620af1d63abSPaul Dagnelie 				break;
621af1d63abSPaul Dagnelie 			}
622af1d63abSPaul Dagnelie 			mls = multilist_sublist_lock(ml, i);
623af1d63abSPaul Dagnelie 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
624af1d63abSPaul Dagnelie 			multilist_sublist_unlock(mls);
625af1d63abSPaul Dagnelie 			if (txg >
626af1d63abSPaul Dagnelie 			    msp->ms_selected_txg + metaslab_unload_delay &&
627af1d63abSPaul Dagnelie 			    gethrtime() > msp->ms_selected_time +
628af1d63abSPaul Dagnelie 			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
629af1d63abSPaul Dagnelie 				metaslab_evict(msp, txg);
630af1d63abSPaul Dagnelie 			} else {
631af1d63abSPaul Dagnelie 				/*
632af1d63abSPaul Dagnelie 				 * Once we've hit a metaslab selected too
633af1d63abSPaul Dagnelie 				 * recently to evict, we're done evicting for
634af1d63abSPaul Dagnelie 				 * now.
635af1d63abSPaul Dagnelie 				 */
636af1d63abSPaul Dagnelie 				mutex_exit(&msp->ms_lock);
637af1d63abSPaul Dagnelie 				break;
638af1d63abSPaul Dagnelie 			}
639af1d63abSPaul Dagnelie 			mutex_exit(&msp->ms_lock);
640af1d63abSPaul Dagnelie 			msp = next_msp;
641af1d63abSPaul Dagnelie 		}
642af1d63abSPaul Dagnelie 	}
643af1d63abSPaul Dagnelie }
644af1d63abSPaul Dagnelie 
645fa9e4066Sahrens static int
metaslab_compare(const void * x1,const void * x2)646fa9e4066Sahrens metaslab_compare(const void *x1, const void *x2)
647fa9e4066Sahrens {
648c4ab0d3fSGvozden Neskovic 	const metaslab_t *m1 = (const metaslab_t *)x1;
649c4ab0d3fSGvozden Neskovic 	const metaslab_t *m2 = (const metaslab_t *)x2;
650fa9e4066Sahrens 
651f78cdc34SPaul Dagnelie 	int sort1 = 0;
652f78cdc34SPaul Dagnelie 	int sort2 = 0;
653f78cdc34SPaul Dagnelie 	if (m1->ms_allocator != -1 && m1->ms_primary)
654f78cdc34SPaul Dagnelie 		sort1 = 1;
655f78cdc34SPaul Dagnelie 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
656f78cdc34SPaul Dagnelie 		sort1 = 2;
657f78cdc34SPaul Dagnelie 	if (m2->ms_allocator != -1 && m2->ms_primary)
658f78cdc34SPaul Dagnelie 		sort2 = 1;
659f78cdc34SPaul Dagnelie 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
660f78cdc34SPaul Dagnelie 		sort2 = 2;
661f78cdc34SPaul Dagnelie 
662f78cdc34SPaul Dagnelie 	/*
663f78cdc34SPaul Dagnelie 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
664f78cdc34SPaul Dagnelie 	 * selecting a metaslab to allocate from, an allocator first tries its
665f78cdc34SPaul Dagnelie 	 * primary, then secondary active metaslab. If it doesn't have active
666f78cdc34SPaul Dagnelie 	 * metaslabs, or can't allocate from them, it searches for an inactive
667f78cdc34SPaul Dagnelie 	 * metaslab to activate. If it can't find a suitable one, it will steal
668f78cdc34SPaul Dagnelie 	 * a primary or secondary metaslab from another allocator.
669f78cdc34SPaul Dagnelie 	 */
670f78cdc34SPaul Dagnelie 	if (sort1 < sort2)
671f78cdc34SPaul Dagnelie 		return (-1);
672f78cdc34SPaul Dagnelie 	if (sort1 > sort2)
673f78cdc34SPaul Dagnelie 		return (1);
674f78cdc34SPaul Dagnelie 
6754d7988d6SPaul Dagnelie 	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
676c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
677c4ab0d3fSGvozden Neskovic 		return (cmp);
678fa9e4066Sahrens 
6794d7988d6SPaul Dagnelie 	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
680fa9e4066Sahrens 
6814d7988d6SPaul Dagnelie 	return (TREE_CMP(m1->ms_start, m2->ms_start));
682fa9e4066Sahrens }
683fa9e4066Sahrens 
6848363e80aSGeorge Wilson /*
6858363e80aSGeorge Wilson  * ==========================================================================
6868363e80aSGeorge Wilson  * Metaslab groups
6878363e80aSGeorge Wilson  * ==========================================================================
6888363e80aSGeorge Wilson  */
68922e30981SGeorge Wilson /*
69022e30981SGeorge Wilson  * Update the allocatable flag and the metaslab group's capacity.
69122e30981SGeorge Wilson  * The allocatable flag is set to true if the capacity is below
6920f7643c7SGeorge Wilson  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
6930f7643c7SGeorge Wilson  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
6940f7643c7SGeorge Wilson  * transitions from allocatable to non-allocatable or vice versa then the
6950f7643c7SGeorge Wilson  * metaslab group's class is updated to reflect the transition.
69622e30981SGeorge Wilson  */
69722e30981SGeorge Wilson static void
metaslab_group_alloc_update(metaslab_group_t * mg)69822e30981SGeorge Wilson metaslab_group_alloc_update(metaslab_group_t *mg)
69922e30981SGeorge Wilson {
70022e30981SGeorge Wilson 	vdev_t *vd = mg->mg_vd;
70122e30981SGeorge Wilson 	metaslab_class_t *mc = mg->mg_class;
70222e30981SGeorge Wilson 	vdev_stat_t *vs = &vd->vdev_stat;
70322e30981SGeorge Wilson 	boolean_t was_allocatable;
7040f7643c7SGeorge Wilson 	boolean_t was_initialized;
70522e30981SGeorge Wilson 
70622e30981SGeorge Wilson 	ASSERT(vd == vd->vdev_top);
7075cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
7085cabbc6bSPrashanth Sreenivasa 	    SCL_ALLOC);
70922e30981SGeorge Wilson 
71022e30981SGeorge Wilson 	mutex_enter(&mg->mg_lock);
71122e30981SGeorge Wilson 	was_allocatable = mg->mg_allocatable;
7120f7643c7SGeorge Wilson 	was_initialized = mg->mg_initialized;
71322e30981SGeorge Wilson 
71422e30981SGeorge Wilson 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
71522e30981SGeorge Wilson 	    (vs->vs_space + 1);
71622e30981SGeorge Wilson 
7170f7643c7SGeorge Wilson 	mutex_enter(&mc->mc_lock);
7180f7643c7SGeorge Wilson 
7190f7643c7SGeorge Wilson 	/*
7200f7643c7SGeorge Wilson 	 * If the metaslab group was just added then it won't
7210f7643c7SGeorge Wilson 	 * have any space until we finish syncing out this txg.
7220f7643c7SGeorge Wilson 	 * At that point we will consider it initialized and available
7230f7643c7SGeorge Wilson 	 * for allocations.  We also don't consider non-activated
7240f7643c7SGeorge Wilson 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
7250f7643c7SGeorge Wilson 	 * to be initialized, because they can't be used for allocation.
7260f7643c7SGeorge Wilson 	 */
7270f7643c7SGeorge Wilson 	mg->mg_initialized = metaslab_group_initialized(mg);
7280f7643c7SGeorge Wilson 	if (!was_initialized && mg->mg_initialized) {
7290f7643c7SGeorge Wilson 		mc->mc_groups++;
7300f7643c7SGeorge Wilson 	} else if (was_initialized && !mg->mg_initialized) {
7310f7643c7SGeorge Wilson 		ASSERT3U(mc->mc_groups, >, 0);
7320f7643c7SGeorge Wilson 		mc->mc_groups--;
7330f7643c7SGeorge Wilson 	}
7340f7643c7SGeorge Wilson 	if (mg->mg_initialized)
7350f7643c7SGeorge Wilson 		mg->mg_no_free_space = B_FALSE;
7360f7643c7SGeorge Wilson 
7372e4c9986SGeorge Wilson 	/*
7382e4c9986SGeorge Wilson 	 * A metaslab group is considered allocatable if it has plenty
7392e4c9986SGeorge Wilson 	 * of free space or is not heavily fragmented. We only take
7402e4c9986SGeorge Wilson 	 * fragmentation into account if the metaslab group has a valid
7412e4c9986SGeorge Wilson 	 * fragmentation metric (i.e. a value between 0 and 100).
7422e4c9986SGeorge Wilson 	 */
7430f7643c7SGeorge Wilson 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
7440f7643c7SGeorge Wilson 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
7452e4c9986SGeorge Wilson 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
7462e4c9986SGeorge Wilson 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
74722e30981SGeorge Wilson 
74822e30981SGeorge Wilson 	/*
74922e30981SGeorge Wilson 	 * The mc_alloc_groups maintains a count of the number of
75022e30981SGeorge Wilson 	 * groups in this metaslab class that are still above the
75122e30981SGeorge Wilson 	 * zfs_mg_noalloc_threshold. This is used by the allocating
75222e30981SGeorge Wilson 	 * threads to determine if they should avoid allocations to
75322e30981SGeorge Wilson 	 * a given group. The allocator will avoid allocations to a group
75422e30981SGeorge Wilson 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
75522e30981SGeorge Wilson 	 * and there are still other groups that are above the threshold.
75622e30981SGeorge Wilson 	 * When a group transitions from allocatable to non-allocatable or
75722e30981SGeorge Wilson 	 * vice versa we update the metaslab class to reflect that change.
75822e30981SGeorge Wilson 	 * When the mc_alloc_groups value drops to 0 that means that all
75922e30981SGeorge Wilson 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
76022e30981SGeorge Wilson 	 * eligible for allocations. This effectively means that all devices
76122e30981SGeorge Wilson 	 * are balanced again.
76222e30981SGeorge Wilson 	 */
76322e30981SGeorge Wilson 	if (was_allocatable && !mg->mg_allocatable)
76422e30981SGeorge Wilson 		mc->mc_alloc_groups--;
76522e30981SGeorge Wilson 	else if (!was_allocatable && mg->mg_allocatable)
76622e30981SGeorge Wilson 		mc->mc_alloc_groups++;
7670f7643c7SGeorge Wilson 	mutex_exit(&mc->mc_lock);
7682e4c9986SGeorge Wilson 
76922e30981SGeorge Wilson 	mutex_exit(&mg->mg_lock);
77022e30981SGeorge Wilson }
77122e30981SGeorge Wilson 
772814dcd43SSerapheim Dimitropoulos int
metaslab_sort_by_flushed(const void * va,const void * vb)773814dcd43SSerapheim Dimitropoulos metaslab_sort_by_flushed(const void *va, const void *vb)
774814dcd43SSerapheim Dimitropoulos {
775814dcd43SSerapheim Dimitropoulos 	const metaslab_t *a = va;
776814dcd43SSerapheim Dimitropoulos 	const metaslab_t *b = vb;
777814dcd43SSerapheim Dimitropoulos 
7784d7988d6SPaul Dagnelie 	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
779814dcd43SSerapheim Dimitropoulos 	if (likely(cmp))
780814dcd43SSerapheim Dimitropoulos 		return (cmp);
781814dcd43SSerapheim Dimitropoulos 
782814dcd43SSerapheim Dimitropoulos 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
783814dcd43SSerapheim Dimitropoulos 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
7844d7988d6SPaul Dagnelie 	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
785814dcd43SSerapheim Dimitropoulos 	if (cmp)
786814dcd43SSerapheim Dimitropoulos 		return (cmp);
787814dcd43SSerapheim Dimitropoulos 
7884d7988d6SPaul Dagnelie 	return (TREE_CMP(a->ms_id, b->ms_id));
789814dcd43SSerapheim Dimitropoulos }
790814dcd43SSerapheim Dimitropoulos 
791fa9e4066Sahrens metaslab_group_t *
metaslab_group_create(metaslab_class_t * mc,vdev_t * vd,int allocators)792f78cdc34SPaul Dagnelie metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
793fa9e4066Sahrens {
794fa9e4066Sahrens 	metaslab_group_t *mg;
795fa9e4066Sahrens 
796fa9e4066Sahrens 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
797fa9e4066Sahrens 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
798084fd14fSBrian Behlendorf 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
799084fd14fSBrian Behlendorf 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
800f78cdc34SPaul Dagnelie 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
801f78cdc34SPaul Dagnelie 	    KM_SLEEP);
802f78cdc34SPaul Dagnelie 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
803f78cdc34SPaul Dagnelie 	    KM_SLEEP);
804fa9e4066Sahrens 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
805814dcd43SSerapheim Dimitropoulos 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
806fa9e4066Sahrens 	mg->mg_vd = vd;
807a1521560SJeff Bonwick 	mg->mg_class = mc;
808a1521560SJeff Bonwick 	mg->mg_activation_count = 0;
8090f7643c7SGeorge Wilson 	mg->mg_initialized = B_FALSE;
8100f7643c7SGeorge Wilson 	mg->mg_no_free_space = B_TRUE;
811f78cdc34SPaul Dagnelie 	mg->mg_allocators = allocators;
812f78cdc34SPaul Dagnelie 
813e914ace2STim Schumacher 	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
814e914ace2STim Schumacher 	    sizeof (zfs_refcount_t), KM_SLEEP);
815f78cdc34SPaul Dagnelie 	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
816f78cdc34SPaul Dagnelie 	    sizeof (uint64_t), KM_SLEEP);
817f78cdc34SPaul Dagnelie 	for (int i = 0; i < allocators; i++) {
818e914ace2STim Schumacher 		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
819f78cdc34SPaul Dagnelie 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
820f78cdc34SPaul Dagnelie 	}
821fa9e4066Sahrens 
822be082110SGeorge Wilson 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
8230713e232SGeorge Wilson 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
8240713e232SGeorge Wilson 
825fa9e4066Sahrens 	return (mg);
826fa9e4066Sahrens }
827fa9e4066Sahrens 
828fa9e4066Sahrens void
metaslab_group_destroy(metaslab_group_t * mg)829fa9e4066Sahrens metaslab_group_destroy(metaslab_group_t *mg)
830fa9e4066Sahrens {
831a1521560SJeff Bonwick 	ASSERT(mg->mg_prev == NULL);
832a1521560SJeff Bonwick 	ASSERT(mg->mg_next == NULL);
833a33cae98STim Haley 	/*
834a33cae98STim Haley 	 * We may have gone below zero with the activation count
835a33cae98STim Haley 	 * either because we never activated in the first place or
836a33cae98STim Haley 	 * because we're done, and possibly removing the vdev.
837a33cae98STim Haley 	 */
838a33cae98STim Haley 	ASSERT(mg->mg_activation_count <= 0);
839a1521560SJeff Bonwick 
840be082110SGeorge Wilson 	taskq_destroy(mg->mg_taskq);
841fa9e4066Sahrens 	avl_destroy(&mg->mg_metaslab_tree);
842f78cdc34SPaul Dagnelie 	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
843f78cdc34SPaul Dagnelie 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
844f78cdc34SPaul Dagnelie 	    sizeof (metaslab_t *));
845fa9e4066Sahrens 	mutex_destroy(&mg->mg_lock);
846084fd14fSBrian Behlendorf 	mutex_destroy(&mg->mg_ms_disabled_lock);
847084fd14fSBrian Behlendorf 	cv_destroy(&mg->mg_ms_disabled_cv);
848f78cdc34SPaul Dagnelie 
849f78cdc34SPaul Dagnelie 	for (int i = 0; i < mg->mg_allocators; i++) {
850e914ace2STim Schumacher 		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
851f78cdc34SPaul Dagnelie 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
852f78cdc34SPaul Dagnelie 	}
853f78cdc34SPaul Dagnelie 	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
854e914ace2STim Schumacher 	    sizeof (zfs_refcount_t));
855f78cdc34SPaul Dagnelie 	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
856f78cdc34SPaul Dagnelie 	    sizeof (uint64_t));
857f78cdc34SPaul Dagnelie 
858fa9e4066Sahrens 	kmem_free(mg, sizeof (metaslab_group_t));
859fa9e4066Sahrens }
860fa9e4066Sahrens 
861a1521560SJeff Bonwick void
metaslab_group_activate(metaslab_group_t * mg)862a1521560SJeff Bonwick metaslab_group_activate(metaslab_group_t *mg)
863a1521560SJeff Bonwick {
864a1521560SJeff Bonwick 	metaslab_class_t *mc = mg->mg_class;
865a1521560SJeff Bonwick 	metaslab_group_t *mgprev, *mgnext;
866a1521560SJeff Bonwick 
8675cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
868a1521560SJeff Bonwick 
869a1521560SJeff Bonwick 	ASSERT(mc->mc_rotor != mg);
870a1521560SJeff Bonwick 	ASSERT(mg->mg_prev == NULL);
871a1521560SJeff Bonwick 	ASSERT(mg->mg_next == NULL);
872a1521560SJeff Bonwick 	ASSERT(mg->mg_activation_count <= 0);
873a1521560SJeff Bonwick 
874a1521560SJeff Bonwick 	if (++mg->mg_activation_count <= 0)
875a1521560SJeff Bonwick 		return;
876a1521560SJeff Bonwick 
877a1521560SJeff Bonwick 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
87822e30981SGeorge Wilson 	metaslab_group_alloc_update(mg);
879a1521560SJeff Bonwick 
880a1521560SJeff Bonwick 	if ((mgprev = mc->mc_rotor) == NULL) {
881a1521560SJeff Bonwick 		mg->mg_prev = mg;
882a1521560SJeff Bonwick 		mg->mg_next = mg;
883a1521560SJeff Bonwick 	} else {
884a1521560SJeff Bonwick 		mgnext = mgprev->mg_next;
885a1521560SJeff Bonwick 		mg->mg_prev = mgprev;
886a1521560SJeff Bonwick 		mg->mg_next = mgnext;
887a1521560SJeff Bonwick 		mgprev->mg_next = mg;
888a1521560SJeff Bonwick 		mgnext->mg_prev = mg;
889a1521560SJeff Bonwick 	}
890a1521560SJeff Bonwick 	mc->mc_rotor = mg;
891a1521560SJeff Bonwick }
892a1521560SJeff Bonwick 
8935cabbc6bSPrashanth Sreenivasa /*
8945cabbc6bSPrashanth Sreenivasa  * Passivate a metaslab group and remove it from the allocation rotor.
8955cabbc6bSPrashanth Sreenivasa  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
8965cabbc6bSPrashanth Sreenivasa  * a metaslab group. This function will momentarily drop spa_config_locks
8975cabbc6bSPrashanth Sreenivasa  * that are lower than the SCL_ALLOC lock (see comment below).
8985cabbc6bSPrashanth Sreenivasa  */
899a1521560SJeff Bonwick void
metaslab_group_passivate(metaslab_group_t * mg)900a1521560SJeff Bonwick metaslab_group_passivate(metaslab_group_t *mg)
901a1521560SJeff Bonwick {
902a1521560SJeff Bonwick 	metaslab_class_t *mc = mg->mg_class;
9035cabbc6bSPrashanth Sreenivasa 	spa_t *spa = mc->mc_spa;
904a1521560SJeff Bonwick 	metaslab_group_t *mgprev, *mgnext;
9055cabbc6bSPrashanth Sreenivasa 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
906a1521560SJeff Bonwick 
9075cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
9085cabbc6bSPrashanth Sreenivasa 	    (SCL_ALLOC | SCL_ZIO));
909a1521560SJeff Bonwick 
910a1521560SJeff Bonwick 	if (--mg->mg_activation_count != 0) {
911a1521560SJeff Bonwick 		ASSERT(mc->mc_rotor != mg);
912a1521560SJeff Bonwick 		ASSERT(mg->mg_prev == NULL);
913a1521560SJeff Bonwick 		ASSERT(mg->mg_next == NULL);
914a1521560SJeff Bonwick 		ASSERT(mg->mg_activation_count < 0);
915a1521560SJeff Bonwick 		return;
916a1521560SJeff Bonwick 	}
917a1521560SJeff Bonwick 
9185cabbc6bSPrashanth Sreenivasa 	/*
9195cabbc6bSPrashanth Sreenivasa 	 * The spa_config_lock is an array of rwlocks, ordered as
9205cabbc6bSPrashanth Sreenivasa 	 * follows (from highest to lowest):
9215cabbc6bSPrashanth Sreenivasa 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
9225cabbc6bSPrashanth Sreenivasa 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
9235cabbc6bSPrashanth Sreenivasa 	 * (For more information about the spa_config_lock see spa_misc.c)
9245cabbc6bSPrashanth Sreenivasa 	 * The higher the lock, the broader its coverage. When we passivate
9255cabbc6bSPrashanth Sreenivasa 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
9265cabbc6bSPrashanth Sreenivasa 	 * config locks. However, the metaslab group's taskq might be trying
9275cabbc6bSPrashanth Sreenivasa 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
9285cabbc6bSPrashanth Sreenivasa 	 * lower locks to allow the I/O to complete. At a minimum,
9295cabbc6bSPrashanth Sreenivasa 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
9305cabbc6bSPrashanth Sreenivasa 	 * allocations from taking place and any changes to the vdev tree.
9315cabbc6bSPrashanth Sreenivasa 	 */
9325cabbc6bSPrashanth Sreenivasa 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
9330713e232SGeorge Wilson 	taskq_wait(mg->mg_taskq);
9345cabbc6bSPrashanth Sreenivasa 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
9352e4c9986SGeorge Wilson 	metaslab_group_alloc_update(mg);
936f78cdc34SPaul Dagnelie 	for (int i = 0; i < mg->mg_allocators; i++) {
937f78cdc34SPaul Dagnelie 		metaslab_t *msp = mg->mg_primaries[i];
938f78cdc34SPaul Dagnelie 		if (msp != NULL) {
939f78cdc34SPaul Dagnelie 			mutex_enter(&msp->ms_lock);
940f78cdc34SPaul Dagnelie 			metaslab_passivate(msp,
941f78cdc34SPaul Dagnelie 			    metaslab_weight_from_range_tree(msp));
942f78cdc34SPaul Dagnelie 			mutex_exit(&msp->ms_lock);
943f78cdc34SPaul Dagnelie 		}
944f78cdc34SPaul Dagnelie 		msp = mg->mg_secondaries[i];
945f78cdc34SPaul Dagnelie 		if (msp != NULL) {
946f78cdc34SPaul Dagnelie 			mutex_enter(&msp->ms_lock);
947f78cdc34SPaul Dagnelie 			metaslab_passivate(msp,
948f78cdc34SPaul Dagnelie 			    metaslab_weight_from_range_tree(msp));
949f78cdc34SPaul Dagnelie 			mutex_exit(&msp->ms_lock);
950f78cdc34SPaul Dagnelie 		}
951f78cdc34SPaul Dagnelie 	}
9520713e232SGeorge Wilson 
953a1521560SJeff Bonwick 	mgprev = mg->mg_prev;
954a1521560SJeff Bonwick 	mgnext = mg->mg_next;
955a1521560SJeff Bonwick 
956a1521560SJeff Bonwick 	if (mg == mgnext) {
957a1521560SJeff Bonwick 		mc->mc_rotor = NULL;
958a1521560SJeff Bonwick 	} else {
959a1521560SJeff Bonwick 		mc->mc_rotor = mgnext;
960a1521560SJeff Bonwick 		mgprev->mg_next = mgnext;
961a1521560SJeff Bonwick 		mgnext->mg_prev = mgprev;
962a1521560SJeff Bonwick 	}
963a1521560SJeff Bonwick 
964a1521560SJeff Bonwick 	mg->mg_prev = NULL;
965a1521560SJeff Bonwick 	mg->mg_next = NULL;
966a1521560SJeff Bonwick }
967a1521560SJeff Bonwick 
9680f7643c7SGeorge Wilson boolean_t
metaslab_group_initialized(metaslab_group_t * mg)9690f7643c7SGeorge Wilson metaslab_group_initialized(metaslab_group_t *mg)
9700f7643c7SGeorge Wilson {
9710f7643c7SGeorge Wilson 	vdev_t *vd = mg->mg_vd;
9720f7643c7SGeorge Wilson 	vdev_stat_t *vs = &vd->vdev_stat;
9730f7643c7SGeorge Wilson 
9740f7643c7SGeorge Wilson 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
9750f7643c7SGeorge Wilson }
9760f7643c7SGeorge Wilson 
9772e4c9986SGeorge Wilson uint64_t
metaslab_group_get_space(metaslab_group_t * mg)9782e4c9986SGeorge Wilson metaslab_group_get_space(metaslab_group_t *mg)
9792e4c9986SGeorge Wilson {
9802e4c9986SGeorge Wilson 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
9812e4c9986SGeorge Wilson }
9822e4c9986SGeorge Wilson 
9832e4c9986SGeorge Wilson void
metaslab_group_histogram_verify(metaslab_group_t * mg)9842e4c9986SGeorge Wilson metaslab_group_histogram_verify(metaslab_group_t *mg)
9852e4c9986SGeorge Wilson {
9862e4c9986SGeorge Wilson 	uint64_t *mg_hist;
9872e4c9986SGeorge Wilson 	vdev_t *vd = mg->mg_vd;
9882e4c9986SGeorge Wilson 	uint64_t ashift = vd->vdev_ashift;
9892e4c9986SGeorge Wilson 	int i;
9902e4c9986SGeorge Wilson 
9912e4c9986SGeorge Wilson 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
9922e4c9986SGeorge Wilson 		return;
9932e4c9986SGeorge Wilson 
9942e4c9986SGeorge Wilson 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
9952e4c9986SGeorge Wilson 	    KM_SLEEP);
9962e4c9986SGeorge Wilson 
9972e4c9986SGeorge Wilson 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
9982e4c9986SGeorge Wilson 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
9992e4c9986SGeorge Wilson 
10002e4c9986SGeorge Wilson 	for (int m = 0; m < vd->vdev_ms_count; m++) {
10012e4c9986SGeorge Wilson 		metaslab_t *msp = vd->vdev_ms[m];
10022e4c9986SGeorge Wilson 
1003663207adSDon Brady 		/* skip if not active or not a member */
1004663207adSDon Brady 		if (msp->ms_sm == NULL || msp->ms_group != mg)
10052e4c9986SGeorge Wilson 			continue;
10062e4c9986SGeorge Wilson 
10072e4c9986SGeorge Wilson 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
10082e4c9986SGeorge Wilson 			mg_hist[i + ashift] +=
10092e4c9986SGeorge Wilson 			    msp->ms_sm->sm_phys->smp_histogram[i];
10102e4c9986SGeorge Wilson 	}
10112e4c9986SGeorge Wilson 
10122e4c9986SGeorge Wilson 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
10132e4c9986SGeorge Wilson 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
10142e4c9986SGeorge Wilson 
10152e4c9986SGeorge Wilson 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
10162e4c9986SGeorge Wilson }
10172e4c9986SGeorge Wilson 
1018ecc2d604Sbonwick static void
metaslab_group_histogram_add(metaslab_group_t * mg,metaslab_t * msp)10192e4c9986SGeorge Wilson metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
10202e4c9986SGeorge Wilson {
10212e4c9986SGeorge Wilson 	metaslab_class_t *mc = mg->mg_class;
10222e4c9986SGeorge Wilson 	uint64_t ashift = mg->mg_vd->vdev_ashift;
10232e4c9986SGeorge Wilson 
10242e4c9986SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
10252e4c9986SGeorge Wilson 	if (msp->ms_sm == NULL)
10262e4c9986SGeorge Wilson 		return;
10272e4c9986SGeorge Wilson 
10282e4c9986SGeorge Wilson 	mutex_enter(&mg->mg_lock);
10292e4c9986SGeorge Wilson 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
10302e4c9986SGeorge Wilson 		mg->mg_histogram[i + ashift] +=
10312e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i];
10322e4c9986SGeorge Wilson 		mc->mc_histogram[i + ashift] +=
10332e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i];
10342e4c9986SGeorge Wilson 	}
10352e4c9986SGeorge Wilson 	mutex_exit(&mg->mg_lock);
10362e4c9986SGeorge Wilson }
10372e4c9986SGeorge Wilson 
10382e4c9986SGeorge Wilson void
metaslab_group_histogram_remove(metaslab_group_t * mg,metaslab_t * msp)10392e4c9986SGeorge Wilson metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
1040fa9e4066Sahrens {
10412e4c9986SGeorge Wilson 	metaslab_class_t *mc = mg->mg_class;
10422e4c9986SGeorge Wilson 	uint64_t ashift = mg->mg_vd->vdev_ashift;
10432e4c9986SGeorge Wilson 
10442e4c9986SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
10452e4c9986SGeorge Wilson 	if (msp->ms_sm == NULL)
10462e4c9986SGeorge Wilson 		return;
10472e4c9986SGeorge Wilson 
1048fa9e4066Sahrens 	mutex_enter(&mg->mg_lock);
10492e4c9986SGeorge Wilson 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
10502e4c9986SGeorge Wilson 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
10512e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i]);
10522e4c9986SGeorge Wilson 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
10532e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i]);
10542e4c9986SGeorge Wilson 
10552e4c9986SGeorge Wilson 		mg->mg_histogram[i + ashift] -=
10562e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i];
10572e4c9986SGeorge Wilson 		mc->mc_histogram[i + ashift] -=
10582e4c9986SGeorge Wilson 		    msp->ms_sm->sm_phys->smp_histogram[i];
10592e4c9986SGeorge Wilson 	}
10602e4c9986SGeorge Wilson 	mutex_exit(&mg->mg_lock);
10612e4c9986SGeorge Wilson }
10622e4c9986SGeorge Wilson 
10632e4c9986SGeorge Wilson static void
metaslab_group_add(metaslab_group_t * mg,metaslab_t * msp)10642e4c9986SGeorge Wilson metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
10652e4c9986SGeorge Wilson {
1066fa9e4066Sahrens 	ASSERT(msp->ms_group == NULL);
10672e4c9986SGeorge Wilson 	mutex_enter(&mg->mg_lock);
1068fa9e4066Sahrens 	msp->ms_group = mg;
1069ecc2d604Sbonwick 	msp->ms_weight = 0;
1070fa9e4066Sahrens 	avl_add(&mg->mg_metaslab_tree, msp);
1071fa9e4066Sahrens 	mutex_exit(&mg->mg_lock);
10722e4c9986SGeorge Wilson 
10732e4c9986SGeorge Wilson 	mutex_enter(&msp->ms_lock);
10742e4c9986SGeorge Wilson 	metaslab_group_histogram_add(mg, msp);
10752e4c9986SGeorge Wilson 	mutex_exit(&msp->ms_lock);
1076fa9e4066Sahrens }
1077fa9e4066Sahrens 
1078ecc2d604Sbonwick static void
metaslab_group_remove(metaslab_group_t * mg,metaslab_t * msp)1079fa9e4066Sahrens metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1080fa9e4066Sahrens {
10812e4c9986SGeorge Wilson 	mutex_enter(&msp->ms_lock);
10822e4c9986SGeorge Wilson 	metaslab_group_histogram_remove(mg, msp);
10832e4c9986SGeorge Wilson 	mutex_exit(&msp->ms_lock);
10842e4c9986SGeorge Wilson 
1085fa9e4066Sahrens 	mutex_enter(&mg->mg_lock);
1086fa9e4066Sahrens 	ASSERT(msp->ms_group == mg);
1087fa9e4066Sahrens 	avl_remove(&mg->mg_metaslab_tree, msp);
1088af1d63abSPaul Dagnelie 
1089af1d63abSPaul Dagnelie 	metaslab_class_t *mc = msp->ms_group->mg_class;
1090af1d63abSPaul Dagnelie 	multilist_sublist_t *mls =
1091af1d63abSPaul Dagnelie 	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
1092af1d63abSPaul Dagnelie 	if (multilist_link_active(&msp->ms_class_txg_node))
1093af1d63abSPaul Dagnelie 		multilist_sublist_remove(mls, msp);
1094af1d63abSPaul Dagnelie 	multilist_sublist_unlock(mls);
1095af1d63abSPaul Dagnelie 
1096fa9e4066Sahrens 	msp->ms_group = NULL;
1097fa9e4066Sahrens 	mutex_exit(&mg->mg_lock);
1098fa9e4066Sahrens }
1099fa9e4066Sahrens 
1100f78cdc34SPaul Dagnelie static void
metaslab_group_sort_impl(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)1101f78cdc34SPaul Dagnelie metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1102f78cdc34SPaul Dagnelie {
1103af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1104f78cdc34SPaul Dagnelie 	ASSERT(MUTEX_HELD(&mg->mg_lock));
1105f78cdc34SPaul Dagnelie 	ASSERT(msp->ms_group == mg);
1106af1d63abSPaul Dagnelie 
1107f78cdc34SPaul Dagnelie 	avl_remove(&mg->mg_metaslab_tree, msp);
1108f78cdc34SPaul Dagnelie 	msp->ms_weight = weight;
1109f78cdc34SPaul Dagnelie 	avl_add(&mg->mg_metaslab_tree, msp);
1110f78cdc34SPaul Dagnelie 
1111f78cdc34SPaul Dagnelie }
1112f78cdc34SPaul Dagnelie 
1113ecc2d604Sbonwick static void
metaslab_group_sort(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)1114fa9e4066Sahrens metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1115fa9e4066Sahrens {
11165f5f7a6fSahrens 	/*
11175f5f7a6fSahrens 	 * Although in principle the weight can be any value, in
11182e4c9986SGeorge Wilson 	 * practice we do not use values in the range [1, 511].
11195f5f7a6fSahrens 	 */
11202e4c9986SGeorge Wilson 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1121ecc2d604Sbonwick 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1122ecc2d604Sbonwick 
1123fa9e4066Sahrens 	mutex_enter(&mg->mg_lock);
1124f78cdc34SPaul Dagnelie 	metaslab_group_sort_impl(mg, msp, weight);
1125fa9e4066Sahrens 	mutex_exit(&mg->mg_lock);
1126fa9e4066Sahrens }
1127fa9e4066Sahrens 
11282e4c9986SGeorge Wilson /*
11292e4c9986SGeorge Wilson  * Calculate the fragmentation for a given metaslab group. We can use
11302e4c9986SGeorge Wilson  * a simple average here since all metaslabs within the group must have
11312e4c9986SGeorge Wilson  * the same size. The return value will be a value between 0 and 100
11322e4c9986SGeorge Wilson  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
11332e4c9986SGeorge Wilson  * group have a fragmentation metric.
11342e4c9986SGeorge Wilson  */
11352e4c9986SGeorge Wilson uint64_t
metaslab_group_fragmentation(metaslab_group_t * mg)11362e4c9986SGeorge Wilson metaslab_group_fragmentation(metaslab_group_t *mg)
11372e4c9986SGeorge Wilson {
11382e4c9986SGeorge Wilson 	vdev_t *vd = mg->mg_vd;
11392e4c9986SGeorge Wilson 	uint64_t fragmentation = 0;
11402e4c9986SGeorge Wilson 	uint64_t valid_ms = 0;
11412e4c9986SGeorge Wilson 
11422e4c9986SGeorge Wilson 	for (int m = 0; m < vd->vdev_ms_count; m++) {
11432e4c9986SGeorge Wilson 		metaslab_t *msp = vd->vdev_ms[m];
11442e4c9986SGeorge Wilson 
11452e4c9986SGeorge Wilson 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
11462e4c9986SGeorge Wilson 			continue;
1147663207adSDon Brady 		if (msp->ms_group != mg)
1148663207adSDon Brady 			continue;
11492e4c9986SGeorge Wilson 
11502e4c9986SGeorge Wilson 		valid_ms++;
11512e4c9986SGeorge Wilson 		fragmentation += msp->ms_fragmentation;
11522e4c9986SGeorge Wilson 	}
11532e4c9986SGeorge Wilson 
1154663207adSDon Brady 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
11552e4c9986SGeorge Wilson 		return (ZFS_FRAG_INVALID);
11562e4c9986SGeorge Wilson 
11572e4c9986SGeorge Wilson 	fragmentation /= valid_ms;
11582e4c9986SGeorge Wilson 	ASSERT3U(fragmentation, <=, 100);
11592e4c9986SGeorge Wilson 	return (fragmentation);
11602e4c9986SGeorge Wilson }
11612e4c9986SGeorge Wilson 
116222e30981SGeorge Wilson /*
116322e30981SGeorge Wilson  * Determine if a given metaslab group should skip allocations. A metaslab
11642e4c9986SGeorge Wilson  * group should avoid allocations if its free capacity is less than the
11652e4c9986SGeorge Wilson  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
11662e4c9986SGeorge Wilson  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
11670f7643c7SGeorge Wilson  * that can still handle allocations. If the allocation throttle is enabled
11680f7643c7SGeorge Wilson  * then we skip allocations to devices that have reached their maximum
11690f7643c7SGeorge Wilson  * allocation queue depth unless the selected metaslab group is the only
11700f7643c7SGeorge Wilson  * eligible group remaining.
117122e30981SGeorge Wilson  */
117222e30981SGeorge Wilson static boolean_t
metaslab_group_allocatable(metaslab_group_t * mg,metaslab_group_t * rotor,uint64_t psize,int allocator,int d)11730f7643c7SGeorge Wilson metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1174dbcaafbdSAlexander Motin     uint64_t psize, int allocator, int d)
117522e30981SGeorge Wilson {
11760f7643c7SGeorge Wilson 	spa_t *spa = mg->mg_vd->vdev_spa;
117722e30981SGeorge Wilson 	metaslab_class_t *mc = mg->mg_class;
117822e30981SGeorge Wilson 
117922e30981SGeorge Wilson 	/*
11800f7643c7SGeorge Wilson 	 * We can only consider skipping this metaslab group if it's
11810f7643c7SGeorge Wilson 	 * in the normal metaslab class and there are other metaslab
11820f7643c7SGeorge Wilson 	 * groups to select from. Otherwise, we always consider it eligible
11832e4c9986SGeorge Wilson 	 * for allocations.
118422e30981SGeorge Wilson 	 */
1185663207adSDon Brady 	if ((mc != spa_normal_class(spa) &&
1186663207adSDon Brady 	    mc != spa_special_class(spa) &&
1187663207adSDon Brady 	    mc != spa_dedup_class(spa)) ||
1188663207adSDon Brady 	    mc->mc_groups <= 1)
11890f7643c7SGeorge Wilson 		return (B_TRUE);
11900f7643c7SGeorge Wilson 
11910f7643c7SGeorge Wilson 	/*
11920f7643c7SGeorge Wilson 	 * If the metaslab group's mg_allocatable flag is set (see comments
11930f7643c7SGeorge Wilson 	 * in metaslab_group_alloc_update() for more information) and
11940f7643c7SGeorge Wilson 	 * the allocation throttle is disabled then allow allocations to this
11950f7643c7SGeorge Wilson 	 * device. However, if the allocation throttle is enabled then
11960f7643c7SGeorge Wilson 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
11970f7643c7SGeorge Wilson 	 * to determine if we should allow allocations to this metaslab group.
11980f7643c7SGeorge Wilson 	 * If all metaslab groups are no longer considered allocatable
11990f7643c7SGeorge Wilson 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
12000f7643c7SGeorge Wilson 	 * gang block size then we allow allocations on this metaslab group
12010f7643c7SGeorge Wilson 	 * regardless of the mg_allocatable or throttle settings.
12020f7643c7SGeorge Wilson 	 */
12030f7643c7SGeorge Wilson 	if (mg->mg_allocatable) {
12040f7643c7SGeorge Wilson 		metaslab_group_t *mgp;
12050f7643c7SGeorge Wilson 		int64_t qdepth;
1206f78cdc34SPaul Dagnelie 		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
12070f7643c7SGeorge Wilson 
12080f7643c7SGeorge Wilson 		if (!mc->mc_alloc_throttle_enabled)
12090f7643c7SGeorge Wilson 			return (B_TRUE);
12100f7643c7SGeorge Wilson 
12110f7643c7SGeorge Wilson 		/*
12120f7643c7SGeorge Wilson 		 * If this metaslab group does not have any free space, then
12130f7643c7SGeorge Wilson 		 * there is no point in looking further.
12140f7643c7SGeorge Wilson 		 */
12150f7643c7SGeorge Wilson 		if (mg->mg_no_free_space)
12160f7643c7SGeorge Wilson 			return (B_FALSE);
12170f7643c7SGeorge Wilson 
1218dbcaafbdSAlexander Motin 		/*
1219dbcaafbdSAlexander Motin 		 * Relax allocation throttling for ditto blocks.  Due to
1220dbcaafbdSAlexander Motin 		 * random imbalances in allocation it tends to push copies
1221dbcaafbdSAlexander Motin 		 * to one vdev, that looks a bit better at the moment.
1222dbcaafbdSAlexander Motin 		 */
1223dbcaafbdSAlexander Motin 		qmax = qmax * (4 + d) / 4;
1224dbcaafbdSAlexander Motin 
1225e914ace2STim Schumacher 		qdepth = zfs_refcount_count(
1226e914ace2STim Schumacher 		    &mg->mg_alloc_queue_depth[allocator]);
12270f7643c7SGeorge Wilson 
12280f7643c7SGeorge Wilson 		/*
12290f7643c7SGeorge Wilson 		 * If this metaslab group is below its qmax or it's
12300f7643c7SGeorge Wilson 		 * the only allocatable metasable group, then attempt
12310f7643c7SGeorge Wilson 		 * to allocate from it.
12320f7643c7SGeorge Wilson 		 */
12330f7643c7SGeorge Wilson 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
12340f7643c7SGeorge Wilson 			return (B_TRUE);
12350f7643c7SGeorge Wilson 		ASSERT3U(mc->mc_alloc_groups, >, 1);
12360f7643c7SGeorge Wilson 
12370f7643c7SGeorge Wilson 		/*
12380f7643c7SGeorge Wilson 		 * Since this metaslab group is at or over its qmax, we
12390f7643c7SGeorge Wilson 		 * need to determine if there are metaslab groups after this
12400f7643c7SGeorge Wilson 		 * one that might be able to handle this allocation. This is
12410f7643c7SGeorge Wilson 		 * racy since we can't hold the locks for all metaslab
12420f7643c7SGeorge Wilson 		 * groups at the same time when we make this check.
12430f7643c7SGeorge Wilson 		 */
12440f7643c7SGeorge Wilson 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1245f78cdc34SPaul Dagnelie 			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1246dbcaafbdSAlexander Motin 			qmax = qmax * (4 + d) / 4;
1247e914ace2STim Schumacher 			qdepth = zfs_refcount_count(
1248f78cdc34SPaul Dagnelie 			    &mgp->mg_alloc_queue_depth[allocator]);
12490f7643c7SGeorge Wilson 
12500f7643c7SGeorge Wilson 			/*
12510f7643c7SGeorge Wilson 			 * If there is another metaslab group that
12520f7643c7SGeorge Wilson 			 * might be able to handle the allocation, then
12530f7643c7SGeorge Wilson 			 * we return false so that we skip this group.
12540f7643c7SGeorge Wilson 			 */
12550f7643c7SGeorge Wilson 			if (qdepth < qmax && !mgp->mg_no_free_space)
12560f7643c7SGeorge Wilson 				return (B_FALSE);
12570f7643c7SGeorge Wilson 		}
12580f7643c7SGeorge Wilson 
12590f7643c7SGeorge Wilson 		/*
12600f7643c7SGeorge Wilson 		 * We didn't find another group to handle the allocation
12610f7643c7SGeorge Wilson 		 * so we can't skip this metaslab group even though
12620f7643c7SGeorge Wilson 		 * we are at or over our qmax.
12630f7643c7SGeorge Wilson 		 */
12640f7643c7SGeorge Wilson 		return (B_TRUE);
12650f7643c7SGeorge Wilson 
12660f7643c7SGeorge Wilson 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
12670f7643c7SGeorge Wilson 		return (B_TRUE);
12680f7643c7SGeorge Wilson 	}
12690f7643c7SGeorge Wilson 	return (B_FALSE);
127022e30981SGeorge Wilson }
127122e30981SGeorge Wilson 
127280eb36f2SGeorge Wilson /*
127380eb36f2SGeorge Wilson  * ==========================================================================
12740713e232SGeorge Wilson  * Range tree callbacks
127580eb36f2SGeorge Wilson  * ==========================================================================
127680eb36f2SGeorge Wilson  */
12770713e232SGeorge Wilson 
12780713e232SGeorge Wilson /*
12794d7988d6SPaul Dagnelie  * Comparison function for the private size-ordered tree using 32-bit
12804d7988d6SPaul Dagnelie  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
12814d7988d6SPaul Dagnelie  */
12824d7988d6SPaul Dagnelie static int
metaslab_rangesize32_compare(const void * x1,const void * x2)12834d7988d6SPaul Dagnelie metaslab_rangesize32_compare(const void *x1, const void *x2)
12844d7988d6SPaul Dagnelie {
12854d7988d6SPaul Dagnelie 	const range_seg32_t *r1 = x1;
12864d7988d6SPaul Dagnelie 	const range_seg32_t *r2 = x2;
12874d7988d6SPaul Dagnelie 
12884d7988d6SPaul Dagnelie 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
12894d7988d6SPaul Dagnelie 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
12904d7988d6SPaul Dagnelie 
12914d7988d6SPaul Dagnelie 	int cmp = TREE_CMP(rs_size1, rs_size2);
12924d7988d6SPaul Dagnelie 	if (likely(cmp))
12934d7988d6SPaul Dagnelie 		return (cmp);
12944d7988d6SPaul Dagnelie 
12954d7988d6SPaul Dagnelie 	return (TREE_CMP(r1->rs_start, r2->rs_start));
12964d7988d6SPaul Dagnelie }
12974d7988d6SPaul Dagnelie 
12984d7988d6SPaul Dagnelie /*
12994d7988d6SPaul Dagnelie  * Comparison function for the private size-ordered tree using 64-bit
13004d7988d6SPaul Dagnelie  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
13010713e232SGeorge Wilson  */
130280eb36f2SGeorge Wilson static int
metaslab_rangesize64_compare(const void * x1,const void * x2)13034d7988d6SPaul Dagnelie metaslab_rangesize64_compare(const void *x1, const void *x2)
130480eb36f2SGeorge Wilson {
13054d7988d6SPaul Dagnelie 	const range_seg64_t *r1 = x1;
13064d7988d6SPaul Dagnelie 	const range_seg64_t *r2 = x2;
13074d7988d6SPaul Dagnelie 
13080713e232SGeorge Wilson 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
13090713e232SGeorge Wilson 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
131080eb36f2SGeorge Wilson 
13114d7988d6SPaul Dagnelie 	int cmp = TREE_CMP(rs_size1, rs_size2);
1312c4ab0d3fSGvozden Neskovic 	if (likely(cmp))
1313c4ab0d3fSGvozden Neskovic 		return (cmp);
13140713e232SGeorge Wilson 
13154d7988d6SPaul Dagnelie 	return (TREE_CMP(r1->rs_start, r2->rs_start));
13164d7988d6SPaul Dagnelie }
13174d7988d6SPaul Dagnelie typedef struct metaslab_rt_arg {
13184d7988d6SPaul Dagnelie 	zfs_btree_t *mra_bt;
13194d7988d6SPaul Dagnelie 	uint32_t mra_floor_shift;
13204d7988d6SPaul Dagnelie } metaslab_rt_arg_t;
13214d7988d6SPaul Dagnelie 
13224d7988d6SPaul Dagnelie struct mssa_arg {
13234d7988d6SPaul Dagnelie 	range_tree_t *rt;
13244d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mra;
13254d7988d6SPaul Dagnelie };
13264d7988d6SPaul Dagnelie 
13274d7988d6SPaul Dagnelie static void
metaslab_size_sorted_add(void * arg,uint64_t start,uint64_t size)13284d7988d6SPaul Dagnelie metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
13294d7988d6SPaul Dagnelie {
13304d7988d6SPaul Dagnelie 	struct mssa_arg *mssap = arg;
13314d7988d6SPaul Dagnelie 	range_tree_t *rt = mssap->rt;
13324d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = mssap->mra;
13334d7988d6SPaul Dagnelie 	range_seg_max_t seg = {0};
13344d7988d6SPaul Dagnelie 	rs_set_start(&seg, rt, start);
13354d7988d6SPaul Dagnelie 	rs_set_end(&seg, rt, start + size);
13364d7988d6SPaul Dagnelie 	metaslab_rt_add(rt, &seg, mrap);
13374d7988d6SPaul Dagnelie }
13384d7988d6SPaul Dagnelie 
13394d7988d6SPaul Dagnelie static void
metaslab_size_tree_full_load(range_tree_t * rt)13404d7988d6SPaul Dagnelie metaslab_size_tree_full_load(range_tree_t *rt)
13414d7988d6SPaul Dagnelie {
13424d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = rt->rt_arg;
13434d7988d6SPaul Dagnelie #ifdef _METASLAB_TRACING
13444d7988d6SPaul Dagnelie 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
13454d7988d6SPaul Dagnelie #endif
13464d7988d6SPaul Dagnelie 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
13474d7988d6SPaul Dagnelie 	mrap->mra_floor_shift = 0;
13484d7988d6SPaul Dagnelie 	struct mssa_arg arg = {0};
13494d7988d6SPaul Dagnelie 	arg.rt = rt;
13504d7988d6SPaul Dagnelie 	arg.mra = mrap;
13514d7988d6SPaul Dagnelie 	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
13524d7988d6SPaul Dagnelie }
13534d7988d6SPaul Dagnelie 
13544d7988d6SPaul Dagnelie /*
13554d7988d6SPaul Dagnelie  * Create any block allocator specific components. The current allocators
13564d7988d6SPaul Dagnelie  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
13574d7988d6SPaul Dagnelie  */
13584d7988d6SPaul Dagnelie /* ARGSUSED */
13594d7988d6SPaul Dagnelie static void
metaslab_rt_create(range_tree_t * rt,void * arg)13604d7988d6SPaul Dagnelie metaslab_rt_create(range_tree_t *rt, void *arg)
13614d7988d6SPaul Dagnelie {
13624d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = arg;
13634d7988d6SPaul Dagnelie 	zfs_btree_t *size_tree = mrap->mra_bt;
13644d7988d6SPaul Dagnelie 
13654d7988d6SPaul Dagnelie 	size_t size;
13664d7988d6SPaul Dagnelie 	int (*compare) (const void *, const void *);
13674d7988d6SPaul Dagnelie 	switch (rt->rt_type) {
13684d7988d6SPaul Dagnelie 	case RANGE_SEG32:
13694d7988d6SPaul Dagnelie 		size = sizeof (range_seg32_t);
13704d7988d6SPaul Dagnelie 		compare = metaslab_rangesize32_compare;
13714d7988d6SPaul Dagnelie 		break;
13724d7988d6SPaul Dagnelie 	case RANGE_SEG64:
13734d7988d6SPaul Dagnelie 		size = sizeof (range_seg64_t);
13744d7988d6SPaul Dagnelie 		compare = metaslab_rangesize64_compare;
13754d7988d6SPaul Dagnelie 		break;
13764d7988d6SPaul Dagnelie 	default:
13774d7988d6SPaul Dagnelie 		panic("Invalid range seg type %d", rt->rt_type);
13784d7988d6SPaul Dagnelie 	}
13794d7988d6SPaul Dagnelie 	zfs_btree_create(size_tree, compare, size);
13804d7988d6SPaul Dagnelie 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
13814d7988d6SPaul Dagnelie }
13824d7988d6SPaul Dagnelie 
13834d7988d6SPaul Dagnelie /* ARGSUSED */
13844d7988d6SPaul Dagnelie static void
metaslab_rt_destroy(range_tree_t * rt,void * arg)13854d7988d6SPaul Dagnelie metaslab_rt_destroy(range_tree_t *rt, void *arg)
13864d7988d6SPaul Dagnelie {
13874d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = arg;
13884d7988d6SPaul Dagnelie 	zfs_btree_t *size_tree = mrap->mra_bt;
13894d7988d6SPaul Dagnelie 
13904d7988d6SPaul Dagnelie 	zfs_btree_destroy(size_tree);
13914d7988d6SPaul Dagnelie 	kmem_free(mrap, sizeof (*mrap));
13924d7988d6SPaul Dagnelie }
13934d7988d6SPaul Dagnelie 
13944d7988d6SPaul Dagnelie /* ARGSUSED */
13954d7988d6SPaul Dagnelie static void
metaslab_rt_add(range_tree_t * rt,range_seg_t * rs,void * arg)13964d7988d6SPaul Dagnelie metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
13974d7988d6SPaul Dagnelie {
13984d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = arg;
13994d7988d6SPaul Dagnelie 	zfs_btree_t *size_tree = mrap->mra_bt;
14004d7988d6SPaul Dagnelie 
14014d7988d6SPaul Dagnelie 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
14024d7988d6SPaul Dagnelie 	    (1 << mrap->mra_floor_shift))
14034d7988d6SPaul Dagnelie 		return;
14044d7988d6SPaul Dagnelie 
14054d7988d6SPaul Dagnelie 	zfs_btree_add(size_tree, rs);
14064d7988d6SPaul Dagnelie }
14074d7988d6SPaul Dagnelie 
14084d7988d6SPaul Dagnelie /* ARGSUSED */
14094d7988d6SPaul Dagnelie static void
metaslab_rt_remove(range_tree_t * rt,range_seg_t * rs,void * arg)14104d7988d6SPaul Dagnelie metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
14114d7988d6SPaul Dagnelie {
14124d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = arg;
14134d7988d6SPaul Dagnelie 	zfs_btree_t *size_tree = mrap->mra_bt;
14144d7988d6SPaul Dagnelie 
14154d7988d6SPaul Dagnelie 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
14164d7988d6SPaul Dagnelie 	    mrap->mra_floor_shift))
14174d7988d6SPaul Dagnelie 		return;
14184d7988d6SPaul Dagnelie 
14194d7988d6SPaul Dagnelie 	zfs_btree_remove(size_tree, rs);
142080eb36f2SGeorge Wilson }
142180eb36f2SGeorge Wilson 
14224d7988d6SPaul Dagnelie /* ARGSUSED */
14234d7988d6SPaul Dagnelie static void
metaslab_rt_vacate(range_tree_t * rt,void * arg)14244d7988d6SPaul Dagnelie metaslab_rt_vacate(range_tree_t *rt, void *arg)
14254d7988d6SPaul Dagnelie {
14264d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap = arg;
14274d7988d6SPaul Dagnelie 	zfs_btree_t *size_tree = mrap->mra_bt;
14284d7988d6SPaul Dagnelie 	zfs_btree_clear(size_tree);
14294d7988d6SPaul Dagnelie 	zfs_btree_destroy(size_tree);
14304d7988d6SPaul Dagnelie 
14314d7988d6SPaul Dagnelie 	metaslab_rt_create(rt, arg);
14324d7988d6SPaul Dagnelie }
14334d7988d6SPaul Dagnelie 
14344d7988d6SPaul Dagnelie static range_tree_ops_t metaslab_rt_ops = {
14354d7988d6SPaul Dagnelie 	.rtop_create = metaslab_rt_create,
14364d7988d6SPaul Dagnelie 	.rtop_destroy = metaslab_rt_destroy,
14374d7988d6SPaul Dagnelie 	.rtop_add = metaslab_rt_add,
14384d7988d6SPaul Dagnelie 	.rtop_remove = metaslab_rt_remove,
14394d7988d6SPaul Dagnelie 	.rtop_vacate = metaslab_rt_vacate
14404d7988d6SPaul Dagnelie };
14414d7988d6SPaul Dagnelie 
14420713e232SGeorge Wilson /*
14430713e232SGeorge Wilson  * ==========================================================================
14448363e80aSGeorge Wilson  * Common allocator routines
14450713e232SGeorge Wilson  * ==========================================================================
14460713e232SGeorge Wilson  */
14470713e232SGeorge Wilson 
1448d6e555bdSGeorge Wilson /*
144980eb36f2SGeorge Wilson  * Return the maximum contiguous segment within the metaslab.
1450d6e555bdSGeorge Wilson  */
1451d6e555bdSGeorge Wilson uint64_t
metaslab_largest_allocatable(metaslab_t * msp)1452af1d63abSPaul Dagnelie metaslab_largest_allocatable(metaslab_t *msp)
1453d6e555bdSGeorge Wilson {
14544d7988d6SPaul Dagnelie 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
14550713e232SGeorge Wilson 	range_seg_t *rs;
1456d6e555bdSGeorge Wilson 
1457af1d63abSPaul Dagnelie 	if (t == NULL)
1458af1d63abSPaul Dagnelie 		return (0);
14594d7988d6SPaul Dagnelie 	if (zfs_btree_numnodes(t) == 0)
14604d7988d6SPaul Dagnelie 		metaslab_size_tree_full_load(msp->ms_allocatable);
14614d7988d6SPaul Dagnelie 
14624d7988d6SPaul Dagnelie 	rs = zfs_btree_last(t, NULL);
1463af1d63abSPaul Dagnelie 	if (rs == NULL)
1464af1d63abSPaul Dagnelie 		return (0);
1465d6e555bdSGeorge Wilson 
14664d7988d6SPaul Dagnelie 	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
14674d7988d6SPaul Dagnelie 	    msp->ms_allocatable));
14680713e232SGeorge Wilson }
14690713e232SGeorge Wilson 
1470af1d63abSPaul Dagnelie /*
1471af1d63abSPaul Dagnelie  * Return the maximum contiguous segment within the unflushed frees of this
1472af1d63abSPaul Dagnelie  * metaslab.
1473af1d63abSPaul Dagnelie  */
1474af1d63abSPaul Dagnelie uint64_t
metaslab_largest_unflushed_free(metaslab_t * msp)1475af1d63abSPaul Dagnelie metaslab_largest_unflushed_free(metaslab_t *msp)
1476af1d63abSPaul Dagnelie {
1477af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1478af1d63abSPaul Dagnelie 
1479af1d63abSPaul Dagnelie 	if (msp->ms_unflushed_frees == NULL)
1480af1d63abSPaul Dagnelie 		return (0);
1481af1d63abSPaul Dagnelie 
14824d7988d6SPaul Dagnelie 	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
14834d7988d6SPaul Dagnelie 		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
14844d7988d6SPaul Dagnelie 	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
14854d7988d6SPaul Dagnelie 	    NULL);
1486af1d63abSPaul Dagnelie 	if (rs == NULL)
1487af1d63abSPaul Dagnelie 		return (0);
1488af1d63abSPaul Dagnelie 
1489af1d63abSPaul Dagnelie 	/*
1490af1d63abSPaul Dagnelie 	 * When a range is freed from the metaslab, that range is added to
1491af1d63abSPaul Dagnelie 	 * both the unflushed frees and the deferred frees. While the block
1492af1d63abSPaul Dagnelie 	 * will eventually be usable, if the metaslab were loaded the range
1493af1d63abSPaul Dagnelie 	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1494af1d63abSPaul Dagnelie 	 * txgs had passed.  As a result, when attempting to estimate an upper
1495af1d63abSPaul Dagnelie 	 * bound for the largest currently-usable free segment in the
1496af1d63abSPaul Dagnelie 	 * metaslab, we need to not consider any ranges currently in the defer
1497af1d63abSPaul Dagnelie 	 * trees. This algorithm approximates the largest available chunk in
1498af1d63abSPaul Dagnelie 	 * the largest range in the unflushed_frees tree by taking the first
1499af1d63abSPaul Dagnelie 	 * chunk.  While this may be a poor estimate, it should only remain so
1500af1d63abSPaul Dagnelie 	 * briefly and should eventually self-correct as frees are no longer
1501af1d63abSPaul Dagnelie 	 * deferred. Similar logic applies to the ms_freed tree. See
1502af1d63abSPaul Dagnelie 	 * metaslab_load() for more details.
1503af1d63abSPaul Dagnelie 	 *
1504af1d63abSPaul Dagnelie 	 * There are two primary sources of innacuracy in this estimate. Both
1505af1d63abSPaul Dagnelie 	 * are tolerated for performance reasons. The first source is that we
1506af1d63abSPaul Dagnelie 	 * only check the largest segment for overlaps. Smaller segments may
1507af1d63abSPaul Dagnelie 	 * have more favorable overlaps with the other trees, resulting in
1508af1d63abSPaul Dagnelie 	 * larger usable chunks.  Second, we only look at the first chunk in
1509af1d63abSPaul Dagnelie 	 * the largest segment; there may be other usable chunks in the
1510af1d63abSPaul Dagnelie 	 * largest segment, but we ignore them.
1511af1d63abSPaul Dagnelie 	 */
15124d7988d6SPaul Dagnelie 	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
15134d7988d6SPaul Dagnelie 	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1514af1d63abSPaul Dagnelie 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1515af1d63abSPaul Dagnelie 		uint64_t start = 0;
1516af1d63abSPaul Dagnelie 		uint64_t size = 0;
1517af1d63abSPaul Dagnelie 		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1518af1d63abSPaul Dagnelie 		    rsize, &start, &size);
1519af1d63abSPaul Dagnelie 		if (found) {
1520af1d63abSPaul Dagnelie 			if (rstart == start)
1521af1d63abSPaul Dagnelie 				return (0);
1522af1d63abSPaul Dagnelie 			rsize = start - rstart;
1523af1d63abSPaul Dagnelie 		}
1524af1d63abSPaul Dagnelie 	}
1525af1d63abSPaul Dagnelie 
1526af1d63abSPaul Dagnelie 	uint64_t start = 0;
1527af1d63abSPaul Dagnelie 	uint64_t size = 0;
1528af1d63abSPaul Dagnelie 	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1529af1d63abSPaul Dagnelie 	    rsize, &start, &size);
1530af1d63abSPaul Dagnelie 	if (found)
1531af1d63abSPaul Dagnelie 		rsize = start - rstart;
1532af1d63abSPaul Dagnelie 
1533af1d63abSPaul Dagnelie 	return (rsize);
1534af1d63abSPaul Dagnelie }
1535af1d63abSPaul Dagnelie 
15368363e80aSGeorge Wilson static range_seg_t *
metaslab_block_find(zfs_btree_t * t,range_tree_t * rt,uint64_t start,uint64_t size,zfs_btree_index_t * where)15374d7988d6SPaul Dagnelie metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
15384d7988d6SPaul Dagnelie     uint64_t size, zfs_btree_index_t *where)
15390713e232SGeorge Wilson {
15404d7988d6SPaul Dagnelie 	range_seg_t *rs;
15414d7988d6SPaul Dagnelie 	range_seg_max_t rsearch;
15420713e232SGeorge Wilson 
15434d7988d6SPaul Dagnelie 	rs_set_start(&rsearch, rt, start);
15444d7988d6SPaul Dagnelie 	rs_set_end(&rsearch, rt, start + size);
15450713e232SGeorge Wilson 
15464d7988d6SPaul Dagnelie 	rs = zfs_btree_find(t, &rsearch, where);
15478363e80aSGeorge Wilson 	if (rs == NULL) {
15484d7988d6SPaul Dagnelie 		rs = zfs_btree_next(t, where, where);
15490713e232SGeorge Wilson 	}
15500713e232SGeorge Wilson 
15518363e80aSGeorge Wilson 	return (rs);
15528363e80aSGeorge Wilson }
15530713e232SGeorge Wilson 
15540713e232SGeorge Wilson /*
15554d7988d6SPaul Dagnelie  * This is a helper function that can be used by the allocator to find a
15564d7988d6SPaul Dagnelie  * suitable block to allocate. This will search the specified B-tree looking
15574d7988d6SPaul Dagnelie  * for a block that matches the specified criteria.
15580713e232SGeorge Wilson  */
15590713e232SGeorge Wilson static uint64_t
metaslab_block_picker(range_tree_t * rt,uint64_t * cursor,uint64_t size,uint64_t max_search)15604d7988d6SPaul Dagnelie metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
1561814dcd43SSerapheim Dimitropoulos     uint64_t max_search)
15620713e232SGeorge Wilson {
15634d7988d6SPaul Dagnelie 	if (*cursor == 0)
15644d7988d6SPaul Dagnelie 		*cursor = rt->rt_start;
15654d7988d6SPaul Dagnelie 	zfs_btree_t *bt = &rt->rt_root;
15664d7988d6SPaul Dagnelie 	zfs_btree_index_t where;
15674d7988d6SPaul Dagnelie 	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
1568814dcd43SSerapheim Dimitropoulos 	uint64_t first_found;
15694d7988d6SPaul Dagnelie 	int count_searched = 0;
15700713e232SGeorge Wilson 
1571814dcd43SSerapheim Dimitropoulos 	if (rs != NULL)
15724d7988d6SPaul Dagnelie 		first_found = rs_get_start(rs, rt);
15730713e232SGeorge Wilson 
15744d7988d6SPaul Dagnelie 	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
15754d7988d6SPaul Dagnelie 	    max_search || count_searched < metaslab_min_search_count)) {
15764d7988d6SPaul Dagnelie 		uint64_t offset = rs_get_start(rs, rt);
15774d7988d6SPaul Dagnelie 		if (offset + size <= rs_get_end(rs, rt)) {
15780713e232SGeorge Wilson 			*cursor = offset + size;
15790713e232SGeorge Wilson 			return (offset);
15800713e232SGeorge Wilson 		}
15814d7988d6SPaul Dagnelie 		rs = zfs_btree_next(bt, &where, &where);
15824d7988d6SPaul Dagnelie 		count_searched++;
15830713e232SGeorge Wilson 	}
15840713e232SGeorge Wilson 
15850713e232SGeorge Wilson 	*cursor = 0;
1586814dcd43SSerapheim Dimitropoulos 	return (-1ULL);
1587d6e555bdSGeorge Wilson }
1588d6e555bdSGeorge Wilson 
158980eb36f2SGeorge Wilson /*
159080eb36f2SGeorge Wilson  * ==========================================================================
1591814dcd43SSerapheim Dimitropoulos  * Dynamic Fit (df) block allocator
1592814dcd43SSerapheim Dimitropoulos  *
1593814dcd43SSerapheim Dimitropoulos  * Search for a free chunk of at least this size, starting from the last
1594814dcd43SSerapheim Dimitropoulos  * offset (for this alignment of block) looking for up to
1595814dcd43SSerapheim Dimitropoulos  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1596814dcd43SSerapheim Dimitropoulos  * found within 16MB, then return a free chunk of exactly the requested size (or
1597814dcd43SSerapheim Dimitropoulos  * larger).
1598814dcd43SSerapheim Dimitropoulos  *
1599814dcd43SSerapheim Dimitropoulos  * If it seems like searching from the last offset will be unproductive, skip
1600814dcd43SSerapheim Dimitropoulos  * that and just return a free chunk of exactly the requested size (or larger).
1601814dcd43SSerapheim Dimitropoulos  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1602814dcd43SSerapheim Dimitropoulos  * mechanism is probably not very useful and may be removed in the future.
1603814dcd43SSerapheim Dimitropoulos  *
1604814dcd43SSerapheim Dimitropoulos  * The behavior when not searching can be changed to return the largest free
1605814dcd43SSerapheim Dimitropoulos  * chunk, instead of a free chunk of exactly the requested size, by setting
1606814dcd43SSerapheim Dimitropoulos  * metaslab_df_use_largest_segment.
160780eb36f2SGeorge Wilson  * ==========================================================================
160880eb36f2SGeorge Wilson  */
1609d6e555bdSGeorge Wilson static uint64_t
metaslab_df_alloc(metaslab_t * msp,uint64_t size)16100713e232SGeorge Wilson metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1611d6e555bdSGeorge Wilson {
16120713e232SGeorge Wilson 	/*
16130713e232SGeorge Wilson 	 * Find the largest power of 2 block size that evenly divides the
16140713e232SGeorge Wilson 	 * requested size. This is used to try to allocate blocks with similar
16150713e232SGeorge Wilson 	 * alignment from the same area of the metaslab (i.e. same cursor
16160713e232SGeorge Wilson 	 * bucket) but it does not guarantee that other allocations sizes
16170713e232SGeorge Wilson 	 * may exist in the same region.
16180713e232SGeorge Wilson 	 */
1619d6e555bdSGeorge Wilson 	uint64_t align = size & -size;
1620bf16b11eSMatthew Ahrens 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
162186714001SSerapheim Dimitropoulos 	range_tree_t *rt = msp->ms_allocatable;
16220713e232SGeorge Wilson 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1623814dcd43SSerapheim Dimitropoulos 	uint64_t offset;
1624d6e555bdSGeorge Wilson 
16250713e232SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1626d6e555bdSGeorge Wilson 
1627d6e555bdSGeorge Wilson 	/*
1628814dcd43SSerapheim Dimitropoulos 	 * If we're running low on space, find a segment based on size,
1629814dcd43SSerapheim Dimitropoulos 	 * rather than iterating based on offset.
1630d6e555bdSGeorge Wilson 	 */
1631af1d63abSPaul Dagnelie 	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1632d6e555bdSGeorge Wilson 	    free_pct < metaslab_df_free_pct) {
1633814dcd43SSerapheim Dimitropoulos 		offset = -1;
1634814dcd43SSerapheim Dimitropoulos 	} else {
16354d7988d6SPaul Dagnelie 		offset = metaslab_block_picker(rt,
1636814dcd43SSerapheim Dimitropoulos 		    cursor, size, metaslab_df_max_search);
1637814dcd43SSerapheim Dimitropoulos 	}
1638814dcd43SSerapheim Dimitropoulos 
1639814dcd43SSerapheim Dimitropoulos 	if (offset == -1) {
1640814dcd43SSerapheim Dimitropoulos 		range_seg_t *rs;
16414d7988d6SPaul Dagnelie 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
16424d7988d6SPaul Dagnelie 			metaslab_size_tree_full_load(msp->ms_allocatable);
1643814dcd43SSerapheim Dimitropoulos 		if (metaslab_df_use_largest_segment) {
1644814dcd43SSerapheim Dimitropoulos 			/* use largest free segment */
16454d7988d6SPaul Dagnelie 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1646814dcd43SSerapheim Dimitropoulos 		} else {
16474d7988d6SPaul Dagnelie 			zfs_btree_index_t where;
1648814dcd43SSerapheim Dimitropoulos 			/* use segment of this size, or next largest */
16494d7988d6SPaul Dagnelie #ifdef _METASLAB_TRACING
16504d7988d6SPaul Dagnelie 			metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
16514d7988d6SPaul Dagnelie 			if (size < (1 << mrap->mra_floor_shift)) {
16524d7988d6SPaul Dagnelie 				METASLABSTAT_BUMP(
16534d7988d6SPaul Dagnelie 				    metaslabstat_df_find_under_floor);
16544d7988d6SPaul Dagnelie 			}
16554d7988d6SPaul Dagnelie #endif
1656814dcd43SSerapheim Dimitropoulos 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
16574d7988d6SPaul Dagnelie 			    rt, msp->ms_start, size, &where);
1658814dcd43SSerapheim Dimitropoulos 		}
16594d7988d6SPaul Dagnelie 		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
16604d7988d6SPaul Dagnelie 		    rt)) {
16614d7988d6SPaul Dagnelie 			offset = rs_get_start(rs, rt);
1662814dcd43SSerapheim Dimitropoulos 			*cursor = offset + size;
1663814dcd43SSerapheim Dimitropoulos 		}
1664d6e555bdSGeorge Wilson 	}
1665d6e555bdSGeorge Wilson 
1666814dcd43SSerapheim Dimitropoulos 	return (offset);
1667d6e555bdSGeorge Wilson }
1668d6e555bdSGeorge Wilson 
16690713e232SGeorge Wilson static metaslab_ops_t metaslab_df_ops = {
16702e4c9986SGeorge Wilson 	metaslab_df_alloc
167180eb36f2SGeorge Wilson };
167280eb36f2SGeorge Wilson 
167380eb36f2SGeorge Wilson /*
167480eb36f2SGeorge Wilson  * ==========================================================================
16750713e232SGeorge Wilson  * Cursor fit block allocator -
16760713e232SGeorge Wilson  * Select the largest region in the metaslab, set the cursor to the beginning
16770713e232SGeorge Wilson  * of the range and the cursor_end to the end of the range. As allocations
16780713e232SGeorge Wilson  * are made advance the cursor. Continue allocating from the cursor until
16790713e232SGeorge Wilson  * the range is exhausted and then find a new range.
168080eb36f2SGeorge Wilson  * ==========================================================================
168180eb36f2SGeorge Wilson  */
168280eb36f2SGeorge Wilson static uint64_t
metaslab_cf_alloc(metaslab_t * msp,uint64_t size)16830713e232SGeorge Wilson metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
168480eb36f2SGeorge Wilson {
168586714001SSerapheim Dimitropoulos 	range_tree_t *rt = msp->ms_allocatable;
16864d7988d6SPaul Dagnelie 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
16870713e232SGeorge Wilson 	uint64_t *cursor = &msp->ms_lbas[0];
16880713e232SGeorge Wilson 	uint64_t *cursor_end = &msp->ms_lbas[1];
168980eb36f2SGeorge Wilson 	uint64_t offset = 0;
169080eb36f2SGeorge Wilson 
16910713e232SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
169280eb36f2SGeorge Wilson 
16930713e232SGeorge Wilson 	ASSERT3U(*cursor_end, >=, *cursor);
169480eb36f2SGeorge Wilson 
16950713e232SGeorge Wilson 	if ((*cursor + size) > *cursor_end) {
16960713e232SGeorge Wilson 		range_seg_t *rs;
169780eb36f2SGeorge Wilson 
16984d7988d6SPaul Dagnelie 		if (zfs_btree_numnodes(t) == 0)
16994d7988d6SPaul Dagnelie 			metaslab_size_tree_full_load(msp->ms_allocatable);
17004d7988d6SPaul Dagnelie 		rs = zfs_btree_last(t, NULL);
17014d7988d6SPaul Dagnelie 		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
17024d7988d6SPaul Dagnelie 		    size)
17030713e232SGeorge Wilson 			return (-1ULL);
170480eb36f2SGeorge Wilson 
17054d7988d6SPaul Dagnelie 		*cursor = rs_get_start(rs, rt);
17064d7988d6SPaul Dagnelie 		*cursor_end = rs_get_end(rs, rt);
170780eb36f2SGeorge Wilson 	}
17080713e232SGeorge Wilson 
17090713e232SGeorge Wilson 	offset = *cursor;
17100713e232SGeorge Wilson 	*cursor += size;
17110713e232SGeorge Wilson 
171280eb36f2SGeorge Wilson 	return (offset);
171380eb36f2SGeorge Wilson }
171480eb36f2SGeorge Wilson 
17150713e232SGeorge Wilson static metaslab_ops_t metaslab_cf_ops = {
17162e4c9986SGeorge Wilson 	metaslab_cf_alloc
171780eb36f2SGeorge Wilson };
171880eb36f2SGeorge Wilson 
17190713e232SGeorge Wilson /*
17200713e232SGeorge Wilson  * ==========================================================================
17210713e232SGeorge Wilson  * New dynamic fit allocator -
17220713e232SGeorge Wilson  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
17230713e232SGeorge Wilson  * contiguous blocks. If no region is found then just use the largest segment
17240713e232SGeorge Wilson  * that remains.
17250713e232SGeorge Wilson  * ==========================================================================
17260713e232SGeorge Wilson  */
17270713e232SGeorge Wilson 
17280713e232SGeorge Wilson /*
17290713e232SGeorge Wilson  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
17300713e232SGeorge Wilson  * to request from the allocator.
17310713e232SGeorge Wilson  */
17328d18220dSMark J Musante uint64_t metaslab_ndf_clump_shift = 4;
17338d18220dSMark J Musante 
173480eb36f2SGeorge Wilson static uint64_t
metaslab_ndf_alloc(metaslab_t * msp,uint64_t size)17350713e232SGeorge Wilson metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
173680eb36f2SGeorge Wilson {
17374d7988d6SPaul Dagnelie 	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
17384d7988d6SPaul Dagnelie 	range_tree_t *rt = msp->ms_allocatable;
17394d7988d6SPaul Dagnelie 	zfs_btree_index_t where;
17404d7988d6SPaul Dagnelie 	range_seg_t *rs;
17414d7988d6SPaul Dagnelie 	range_seg_max_t rsearch;
1742bf16b11eSMatthew Ahrens 	uint64_t hbit = highbit64(size);
17430713e232SGeorge Wilson 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1744af1d63abSPaul Dagnelie 	uint64_t max_size = metaslab_largest_allocatable(msp);
174580eb36f2SGeorge Wilson 
17460713e232SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
174780eb36f2SGeorge Wilson 
174880eb36f2SGeorge Wilson 	if (max_size < size)
174980eb36f2SGeorge Wilson 		return (-1ULL);
175080eb36f2SGeorge Wilson 
17514d7988d6SPaul Dagnelie 	rs_set_start(&rsearch, rt, *cursor);
17524d7988d6SPaul Dagnelie 	rs_set_end(&rsearch, rt, *cursor + size);
175380eb36f2SGeorge Wilson 
17544d7988d6SPaul Dagnelie 	rs = zfs_btree_find(t, &rsearch, &where);
17554d7988d6SPaul Dagnelie 	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
175686714001SSerapheim Dimitropoulos 		t = &msp->ms_allocatable_by_size;
175780eb36f2SGeorge Wilson 
17584d7988d6SPaul Dagnelie 		rs_set_start(&rsearch, rt, 0);
17594d7988d6SPaul Dagnelie 		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
17604d7988d6SPaul Dagnelie 		    metaslab_ndf_clump_shift)));
17614d7988d6SPaul Dagnelie 
17624d7988d6SPaul Dagnelie 		rs = zfs_btree_find(t, &rsearch, &where);
17630713e232SGeorge Wilson 		if (rs == NULL)
17644d7988d6SPaul Dagnelie 			rs = zfs_btree_next(t, &where, &where);
17650713e232SGeorge Wilson 		ASSERT(rs != NULL);
176680eb36f2SGeorge Wilson 	}
176780eb36f2SGeorge Wilson 
17684d7988d6SPaul Dagnelie 	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
17694d7988d6SPaul Dagnelie 		*cursor = rs_get_start(rs, rt) + size;
17704d7988d6SPaul Dagnelie 		return (rs_get_start(rs, rt));
177180eb36f2SGeorge Wilson 	}
177280eb36f2SGeorge Wilson 	return (-1ULL);
177380eb36f2SGeorge Wilson }
177480eb36f2SGeorge Wilson 
17750713e232SGeorge Wilson static metaslab_ops_t metaslab_ndf_ops = {
17762e4c9986SGeorge Wilson 	metaslab_ndf_alloc
1777ecc2d604Sbonwick };
1778ecc2d604Sbonwick 
17790713e232SGeorge Wilson metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1780d6e555bdSGeorge Wilson 
1781fa9e4066Sahrens /*
1782fa9e4066Sahrens  * ==========================================================================
1783fa9e4066Sahrens  * Metaslabs
1784fa9e4066Sahrens  * ==========================================================================
1785fa9e4066Sahrens  */
17860713e232SGeorge Wilson 
1787814dcd43SSerapheim Dimitropoulos /*
1788814dcd43SSerapheim Dimitropoulos  * Wait for any in-progress metaslab loads to complete.
1789814dcd43SSerapheim Dimitropoulos  */
1790814dcd43SSerapheim Dimitropoulos void
metaslab_load_wait(metaslab_t * msp)1791814dcd43SSerapheim Dimitropoulos metaslab_load_wait(metaslab_t *msp)
1792814dcd43SSerapheim Dimitropoulos {
1793814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1794814dcd43SSerapheim Dimitropoulos 
1795814dcd43SSerapheim Dimitropoulos 	while (msp->ms_loading) {
1796814dcd43SSerapheim Dimitropoulos 		ASSERT(!msp->ms_loaded);
1797814dcd43SSerapheim Dimitropoulos 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1798814dcd43SSerapheim Dimitropoulos 	}
1799814dcd43SSerapheim Dimitropoulos }
1800814dcd43SSerapheim Dimitropoulos 
1801814dcd43SSerapheim Dimitropoulos /*
1802814dcd43SSerapheim Dimitropoulos  * Wait for any in-progress flushing to complete.
1803814dcd43SSerapheim Dimitropoulos  */
1804814dcd43SSerapheim Dimitropoulos void
metaslab_flush_wait(metaslab_t * msp)1805814dcd43SSerapheim Dimitropoulos metaslab_flush_wait(metaslab_t *msp)
1806814dcd43SSerapheim Dimitropoulos {
1807814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1808814dcd43SSerapheim Dimitropoulos 
1809814dcd43SSerapheim Dimitropoulos 	while (msp->ms_flushing)
1810814dcd43SSerapheim Dimitropoulos 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1811814dcd43SSerapheim Dimitropoulos }
1812814dcd43SSerapheim Dimitropoulos 
1813af1d63abSPaul Dagnelie static unsigned int
metaslab_idx_func(multilist_t * ml,void * arg)1814af1d63abSPaul Dagnelie metaslab_idx_func(multilist_t *ml, void *arg)
1815af1d63abSPaul Dagnelie {
1816af1d63abSPaul Dagnelie 	metaslab_t *msp = arg;
1817af1d63abSPaul Dagnelie 	return (msp->ms_id % multilist_get_num_sublists(ml));
1818af1d63abSPaul Dagnelie }
1819af1d63abSPaul Dagnelie 
1820814dcd43SSerapheim Dimitropoulos uint64_t
metaslab_allocated_space(metaslab_t * msp)1821814dcd43SSerapheim Dimitropoulos metaslab_allocated_space(metaslab_t *msp)
1822814dcd43SSerapheim Dimitropoulos {
1823814dcd43SSerapheim Dimitropoulos 	return (msp->ms_allocated_space);
1824814dcd43SSerapheim Dimitropoulos }
1825814dcd43SSerapheim Dimitropoulos 
1826814dcd43SSerapheim Dimitropoulos /*
1827814dcd43SSerapheim Dimitropoulos  * Verify that the space accounting on disk matches the in-core range_trees.
1828814dcd43SSerapheim Dimitropoulos  */
1829814dcd43SSerapheim Dimitropoulos static void
metaslab_verify_space(metaslab_t * msp,uint64_t txg)1830814dcd43SSerapheim Dimitropoulos metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1831814dcd43SSerapheim Dimitropoulos {
1832814dcd43SSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1833814dcd43SSerapheim Dimitropoulos 	uint64_t allocating = 0;
1834814dcd43SSerapheim Dimitropoulos 	uint64_t sm_free_space, msp_free_space;
1835814dcd43SSerapheim Dimitropoulos 
1836814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1837814dcd43SSerapheim Dimitropoulos 	ASSERT(!msp->ms_condensing);
1838814dcd43SSerapheim Dimitropoulos 
1839814dcd43SSerapheim Dimitropoulos 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1840814dcd43SSerapheim Dimitropoulos 		return;
1841814dcd43SSerapheim Dimitropoulos 
1842814dcd43SSerapheim Dimitropoulos 	/*
1843814dcd43SSerapheim Dimitropoulos 	 * We can only verify the metaslab space when we're called
1844814dcd43SSerapheim Dimitropoulos 	 * from syncing context with a loaded metaslab that has an
1845814dcd43SSerapheim Dimitropoulos 	 * allocated space map. Calling this in non-syncing context
1846814dcd43SSerapheim Dimitropoulos 	 * does not provide a consistent view of the metaslab since
1847814dcd43SSerapheim Dimitropoulos 	 * we're performing allocations in the future.
1848814dcd43SSerapheim Dimitropoulos 	 */
1849814dcd43SSerapheim Dimitropoulos 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1850814dcd43SSerapheim Dimitropoulos 	    !msp->ms_loaded)
1851814dcd43SSerapheim Dimitropoulos 		return;
1852814dcd43SSerapheim Dimitropoulos 
1853814dcd43SSerapheim Dimitropoulos 	/*
1854814dcd43SSerapheim Dimitropoulos 	 * Even though the smp_alloc field can get negative,
1855814dcd43SSerapheim Dimitropoulos 	 * when it comes to a metaslab's space map, that should
1856814dcd43SSerapheim Dimitropoulos 	 * never be the case.
1857814dcd43SSerapheim Dimitropoulos 	 */
1858814dcd43SSerapheim Dimitropoulos 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1859814dcd43SSerapheim Dimitropoulos 
1860814dcd43SSerapheim Dimitropoulos 	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1861814dcd43SSerapheim Dimitropoulos 	    range_tree_space(msp->ms_unflushed_frees));
1862814dcd43SSerapheim Dimitropoulos 
1863814dcd43SSerapheim Dimitropoulos 	ASSERT3U(metaslab_allocated_space(msp), ==,
1864814dcd43SSerapheim Dimitropoulos 	    space_map_allocated(msp->ms_sm) +
1865814dcd43SSerapheim Dimitropoulos 	    range_tree_space(msp->ms_unflushed_allocs) -
1866814dcd43SSerapheim Dimitropoulos 	    range_tree_space(msp->ms_unflushed_frees));
1867814dcd43SSerapheim Dimitropoulos 
1868814dcd43SSerapheim Dimitropoulos 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1869814dcd43SSerapheim Dimitropoulos 
1870814dcd43SSerapheim Dimitropoulos 	/*
1871814dcd43SSerapheim Dimitropoulos 	 * Account for future allocations since we would have
1872814dcd43SSerapheim Dimitropoulos 	 * already deducted that space from the ms_allocatable.
1873814dcd43SSerapheim Dimitropoulos 	 */
1874814dcd43SSerapheim Dimitropoulos 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1875814dcd43SSerapheim Dimitropoulos 		allocating +=
1876814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1877814dcd43SSerapheim Dimitropoulos 	}
1878af1d63abSPaul Dagnelie 	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1879af1d63abSPaul Dagnelie 	    msp->ms_allocating_total);
1880814dcd43SSerapheim Dimitropoulos 
1881814dcd43SSerapheim Dimitropoulos 	ASSERT3U(msp->ms_deferspace, ==,
1882814dcd43SSerapheim Dimitropoulos 	    range_tree_space(msp->ms_defer[0]) +
1883814dcd43SSerapheim Dimitropoulos 	    range_tree_space(msp->ms_defer[1]));
1884814dcd43SSerapheim Dimitropoulos 
1885814dcd43SSerapheim Dimitropoulos 	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1886814dcd43SSerapheim Dimitropoulos 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
1887814dcd43SSerapheim Dimitropoulos 
1888814dcd43SSerapheim Dimitropoulos 	VERIFY3U(sm_free_space, ==, msp_free_space);
1889814dcd43SSerapheim Dimitropoulos }
1890814dcd43SSerapheim Dimitropoulos 
1891555d674dSSerapheim Dimitropoulos static void
metaslab_aux_histograms_clear(metaslab_t * msp)1892555d674dSSerapheim Dimitropoulos metaslab_aux_histograms_clear(metaslab_t *msp)
1893555d674dSSerapheim Dimitropoulos {
1894555d674dSSerapheim Dimitropoulos 	/*
1895555d674dSSerapheim Dimitropoulos 	 * Auxiliary histograms are only cleared when resetting them,
1896555d674dSSerapheim Dimitropoulos 	 * which can only happen while the metaslab is loaded.
1897555d674dSSerapheim Dimitropoulos 	 */
1898555d674dSSerapheim Dimitropoulos 	ASSERT(msp->ms_loaded);
1899555d674dSSerapheim Dimitropoulos 
1900555d674dSSerapheim Dimitropoulos 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1901555d674dSSerapheim Dimitropoulos 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
1902555d674dSSerapheim Dimitropoulos 		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1903555d674dSSerapheim Dimitropoulos }
1904555d674dSSerapheim Dimitropoulos 
1905555d674dSSerapheim Dimitropoulos static void
metaslab_aux_histogram_add(uint64_t * histogram,uint64_t shift,range_tree_t * rt)1906555d674dSSerapheim Dimitropoulos metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1907555d674dSSerapheim Dimitropoulos     range_tree_t *rt)
1908555d674dSSerapheim Dimitropoulos {
1909555d674dSSerapheim Dimitropoulos 	/*
1910555d674dSSerapheim Dimitropoulos 	 * This is modeled after space_map_histogram_add(), so refer to that
1911555d674dSSerapheim Dimitropoulos 	 * function for implementation details. We want this to work like
1912555d674dSSerapheim Dimitropoulos 	 * the space map histogram, and not the range tree histogram, as we
1913555d674dSSerapheim Dimitropoulos 	 * are essentially constructing a delta that will be later subtracted
1914555d674dSSerapheim Dimitropoulos 	 * from the space map histogram.
1915555d674dSSerapheim Dimitropoulos 	 */
1916555d674dSSerapheim Dimitropoulos 	int idx = 0;
1917555d674dSSerapheim Dimitropoulos 	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1918555d674dSSerapheim Dimitropoulos 		ASSERT3U(i, >=, idx + shift);
1919555d674dSSerapheim Dimitropoulos 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1920555d674dSSerapheim Dimitropoulos 
1921555d674dSSerapheim Dimitropoulos 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1922555d674dSSerapheim Dimitropoulos 			ASSERT3U(idx + shift, ==, i);
1923555d674dSSerapheim Dimitropoulos 			idx++;
1924555d674dSSerapheim Dimitropoulos 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1925555d674dSSerapheim Dimitropoulos 		}
1926555d674dSSerapheim Dimitropoulos 	}
1927555d674dSSerapheim Dimitropoulos }
1928555d674dSSerapheim Dimitropoulos 
1929555d674dSSerapheim Dimitropoulos /*
1930555d674dSSerapheim Dimitropoulos  * Called at every sync pass that the metaslab gets synced.
1931555d674dSSerapheim Dimitropoulos  *
1932555d674dSSerapheim Dimitropoulos  * The reason is that we want our auxiliary histograms to be updated
1933555d674dSSerapheim Dimitropoulos  * wherever the metaslab's space map histogram is updated. This way
1934555d674dSSerapheim Dimitropoulos  * we stay consistent on which parts of the metaslab space map's
1935555d674dSSerapheim Dimitropoulos  * histogram are currently not available for allocations (e.g because
1936555d674dSSerapheim Dimitropoulos  * they are in the defer, freed, and freeing trees).
1937555d674dSSerapheim Dimitropoulos  */
1938555d674dSSerapheim Dimitropoulos static void
metaslab_aux_histograms_update(metaslab_t * msp)1939555d674dSSerapheim Dimitropoulos metaslab_aux_histograms_update(metaslab_t *msp)
1940555d674dSSerapheim Dimitropoulos {
1941555d674dSSerapheim Dimitropoulos 	space_map_t *sm = msp->ms_sm;
1942555d674dSSerapheim Dimitropoulos 	ASSERT(sm != NULL);
1943555d674dSSerapheim Dimitropoulos 
1944555d674dSSerapheim Dimitropoulos 	/*
1945555d674dSSerapheim Dimitropoulos 	 * This is similar to the metaslab's space map histogram updates
1946555d674dSSerapheim Dimitropoulos 	 * that take place in metaslab_sync(). The only difference is that
1947555d674dSSerapheim Dimitropoulos 	 * we only care about segments that haven't made it into the
1948555d674dSSerapheim Dimitropoulos 	 * ms_allocatable tree yet.
1949555d674dSSerapheim Dimitropoulos 	 */
1950555d674dSSerapheim Dimitropoulos 	if (msp->ms_loaded) {
1951555d674dSSerapheim Dimitropoulos 		metaslab_aux_histograms_clear(msp);
1952555d674dSSerapheim Dimitropoulos 
1953555d674dSSerapheim Dimitropoulos 		metaslab_aux_histogram_add(msp->ms_synchist,
1954555d674dSSerapheim Dimitropoulos 		    sm->sm_shift, msp->ms_freed);
1955555d674dSSerapheim Dimitropoulos 
1956555d674dSSerapheim Dimitropoulos 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1957555d674dSSerapheim Dimitropoulos 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
1958555d674dSSerapheim Dimitropoulos 			    sm->sm_shift, msp->ms_defer[t]);
1959555d674dSSerapheim Dimitropoulos 		}
1960555d674dSSerapheim Dimitropoulos 	}
1961555d674dSSerapheim Dimitropoulos 
1962555d674dSSerapheim Dimitropoulos 	metaslab_aux_histogram_add(msp->ms_synchist,
1963555d674dSSerapheim Dimitropoulos 	    sm->sm_shift, msp->ms_freeing);
1964555d674dSSerapheim Dimitropoulos }
1965555d674dSSerapheim Dimitropoulos 
1966555d674dSSerapheim Dimitropoulos /*
1967555d674dSSerapheim Dimitropoulos  * Called every time we are done syncing (writing to) the metaslab,
1968555d674dSSerapheim Dimitropoulos  * i.e. at the end of each sync pass.
1969555d674dSSerapheim Dimitropoulos  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1970555d674dSSerapheim Dimitropoulos  */
1971555d674dSSerapheim Dimitropoulos static void
metaslab_aux_histograms_update_done(metaslab_t * msp,boolean_t defer_allowed)1972555d674dSSerapheim Dimitropoulos metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1973555d674dSSerapheim Dimitropoulos {
1974555d674dSSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1975555d674dSSerapheim Dimitropoulos 	space_map_t *sm = msp->ms_sm;
1976555d674dSSerapheim Dimitropoulos 
1977555d674dSSerapheim Dimitropoulos 	if (sm == NULL) {
1978555d674dSSerapheim Dimitropoulos 		/*
1979555d674dSSerapheim Dimitropoulos 		 * We came here from metaslab_init() when creating/opening a
1980555d674dSSerapheim Dimitropoulos 		 * pool, looking at a metaslab that hasn't had any allocations
1981555d674dSSerapheim Dimitropoulos 		 * yet.
1982555d674dSSerapheim Dimitropoulos 		 */
1983555d674dSSerapheim Dimitropoulos 		return;
1984555d674dSSerapheim Dimitropoulos 	}
1985555d674dSSerapheim Dimitropoulos 
1986555d674dSSerapheim Dimitropoulos 	/*
1987555d674dSSerapheim Dimitropoulos 	 * This is similar to the actions that we take for the ms_freed
1988555d674dSSerapheim Dimitropoulos 	 * and ms_defer trees in metaslab_sync_done().
1989555d674dSSerapheim Dimitropoulos 	 */
1990555d674dSSerapheim Dimitropoulos 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1991555d674dSSerapheim Dimitropoulos 	if (defer_allowed) {
1992555d674dSSerapheim Dimitropoulos 		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1993555d674dSSerapheim Dimitropoulos 		    sizeof (msp->ms_synchist));
1994555d674dSSerapheim Dimitropoulos 	} else {
1995555d674dSSerapheim Dimitropoulos 		bzero(msp->ms_deferhist[hist_index],
1996555d674dSSerapheim Dimitropoulos 		    sizeof (msp->ms_deferhist[hist_index]));
1997555d674dSSerapheim Dimitropoulos 	}
1998555d674dSSerapheim Dimitropoulos 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1999555d674dSSerapheim Dimitropoulos }
2000555d674dSSerapheim Dimitropoulos 
2001555d674dSSerapheim Dimitropoulos /*
2002555d674dSSerapheim Dimitropoulos  * Ensure that the metaslab's weight and fragmentation are consistent
2003555d674dSSerapheim Dimitropoulos  * with the contents of the histogram (either the range tree's histogram
2004555d674dSSerapheim Dimitropoulos  * or the space map's depending whether the metaslab is loaded).
2005555d674dSSerapheim Dimitropoulos  */
2006555d674dSSerapheim Dimitropoulos static void
metaslab_verify_weight_and_frag(metaslab_t * msp)2007555d674dSSerapheim Dimitropoulos metaslab_verify_weight_and_frag(metaslab_t *msp)
2008555d674dSSerapheim Dimitropoulos {
2009555d674dSSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2010555d674dSSerapheim Dimitropoulos 
2011555d674dSSerapheim Dimitropoulos 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
2012555d674dSSerapheim Dimitropoulos 		return;
2013555d674dSSerapheim Dimitropoulos 
2014814dcd43SSerapheim Dimitropoulos 	/*
2015814dcd43SSerapheim Dimitropoulos 	 * We can end up here from vdev_remove_complete(), in which case we
2016814dcd43SSerapheim Dimitropoulos 	 * cannot do these assertions because we hold spa config locks and
2017814dcd43SSerapheim Dimitropoulos 	 * thus we are not allowed to read from the DMU.
2018814dcd43SSerapheim Dimitropoulos 	 *
2019814dcd43SSerapheim Dimitropoulos 	 * We check if the metaslab group has been removed and if that's
2020814dcd43SSerapheim Dimitropoulos 	 * the case we return immediately as that would mean that we are
2021814dcd43SSerapheim Dimitropoulos 	 * here from the aforementioned code path.
2022814dcd43SSerapheim Dimitropoulos 	 */
2023555d674dSSerapheim Dimitropoulos 	if (msp->ms_group == NULL)
2024555d674dSSerapheim Dimitropoulos 		return;
2025555d674dSSerapheim Dimitropoulos 
2026555d674dSSerapheim Dimitropoulos 	/*
2027555d674dSSerapheim Dimitropoulos 	 * Devices being removed always return a weight of 0 and leave
2028555d674dSSerapheim Dimitropoulos 	 * fragmentation and ms_max_size as is - there is nothing for
2029555d674dSSerapheim Dimitropoulos 	 * us to verify here.
2030555d674dSSerapheim Dimitropoulos 	 */
2031555d674dSSerapheim Dimitropoulos 	vdev_t *vd = msp->ms_group->mg_vd;
2032555d674dSSerapheim Dimitropoulos 	if (vd->vdev_removing)
2033555d674dSSerapheim Dimitropoulos 		return;
2034555d674dSSerapheim Dimitropoulos 
2035555d674dSSerapheim Dimitropoulos 	/*
2036555d674dSSerapheim Dimitropoulos 	 * If the metaslab is dirty it probably means that we've done
2037555d674dSSerapheim Dimitropoulos 	 * some allocations or frees that have changed our histograms
2038555d674dSSerapheim Dimitropoulos 	 * and thus the weight.
2039555d674dSSerapheim Dimitropoulos 	 */
2040555d674dSSerapheim Dimitropoulos 	for (int t = 0; t < TXG_SIZE; t++) {
2041555d674dSSerapheim Dimitropoulos 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
2042555d674dSSerapheim Dimitropoulos 			return;
2043555d674dSSerapheim Dimitropoulos 	}
2044555d674dSSerapheim Dimitropoulos 
2045555d674dSSerapheim Dimitropoulos 	/*
2046555d674dSSerapheim Dimitropoulos 	 * This verification checks that our in-memory state is consistent
2047555d674dSSerapheim Dimitropoulos 	 * with what's on disk. If the pool is read-only then there aren't
2048555d674dSSerapheim Dimitropoulos 	 * any changes and we just have the initially-loaded state.
2049555d674dSSerapheim Dimitropoulos 	 */
2050555d674dSSerapheim Dimitropoulos 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2051555d674dSSerapheim Dimitropoulos 		return;
2052555d674dSSerapheim Dimitropoulos 
2053555d674dSSerapheim Dimitropoulos 	/* some extra verification for in-core tree if you can */
2054555d674dSSerapheim Dimitropoulos 	if (msp->ms_loaded) {
2055555d674dSSerapheim Dimitropoulos 		range_tree_stat_verify(msp->ms_allocatable);
2056555d674dSSerapheim Dimitropoulos 		VERIFY(space_map_histogram_verify(msp->ms_sm,
2057555d674dSSerapheim Dimitropoulos 		    msp->ms_allocatable));
2058555d674dSSerapheim Dimitropoulos 	}
2059555d674dSSerapheim Dimitropoulos 
2060555d674dSSerapheim Dimitropoulos 	uint64_t weight = msp->ms_weight;
2061555d674dSSerapheim Dimitropoulos 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2062555d674dSSerapheim Dimitropoulos 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2063555d674dSSerapheim Dimitropoulos 	uint64_t frag = msp->ms_fragmentation;
2064555d674dSSerapheim Dimitropoulos 	uint64_t max_segsize = msp->ms_max_size;
2065555d674dSSerapheim Dimitropoulos 
2066555d674dSSerapheim Dimitropoulos 	msp->ms_weight = 0;
2067555d674dSSerapheim Dimitropoulos 	msp->ms_fragmentation = 0;
2068555d674dSSerapheim Dimitropoulos 
2069555d674dSSerapheim Dimitropoulos 	/*
2070555d674dSSerapheim Dimitropoulos 	 * This function is used for verification purposes. Regardless of
2071555d674dSSerapheim Dimitropoulos 	 * whether metaslab_weight() thinks this metaslab should be active or
2072555d674dSSerapheim Dimitropoulos 	 * not, we want to ensure that the actual weight (and therefore the
2073555d674dSSerapheim Dimitropoulos 	 * value of ms_weight) would be the same if it was to be recalculated
2074555d674dSSerapheim Dimitropoulos 	 * at this point.
2075555d674dSSerapheim Dimitropoulos 	 */
2076555d674dSSerapheim Dimitropoulos 	msp->ms_weight = metaslab_weight(msp) | was_active;
2077555d674dSSerapheim Dimitropoulos 
2078555d674dSSerapheim Dimitropoulos 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
2079555d674dSSerapheim Dimitropoulos 
2080555d674dSSerapheim Dimitropoulos 	/*
2081555d674dSSerapheim Dimitropoulos 	 * If the weight type changed then there is no point in doing
2082555d674dSSerapheim Dimitropoulos 	 * verification. Revert fields to their original values.
2083555d674dSSerapheim Dimitropoulos 	 */
2084555d674dSSerapheim Dimitropoulos 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2085555d674dSSerapheim Dimitropoulos 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2086555d674dSSerapheim Dimitropoulos 		msp->ms_fragmentation = frag;
2087555d674dSSerapheim Dimitropoulos 		msp->ms_weight = weight;
2088555d674dSSerapheim Dimitropoulos 		return;
2089555d674dSSerapheim Dimitropoulos 	}
2090555d674dSSerapheim Dimitropoulos 
2091555d674dSSerapheim Dimitropoulos 	VERIFY3U(msp->ms_fragmentation, ==, frag);
2092555d674dSSerapheim Dimitropoulos 	VERIFY3U(msp->ms_weight, ==, weight);
2093555d674dSSerapheim Dimitropoulos }
2094555d674dSSerapheim Dimitropoulos 
2095af1d63abSPaul Dagnelie /*
2096af1d63abSPaul Dagnelie  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
2097af1d63abSPaul Dagnelie  * this class that was used longest ago, and attempt to unload it.  We don't
2098af1d63abSPaul Dagnelie  * want to spend too much time in this loop to prevent performance
2099af1d63abSPaul Dagnelie  * degredation, and we expect that most of the time this operation will
2100af1d63abSPaul Dagnelie  * succeed. Between that and the normal unloading processing during txg sync,
2101af1d63abSPaul Dagnelie  * we expect this to keep the metaslab memory usage under control.
2102af1d63abSPaul Dagnelie  */
2103af1d63abSPaul Dagnelie static void
metaslab_potentially_evict(metaslab_class_t * mc)2104af1d63abSPaul Dagnelie metaslab_potentially_evict(metaslab_class_t *mc)
2105af1d63abSPaul Dagnelie {
2106af1d63abSPaul Dagnelie #ifdef _KERNEL
2107af1d63abSPaul Dagnelie 	uint64_t allmem = arc_all_memory();
21084d7988d6SPaul Dagnelie 	extern kmem_cache_t *zfs_btree_leaf_cache;
21094d7988d6SPaul Dagnelie 	uint64_t inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse");
21104d7988d6SPaul Dagnelie 	uint64_t size =  kmem_cache_stat(zfs_btree_leaf_cache, "buf_size");
2111af1d63abSPaul Dagnelie 	int tries = 0;
2112af1d63abSPaul Dagnelie 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
2113af1d63abSPaul Dagnelie 	    tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
2114af1d63abSPaul Dagnelie 	    tries++) {
2115af1d63abSPaul Dagnelie 		unsigned int idx = multilist_get_random_index(
2116af1d63abSPaul Dagnelie 		    mc->mc_metaslab_txg_list);
2117af1d63abSPaul Dagnelie 		multilist_sublist_t *mls =
2118af1d63abSPaul Dagnelie 		    multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
2119af1d63abSPaul Dagnelie 		metaslab_t *msp = multilist_sublist_head(mls);
2120af1d63abSPaul Dagnelie 		multilist_sublist_unlock(mls);
2121af1d63abSPaul Dagnelie 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
2122af1d63abSPaul Dagnelie 		    inuse * size) {
2123af1d63abSPaul Dagnelie 			VERIFY3P(mls, ==, multilist_sublist_lock(
2124af1d63abSPaul Dagnelie 			    mc->mc_metaslab_txg_list, idx));
2125af1d63abSPaul Dagnelie 			ASSERT3U(idx, ==,
2126af1d63abSPaul Dagnelie 			    metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
2127af1d63abSPaul Dagnelie 
2128af1d63abSPaul Dagnelie 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
2129af1d63abSPaul Dagnelie 				multilist_sublist_unlock(mls);
2130af1d63abSPaul Dagnelie 				break;
2131af1d63abSPaul Dagnelie 			}
2132af1d63abSPaul Dagnelie 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
2133af1d63abSPaul Dagnelie 			multilist_sublist_unlock(mls);
2134af1d63abSPaul Dagnelie 			/*
2135af1d63abSPaul Dagnelie 			 * If the metaslab is currently loading there are two
2136af1d63abSPaul Dagnelie 			 * cases. If it's the metaslab we're evicting, we
2137af1d63abSPaul Dagnelie 			 * can't continue on or we'll panic when we attempt to
2138af1d63abSPaul Dagnelie 			 * recursively lock the mutex. If it's another
2139af1d63abSPaul Dagnelie 			 * metaslab that's loading, it can be safely skipped,
2140af1d63abSPaul Dagnelie 			 * since we know it's very new and therefore not a
2141af1d63abSPaul Dagnelie 			 * good eviction candidate. We check later once the
2142af1d63abSPaul Dagnelie 			 * lock is held that the metaslab is fully loaded
2143af1d63abSPaul Dagnelie 			 * before actually unloading it.
2144af1d63abSPaul Dagnelie 			 */
2145af1d63abSPaul Dagnelie 			if (msp->ms_loading) {
2146af1d63abSPaul Dagnelie 				msp = next_msp;
21474d7988d6SPaul Dagnelie 				inuse = kmem_cache_stat(zfs_btree_leaf_cache,
2148af1d63abSPaul Dagnelie 				    "buf_inuse");
2149af1d63abSPaul Dagnelie 				continue;
2150af1d63abSPaul Dagnelie 			}
2151af1d63abSPaul Dagnelie 			/*
2152af1d63abSPaul Dagnelie 			 * We can't unload metaslabs with no spacemap because
2153af1d63abSPaul Dagnelie 			 * they're not ready to be unloaded yet. We can't
2154af1d63abSPaul Dagnelie 			 * unload metaslabs with outstanding allocations
2155af1d63abSPaul Dagnelie 			 * because doing so could cause the metaslab's weight
2156af1d63abSPaul Dagnelie 			 * to decrease while it's unloaded, which violates an
2157af1d63abSPaul Dagnelie 			 * invariant that we use to prevent unnecessary
2158af1d63abSPaul Dagnelie 			 * loading. We also don't unload metaslabs that are
2159af1d63abSPaul Dagnelie 			 * currently active because they are high-weight
2160af1d63abSPaul Dagnelie 			 * metaslabs that are likely to be used in the near
2161af1d63abSPaul Dagnelie 			 * future.
2162af1d63abSPaul Dagnelie 			 */
2163af1d63abSPaul Dagnelie 			mutex_enter(&msp->ms_lock);
2164af1d63abSPaul Dagnelie 			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2165af1d63abSPaul Dagnelie 			    msp->ms_allocating_total == 0) {
2166af1d63abSPaul Dagnelie 				metaslab_unload(msp);
2167af1d63abSPaul Dagnelie 			}
2168af1d63abSPaul Dagnelie 			mutex_exit(&msp->ms_lock);
2169af1d63abSPaul Dagnelie 			msp = next_msp;
21704d7988d6SPaul Dagnelie 			inuse = kmem_cache_stat(zfs_btree_leaf_cache,
21714d7988d6SPaul Dagnelie 			    "buf_inuse");
2172af1d63abSPaul Dagnelie 		}
2173af1d63abSPaul Dagnelie 	}
2174af1d63abSPaul Dagnelie #endif
2175af1d63abSPaul Dagnelie }
2176af1d63abSPaul Dagnelie 
2177a0b03b16SSerapheim Dimitropoulos static int
metaslab_load_impl(metaslab_t * msp)2178a0b03b16SSerapheim Dimitropoulos metaslab_load_impl(metaslab_t *msp)
21790713e232SGeorge Wilson {
21800713e232SGeorge Wilson 	int error = 0;
21810713e232SGeorge Wilson 
21820713e232SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2183a0b03b16SSerapheim Dimitropoulos 	ASSERT(msp->ms_loading);
2184555d674dSSerapheim Dimitropoulos 	ASSERT(!msp->ms_condensing);
21850713e232SGeorge Wilson 
21865cabbc6bSPrashanth Sreenivasa 	/*
2187555d674dSSerapheim Dimitropoulos 	 * We temporarily drop the lock to unblock other operations while we
2188555d674dSSerapheim Dimitropoulos 	 * are reading the space map. Therefore, metaslab_sync() and
2189555d674dSSerapheim Dimitropoulos 	 * metaslab_sync_done() can run at the same time as we do.
2190555d674dSSerapheim Dimitropoulos 	 *
2191814dcd43SSerapheim Dimitropoulos 	 * If we are using the log space maps, metaslab_sync() can't write to
2192814dcd43SSerapheim Dimitropoulos 	 * the metaslab's space map while we are loading as we only write to
2193814dcd43SSerapheim Dimitropoulos 	 * it when we are flushing the metaslab, and that can't happen while
2194814dcd43SSerapheim Dimitropoulos 	 * we are loading it.
2195814dcd43SSerapheim Dimitropoulos 	 *
2196814dcd43SSerapheim Dimitropoulos 	 * If we are not using log space maps though, metaslab_sync() can
2197814dcd43SSerapheim Dimitropoulos 	 * append to the space map while we are loading. Therefore we load
2198814dcd43SSerapheim Dimitropoulos 	 * only entries that existed when we started the load. Additionally,
2199814dcd43SSerapheim Dimitropoulos 	 * metaslab_sync_done() has to wait for the load to complete because
2200814dcd43SSerapheim Dimitropoulos 	 * there are potential races like metaslab_load() loading parts of the
2201814dcd43SSerapheim Dimitropoulos 	 * space map that are currently being appended by metaslab_sync(). If
2202814dcd43SSerapheim Dimitropoulos 	 * we didn't, the ms_allocatable would have entries that
2203814dcd43SSerapheim Dimitropoulos 	 * metaslab_sync_done() would try to re-add later.
2204555d674dSSerapheim Dimitropoulos 	 *
2205555d674dSSerapheim Dimitropoulos 	 * That's why before dropping the lock we remember the synced length
2206555d674dSSerapheim Dimitropoulos 	 * of the metaslab and read up to that point of the space map,
2207555d674dSSerapheim Dimitropoulos 	 * ignoring entries appended by metaslab_sync() that happen after we
2208555d674dSSerapheim Dimitropoulos 	 * drop the lock.
22095cabbc6bSPrashanth Sreenivasa 	 */
2210555d674dSSerapheim Dimitropoulos 	uint64_t length = msp->ms_synced_length;
22115cabbc6bSPrashanth Sreenivasa 	mutex_exit(&msp->ms_lock);
22120713e232SGeorge Wilson 
2213814dcd43SSerapheim Dimitropoulos 	hrtime_t load_start = gethrtime();
22144d7988d6SPaul Dagnelie 	metaslab_rt_arg_t *mrap;
22154d7988d6SPaul Dagnelie 	if (msp->ms_allocatable->rt_arg == NULL) {
22164d7988d6SPaul Dagnelie 		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
22174d7988d6SPaul Dagnelie 	} else {
22184d7988d6SPaul Dagnelie 		mrap = msp->ms_allocatable->rt_arg;
22194d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_ops = NULL;
22204d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_arg = NULL;
22214d7988d6SPaul Dagnelie 	}
22224d7988d6SPaul Dagnelie 	mrap->mra_bt = &msp->ms_allocatable_by_size;
22234d7988d6SPaul Dagnelie 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
22244d7988d6SPaul Dagnelie 
222586714001SSerapheim Dimitropoulos 	if (msp->ms_sm != NULL) {
2226555d674dSSerapheim Dimitropoulos 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2227555d674dSSerapheim Dimitropoulos 		    SM_FREE, length);
22284d7988d6SPaul Dagnelie 
22294d7988d6SPaul Dagnelie 		/* Now, populate the size-sorted tree. */
22304d7988d6SPaul Dagnelie 		metaslab_rt_create(msp->ms_allocatable, mrap);
22314d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
22324d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_arg = mrap;
22334d7988d6SPaul Dagnelie 
22344d7988d6SPaul Dagnelie 		struct mssa_arg arg = {0};
22354d7988d6SPaul Dagnelie 		arg.rt = msp->ms_allocatable;
22364d7988d6SPaul Dagnelie 		arg.mra = mrap;
22374d7988d6SPaul Dagnelie 		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
22384d7988d6SPaul Dagnelie 		    &arg);
223986714001SSerapheim Dimitropoulos 	} else {
22404d7988d6SPaul Dagnelie 		/*
22414d7988d6SPaul Dagnelie 		 * Add the size-sorted tree first, since we don't need to load
22424d7988d6SPaul Dagnelie 		 * the metaslab from the spacemap.
22434d7988d6SPaul Dagnelie 		 */
22444d7988d6SPaul Dagnelie 		metaslab_rt_create(msp->ms_allocatable, mrap);
22454d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
22464d7988d6SPaul Dagnelie 		msp->ms_allocatable->rt_arg = mrap;
2247555d674dSSerapheim Dimitropoulos 		/*
2248555d674dSSerapheim Dimitropoulos 		 * The space map has not been allocated yet, so treat
2249555d674dSSerapheim Dimitropoulos 		 * all the space in the metaslab as free and add it to the
2250555d674dSSerapheim Dimitropoulos 		 * ms_allocatable tree.
2251555d674dSSerapheim Dimitropoulos 		 */
225286714001SSerapheim Dimitropoulos 		range_tree_add(msp->ms_allocatable,
225386714001SSerapheim Dimitropoulos 		    msp->ms_start, msp->ms_size);
2254814dcd43SSerapheim Dimitropoulos 
2255814dcd43SSerapheim Dimitropoulos 		if (msp->ms_freed != NULL) {
2256814dcd43SSerapheim Dimitropoulos 			/*
2257814dcd43SSerapheim Dimitropoulos 			 * If the ms_sm doesn't exist, this means that this
2258814dcd43SSerapheim Dimitropoulos 			 * metaslab hasn't gone through metaslab_sync() and
2259814dcd43SSerapheim Dimitropoulos 			 * thus has never been dirtied. So we shouldn't
2260814dcd43SSerapheim Dimitropoulos 			 * expect any unflushed allocs or frees from previous
2261814dcd43SSerapheim Dimitropoulos 			 * TXGs.
2262814dcd43SSerapheim Dimitropoulos 			 *
2263814dcd43SSerapheim Dimitropoulos 			 * Note: ms_freed and all the other trees except for
2264814dcd43SSerapheim Dimitropoulos 			 * the ms_allocatable, can be NULL at this point only
2265814dcd43SSerapheim Dimitropoulos 			 * if this is a new metaslab of a vdev that just got
2266814dcd43SSerapheim Dimitropoulos 			 * expanded.
2267814dcd43SSerapheim Dimitropoulos 			 */
2268814dcd43SSerapheim Dimitropoulos 			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2269814dcd43SSerapheim Dimitropoulos 			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2270814dcd43SSerapheim Dimitropoulos 		}
227186714001SSerapheim Dimitropoulos 	}
22720713e232SGeorge Wilson 
2273555d674dSSerapheim Dimitropoulos 	/*
2274555d674dSSerapheim Dimitropoulos 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2275814dcd43SSerapheim Dimitropoulos 	 * changing the ms_sm (or log_sm) and the metaslab's range trees
2276814dcd43SSerapheim Dimitropoulos 	 * while we are about to use them and populate the ms_allocatable.
2277814dcd43SSerapheim Dimitropoulos 	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
2278814dcd43SSerapheim Dimitropoulos 	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
2279555d674dSSerapheim Dimitropoulos 	 */
2280555d674dSSerapheim Dimitropoulos 	mutex_enter(&msp->ms_sync_lock);
22815cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
2282814dcd43SSerapheim Dimitropoulos 
2283555d674dSSerapheim Dimitropoulos 	ASSERT(!msp->ms_condensing);
2284814dcd43SSerapheim Dimitropoulos 	ASSERT(!msp->ms_flushing);
22850713e232SGeorge Wilson 
2286555d674dSSerapheim Dimitropoulos 	if (error != 0) {
2287555d674dSSerapheim Dimitropoulos 		mutex_exit(&msp->ms_sync_lock);
2288a0b03b16SSerapheim Dimitropoulos 		return (error);
2289555d674dSSerapheim Dimitropoulos 	}
22908363e80aSGeorge Wilson 
2291a0b03b16SSerapheim Dimitropoulos 	ASSERT3P(msp->ms_group, !=, NULL);
2292a0b03b16SSerapheim Dimitropoulos 	msp->ms_loaded = B_TRUE;
2293a0b03b16SSerapheim Dimitropoulos 
2294a0b03b16SSerapheim Dimitropoulos 	/*
2295814dcd43SSerapheim Dimitropoulos 	 * Apply all the unflushed changes to ms_allocatable right
2296814dcd43SSerapheim Dimitropoulos 	 * away so any manipulations we do below have a clear view
2297814dcd43SSerapheim Dimitropoulos 	 * of what is allocated and what is free.
2298814dcd43SSerapheim Dimitropoulos 	 */
2299814dcd43SSerapheim Dimitropoulos 	range_tree_walk(msp->ms_unflushed_allocs,
2300814dcd43SSerapheim Dimitropoulos 	    range_tree_remove, msp->ms_allocatable);
2301814dcd43SSerapheim Dimitropoulos 	range_tree_walk(msp->ms_unflushed_frees,
2302814dcd43SSerapheim Dimitropoulos 	    range_tree_add, msp->ms_allocatable);
2303814dcd43SSerapheim Dimitropoulos 
2304814dcd43SSerapheim Dimitropoulos 	msp->ms_loaded = B_TRUE;
2305814dcd43SSerapheim Dimitropoulos 
2306814dcd43SSerapheim Dimitropoulos 	ASSERT3P(msp->ms_group, !=, NULL);
2307814dcd43SSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2308814dcd43SSerapheim Dimitropoulos 	if (spa_syncing_log_sm(spa) != NULL) {
2309814dcd43SSerapheim Dimitropoulos 		ASSERT(spa_feature_is_enabled(spa,
2310814dcd43SSerapheim Dimitropoulos 		    SPA_FEATURE_LOG_SPACEMAP));
2311814dcd43SSerapheim Dimitropoulos 
2312814dcd43SSerapheim Dimitropoulos 		/*
2313814dcd43SSerapheim Dimitropoulos 		 * If we use a log space map we add all the segments
2314814dcd43SSerapheim Dimitropoulos 		 * that are in ms_unflushed_frees so they are available
2315814dcd43SSerapheim Dimitropoulos 		 * for allocation.
2316814dcd43SSerapheim Dimitropoulos 		 *
2317814dcd43SSerapheim Dimitropoulos 		 * ms_allocatable needs to contain all free segments
2318814dcd43SSerapheim Dimitropoulos 		 * that are ready for allocations (thus not segments
2319814dcd43SSerapheim Dimitropoulos 		 * from ms_freeing, ms_freed, and the ms_defer trees).
2320814dcd43SSerapheim Dimitropoulos 		 * But if we grab the lock in this code path at a sync
2321814dcd43SSerapheim Dimitropoulos 		 * pass later that 1, then it also contains the
2322814dcd43SSerapheim Dimitropoulos 		 * segments of ms_freed (they were added to it earlier
2323814dcd43SSerapheim Dimitropoulos 		 * in this path through ms_unflushed_frees). So we
2324814dcd43SSerapheim Dimitropoulos 		 * need to remove all the segments that exist in
2325814dcd43SSerapheim Dimitropoulos 		 * ms_freed from ms_allocatable as they will be added
2326814dcd43SSerapheim Dimitropoulos 		 * later in metaslab_sync_done().
2327814dcd43SSerapheim Dimitropoulos 		 *
2328814dcd43SSerapheim Dimitropoulos 		 * When there's no log space map, the ms_allocatable
2329814dcd43SSerapheim Dimitropoulos 		 * correctly doesn't contain any segments that exist
2330814dcd43SSerapheim Dimitropoulos 		 * in ms_freed [see ms_synced_length].
2331814dcd43SSerapheim Dimitropoulos 		 */
2332814dcd43SSerapheim Dimitropoulos 		range_tree_walk(msp->ms_freed,
2333814dcd43SSerapheim Dimitropoulos 		    range_tree_remove, msp->ms_allocatable);
2334814dcd43SSerapheim Dimitropoulos 	}
2335814dcd43SSerapheim Dimitropoulos 
2336814dcd43SSerapheim Dimitropoulos 	/*
2337814dcd43SSerapheim Dimitropoulos 	 * If we are not using the log space map, ms_allocatable
2338814dcd43SSerapheim Dimitropoulos 	 * contains the segments that exist in the ms_defer trees
2339814dcd43SSerapheim Dimitropoulos 	 * [see ms_synced_length]. Thus we need to remove them
2340814dcd43SSerapheim Dimitropoulos 	 * from ms_allocatable as they will be added again in
2341555d674dSSerapheim Dimitropoulos 	 * metaslab_sync_done().
2342814dcd43SSerapheim Dimitropoulos 	 *
2343814dcd43SSerapheim Dimitropoulos 	 * If we are using the log space map, ms_allocatable still
2344814dcd43SSerapheim Dimitropoulos 	 * contains the segments that exist in the ms_defer trees.
2345814dcd43SSerapheim Dimitropoulos 	 * Not because it read them through the ms_sm though. But
2346814dcd43SSerapheim Dimitropoulos 	 * because these segments are part of ms_unflushed_frees
2347814dcd43SSerapheim Dimitropoulos 	 * whose segments we add to ms_allocatable earlier in this
2348814dcd43SSerapheim Dimitropoulos 	 * code path.
2349a0b03b16SSerapheim Dimitropoulos 	 */
2350555d674dSSerapheim Dimitropoulos 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2351555d674dSSerapheim Dimitropoulos 		range_tree_walk(msp->ms_defer[t],
2352555d674dSSerapheim Dimitropoulos 		    range_tree_remove, msp->ms_allocatable);
23530713e232SGeorge Wilson 	}
2354555d674dSSerapheim Dimitropoulos 
2355555d674dSSerapheim Dimitropoulos 	/*
2356555d674dSSerapheim Dimitropoulos 	 * Call metaslab_recalculate_weight_and_sort() now that the
2357555d674dSSerapheim Dimitropoulos 	 * metaslab is loaded so we get the metaslab's real weight.
2358555d674dSSerapheim Dimitropoulos 	 *
2359555d674dSSerapheim Dimitropoulos 	 * Unless this metaslab was created with older software and
2360555d674dSSerapheim Dimitropoulos 	 * has not yet been converted to use segment-based weight, we
2361555d674dSSerapheim Dimitropoulos 	 * expect the new weight to be better or equal to the weight
2362555d674dSSerapheim Dimitropoulos 	 * that the metaslab had while it was not loaded. This is
2363555d674dSSerapheim Dimitropoulos 	 * because the old weight does not take into account the
2364555d674dSSerapheim Dimitropoulos 	 * consolidation of adjacent segments between TXGs. [see
2365555d674dSSerapheim Dimitropoulos 	 * comment for ms_synchist and ms_deferhist[] for more info]
2366555d674dSSerapheim Dimitropoulos 	 */
2367555d674dSSerapheim Dimitropoulos 	uint64_t weight = msp->ms_weight;
2368af1d63abSPaul Dagnelie 	uint64_t max_size = msp->ms_max_size;
2369555d674dSSerapheim Dimitropoulos 	metaslab_recalculate_weight_and_sort(msp);
2370555d674dSSerapheim Dimitropoulos 	if (!WEIGHT_IS_SPACEBASED(weight))
2371555d674dSSerapheim Dimitropoulos 		ASSERT3U(weight, <=, msp->ms_weight);
2372af1d63abSPaul Dagnelie 	msp->ms_max_size = metaslab_largest_allocatable(msp);
2373af1d63abSPaul Dagnelie 	ASSERT3U(max_size, <=, msp->ms_max_size);
2374814dcd43SSerapheim Dimitropoulos 	hrtime_t load_end = gethrtime();
23759cfcc091SToomas Soome 	msp->ms_load_time = load_end;
2376814dcd43SSerapheim Dimitropoulos 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
2377814dcd43SSerapheim Dimitropoulos 		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
2378814dcd43SSerapheim Dimitropoulos 		    "ms_id %llu, smp_length %llu, "
2379814dcd43SSerapheim Dimitropoulos 		    "unflushed_allocs %llu, unflushed_frees %llu, "
2380814dcd43SSerapheim Dimitropoulos 		    "freed %llu, defer %llu + %llu, "
2381af1d63abSPaul Dagnelie 		    "loading_time %lld ms, ms_max_size %llu, "
2382af1d63abSPaul Dagnelie 		    "max size error %llu",
2383814dcd43SSerapheim Dimitropoulos 		    spa_syncing_txg(spa), spa_name(spa),
2384814dcd43SSerapheim Dimitropoulos 		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
2385814dcd43SSerapheim Dimitropoulos 		    space_map_length(msp->ms_sm),
2386814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_unflushed_allocs),
2387814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_unflushed_frees),
2388814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_freed),
2389814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_defer[0]),
2390814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_defer[1]),
2391af1d63abSPaul Dagnelie 		    (longlong_t)((load_end - load_start) / 1000000),
2392af1d63abSPaul Dagnelie 		    msp->ms_max_size, msp->ms_max_size - max_size);
2393814dcd43SSerapheim Dimitropoulos 	}
2394814dcd43SSerapheim Dimitropoulos 
2395555d674dSSerapheim Dimitropoulos 	metaslab_verify_space(msp, spa_syncing_txg(spa));
2396555d674dSSerapheim Dimitropoulos 	mutex_exit(&msp->ms_sync_lock);
2397a0b03b16SSerapheim Dimitropoulos 	return (0);
2398a0b03b16SSerapheim Dimitropoulos }
2399a0b03b16SSerapheim Dimitropoulos 
2400a0b03b16SSerapheim Dimitropoulos int
metaslab_load(metaslab_t * msp)2401a0b03b16SSerapheim Dimitropoulos metaslab_load(metaslab_t *msp)
2402a0b03b16SSerapheim Dimitropoulos {
2403a0b03b16SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2404a0b03b16SSerapheim Dimitropoulos 
2405a0b03b16SSerapheim Dimitropoulos 	/*
2406a0b03b16SSerapheim Dimitropoulos 	 * There may be another thread loading the same metaslab, if that's
2407a0b03b16SSerapheim Dimitropoulos 	 * the case just wait until the other thread is done and return.
2408a0b03b16SSerapheim Dimitropoulos 	 */
2409a0b03b16SSerapheim Dimitropoulos 	metaslab_load_wait(msp);
2410a0b03b16SSerapheim Dimitropoulos 	if (msp->ms_loaded)
2411a0b03b16SSerapheim Dimitropoulos 		return (0);
2412a0b03b16SSerapheim Dimitropoulos 	VERIFY(!msp->ms_loading);
2413555d674dSSerapheim Dimitropoulos 	ASSERT(!msp->ms_condensing);
2414a0b03b16SSerapheim Dimitropoulos 
2415814dcd43SSerapheim Dimitropoulos 	/*
2416814dcd43SSerapheim Dimitropoulos 	 * We set the loading flag BEFORE potentially dropping the lock to
2417814dcd43SSerapheim Dimitropoulos 	 * wait for an ongoing flush (see ms_flushing below). This way other
2418814dcd43SSerapheim Dimitropoulos 	 * threads know that there is already a thread that is loading this
2419814dcd43SSerapheim Dimitropoulos 	 * metaslab.
2420814dcd43SSerapheim Dimitropoulos 	 */
2421a0b03b16SSerapheim Dimitropoulos 	msp->ms_loading = B_TRUE;
2422814dcd43SSerapheim Dimitropoulos 
2423814dcd43SSerapheim Dimitropoulos 	/*
2424814dcd43SSerapheim Dimitropoulos 	 * Wait for any in-progress flushing to finish as we drop the ms_lock
2425814dcd43SSerapheim Dimitropoulos 	 * both here (during space_map_load()) and in metaslab_flush() (when
2426814dcd43SSerapheim Dimitropoulos 	 * we flush our changes to the ms_sm).
2427814dcd43SSerapheim Dimitropoulos 	 */
2428814dcd43SSerapheim Dimitropoulos 	if (msp->ms_flushing)
2429814dcd43SSerapheim Dimitropoulos 		metaslab_flush_wait(msp);
2430814dcd43SSerapheim Dimitropoulos 
2431814dcd43SSerapheim Dimitropoulos 	/*
2432814dcd43SSerapheim Dimitropoulos 	 * In the possibility that we were waiting for the metaslab to be
2433814dcd43SSerapheim Dimitropoulos 	 * flushed (where we temporarily dropped the ms_lock), ensure that
2434814dcd43SSerapheim Dimitropoulos 	 * no one else loaded the metaslab somehow.
2435814dcd43SSerapheim Dimitropoulos 	 */
2436814dcd43SSerapheim Dimitropoulos 	ASSERT(!msp->ms_loaded);
2437814dcd43SSerapheim Dimitropoulos 
2438af1d63abSPaul Dagnelie 	/*
2439af1d63abSPaul Dagnelie 	 * If we're loading a metaslab in the normal class, consider evicting
2440af1d63abSPaul Dagnelie 	 * another one to keep our memory usage under the limit defined by the
2441af1d63abSPaul Dagnelie 	 * zfs_metaslab_mem_limit tunable.
2442af1d63abSPaul Dagnelie 	 */
2443af1d63abSPaul Dagnelie 	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2444af1d63abSPaul Dagnelie 	    msp->ms_group->mg_class) {
2445af1d63abSPaul Dagnelie 		metaslab_potentially_evict(msp->ms_group->mg_class);
2446af1d63abSPaul Dagnelie 	}
2447af1d63abSPaul Dagnelie 
2448a0b03b16SSerapheim Dimitropoulos 	int error = metaslab_load_impl(msp);
2449814dcd43SSerapheim Dimitropoulos 
2450814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2451a0b03b16SSerapheim Dimitropoulos 	msp->ms_loading = B_FALSE;
24520713e232SGeorge Wilson 	cv_broadcast(&msp->ms_load_cv);
2453a0b03b16SSerapheim Dimitropoulos 
24540713e232SGeorge Wilson 	return (error);
24550713e232SGeorge Wilson }
24560713e232SGeorge Wilson 
24570713e232SGeorge Wilson void
metaslab_unload(metaslab_t * msp)24580713e232SGeorge Wilson metaslab_unload(metaslab_t *msp)
24590713e232SGeorge Wilson {
24600713e232SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2461555d674dSSerapheim Dimitropoulos 
2462af1d63abSPaul Dagnelie 	/*
2463af1d63abSPaul Dagnelie 	 * This can happen if a metaslab is selected for eviction (in
2464af1d63abSPaul Dagnelie 	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
2465af1d63abSPaul Dagnelie 	 * metaslab_class_evict_old).
2466af1d63abSPaul Dagnelie 	 */
2467af1d63abSPaul Dagnelie 	if (!msp->ms_loaded)
2468af1d63abSPaul Dagnelie 		return;
2469555d674dSSerapheim Dimitropoulos 
247086714001SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
24710713e232SGeorge Wilson 	msp->ms_loaded = B_FALSE;
2472af1d63abSPaul Dagnelie 	msp->ms_unload_time = gethrtime();
2473555d674dSSerapheim Dimitropoulos 
2474af1d63abSPaul Dagnelie 	msp->ms_activation_weight = 0;
24750713e232SGeorge Wilson 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2476af1d63abSPaul Dagnelie 
2477af1d63abSPaul Dagnelie 	if (msp->ms_group != NULL) {
2478af1d63abSPaul Dagnelie 		metaslab_class_t *mc = msp->ms_group->mg_class;
2479af1d63abSPaul Dagnelie 		multilist_sublist_t *mls =
2480af1d63abSPaul Dagnelie 		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2481af1d63abSPaul Dagnelie 		if (multilist_link_active(&msp->ms_class_txg_node))
2482af1d63abSPaul Dagnelie 			multilist_sublist_remove(mls, msp);
2483af1d63abSPaul Dagnelie 		multilist_sublist_unlock(mls);
2484af1d63abSPaul Dagnelie 	}
2485555d674dSSerapheim Dimitropoulos 
2486555d674dSSerapheim Dimitropoulos 	/*
2487555d674dSSerapheim Dimitropoulos 	 * We explicitly recalculate the metaslab's weight based on its space
2488555d674dSSerapheim Dimitropoulos 	 * map (as it is now not loaded). We want unload metaslabs to always
2489555d674dSSerapheim Dimitropoulos 	 * have their weights calculated from the space map histograms, while
2490555d674dSSerapheim Dimitropoulos 	 * loaded ones have it calculated from their in-core range tree
2491555d674dSSerapheim Dimitropoulos 	 * [see metaslab_load()]. This way, the weight reflects the information
2492814dcd43SSerapheim Dimitropoulos 	 * available in-core, whether it is loaded or not.
2493555d674dSSerapheim Dimitropoulos 	 *
2494555d674dSSerapheim Dimitropoulos 	 * If ms_group == NULL means that we came here from metaslab_fini(),
2495555d674dSSerapheim Dimitropoulos 	 * at which point it doesn't make sense for us to do the recalculation
2496555d674dSSerapheim Dimitropoulos 	 * and the sorting.
2497555d674dSSerapheim Dimitropoulos 	 */
2498555d674dSSerapheim Dimitropoulos 	if (msp->ms_group != NULL)
2499555d674dSSerapheim Dimitropoulos 		metaslab_recalculate_weight_and_sort(msp);
25000713e232SGeorge Wilson }
25010713e232SGeorge Wilson 
25024d7988d6SPaul Dagnelie /*
25034d7988d6SPaul Dagnelie  * We want to optimize the memory use of the per-metaslab range
25044d7988d6SPaul Dagnelie  * trees. To do this, we store the segments in the range trees in
25054d7988d6SPaul Dagnelie  * units of sectors, zero-indexing from the start of the metaslab. If
25064d7988d6SPaul Dagnelie  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
25074d7988d6SPaul Dagnelie  * the ranges using two uint32_ts, rather than two uint64_ts.
25084d7988d6SPaul Dagnelie  */
25094d7988d6SPaul Dagnelie static range_seg_type_t
metaslab_calculate_range_tree_type(vdev_t * vdev,metaslab_t * msp,uint64_t * start,uint64_t * shift)25104d7988d6SPaul Dagnelie metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
25114d7988d6SPaul Dagnelie     uint64_t *start, uint64_t *shift)
25124d7988d6SPaul Dagnelie {
25134d7988d6SPaul Dagnelie 	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
25144d7988d6SPaul Dagnelie 	    !zfs_metaslab_force_large_segs) {
25154d7988d6SPaul Dagnelie 		*shift = vdev->vdev_ashift;
25164d7988d6SPaul Dagnelie 		*start = msp->ms_start;
25174d7988d6SPaul Dagnelie 		return (RANGE_SEG32);
25184d7988d6SPaul Dagnelie 	} else {
25194d7988d6SPaul Dagnelie 		*shift = 0;
25204d7988d6SPaul Dagnelie 		*start = 0;
25214d7988d6SPaul Dagnelie 		return (RANGE_SEG64);
25224d7988d6SPaul Dagnelie 	}
25234d7988d6SPaul Dagnelie }
25244d7988d6SPaul Dagnelie 
2525af1d63abSPaul Dagnelie void
metaslab_set_selected_txg(metaslab_t * msp,uint64_t txg)2526af1d63abSPaul Dagnelie metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2527af1d63abSPaul Dagnelie {
2528af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
2529af1d63abSPaul Dagnelie 	metaslab_class_t *mc = msp->ms_group->mg_class;
2530af1d63abSPaul Dagnelie 	multilist_sublist_t *mls =
2531af1d63abSPaul Dagnelie 	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2532af1d63abSPaul Dagnelie 	if (multilist_link_active(&msp->ms_class_txg_node))
2533af1d63abSPaul Dagnelie 		multilist_sublist_remove(mls, msp);
2534af1d63abSPaul Dagnelie 	msp->ms_selected_txg = txg;
2535af1d63abSPaul Dagnelie 	msp->ms_selected_time = gethrtime();
2536af1d63abSPaul Dagnelie 	multilist_sublist_insert_tail(mls, msp);
2537af1d63abSPaul Dagnelie 	multilist_sublist_unlock(mls);
2538af1d63abSPaul Dagnelie }
2539af1d63abSPaul Dagnelie 
2540814dcd43SSerapheim Dimitropoulos void
metaslab_space_update(vdev_t * vd,metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta)2541663207adSDon Brady metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2542663207adSDon Brady     int64_t defer_delta, int64_t space_delta)
2543663207adSDon Brady {
2544663207adSDon Brady 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2545663207adSDon Brady 
2546663207adSDon Brady 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2547663207adSDon Brady 	ASSERT(vd->vdev_ms_count != 0);
2548663207adSDon Brady 
2549663207adSDon Brady 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2550663207adSDon Brady 	    vdev_deflated_space(vd, space_delta));
2551663207adSDon Brady }
2552663207adSDon Brady 
25531e9bd7ecSPrakash Surya int
metaslab_init(metaslab_group_t * mg,uint64_t id,uint64_t object,uint64_t txg,metaslab_t ** msp)2554814dcd43SSerapheim Dimitropoulos metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2555814dcd43SSerapheim Dimitropoulos     uint64_t txg, metaslab_t **msp)
2556fa9e4066Sahrens {
2557fa9e4066Sahrens 	vdev_t *vd = mg->mg_vd;
2558663207adSDon Brady 	spa_t *spa = vd->vdev_spa;
2559663207adSDon Brady 	objset_t *mos = spa->spa_meta_objset;
25601e9bd7ecSPrakash Surya 	metaslab_t *ms;
25611e9bd7ecSPrakash Surya 	int error;
2562fa9e4066Sahrens 
25631e9bd7ecSPrakash Surya 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
25641e9bd7ecSPrakash Surya 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
25655cabbc6bSPrashanth Sreenivasa 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
25661e9bd7ecSPrakash Surya 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2567814dcd43SSerapheim Dimitropoulos 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2568af1d63abSPaul Dagnelie 	multilist_link_init(&ms->ms_class_txg_node);
2569094e47e9SGeorge Wilson 
25701e9bd7ecSPrakash Surya 	ms->ms_id = id;
25711e9bd7ecSPrakash Surya 	ms->ms_start = id << vd->vdev_ms_shift;
25721e9bd7ecSPrakash Surya 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
2573f78cdc34SPaul Dagnelie 	ms->ms_allocator = -1;
2574f78cdc34SPaul Dagnelie 	ms->ms_new = B_TRUE;
2575fa9e4066Sahrens 
25760713e232SGeorge Wilson 	/*
25770713e232SGeorge Wilson 	 * We only open space map objects that already exist. All others
25780713e232SGeorge Wilson 	 * will be opened when we finally allocate an object for it.
2579555d674dSSerapheim Dimitropoulos 	 *
2580555d674dSSerapheim Dimitropoulos 	 * Note:
2581555d674dSSerapheim Dimitropoulos 	 * When called from vdev_expand(), we can't call into the DMU as
2582555d674dSSerapheim Dimitropoulos 	 * we are holding the spa_config_lock as a writer and we would
2583555d674dSSerapheim Dimitropoulos 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
2584555d674dSSerapheim Dimitropoulos 	 * that case, the object parameter is zero though, so we won't
2585555d674dSSerapheim Dimitropoulos 	 * call into the DMU.
25860713e232SGeorge Wilson 	 */
25870713e232SGeorge Wilson 	if (object != 0) {
25881e9bd7ecSPrakash Surya 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
25895cabbc6bSPrashanth Sreenivasa 		    ms->ms_size, vd->vdev_ashift);
25901e9bd7ecSPrakash Surya 
25911e9bd7ecSPrakash Surya 		if (error != 0) {
25921e9bd7ecSPrakash Surya 			kmem_free(ms, sizeof (metaslab_t));
25931e9bd7ecSPrakash Surya 			return (error);
25941e9bd7ecSPrakash Surya 		}
25951e9bd7ecSPrakash Surya 
25961e9bd7ecSPrakash Surya 		ASSERT(ms->ms_sm != NULL);
2597555d674dSSerapheim Dimitropoulos 		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
2598555d674dSSerapheim Dimitropoulos 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
25990713e232SGeorge Wilson 	}
2600fa9e4066Sahrens 
26014d7988d6SPaul Dagnelie 	range_seg_type_t type;
26024d7988d6SPaul Dagnelie 	uint64_t shift, start;
26034d7988d6SPaul Dagnelie 	type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
26044d7988d6SPaul Dagnelie 
2605ecc2d604Sbonwick 	/*
2606555d674dSSerapheim Dimitropoulos 	 * We create the ms_allocatable here, but we don't create the
26075f145778SMatthew Ahrens 	 * other range trees until metaslab_sync_done().  This serves
2608ecc2d604Sbonwick 	 * two purposes: it allows metaslab_sync_done() to detect the
2609555d674dSSerapheim Dimitropoulos 	 * addition of new space; and for debugging, it ensures that
2610555d674dSSerapheim Dimitropoulos 	 * we'd data fault on any attempt to use this metaslab before
2611555d674dSSerapheim Dimitropoulos 	 * it's ready.
2612ecc2d604Sbonwick 	 */
26134d7988d6SPaul Dagnelie 	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2614fa9e4066Sahrens 
26154d7988d6SPaul Dagnelie 	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2616084fd14fSBrian Behlendorf 
2617084fd14fSBrian Behlendorf 	metaslab_group_add(mg, ms);
26188363e80aSGeorge Wilson 	metaslab_set_fragmentation(ms);
2619b24ab676SJeff Bonwick 
2620fa9e4066Sahrens 	/*
2621fa9e4066Sahrens 	 * If we're opening an existing pool (txg == 0) or creating
2622fa9e4066Sahrens 	 * a new one (txg == TXG_INITIAL), all space is available now.
2623fa9e4066Sahrens 	 * If we're adding space to an existing pool, the new space
2624fa9e4066Sahrens 	 * does not become available until after this txg has synced.
26258363e80aSGeorge Wilson 	 * The metaslab's weight will also be initialized when we sync
26268363e80aSGeorge Wilson 	 * out this txg. This ensures that we don't attempt to allocate
26278363e80aSGeorge Wilson 	 * from it before we have initialized it completely.
2628fa9e4066Sahrens 	 */
2629555d674dSSerapheim Dimitropoulos 	if (txg <= TXG_INITIAL) {
26301e9bd7ecSPrakash Surya 		metaslab_sync_done(ms, 0);
2631555d674dSSerapheim Dimitropoulos 		metaslab_space_update(vd, mg->mg_class,
2632555d674dSSerapheim Dimitropoulos 		    metaslab_allocated_space(ms), 0, 0);
2633555d674dSSerapheim Dimitropoulos 	}
2634ecc2d604Sbonwick 
2635ecc2d604Sbonwick 	if (txg != 0) {
2636ecc2d604Sbonwick 		vdev_dirty(vd, 0, NULL, txg);
26371e9bd7ecSPrakash Surya 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
2638fa9e4066Sahrens 	}
2639fa9e4066Sahrens 
26401e9bd7ecSPrakash Surya 	*msp = ms;
26411e9bd7ecSPrakash Surya 
26421e9bd7ecSPrakash Surya 	return (0);
2643fa9e4066Sahrens }
2644fa9e4066Sahrens 
2645814dcd43SSerapheim Dimitropoulos static void
metaslab_fini_flush_data(metaslab_t * msp)2646814dcd43SSerapheim Dimitropoulos metaslab_fini_flush_data(metaslab_t *msp)
2647814dcd43SSerapheim Dimitropoulos {
2648814dcd43SSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2649814dcd43SSerapheim Dimitropoulos 
2650814dcd43SSerapheim Dimitropoulos 	if (metaslab_unflushed_txg(msp) == 0) {
2651814dcd43SSerapheim Dimitropoulos 		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2652814dcd43SSerapheim Dimitropoulos 		    ==, NULL);
2653814dcd43SSerapheim Dimitropoulos 		return;
2654814dcd43SSerapheim Dimitropoulos 	}
2655814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2656814dcd43SSerapheim Dimitropoulos 
2657814dcd43SSerapheim Dimitropoulos 	mutex_enter(&spa->spa_flushed_ms_lock);
2658814dcd43SSerapheim Dimitropoulos 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2659814dcd43SSerapheim Dimitropoulos 	mutex_exit(&spa->spa_flushed_ms_lock);
2660814dcd43SSerapheim Dimitropoulos 
2661814dcd43SSerapheim Dimitropoulos 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2662814dcd43SSerapheim Dimitropoulos 	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2663814dcd43SSerapheim Dimitropoulos }
2664814dcd43SSerapheim Dimitropoulos 
2665814dcd43SSerapheim Dimitropoulos uint64_t
metaslab_unflushed_changes_memused(metaslab_t * ms)2666814dcd43SSerapheim Dimitropoulos metaslab_unflushed_changes_memused(metaslab_t *ms)
2667814dcd43SSerapheim Dimitropoulos {
2668814dcd43SSerapheim Dimitropoulos 	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2669814dcd43SSerapheim Dimitropoulos 	    range_tree_numsegs(ms->ms_unflushed_frees)) *
26704d7988d6SPaul Dagnelie 	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2671814dcd43SSerapheim Dimitropoulos }
2672814dcd43SSerapheim Dimitropoulos 
2673fa9e4066Sahrens void
metaslab_fini(metaslab_t * msp)2674fa9e4066Sahrens metaslab_fini(metaslab_t *msp)
2675fa9e4066Sahrens {
2676fa9e4066Sahrens 	metaslab_group_t *mg = msp->ms_group;
2677663207adSDon Brady 	vdev_t *vd = mg->mg_vd;
2678814dcd43SSerapheim Dimitropoulos 	spa_t *spa = vd->vdev_spa;
2679814dcd43SSerapheim Dimitropoulos 
2680814dcd43SSerapheim Dimitropoulos 	metaslab_fini_flush_data(msp);
2681fa9e4066Sahrens 
2682fa9e4066Sahrens 	metaslab_group_remove(mg, msp);
2683fa9e4066Sahrens 
2684fa9e4066Sahrens 	mutex_enter(&msp->ms_lock);
26850713e232SGeorge Wilson 	VERIFY(msp->ms_group == NULL);
2686663207adSDon Brady 	metaslab_space_update(vd, mg->mg_class,
2687555d674dSSerapheim Dimitropoulos 	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
2688663207adSDon Brady 
26890713e232SGeorge Wilson 	space_map_close(msp->ms_sm);
2690814dcd43SSerapheim Dimitropoulos 	msp->ms_sm = NULL;
26910713e232SGeorge Wilson 
26920713e232SGeorge Wilson 	metaslab_unload(msp);
269386714001SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_allocatable);
269486714001SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_freeing);
269586714001SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_freed);
2696fa9e4066Sahrens 
2697814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2698814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp));
2699814dcd43SSerapheim Dimitropoulos 	spa->spa_unflushed_stats.sus_memused -=
2700814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp);
2701814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2702814dcd43SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_unflushed_allocs);
2703814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2704814dcd43SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_unflushed_frees);
2705814dcd43SSerapheim Dimitropoulos 
2706468c413aSTim Haley 	for (int t = 0; t < TXG_SIZE; t++) {
270786714001SSerapheim Dimitropoulos 		range_tree_destroy(msp->ms_allocating[t]);
2708fa9e4066Sahrens 	}
2709fa9e4066Sahrens 
271016a4a807SGeorge Wilson 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
271186714001SSerapheim Dimitropoulos 		range_tree_destroy(msp->ms_defer[t]);
271216a4a807SGeorge Wilson 	}
2713fb09f5aaSMadhav Suresh 	ASSERT0(msp->ms_deferspace);
2714468c413aSTim Haley 
271586714001SSerapheim Dimitropoulos 	range_tree_destroy(msp->ms_checkpointing);
271686714001SSerapheim Dimitropoulos 
2717555d674dSSerapheim Dimitropoulos 	for (int t = 0; t < TXG_SIZE; t++)
2718555d674dSSerapheim Dimitropoulos 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2719555d674dSSerapheim Dimitropoulos 
2720084fd14fSBrian Behlendorf 	range_tree_vacate(msp->ms_trim, NULL, NULL);
2721084fd14fSBrian Behlendorf 	range_tree_destroy(msp->ms_trim);
2722084fd14fSBrian Behlendorf 
2723fa9e4066Sahrens 	mutex_exit(&msp->ms_lock);
27240713e232SGeorge Wilson 	cv_destroy(&msp->ms_load_cv);
2725814dcd43SSerapheim Dimitropoulos 	cv_destroy(&msp->ms_flush_cv);
27265ad82045Snd 	mutex_destroy(&msp->ms_lock);
27275cabbc6bSPrashanth Sreenivasa 	mutex_destroy(&msp->ms_sync_lock);
2728f78cdc34SPaul Dagnelie 	ASSERT3U(msp->ms_allocator, ==, -1);
2729fa9e4066Sahrens 
2730fa9e4066Sahrens 	kmem_free(msp, sizeof (metaslab_t));
2731fa9e4066Sahrens }
2732fa9e4066Sahrens 
27332e4c9986SGeorge Wilson #define	FRAGMENTATION_TABLE_SIZE	17
27342e4c9986SGeorge Wilson 
27350713e232SGeorge Wilson /*
27362e4c9986SGeorge Wilson  * This table defines a segment size based fragmentation metric that will
27372e4c9986SGeorge Wilson  * allow each metaslab to derive its own fragmentation value. This is done
27382e4c9986SGeorge Wilson  * by calculating the space in each bucket of the spacemap histogram and
2739555d674dSSerapheim Dimitropoulos  * multiplying that by the fragmentation metric in this table. Doing
27402e4c9986SGeorge Wilson  * this for all buckets and dividing it by the total amount of free
27412e4c9986SGeorge Wilson  * space in this metaslab (i.e. the total free space in all buckets) gives
27422e4c9986SGeorge Wilson  * us the fragmentation metric. This means that a high fragmentation metric
27432e4c9986SGeorge Wilson  * equates to most of the free space being comprised of small segments.
27442e4c9986SGeorge Wilson  * Conversely, if the metric is low, then most of the free space is in
27452e4c9986SGeorge Wilson  * large segments. A 10% change in fragmentation equates to approximately
27462e4c9986SGeorge Wilson  * double the number of segments.
27470713e232SGeorge Wilson  *
27482e4c9986SGeorge Wilson  * This table defines 0% fragmented space using 16MB segments. Testing has
27492e4c9986SGeorge Wilson  * shown that segments that are greater than or equal to 16MB do not suffer
27502e4c9986SGeorge Wilson  * from drastic performance problems. Using this value, we derive the rest
27512e4c9986SGeorge Wilson  * of the table. Since the fragmentation value is never stored on disk, it
27522e4c9986SGeorge Wilson  * is possible to change these calculations in the future.
27532e4c9986SGeorge Wilson  */
27542e4c9986SGeorge Wilson int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
27552e4c9986SGeorge Wilson 	100,	/* 512B	*/
27562e4c9986SGeorge Wilson 	100,	/* 1K	*/
27572e4c9986SGeorge Wilson 	98,	/* 2K	*/
27582e4c9986SGeorge Wilson 	95,	/* 4K	*/
27592e4c9986SGeorge Wilson 	90,	/* 8K	*/
27602e4c9986SGeorge Wilson 	80,	/* 16K	*/
27612e4c9986SGeorge Wilson 	70,	/* 32K	*/
27622e4c9986SGeorge Wilson 	60,	/* 64K	*/
27632e4c9986SGeorge Wilson 	50,	/* 128K	*/
27642e4c9986SGeorge Wilson 	40,	/* 256K	*/
27652e4c9986SGeorge Wilson 	30,	/* 512K	*/
27662e4c9986SGeorge Wilson 	20,	/* 1M	*/
27672e4c9986SGeorge Wilson 	15,	/* 2M	*/
27682e4c9986SGeorge Wilson 	10,	/* 4M	*/
27692e4c9986SGeorge Wilson 	5,	/* 8M	*/
27702e4c9986SGeorge Wilson 	0	/* 16M	*/
27712e4c9986SGeorge Wilson };
27722e4c9986SGeorge Wilson 
27732e4c9986SGeorge Wilson /*
2774555d674dSSerapheim Dimitropoulos  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2775555d674dSSerapheim Dimitropoulos  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2776555d674dSSerapheim Dimitropoulos  * been upgraded and does not support this metric. Otherwise, the return
2777555d674dSSerapheim Dimitropoulos  * value should be in the range [0, 100].
27780713e232SGeorge Wilson  */
27798363e80aSGeorge Wilson static void
metaslab_set_fragmentation(metaslab_t * msp)27808363e80aSGeorge Wilson metaslab_set_fragmentation(metaslab_t *msp)
27810713e232SGeorge Wilson {
27822e4c9986SGeorge Wilson 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
27832e4c9986SGeorge Wilson 	uint64_t fragmentation = 0;
27842e4c9986SGeorge Wilson 	uint64_t total = 0;
27852e4c9986SGeorge Wilson 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
27862e4c9986SGeorge Wilson 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
27872e4c9986SGeorge Wilson 
27888363e80aSGeorge Wilson 	if (!feature_enabled) {
27898363e80aSGeorge Wilson 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
27908363e80aSGeorge Wilson 		return;
27918363e80aSGeorge Wilson 	}
27920713e232SGeorge Wilson 
27930713e232SGeorge Wilson 	/*
27942e4c9986SGeorge Wilson 	 * A null space map means that the entire metaslab is free
27952e4c9986SGeorge Wilson 	 * and thus is not fragmented.
27960713e232SGeorge Wilson 	 */
27978363e80aSGeorge Wilson 	if (msp->ms_sm == NULL) {
27988363e80aSGeorge Wilson 		msp->ms_fragmentation = 0;
27998363e80aSGeorge Wilson 		return;
28008363e80aSGeorge Wilson 	}
28012e4c9986SGeorge Wilson 
28022e4c9986SGeorge Wilson 	/*
28038363e80aSGeorge Wilson 	 * If this metaslab's space map has not been upgraded, flag it
28042e4c9986SGeorge Wilson 	 * so that we upgrade next time we encounter it.
28052e4c9986SGeorge Wilson 	 */
28062e4c9986SGeorge Wilson 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
28072e4c9986SGeorge Wilson 		uint64_t txg = spa_syncing_txg(spa);
28080713e232SGeorge Wilson 		vdev_t *vd = msp->ms_group->mg_vd;
28090713e232SGeorge Wilson 
28103991b535SGeorge Wilson 		/*
28113991b535SGeorge Wilson 		 * If we've reached the final dirty txg, then we must
28123991b535SGeorge Wilson 		 * be shutting down the pool. We don't want to dirty
28133991b535SGeorge Wilson 		 * any data past this point so skip setting the condense
28143991b535SGeorge Wilson 		 * flag. We can retry this action the next time the pool
28153991b535SGeorge Wilson 		 * is imported.
28163991b535SGeorge Wilson 		 */
28173991b535SGeorge Wilson 		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2818b1be2892SMatthew Ahrens 			msp->ms_condense_wanted = B_TRUE;
2819b1be2892SMatthew Ahrens 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
282021f7c81cSMatthew Ahrens 			zfs_dbgmsg("txg %llu, requesting force condense: "
28213991b535SGeorge Wilson 			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
28223991b535SGeorge Wilson 			    vd->vdev_id);
2823b1be2892SMatthew Ahrens 		}
28248363e80aSGeorge Wilson 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
28258363e80aSGeorge Wilson 		return;
28260713e232SGeorge Wilson 	}
28270713e232SGeorge Wilson 
28282e4c9986SGeorge Wilson 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
28292e4c9986SGeorge Wilson 		uint64_t space = 0;
28302e4c9986SGeorge Wilson 		uint8_t shift = msp->ms_sm->sm_shift;
28318363e80aSGeorge Wilson 
28322e4c9986SGeorge Wilson 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
28332e4c9986SGeorge Wilson 		    FRAGMENTATION_TABLE_SIZE - 1);
28340713e232SGeorge Wilson 
28350713e232SGeorge Wilson 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
28360713e232SGeorge Wilson 			continue;
28370713e232SGeorge Wilson 
28382e4c9986SGeorge Wilson 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
28392e4c9986SGeorge Wilson 		total += space;
28402e4c9986SGeorge Wilson 
28412e4c9986SGeorge Wilson 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
28422e4c9986SGeorge Wilson 		fragmentation += space * zfs_frag_table[idx];
28430713e232SGeorge Wilson 	}
28442e4c9986SGeorge Wilson 
28452e4c9986SGeorge Wilson 	if (total > 0)
28462e4c9986SGeorge Wilson 		fragmentation /= total;
28472e4c9986SGeorge Wilson 	ASSERT3U(fragmentation, <=, 100);
28488363e80aSGeorge Wilson 
28498363e80aSGeorge Wilson 	msp->ms_fragmentation = fragmentation;
28500713e232SGeorge Wilson }
2851ecc2d604Sbonwick 
28522e4c9986SGeorge Wilson /*
28532e4c9986SGeorge Wilson  * Compute a weight -- a selection preference value -- for the given metaslab.
28542e4c9986SGeorge Wilson  * This is based on the amount of free space, the level of fragmentation,
28552e4c9986SGeorge Wilson  * the LBA range, and whether the metaslab is loaded.
28562e4c9986SGeorge Wilson  */
2857ecc2d604Sbonwick static uint64_t
metaslab_space_weight(metaslab_t * msp)28588363e80aSGeorge Wilson metaslab_space_weight(metaslab_t *msp)
2859ecc2d604Sbonwick {
286044cd46caSbillm 	metaslab_group_t *mg = msp->ms_group;
286144cd46caSbillm 	vdev_t *vd = mg->mg_vd;
2862ecc2d604Sbonwick 	uint64_t weight, space;
2863ecc2d604Sbonwick 
2864ecc2d604Sbonwick 	ASSERT(MUTEX_HELD(&msp->ms_lock));
28659eb57f7fSGeorge Wilson 
2866ecc2d604Sbonwick 	/*
2867ecc2d604Sbonwick 	 * The baseline weight is the metaslab's free space.
2868ecc2d604Sbonwick 	 */
2869555d674dSSerapheim Dimitropoulos 	space = msp->ms_size - metaslab_allocated_space(msp);
28702e4c9986SGeorge Wilson 
28712e4c9986SGeorge Wilson 	if (metaslab_fragmentation_factor_enabled &&
28722e4c9986SGeorge Wilson 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
28732e4c9986SGeorge Wilson 		/*
28742e4c9986SGeorge Wilson 		 * Use the fragmentation information to inversely scale
28752e4c9986SGeorge Wilson 		 * down the baseline weight. We need to ensure that we
28762e4c9986SGeorge Wilson 		 * don't exclude this metaslab completely when it's 100%
28772e4c9986SGeorge Wilson 		 * fragmented. To avoid this we reduce the fragmented value
28782e4c9986SGeorge Wilson 		 * by 1.
28792e4c9986SGeorge Wilson 		 */
28802e4c9986SGeorge Wilson 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
28812e4c9986SGeorge Wilson 
28822e4c9986SGeorge Wilson 		/*
28832e4c9986SGeorge Wilson 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
28842e4c9986SGeorge Wilson 		 * this metaslab again. The fragmentation metric may have
28852e4c9986SGeorge Wilson 		 * decreased the space to something smaller than
28862e4c9986SGeorge Wilson 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
28872e4c9986SGeorge Wilson 		 * so that we can consume any remaining space.
28882e4c9986SGeorge Wilson 		 */
28892e4c9986SGeorge Wilson 		if (space > 0 && space < SPA_MINBLOCKSIZE)
28902e4c9986SGeorge Wilson 			space = SPA_MINBLOCKSIZE;
28912e4c9986SGeorge Wilson 	}
2892ecc2d604Sbonwick 	weight = space;
2893ecc2d604Sbonwick 
2894ecc2d604Sbonwick 	/*
2895ecc2d604Sbonwick 	 * Modern disks have uniform bit density and constant angular velocity.
2896ecc2d604Sbonwick 	 * Therefore, the outer recording zones are faster (higher bandwidth)
2897ecc2d604Sbonwick 	 * than the inner zones by the ratio of outer to inner track diameter,
2898ecc2d604Sbonwick 	 * which is typically around 2:1.  We account for this by assigning
2899ecc2d604Sbonwick 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2900ecc2d604Sbonwick 	 * In effect, this means that we'll select the metaslab with the most
2901ecc2d604Sbonwick 	 * free bandwidth rather than simply the one with the most free space.
2902ecc2d604Sbonwick 	 */
290312a8814cSTom Caputi 	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
29042e4c9986SGeorge Wilson 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
29052e4c9986SGeorge Wilson 		ASSERT(weight >= space && weight <= 2 * space);
29062e4c9986SGeorge Wilson 	}
290780eb36f2SGeorge Wilson 
29082e4c9986SGeorge Wilson 	/*
29092e4c9986SGeorge Wilson 	 * If this metaslab is one we're actively using, adjust its
29102e4c9986SGeorge Wilson 	 * weight to make it preferable to any inactive metaslab so
29112e4c9986SGeorge Wilson 	 * we'll polish it off. If the fragmentation on this metaslab
29122e4c9986SGeorge Wilson 	 * has exceed our threshold, then don't mark it active.
29132e4c9986SGeorge Wilson 	 */
29142e4c9986SGeorge Wilson 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
29152e4c9986SGeorge Wilson 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
291680eb36f2SGeorge Wilson 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
291780eb36f2SGeorge Wilson 	}
291880eb36f2SGeorge Wilson 
29198363e80aSGeorge Wilson 	WEIGHT_SET_SPACEBASED(weight);
29208363e80aSGeorge Wilson 	return (weight);
29218363e80aSGeorge Wilson }
29228363e80aSGeorge Wilson 
29238363e80aSGeorge Wilson /*
29248363e80aSGeorge Wilson  * Return the weight of the specified metaslab, according to the segment-based
29258363e80aSGeorge Wilson  * weighting algorithm. The metaslab must be loaded. This function can
29268363e80aSGeorge Wilson  * be called within a sync pass since it relies only on the metaslab's
29278363e80aSGeorge Wilson  * range tree which is always accurate when the metaslab is loaded.
29288363e80aSGeorge Wilson  */
29298363e80aSGeorge Wilson static uint64_t
metaslab_weight_from_range_tree(metaslab_t * msp)29308363e80aSGeorge Wilson metaslab_weight_from_range_tree(metaslab_t *msp)
29318363e80aSGeorge Wilson {
29328363e80aSGeorge Wilson 	uint64_t weight = 0;
29338363e80aSGeorge Wilson 	uint32_t segments = 0;
29348363e80aSGeorge Wilson 
29358363e80aSGeorge Wilson 	ASSERT(msp->ms_loaded);
29368363e80aSGeorge Wilson 
29378363e80aSGeorge Wilson 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
29388363e80aSGeorge Wilson 	    i--) {
29398363e80aSGeorge Wilson 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
29408363e80aSGeorge Wilson 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
29418363e80aSGeorge Wilson 
29428363e80aSGeorge Wilson 		segments <<= 1;
294386714001SSerapheim Dimitropoulos 		segments += msp->ms_allocatable->rt_histogram[i];
29448363e80aSGeorge Wilson 
29458363e80aSGeorge Wilson 		/*
29468363e80aSGeorge Wilson 		 * The range tree provides more precision than the space map
29478363e80aSGeorge Wilson 		 * and must be downgraded so that all values fit within the
29488363e80aSGeorge Wilson 		 * space map's histogram. This allows us to compare loaded
29498363e80aSGeorge Wilson 		 * vs. unloaded metaslabs to determine which metaslab is
29508363e80aSGeorge Wilson 		 * considered "best".
29518363e80aSGeorge Wilson 		 */
29528363e80aSGeorge Wilson 		if (i > max_idx)
29538363e80aSGeorge Wilson 			continue;
29548363e80aSGeorge Wilson 
29558363e80aSGeorge Wilson 		if (segments != 0) {
29568363e80aSGeorge Wilson 			WEIGHT_SET_COUNT(weight, segments);
29578363e80aSGeorge Wilson 			WEIGHT_SET_INDEX(weight, i);
29588363e80aSGeorge Wilson 			WEIGHT_SET_ACTIVE(weight, 0);
29598363e80aSGeorge Wilson 			break;
29608363e80aSGeorge Wilson 		}
29618363e80aSGeorge Wilson 	}
29628363e80aSGeorge Wilson 	return (weight);
29638363e80aSGeorge Wilson }
29648363e80aSGeorge Wilson 
29658363e80aSGeorge Wilson /*
2966814dcd43SSerapheim Dimitropoulos  * Calculate the weight based on the on-disk histogram. Should be applied
2967814dcd43SSerapheim Dimitropoulos  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
2968814dcd43SSerapheim Dimitropoulos  * give results consistent with the on-disk state
29698363e80aSGeorge Wilson  */
29708363e80aSGeorge Wilson static uint64_t
metaslab_weight_from_spacemap(metaslab_t * msp)29718363e80aSGeorge Wilson metaslab_weight_from_spacemap(metaslab_t *msp)
29728363e80aSGeorge Wilson {
2973555d674dSSerapheim Dimitropoulos 	space_map_t *sm = msp->ms_sm;
2974555d674dSSerapheim Dimitropoulos 	ASSERT(!msp->ms_loaded);
2975555d674dSSerapheim Dimitropoulos 	ASSERT(sm != NULL);
2976555d674dSSerapheim Dimitropoulos 	ASSERT3U(space_map_object(sm), !=, 0);
2977555d674dSSerapheim Dimitropoulos 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
29788363e80aSGeorge Wilson 
2979555d674dSSerapheim Dimitropoulos 	/*
2980555d674dSSerapheim Dimitropoulos 	 * Create a joint histogram from all the segments that have made
2981555d674dSSerapheim Dimitropoulos 	 * it to the metaslab's space map histogram, that are not yet
2982555d674dSSerapheim Dimitropoulos 	 * available for allocation because they are still in the freeing
2983555d674dSSerapheim Dimitropoulos 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2984555d674dSSerapheim Dimitropoulos 	 * these segments from the space map's histogram to get a more
2985555d674dSSerapheim Dimitropoulos 	 * accurate weight.
2986555d674dSSerapheim Dimitropoulos 	 */
2987555d674dSSerapheim Dimitropoulos 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2988555d674dSSerapheim Dimitropoulos 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2989555d674dSSerapheim Dimitropoulos 		deferspace_histogram[i] += msp->ms_synchist[i];
2990555d674dSSerapheim Dimitropoulos 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2991555d674dSSerapheim Dimitropoulos 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2992555d674dSSerapheim Dimitropoulos 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
2993555d674dSSerapheim Dimitropoulos 		}
2994555d674dSSerapheim Dimitropoulos 	}
2995555d674dSSerapheim Dimitropoulos 
2996555d674dSSerapheim Dimitropoulos 	uint64_t weight = 0;
29978363e80aSGeorge Wilson 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2998555d674dSSerapheim Dimitropoulos 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2999555d674dSSerapheim Dimitropoulos 		    deferspace_histogram[i]);
3000555d674dSSerapheim Dimitropoulos 		uint64_t count =
3001555d674dSSerapheim Dimitropoulos 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3002555d674dSSerapheim Dimitropoulos 		if (count != 0) {
3003555d674dSSerapheim Dimitropoulos 			WEIGHT_SET_COUNT(weight, count);
3004555d674dSSerapheim Dimitropoulos 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
30058363e80aSGeorge Wilson 			WEIGHT_SET_ACTIVE(weight, 0);
30068363e80aSGeorge Wilson 			break;
30078363e80aSGeorge Wilson 		}
30088363e80aSGeorge Wilson 	}
30098363e80aSGeorge Wilson 	return (weight);
30108363e80aSGeorge Wilson }
30118363e80aSGeorge Wilson 
30128363e80aSGeorge Wilson /*
30138363e80aSGeorge Wilson  * Compute a segment-based weight for the specified metaslab. The weight
30148363e80aSGeorge Wilson  * is determined by highest bucket in the histogram. The information
30158363e80aSGeorge Wilson  * for the highest bucket is encoded into the weight value.
30168363e80aSGeorge Wilson  */
30178363e80aSGeorge Wilson static uint64_t
metaslab_segment_weight(metaslab_t * msp)30188363e80aSGeorge Wilson metaslab_segment_weight(metaslab_t *msp)
30198363e80aSGeorge Wilson {
30208363e80aSGeorge Wilson 	metaslab_group_t *mg = msp->ms_group;
30218363e80aSGeorge Wilson 	uint64_t weight = 0;
30228363e80aSGeorge Wilson 	uint8_t shift = mg->mg_vd->vdev_ashift;
30238363e80aSGeorge Wilson 
30248363e80aSGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
30258363e80aSGeorge Wilson 
30268363e80aSGeorge Wilson 	/*
30278363e80aSGeorge Wilson 	 * The metaslab is completely free.
30288363e80aSGeorge Wilson 	 */
3029555d674dSSerapheim Dimitropoulos 	if (metaslab_allocated_space(msp) == 0) {
30308363e80aSGeorge Wilson 		int idx = highbit64(msp->ms_size) - 1;
30318363e80aSGeorge Wilson 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
30328363e80aSGeorge Wilson 
30338363e80aSGeorge Wilson 		if (idx < max_idx) {
30348363e80aSGeorge Wilson 			WEIGHT_SET_COUNT(weight, 1ULL);
30358363e80aSGeorge Wilson 			WEIGHT_SET_INDEX(weight, idx);
30368363e80aSGeorge Wilson 		} else {
30378363e80aSGeorge Wilson 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
30388363e80aSGeorge Wilson 			WEIGHT_SET_INDEX(weight, max_idx);
30398363e80aSGeorge Wilson 		}
30408363e80aSGeorge Wilson 		WEIGHT_SET_ACTIVE(weight, 0);
30418363e80aSGeorge Wilson 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
30428363e80aSGeorge Wilson 		return (weight);
30438363e80aSGeorge Wilson 	}
30448363e80aSGeorge Wilson 
30458363e80aSGeorge Wilson 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
30468363e80aSGeorge Wilson 
30478363e80aSGeorge Wilson 	/*
30488363e80aSGeorge Wilson 	 * If the metaslab is fully allocated then just make the weight 0.
30498363e80aSGeorge Wilson 	 */
3050555d674dSSerapheim Dimitropoulos 	if (metaslab_allocated_space(msp) == msp->ms_size)
30518363e80aSGeorge Wilson 		return (0);
30528363e80aSGeorge Wilson 	/*
30538363e80aSGeorge Wilson 	 * If the metaslab is already loaded, then use the range tree to
30548363e80aSGeorge Wilson 	 * determine the weight. Otherwise, we rely on the space map information
30558363e80aSGeorge Wilson 	 * to generate the weight.
30568363e80aSGeorge Wilson 	 */
30578363e80aSGeorge Wilson 	if (msp->ms_loaded) {
30588363e80aSGeorge Wilson 		weight = metaslab_weight_from_range_tree(msp);
30598363e80aSGeorge Wilson 	} else {
30608363e80aSGeorge Wilson 		weight = metaslab_weight_from_spacemap(msp);
30618363e80aSGeorge Wilson 	}
30628363e80aSGeorge Wilson 
30638363e80aSGeorge Wilson 	/*
30648363e80aSGeorge Wilson 	 * If the metaslab was active the last time we calculated its weight
30658363e80aSGeorge Wilson 	 * then keep it active. We want to consume the entire region that
30668363e80aSGeorge Wilson 	 * is associated with this weight.
30678363e80aSGeorge Wilson 	 */
30688363e80aSGeorge Wilson 	if (msp->ms_activation_weight != 0 && weight != 0)
30698363e80aSGeorge Wilson 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
30708363e80aSGeorge Wilson 	return (weight);
30718363e80aSGeorge Wilson }
30728363e80aSGeorge Wilson 
30738363e80aSGeorge Wilson /*
30748363e80aSGeorge Wilson  * Determine if we should attempt to allocate from this metaslab. If the
3075814dcd43SSerapheim Dimitropoulos  * metaslab is loaded, then we can determine if the desired allocation
3076814dcd43SSerapheim Dimitropoulos  * can be satisfied by looking at the size of the maximum free segment
3077814dcd43SSerapheim Dimitropoulos  * on that metaslab. Otherwise, we make our decision based on the metaslab's
3078814dcd43SSerapheim Dimitropoulos  * weight. For segment-based weighting we can determine the maximum
3079814dcd43SSerapheim Dimitropoulos  * allocation based on the index encoded in its value. For space-based
3080814dcd43SSerapheim Dimitropoulos  * weights we rely on the entire weight (excluding the weight-type bit).
30818363e80aSGeorge Wilson  */
30828363e80aSGeorge Wilson boolean_t
metaslab_should_allocate(metaslab_t * msp,uint64_t asize,boolean_t try_hard)3083af1d63abSPaul Dagnelie metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
30848363e80aSGeorge Wilson {
3085af1d63abSPaul Dagnelie 	/*
3086af1d63abSPaul Dagnelie 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
3087af1d63abSPaul Dagnelie 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
3088af1d63abSPaul Dagnelie 	 * set), and we should use the fast check as long as we're not in
3089af1d63abSPaul Dagnelie 	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
3090af1d63abSPaul Dagnelie 	 * seconds since the metaslab was unloaded.
3091af1d63abSPaul Dagnelie 	 */
3092af1d63abSPaul Dagnelie 	if (msp->ms_loaded ||
3093af1d63abSPaul Dagnelie 	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3094af1d63abSPaul Dagnelie 	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
30958363e80aSGeorge Wilson 		return (msp->ms_max_size >= asize);
30968363e80aSGeorge Wilson 
3097814dcd43SSerapheim Dimitropoulos 	boolean_t should_allocate;
30988363e80aSGeorge Wilson 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
30998363e80aSGeorge Wilson 		/*
31008363e80aSGeorge Wilson 		 * The metaslab segment weight indicates segments in the
31018363e80aSGeorge Wilson 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
31028363e80aSGeorge Wilson 		 * Since the asize might be in the middle of the range, we
31038363e80aSGeorge Wilson 		 * should attempt the allocation if asize < 2^(i+1).
31048363e80aSGeorge Wilson 		 */
31058363e80aSGeorge Wilson 		should_allocate = (asize <
31068363e80aSGeorge Wilson 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
31078363e80aSGeorge Wilson 	} else {
31088363e80aSGeorge Wilson 		should_allocate = (asize <=
31098363e80aSGeorge Wilson 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
31108363e80aSGeorge Wilson 	}
3111af1d63abSPaul Dagnelie 
31128363e80aSGeorge Wilson 	return (should_allocate);
31138363e80aSGeorge Wilson }
31148363e80aSGeorge Wilson 
31158363e80aSGeorge Wilson static uint64_t
metaslab_weight(metaslab_t * msp)31168363e80aSGeorge Wilson metaslab_weight(metaslab_t *msp)
31178363e80aSGeorge Wilson {
31188363e80aSGeorge Wilson 	vdev_t *vd = msp->ms_group->mg_vd;
31198363e80aSGeorge Wilson 	spa_t *spa = vd->vdev_spa;
31208363e80aSGeorge Wilson 	uint64_t weight;
31218363e80aSGeorge Wilson 
31228363e80aSGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
31238363e80aSGeorge Wilson 
31248363e80aSGeorge Wilson 	metaslab_set_fragmentation(msp);
31258363e80aSGeorge Wilson 
31268363e80aSGeorge Wilson 	/*
3127af1d63abSPaul Dagnelie 	 * Update the maximum size. If the metaslab is loaded, this will
31288363e80aSGeorge Wilson 	 * ensure that we get an accurate maximum size if newly freed space
3129af1d63abSPaul Dagnelie 	 * has been added back into the free tree. If the metaslab is
3130af1d63abSPaul Dagnelie 	 * unloaded, we check if there's a larger free segment in the
3131af1d63abSPaul Dagnelie 	 * unflushed frees. This is a lower bound on the largest allocatable
3132af1d63abSPaul Dagnelie 	 * segment size. Coalescing of adjacent entries may reveal larger
3133af1d63abSPaul Dagnelie 	 * allocatable segments, but we aren't aware of those until loading
3134af1d63abSPaul Dagnelie 	 * the space map into a range tree.
31358363e80aSGeorge Wilson 	 */
3136af1d63abSPaul Dagnelie 	if (msp->ms_loaded) {
3137af1d63abSPaul Dagnelie 		msp->ms_max_size = metaslab_largest_allocatable(msp);
3138af1d63abSPaul Dagnelie 	} else {
3139af1d63abSPaul Dagnelie 		msp->ms_max_size = MAX(msp->ms_max_size,
3140af1d63abSPaul Dagnelie 		    metaslab_largest_unflushed_free(msp));
3141af1d63abSPaul Dagnelie 	}
31428363e80aSGeorge Wilson 
31438363e80aSGeorge Wilson 	/*
31448363e80aSGeorge Wilson 	 * Segment-based weighting requires space map histogram support.
31458363e80aSGeorge Wilson 	 */
31468363e80aSGeorge Wilson 	if (zfs_metaslab_segment_weight_enabled &&
31478363e80aSGeorge Wilson 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
31488363e80aSGeorge Wilson 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
31498363e80aSGeorge Wilson 	    sizeof (space_map_phys_t))) {
31508363e80aSGeorge Wilson 		weight = metaslab_segment_weight(msp);
31518363e80aSGeorge Wilson 	} else {
31528363e80aSGeorge Wilson 		weight = metaslab_space_weight(msp);
31538363e80aSGeorge Wilson 	}
31540713e232SGeorge Wilson 	return (weight);
3155ecc2d604Sbonwick }
3156ecc2d604Sbonwick 
3157555d674dSSerapheim Dimitropoulos void
metaslab_recalculate_weight_and_sort(metaslab_t * msp)3158555d674dSSerapheim Dimitropoulos metaslab_recalculate_weight_and_sort(metaslab_t *msp)
3159555d674dSSerapheim Dimitropoulos {
3160af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3161af1d63abSPaul Dagnelie 
3162555d674dSSerapheim Dimitropoulos 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
3163555d674dSSerapheim Dimitropoulos 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3164555d674dSSerapheim Dimitropoulos 	metaslab_group_sort(msp->ms_group, msp,
3165555d674dSSerapheim Dimitropoulos 	    metaslab_weight(msp) | was_active);
3166555d674dSSerapheim Dimitropoulos }
3167555d674dSSerapheim Dimitropoulos 
3168ecc2d604Sbonwick static int
metaslab_activate_allocator(metaslab_group_t * mg,metaslab_t * msp,int allocator,uint64_t activation_weight)3169f78cdc34SPaul Dagnelie metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3170f78cdc34SPaul Dagnelie     int allocator, uint64_t activation_weight)
3171f78cdc34SPaul Dagnelie {
3172af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3173af1d63abSPaul Dagnelie 
3174f78cdc34SPaul Dagnelie 	/*
3175f78cdc34SPaul Dagnelie 	 * If we're activating for the claim code, we don't want to actually
3176f78cdc34SPaul Dagnelie 	 * set the metaslab up for a specific allocator.
3177f78cdc34SPaul Dagnelie 	 */
3178af1d63abSPaul Dagnelie 	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
3179af1d63abSPaul Dagnelie 		ASSERT0(msp->ms_activation_weight);
3180af1d63abSPaul Dagnelie 		msp->ms_activation_weight = msp->ms_weight;
3181af1d63abSPaul Dagnelie 		metaslab_group_sort(mg, msp, msp->ms_weight |
3182af1d63abSPaul Dagnelie 		    activation_weight);
3183f78cdc34SPaul Dagnelie 		return (0);
3184af1d63abSPaul Dagnelie 	}
3185af1d63abSPaul Dagnelie 
3186f78cdc34SPaul Dagnelie 	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
3187f78cdc34SPaul Dagnelie 	    mg->mg_primaries : mg->mg_secondaries);
3188f78cdc34SPaul Dagnelie 
3189f78cdc34SPaul Dagnelie 	mutex_enter(&mg->mg_lock);
3190f78cdc34SPaul Dagnelie 	if (arr[allocator] != NULL) {
3191f78cdc34SPaul Dagnelie 		mutex_exit(&mg->mg_lock);
3192f78cdc34SPaul Dagnelie 		return (EEXIST);
3193f78cdc34SPaul Dagnelie 	}
3194f78cdc34SPaul Dagnelie 
3195f78cdc34SPaul Dagnelie 	arr[allocator] = msp;
3196f78cdc34SPaul Dagnelie 	ASSERT3S(msp->ms_allocator, ==, -1);
3197f78cdc34SPaul Dagnelie 	msp->ms_allocator = allocator;
3198f78cdc34SPaul Dagnelie 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3199af1d63abSPaul Dagnelie 
3200af1d63abSPaul Dagnelie 	ASSERT0(msp->ms_activation_weight);
3201af1d63abSPaul Dagnelie 	msp->ms_activation_weight = msp->ms_weight;
3202af1d63abSPaul Dagnelie 	metaslab_group_sort_impl(mg, msp,
3203af1d63abSPaul Dagnelie 	    msp->ms_weight | activation_weight);
3204af1d63abSPaul Dagnelie 
3205f78cdc34SPaul Dagnelie 	mutex_exit(&mg->mg_lock);
3206f78cdc34SPaul Dagnelie 
3207f78cdc34SPaul Dagnelie 	return (0);
3208f78cdc34SPaul Dagnelie }
3209f78cdc34SPaul Dagnelie 
3210f78cdc34SPaul Dagnelie static int
metaslab_activate(metaslab_t * msp,int allocator,uint64_t activation_weight)3211f78cdc34SPaul Dagnelie metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
3212ecc2d604Sbonwick {
3213ecc2d604Sbonwick 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3214ecc2d604Sbonwick 
3215af1d63abSPaul Dagnelie 	/*
3216af1d63abSPaul Dagnelie 	 * The current metaslab is already activated for us so there
3217af1d63abSPaul Dagnelie 	 * is nothing to do. Already activated though, doesn't mean
3218af1d63abSPaul Dagnelie 	 * that this metaslab is activated for our allocator nor our
3219af1d63abSPaul Dagnelie 	 * requested activation weight. The metaslab could have started
3220af1d63abSPaul Dagnelie 	 * as an active one for our allocator but changed allocators
3221af1d63abSPaul Dagnelie 	 * while we were waiting to grab its ms_lock or we stole it
3222af1d63abSPaul Dagnelie 	 * [see find_valid_metaslab()]. This means that there is a
3223af1d63abSPaul Dagnelie 	 * possibility of passivating a metaslab of another allocator
3224af1d63abSPaul Dagnelie 	 * or from a different activation mask, from this thread.
3225af1d63abSPaul Dagnelie 	 */
3226af1d63abSPaul Dagnelie 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3227af1d63abSPaul Dagnelie 		ASSERT(msp->ms_loaded);
3228af1d63abSPaul Dagnelie 		return (0);
3229af1d63abSPaul Dagnelie 	}
3230af1d63abSPaul Dagnelie 
3231af1d63abSPaul Dagnelie 	int error = metaslab_load(msp);
3232af1d63abSPaul Dagnelie 	if (error != 0) {
3233af1d63abSPaul Dagnelie 		metaslab_group_sort(msp->ms_group, msp, 0);
3234af1d63abSPaul Dagnelie 		return (error);
3235af1d63abSPaul Dagnelie 	}
3236af1d63abSPaul Dagnelie 
3237af1d63abSPaul Dagnelie 	/*
3238af1d63abSPaul Dagnelie 	 * When entering metaslab_load() we may have dropped the
3239af1d63abSPaul Dagnelie 	 * ms_lock because we were loading this metaslab, or we
3240af1d63abSPaul Dagnelie 	 * were waiting for another thread to load it for us. In
3241af1d63abSPaul Dagnelie 	 * that scenario, we recheck the weight of the metaslab
3242af1d63abSPaul Dagnelie 	 * to see if it was activated by another thread.
3243af1d63abSPaul Dagnelie 	 *
3244af1d63abSPaul Dagnelie 	 * If the metaslab was activated for another allocator or
3245af1d63abSPaul Dagnelie 	 * it was activated with a different activation weight (e.g.
3246af1d63abSPaul Dagnelie 	 * we wanted to make it a primary but it was activated as
3247af1d63abSPaul Dagnelie 	 * secondary) we return error (EBUSY).
3248af1d63abSPaul Dagnelie 	 *
3249af1d63abSPaul Dagnelie 	 * If the metaslab was activated for the same allocator
3250af1d63abSPaul Dagnelie 	 * and requested activation mask, skip activating it.
3251af1d63abSPaul Dagnelie 	 */
3252af1d63abSPaul Dagnelie 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3253af1d63abSPaul Dagnelie 		if (msp->ms_allocator != allocator)
3254f78cdc34SPaul Dagnelie 			return (EBUSY);
3255d6e555bdSGeorge Wilson 
3256af1d63abSPaul Dagnelie 		if ((msp->ms_weight & activation_weight) == 0)
3257af1d63abSPaul Dagnelie 			return (EBUSY);
3258af1d63abSPaul Dagnelie 
3259af1d63abSPaul Dagnelie 		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
3260af1d63abSPaul Dagnelie 		    msp->ms_primary);
3261af1d63abSPaul Dagnelie 		return (0);
3262af1d63abSPaul Dagnelie 	}
3263af1d63abSPaul Dagnelie 
3264af1d63abSPaul Dagnelie 	/*
3265af1d63abSPaul Dagnelie 	 * If the metaslab has literally 0 space, it will have weight 0. In
3266af1d63abSPaul Dagnelie 	 * that case, don't bother activating it. This can happen if the
3267af1d63abSPaul Dagnelie 	 * metaslab had space during find_valid_metaslab, but another thread
3268af1d63abSPaul Dagnelie 	 * loaded it and used all that space while we were waiting to grab the
3269af1d63abSPaul Dagnelie 	 * lock.
3270af1d63abSPaul Dagnelie 	 */
3271af1d63abSPaul Dagnelie 	if (msp->ms_weight == 0) {
3272af1d63abSPaul Dagnelie 		ASSERT0(range_tree_space(msp->ms_allocatable));
3273af1d63abSPaul Dagnelie 		return (SET_ERROR(ENOSPC));
3274ecc2d604Sbonwick 	}
3275af1d63abSPaul Dagnelie 
3276af1d63abSPaul Dagnelie 	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3277af1d63abSPaul Dagnelie 	    allocator, activation_weight)) != 0) {
3278af1d63abSPaul Dagnelie 		return (error);
3279af1d63abSPaul Dagnelie 	}
3280af1d63abSPaul Dagnelie 
32810713e232SGeorge Wilson 	ASSERT(msp->ms_loaded);
328244cd46caSbillm 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3283ecc2d604Sbonwick 
3284ecc2d604Sbonwick 	return (0);
3285ecc2d604Sbonwick }
3286ecc2d604Sbonwick 
3287f78cdc34SPaul Dagnelie static void
metaslab_passivate_allocator(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)3288f78cdc34SPaul Dagnelie metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3289f78cdc34SPaul Dagnelie     uint64_t weight)
3290f78cdc34SPaul Dagnelie {
3291f78cdc34SPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3292af1d63abSPaul Dagnelie 	ASSERT(msp->ms_loaded);
3293af1d63abSPaul Dagnelie 
3294f78cdc34SPaul Dagnelie 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3295f78cdc34SPaul Dagnelie 		metaslab_group_sort(mg, msp, weight);
3296f78cdc34SPaul Dagnelie 		return;
3297f78cdc34SPaul Dagnelie 	}
3298f78cdc34SPaul Dagnelie 
3299f78cdc34SPaul Dagnelie 	mutex_enter(&mg->mg_lock);
3300f78cdc34SPaul Dagnelie 	ASSERT3P(msp->ms_group, ==, mg);
3301af1d63abSPaul Dagnelie 	ASSERT3S(0, <=, msp->ms_allocator);
3302af1d63abSPaul Dagnelie 	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3303af1d63abSPaul Dagnelie 
3304f78cdc34SPaul Dagnelie 	if (msp->ms_primary) {
3305f78cdc34SPaul Dagnelie 		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
3306f78cdc34SPaul Dagnelie 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3307f78cdc34SPaul Dagnelie 		mg->mg_primaries[msp->ms_allocator] = NULL;
3308f78cdc34SPaul Dagnelie 	} else {
3309f78cdc34SPaul Dagnelie 		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
3310af1d63abSPaul Dagnelie 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3311f78cdc34SPaul Dagnelie 		mg->mg_secondaries[msp->ms_allocator] = NULL;
3312f78cdc34SPaul Dagnelie 	}
3313f78cdc34SPaul Dagnelie 	msp->ms_allocator = -1;
3314f78cdc34SPaul Dagnelie 	metaslab_group_sort_impl(mg, msp, weight);
3315f78cdc34SPaul Dagnelie 	mutex_exit(&mg->mg_lock);
3316f78cdc34SPaul Dagnelie }
3317f78cdc34SPaul Dagnelie 
3318ecc2d604Sbonwick static void
metaslab_passivate(metaslab_t * msp,uint64_t weight)33198363e80aSGeorge Wilson metaslab_passivate(metaslab_t *msp, uint64_t weight)
3320ecc2d604Sbonwick {
33218363e80aSGeorge Wilson 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
33228363e80aSGeorge Wilson 
33235f5f7a6fSahrens 	/*
33245f5f7a6fSahrens 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
33255f5f7a6fSahrens 	 * this metaslab again.  In that case, it had better be empty,
33265f5f7a6fSahrens 	 * or we would be leaving space on the table.
33275f5f7a6fSahrens 	 */
33288363e80aSGeorge Wilson 	ASSERT(size >= SPA_MINBLOCKSIZE ||
332986714001SSerapheim Dimitropoulos 	    range_tree_is_empty(msp->ms_allocatable));
33308363e80aSGeorge Wilson 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
33318363e80aSGeorge Wilson 
3332af1d63abSPaul Dagnelie 	ASSERT(msp->ms_activation_weight != 0);
33338363e80aSGeorge Wilson 	msp->ms_activation_weight = 0;
3334f78cdc34SPaul Dagnelie 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
3335af1d63abSPaul Dagnelie 	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3336ecc2d604Sbonwick }
3337ecc2d604Sbonwick 
33388363e80aSGeorge Wilson /*
33398363e80aSGeorge Wilson  * Segment-based metaslabs are activated once and remain active until
33408363e80aSGeorge Wilson  * we either fail an allocation attempt (similar to space-based metaslabs)
33418363e80aSGeorge Wilson  * or have exhausted the free space in zfs_metaslab_switch_threshold
33428363e80aSGeorge Wilson  * buckets since the metaslab was activated. This function checks to see
33438363e80aSGeorge Wilson  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
33448363e80aSGeorge Wilson  * metaslab and passivates it proactively. This will allow us to select a
33458363e80aSGeorge Wilson  * metaslabs with larger contiguous region if any remaining within this
33468363e80aSGeorge Wilson  * metaslab group. If we're in sync pass > 1, then we continue using this
33478363e80aSGeorge Wilson  * metaslab so that we don't dirty more block and cause more sync passes.
33488363e80aSGeorge Wilson  */
33498363e80aSGeorge Wilson void
metaslab_segment_may_passivate(metaslab_t * msp)33508363e80aSGeorge Wilson metaslab_segment_may_passivate(metaslab_t *msp)
33518363e80aSGeorge Wilson {
33528363e80aSGeorge Wilson 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
33538363e80aSGeorge Wilson 
33548363e80aSGeorge Wilson 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
33558363e80aSGeorge Wilson 		return;
33568363e80aSGeorge Wilson 
33578363e80aSGeorge Wilson 	/*
33588363e80aSGeorge Wilson 	 * Since we are in the middle of a sync pass, the most accurate
33598363e80aSGeorge Wilson 	 * information that is accessible to us is the in-core range tree
33608363e80aSGeorge Wilson 	 * histogram; calculate the new weight based on that information.
33618363e80aSGeorge Wilson 	 */
33628363e80aSGeorge Wilson 	uint64_t weight = metaslab_weight_from_range_tree(msp);
33638363e80aSGeorge Wilson 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
33648363e80aSGeorge Wilson 	int current_idx = WEIGHT_GET_INDEX(weight);
33658363e80aSGeorge Wilson 
33668363e80aSGeorge Wilson 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
33678363e80aSGeorge Wilson 		metaslab_passivate(msp, weight);
33688363e80aSGeorge Wilson }
33698363e80aSGeorge Wilson 
33700713e232SGeorge Wilson static void
metaslab_preload(void * arg)33710713e232SGeorge Wilson metaslab_preload(void *arg)
33720713e232SGeorge Wilson {
33730713e232SGeorge Wilson 	metaslab_t *msp = arg;
3374af1d63abSPaul Dagnelie 	metaslab_class_t *mc = msp->ms_group->mg_class;
3375af1d63abSPaul Dagnelie 	spa_t *spa = mc->mc_spa;
33760713e232SGeorge Wilson 
337730beaff4SGeorge Wilson 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
337830beaff4SGeorge Wilson 
33790713e232SGeorge Wilson 	mutex_enter(&msp->ms_lock);
3380a0b03b16SSerapheim Dimitropoulos 	(void) metaslab_load(msp);
3381af1d63abSPaul Dagnelie 	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
33820713e232SGeorge Wilson 	mutex_exit(&msp->ms_lock);
33830713e232SGeorge Wilson }
33840713e232SGeorge Wilson 
33850713e232SGeorge Wilson static void
metaslab_group_preload(metaslab_group_t * mg)33860713e232SGeorge Wilson metaslab_group_preload(metaslab_group_t *mg)
33870713e232SGeorge Wilson {
33880713e232SGeorge Wilson 	spa_t *spa = mg->mg_vd->vdev_spa;
33890713e232SGeorge Wilson 	metaslab_t *msp;
33900713e232SGeorge Wilson 	avl_tree_t *t = &mg->mg_metaslab_tree;
33910713e232SGeorge Wilson 	int m = 0;
33920713e232SGeorge Wilson 
33930713e232SGeorge Wilson 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
33940713e232SGeorge Wilson 		taskq_wait(mg->mg_taskq);
33950713e232SGeorge Wilson 		return;
33960713e232SGeorge Wilson 	}
33970713e232SGeorge Wilson 
339830beaff4SGeorge Wilson 	mutex_enter(&mg->mg_lock);
33995cabbc6bSPrashanth Sreenivasa 
34000713e232SGeorge Wilson 	/*
340130beaff4SGeorge Wilson 	 * Load the next potential metaslabs
34020713e232SGeorge Wilson 	 */
34038363e80aSGeorge Wilson 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
34045cabbc6bSPrashanth Sreenivasa 		ASSERT3P(msp->ms_group, ==, mg);
34055cabbc6bSPrashanth Sreenivasa 
34062e4c9986SGeorge Wilson 		/*
34072e4c9986SGeorge Wilson 		 * We preload only the maximum number of metaslabs specified
34082e4c9986SGeorge Wilson 		 * by metaslab_preload_limit. If a metaslab is being forced
34092e4c9986SGeorge Wilson 		 * to condense then we preload it too. This will ensure
34102e4c9986SGeorge Wilson 		 * that force condensing happens in the next txg.
34112e4c9986SGeorge Wilson 		 */
34122e4c9986SGeorge Wilson 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
34132e4c9986SGeorge Wilson 			continue;
34142e4c9986SGeorge Wilson 		}
34150713e232SGeorge Wilson 
34160713e232SGeorge Wilson 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
3417fc8ae2ecSToomas Soome 		    msp, TQ_SLEEP) != TASKQID_INVALID);
34180713e232SGeorge Wilson 	}
34190713e232SGeorge Wilson 	mutex_exit(&mg->mg_lock);
34200713e232SGeorge Wilson }
34210713e232SGeorge Wilson 
342216a4a807SGeorge Wilson /*
3423814dcd43SSerapheim Dimitropoulos  * Determine if the space map's on-disk footprint is past our tolerance for
3424814dcd43SSerapheim Dimitropoulos  * inefficiency. We would like to use the following criteria to make our
3425814dcd43SSerapheim Dimitropoulos  * decision:
342616a4a807SGeorge Wilson  *
3427814dcd43SSerapheim Dimitropoulos  * 1. Do not condense if the size of the space map object would dramatically
3428814dcd43SSerapheim Dimitropoulos  *    increase as a result of writing out the free space range tree.
342916a4a807SGeorge Wilson  *
3430814dcd43SSerapheim Dimitropoulos  * 2. Condense if the on on-disk space map representation is at least
3431814dcd43SSerapheim Dimitropoulos  *    zfs_condense_pct/100 times the size of the optimal representation
3432814dcd43SSerapheim Dimitropoulos  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
343316a4a807SGeorge Wilson  *
3434814dcd43SSerapheim Dimitropoulos  * 3. Do not condense if the on-disk size of the space map does not actually
3435814dcd43SSerapheim Dimitropoulos  *    decrease.
34362a104a52SAlex Reece  *
34372a104a52SAlex Reece  * Unfortunately, we cannot compute the on-disk size of the space map in this
34382a104a52SAlex Reece  * context because we cannot accurately compute the effects of compression, etc.
34392a104a52SAlex Reece  * Instead, we apply the heuristic described in the block comment for
34402a104a52SAlex Reece  * zfs_metaslab_condense_block_threshold - we only condense if the space used
34412a104a52SAlex Reece  * is greater than a threshold number of blocks.
344216a4a807SGeorge Wilson  */
344316a4a807SGeorge Wilson static boolean_t
metaslab_should_condense(metaslab_t * msp)344416a4a807SGeorge Wilson metaslab_should_condense(metaslab_t *msp)
344516a4a807SGeorge Wilson {
34460713e232SGeorge Wilson 	space_map_t *sm = msp->ms_sm;
344786714001SSerapheim Dimitropoulos 	vdev_t *vd = msp->ms_group->mg_vd;
344886714001SSerapheim Dimitropoulos 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
344916a4a807SGeorge Wilson 
345016a4a807SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
34510713e232SGeorge Wilson 	ASSERT(msp->ms_loaded);
3452814dcd43SSerapheim Dimitropoulos 	ASSERT(sm != NULL);
3453814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
345486714001SSerapheim Dimitropoulos 
345586714001SSerapheim Dimitropoulos 	/*
345617f11284SSerapheim Dimitropoulos 	 * We always condense metaslabs that are empty and metaslabs for
345717f11284SSerapheim Dimitropoulos 	 * which a condense request has been made.
345816a4a807SGeorge Wilson 	 */
34594d7988d6SPaul Dagnelie 	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
346017f11284SSerapheim Dimitropoulos 	    msp->ms_condense_wanted)
346116a4a807SGeorge Wilson 		return (B_TRUE);
346216a4a807SGeorge Wilson 
3463814dcd43SSerapheim Dimitropoulos 	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3464814dcd43SSerapheim Dimitropoulos 	uint64_t object_size = space_map_length(sm);
346517f11284SSerapheim Dimitropoulos 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
346617f11284SSerapheim Dimitropoulos 	    msp->ms_allocatable, SM_NO_VDEVID);
34672a104a52SAlex Reece 
346817f11284SSerapheim Dimitropoulos 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
34692a104a52SAlex Reece 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
347016a4a807SGeorge Wilson }
347116a4a807SGeorge Wilson 
347216a4a807SGeorge Wilson /*
347316a4a807SGeorge Wilson  * Condense the on-disk space map representation to its minimized form.
3474814dcd43SSerapheim Dimitropoulos  * The minimized form consists of a small number of allocations followed
3475814dcd43SSerapheim Dimitropoulos  * by the entries of the free range tree (ms_allocatable). The condensed
3476814dcd43SSerapheim Dimitropoulos  * spacemap contains all the entries of previous TXGs (including those in
3477814dcd43SSerapheim Dimitropoulos  * the pool-wide log spacemaps; thus this is effectively a superset of
3478814dcd43SSerapheim Dimitropoulos  * metaslab_flush()), but this TXG's entries still need to be written.
347916a4a807SGeorge Wilson  */
348016a4a807SGeorge Wilson static void
metaslab_condense(metaslab_t * msp,dmu_tx_t * tx)3481814dcd43SSerapheim Dimitropoulos metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
348216a4a807SGeorge Wilson {
34830713e232SGeorge Wilson 	range_tree_t *condense_tree;
34840713e232SGeorge Wilson 	space_map_t *sm = msp->ms_sm;
3485814dcd43SSerapheim Dimitropoulos 	uint64_t txg = dmu_tx_get_txg(tx);
3486814dcd43SSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
348716a4a807SGeorge Wilson 
348816a4a807SGeorge Wilson 	ASSERT(MUTEX_HELD(&msp->ms_lock));
34890713e232SGeorge Wilson 	ASSERT(msp->ms_loaded);
3490814dcd43SSerapheim Dimitropoulos 	ASSERT(msp->ms_sm != NULL);
3491814dcd43SSerapheim Dimitropoulos 
3492814dcd43SSerapheim Dimitropoulos 	/*
3493814dcd43SSerapheim Dimitropoulos 	 * In order to condense the space map, we need to change it so it
3494814dcd43SSerapheim Dimitropoulos 	 * only describes which segments are currently allocated and free.
3495814dcd43SSerapheim Dimitropoulos 	 *
3496814dcd43SSerapheim Dimitropoulos 	 * All the current free space resides in the ms_allocatable, all
3497814dcd43SSerapheim Dimitropoulos 	 * the ms_defer trees, and all the ms_allocating trees. We ignore
3498814dcd43SSerapheim Dimitropoulos 	 * ms_freed because it is empty because we're in sync pass 1. We
3499814dcd43SSerapheim Dimitropoulos 	 * ignore ms_freeing because these changes are not yet reflected
3500814dcd43SSerapheim Dimitropoulos 	 * in the spacemap (they will be written later this txg).
3501814dcd43SSerapheim Dimitropoulos 	 *
3502814dcd43SSerapheim Dimitropoulos 	 * So to truncate the space map to represent all the entries of
3503814dcd43SSerapheim Dimitropoulos 	 * previous TXGs we do the following:
3504814dcd43SSerapheim Dimitropoulos 	 *
35054d7988d6SPaul Dagnelie 	 * 1] We create a range tree (condense tree) that is 100% empty.
35064d7988d6SPaul Dagnelie 	 * 2] We add to it all segments found in the ms_defer trees
3507814dcd43SSerapheim Dimitropoulos 	 *    as those segments are marked as free in the original space
3508814dcd43SSerapheim Dimitropoulos 	 *    map. We do the same with the ms_allocating trees for the same
35094d7988d6SPaul Dagnelie 	 *    reason. Adding these segments should be a relatively
3510814dcd43SSerapheim Dimitropoulos 	 *    inexpensive operation since we expect these trees to have a
3511814dcd43SSerapheim Dimitropoulos 	 *    small number of nodes.
35124d7988d6SPaul Dagnelie 	 * 3] We vacate any unflushed allocs, since they are not frees we
35134d7988d6SPaul Dagnelie 	 *    need to add to the condense tree. Then we vacate any
35144d7988d6SPaul Dagnelie 	 *    unflushed frees as they should already be part of ms_allocatable.
35154d7988d6SPaul Dagnelie 	 * 4] At this point, we would ideally like to add all segments
3516814dcd43SSerapheim Dimitropoulos 	 *    in the ms_allocatable tree from the condense tree. This way
3517814dcd43SSerapheim Dimitropoulos 	 *    we would write all the entries of the condense tree as the
35184d7988d6SPaul Dagnelie 	 *    condensed space map, which would only contain freeed
35194d7988d6SPaul Dagnelie 	 *    segments with everything else assumed to be allocated.
3520814dcd43SSerapheim Dimitropoulos 	 *
3521814dcd43SSerapheim Dimitropoulos 	 *    Doing so can be prohibitively expensive as ms_allocatable can
35224d7988d6SPaul Dagnelie 	 *    be large, and therefore computationally expensive to add to
35234d7988d6SPaul Dagnelie 	 *    the condense_tree. Instead we first sync out an entry marking
35244d7988d6SPaul Dagnelie 	 *    everything as allocated, then the condense_tree and then the
35254d7988d6SPaul Dagnelie 	 *    ms_allocatable, in the condensed space map. While this is not
35264d7988d6SPaul Dagnelie 	 *    optimal, it is typically close to optimal and more importantly
35274d7988d6SPaul Dagnelie 	 *    much cheaper to compute.
3528814dcd43SSerapheim Dimitropoulos 	 *
3529814dcd43SSerapheim Dimitropoulos 	 * 5] Finally, as both of the unflushed trees were written to our
3530814dcd43SSerapheim Dimitropoulos 	 *    new and condensed metaslab space map, we basically flushed
3531814dcd43SSerapheim Dimitropoulos 	 *    all the unflushed changes to disk, thus we call
3532814dcd43SSerapheim Dimitropoulos 	 *    metaslab_flush_update().
3533814dcd43SSerapheim Dimitropoulos 	 */
3534814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(spa), ==, 1);
3535814dcd43SSerapheim Dimitropoulos 	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
353616a4a807SGeorge Wilson 
353786714001SSerapheim Dimitropoulos 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
3538daec38ecSJoe Stein 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
3539daec38ecSJoe Stein 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
3540814dcd43SSerapheim Dimitropoulos 	    spa->spa_name, space_map_length(msp->ms_sm),
35414d7988d6SPaul Dagnelie 	    range_tree_numsegs(msp->ms_allocatable),
35422e4c9986SGeorge Wilson 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
35432e4c9986SGeorge Wilson 
35442e4c9986SGeorge Wilson 	msp->ms_condense_wanted = B_FALSE;
354516a4a807SGeorge Wilson 
35464d7988d6SPaul Dagnelie 	range_seg_type_t type;
35474d7988d6SPaul Dagnelie 	uint64_t shift, start;
35484d7988d6SPaul Dagnelie 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
35494d7988d6SPaul Dagnelie 	    &start, &shift);
35504d7988d6SPaul Dagnelie 
35514d7988d6SPaul Dagnelie 	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
355216a4a807SGeorge Wilson 
35530713e232SGeorge Wilson 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
355486714001SSerapheim Dimitropoulos 		range_tree_walk(msp->ms_defer[t],
35554d7988d6SPaul Dagnelie 		    range_tree_add, condense_tree);
35560713e232SGeorge Wilson 	}
355716a4a807SGeorge Wilson 
3558814dcd43SSerapheim Dimitropoulos 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
355986714001SSerapheim Dimitropoulos 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
35604d7988d6SPaul Dagnelie 		    range_tree_add, condense_tree);
35610713e232SGeorge Wilson 	}
356216a4a807SGeorge Wilson 
3563814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3564814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp));
3565814dcd43SSerapheim Dimitropoulos 	spa->spa_unflushed_stats.sus_memused -=
3566814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp);
3567814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3568814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3569814dcd43SSerapheim Dimitropoulos 
357016a4a807SGeorge Wilson 	/*
3571814dcd43SSerapheim Dimitropoulos 	 * We're about to drop the metaslab's lock thus allowing other
3572814dcd43SSerapheim Dimitropoulos 	 * consumers to change its content. Set the metaslab's ms_condensing
3573814dcd43SSerapheim Dimitropoulos 	 * flag to ensure that allocations on this metaslab do not occur
3574814dcd43SSerapheim Dimitropoulos 	 * while we're in the middle of committing it to disk. This is only
3575814dcd43SSerapheim Dimitropoulos 	 * critical for ms_allocatable as all other range trees use per TXG
357616a4a807SGeorge Wilson 	 * views of their content.
357716a4a807SGeorge Wilson 	 */
35780713e232SGeorge Wilson 	msp->ms_condensing = B_TRUE;
357916a4a807SGeorge Wilson 
358016a4a807SGeorge Wilson 	mutex_exit(&msp->ms_lock);
3581814dcd43SSerapheim Dimitropoulos 	uint64_t object = space_map_object(msp->ms_sm);
3582814dcd43SSerapheim Dimitropoulos 	space_map_truncate(sm,
3583814dcd43SSerapheim Dimitropoulos 	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3584814dcd43SSerapheim Dimitropoulos 	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
358516a4a807SGeorge Wilson 
358616a4a807SGeorge Wilson 	/*
3587814dcd43SSerapheim Dimitropoulos 	 * space_map_truncate() may have reallocated the spacemap object.
3588814dcd43SSerapheim Dimitropoulos 	 * If so, update the vdev_ms_array.
3589814dcd43SSerapheim Dimitropoulos 	 */
3590814dcd43SSerapheim Dimitropoulos 	if (space_map_object(msp->ms_sm) != object) {
3591814dcd43SSerapheim Dimitropoulos 		object = space_map_object(msp->ms_sm);
3592814dcd43SSerapheim Dimitropoulos 		dmu_write(spa->spa_meta_objset,
3593814dcd43SSerapheim Dimitropoulos 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3594814dcd43SSerapheim Dimitropoulos 		    msp->ms_id, sizeof (uint64_t), &object, tx);
3595814dcd43SSerapheim Dimitropoulos 	}
3596814dcd43SSerapheim Dimitropoulos 
3597814dcd43SSerapheim Dimitropoulos 	/*
3598814dcd43SSerapheim Dimitropoulos 	 * Note:
3599814dcd43SSerapheim Dimitropoulos 	 * When the log space map feature is enabled, each space map will
3600814dcd43SSerapheim Dimitropoulos 	 * always have ALLOCS followed by FREES for each sync pass. This is
3601814dcd43SSerapheim Dimitropoulos 	 * typically true even when the log space map feature is disabled,
3602814dcd43SSerapheim Dimitropoulos 	 * except from the case where a metaslab goes through metaslab_sync()
3603814dcd43SSerapheim Dimitropoulos 	 * and gets condensed. In that case the metaslab's space map will have
3604814dcd43SSerapheim Dimitropoulos 	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3605814dcd43SSerapheim Dimitropoulos 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
3606814dcd43SSerapheim Dimitropoulos 	 * sync pass 1.
360716a4a807SGeorge Wilson 	 */
36084d7988d6SPaul Dagnelie 	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
36094d7988d6SPaul Dagnelie 	    shift);
36104d7988d6SPaul Dagnelie 	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
36114d7988d6SPaul Dagnelie 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3612814dcd43SSerapheim Dimitropoulos 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
36134d7988d6SPaul Dagnelie 	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
3614814dcd43SSerapheim Dimitropoulos 
36150713e232SGeorge Wilson 	range_tree_vacate(condense_tree, NULL, NULL);
36160713e232SGeorge Wilson 	range_tree_destroy(condense_tree);
36174d7988d6SPaul Dagnelie 	range_tree_vacate(tmp_tree, NULL, NULL);
36184d7988d6SPaul Dagnelie 	range_tree_destroy(tmp_tree);
36195cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
3620814dcd43SSerapheim Dimitropoulos 
36210713e232SGeorge Wilson 	msp->ms_condensing = B_FALSE;
3622814dcd43SSerapheim Dimitropoulos 	metaslab_flush_update(msp, tx);
3623814dcd43SSerapheim Dimitropoulos }
3624814dcd43SSerapheim Dimitropoulos 
3625814dcd43SSerapheim Dimitropoulos /*
3626814dcd43SSerapheim Dimitropoulos  * Called when the metaslab has been flushed (its own spacemap now reflects
3627814dcd43SSerapheim Dimitropoulos  * all the contents of the pool-wide spacemap log). Updates the metaslab's
3628814dcd43SSerapheim Dimitropoulos  * metadata and any pool-wide related log space map data (e.g. summary,
3629814dcd43SSerapheim Dimitropoulos  * obsolete logs, etc.) to reflect that.
3630814dcd43SSerapheim Dimitropoulos  */
3631814dcd43SSerapheim Dimitropoulos static void
metaslab_flush_update(metaslab_t * msp,dmu_tx_t * tx)3632814dcd43SSerapheim Dimitropoulos metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3633814dcd43SSerapheim Dimitropoulos {
3634814dcd43SSerapheim Dimitropoulos 	metaslab_group_t *mg = msp->ms_group;
3635814dcd43SSerapheim Dimitropoulos 	spa_t *spa = mg->mg_vd->vdev_spa;
3636814dcd43SSerapheim Dimitropoulos 
3637814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3638814dcd43SSerapheim Dimitropoulos 
3639814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(spa), ==, 1);
3640814dcd43SSerapheim Dimitropoulos 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3641814dcd43SSerapheim Dimitropoulos 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3642814dcd43SSerapheim Dimitropoulos 
3643814dcd43SSerapheim Dimitropoulos 	/*
3644814dcd43SSerapheim Dimitropoulos 	 * Just because a metaslab got flushed, that doesn't mean that
3645814dcd43SSerapheim Dimitropoulos 	 * it will pass through metaslab_sync_done(). Thus, make sure to
3646814dcd43SSerapheim Dimitropoulos 	 * update ms_synced_length here in case it doesn't.
3647814dcd43SSerapheim Dimitropoulos 	 */
3648814dcd43SSerapheim Dimitropoulos 	msp->ms_synced_length = space_map_length(msp->ms_sm);
3649814dcd43SSerapheim Dimitropoulos 
3650814dcd43SSerapheim Dimitropoulos 	/*
3651814dcd43SSerapheim Dimitropoulos 	 * We may end up here from metaslab_condense() without the
3652814dcd43SSerapheim Dimitropoulos 	 * feature being active. In that case this is a no-op.
3653814dcd43SSerapheim Dimitropoulos 	 */
3654814dcd43SSerapheim Dimitropoulos 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3655814dcd43SSerapheim Dimitropoulos 		return;
3656814dcd43SSerapheim Dimitropoulos 
3657814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_syncing_log_sm(spa) != NULL);
3658814dcd43SSerapheim Dimitropoulos 	ASSERT(msp->ms_sm != NULL);
3659814dcd43SSerapheim Dimitropoulos 	ASSERT(metaslab_unflushed_txg(msp) != 0);
3660814dcd43SSerapheim Dimitropoulos 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3661814dcd43SSerapheim Dimitropoulos 
3662814dcd43SSerapheim Dimitropoulos 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3663814dcd43SSerapheim Dimitropoulos 
3664814dcd43SSerapheim Dimitropoulos 	/* update metaslab's position in our flushing tree */
3665814dcd43SSerapheim Dimitropoulos 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3666814dcd43SSerapheim Dimitropoulos 	mutex_enter(&spa->spa_flushed_ms_lock);
3667814dcd43SSerapheim Dimitropoulos 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3668814dcd43SSerapheim Dimitropoulos 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3669814dcd43SSerapheim Dimitropoulos 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
3670814dcd43SSerapheim Dimitropoulos 	mutex_exit(&spa->spa_flushed_ms_lock);
3671814dcd43SSerapheim Dimitropoulos 
3672814dcd43SSerapheim Dimitropoulos 	/* update metaslab counts of spa_log_sm_t nodes */
3673814dcd43SSerapheim Dimitropoulos 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3674814dcd43SSerapheim Dimitropoulos 	spa_log_sm_increment_current_mscount(spa);
3675814dcd43SSerapheim Dimitropoulos 
3676814dcd43SSerapheim Dimitropoulos 	/* cleanup obsolete logs if any */
3677814dcd43SSerapheim Dimitropoulos 	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
3678814dcd43SSerapheim Dimitropoulos 	spa_cleanup_old_sm_logs(spa, tx);
3679814dcd43SSerapheim Dimitropoulos 	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
3680814dcd43SSerapheim Dimitropoulos 	VERIFY3U(log_blocks_after, <=, log_blocks_before);
3681814dcd43SSerapheim Dimitropoulos 
3682814dcd43SSerapheim Dimitropoulos 	/* update log space map summary */
3683814dcd43SSerapheim Dimitropoulos 	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
3684814dcd43SSerapheim Dimitropoulos 	spa_log_summary_add_flushed_metaslab(spa);
3685814dcd43SSerapheim Dimitropoulos 	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
3686814dcd43SSerapheim Dimitropoulos 	spa_log_summary_decrement_blkcount(spa, blocks_gone);
3687814dcd43SSerapheim Dimitropoulos }
3688814dcd43SSerapheim Dimitropoulos 
3689814dcd43SSerapheim Dimitropoulos boolean_t
metaslab_flush(metaslab_t * msp,dmu_tx_t * tx)3690814dcd43SSerapheim Dimitropoulos metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3691814dcd43SSerapheim Dimitropoulos {
3692814dcd43SSerapheim Dimitropoulos 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3693814dcd43SSerapheim Dimitropoulos 
3694814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
3695814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa_sync_pass(spa), ==, 1);
3696814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3697814dcd43SSerapheim Dimitropoulos 
3698814dcd43SSerapheim Dimitropoulos 	ASSERT(msp->ms_sm != NULL);
3699814dcd43SSerapheim Dimitropoulos 	ASSERT(metaslab_unflushed_txg(msp) != 0);
3700814dcd43SSerapheim Dimitropoulos 	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3701814dcd43SSerapheim Dimitropoulos 
3702814dcd43SSerapheim Dimitropoulos 	/*
3703814dcd43SSerapheim Dimitropoulos 	 * There is nothing wrong with flushing the same metaslab twice, as
3704814dcd43SSerapheim Dimitropoulos 	 * this codepath should work on that case. However, the current
3705814dcd43SSerapheim Dimitropoulos 	 * flushing scheme makes sure to avoid this situation as we would be
3706814dcd43SSerapheim Dimitropoulos 	 * making all these calls without having anything meaningful to write
3707814dcd43SSerapheim Dimitropoulos 	 * to disk. We assert this behavior here.
3708814dcd43SSerapheim Dimitropoulos 	 */
3709814dcd43SSerapheim Dimitropoulos 	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3710814dcd43SSerapheim Dimitropoulos 
3711814dcd43SSerapheim Dimitropoulos 	/*
3712814dcd43SSerapheim Dimitropoulos 	 * We can not flush while loading, because then we would
3713814dcd43SSerapheim Dimitropoulos 	 * not load the ms_unflushed_{allocs,frees}.
3714814dcd43SSerapheim Dimitropoulos 	 */
3715814dcd43SSerapheim Dimitropoulos 	if (msp->ms_loading)
3716814dcd43SSerapheim Dimitropoulos 		return (B_FALSE);
3717814dcd43SSerapheim Dimitropoulos 
3718814dcd43SSerapheim Dimitropoulos 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3719814dcd43SSerapheim Dimitropoulos 	metaslab_verify_weight_and_frag(msp);
3720814dcd43SSerapheim Dimitropoulos 
3721814dcd43SSerapheim Dimitropoulos 	/*
3722814dcd43SSerapheim Dimitropoulos 	 * Metaslab condensing is effectively flushing. Therefore if the
3723814dcd43SSerapheim Dimitropoulos 	 * metaslab can be condensed we can just condense it instead of
3724814dcd43SSerapheim Dimitropoulos 	 * flushing it.
3725814dcd43SSerapheim Dimitropoulos 	 *
3726814dcd43SSerapheim Dimitropoulos 	 * Note that metaslab_condense() does call metaslab_flush_update()
3727814dcd43SSerapheim Dimitropoulos 	 * so we can just return immediately after condensing. We also
3728814dcd43SSerapheim Dimitropoulos 	 * don't need to care about setting ms_flushing or broadcasting
3729814dcd43SSerapheim Dimitropoulos 	 * ms_flush_cv, even if we temporarily drop the ms_lock in
3730814dcd43SSerapheim Dimitropoulos 	 * metaslab_condense(), as the metaslab is already loaded.
3731814dcd43SSerapheim Dimitropoulos 	 */
3732814dcd43SSerapheim Dimitropoulos 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
3733814dcd43SSerapheim Dimitropoulos 		metaslab_group_t *mg = msp->ms_group;
3734814dcd43SSerapheim Dimitropoulos 
3735814dcd43SSerapheim Dimitropoulos 		/*
3736814dcd43SSerapheim Dimitropoulos 		 * For all histogram operations below refer to the
3737814dcd43SSerapheim Dimitropoulos 		 * comments of metaslab_sync() where we follow a
3738814dcd43SSerapheim Dimitropoulos 		 * similar procedure.
3739814dcd43SSerapheim Dimitropoulos 		 */
3740814dcd43SSerapheim Dimitropoulos 		metaslab_group_histogram_verify(mg);
3741814dcd43SSerapheim Dimitropoulos 		metaslab_class_histogram_verify(mg->mg_class);
3742814dcd43SSerapheim Dimitropoulos 		metaslab_group_histogram_remove(mg, msp);
3743814dcd43SSerapheim Dimitropoulos 
3744814dcd43SSerapheim Dimitropoulos 		metaslab_condense(msp, tx);
3745814dcd43SSerapheim Dimitropoulos 
3746814dcd43SSerapheim Dimitropoulos 		space_map_histogram_clear(msp->ms_sm);
3747814dcd43SSerapheim Dimitropoulos 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3748814dcd43SSerapheim Dimitropoulos 		ASSERT(range_tree_is_empty(msp->ms_freed));
3749814dcd43SSerapheim Dimitropoulos 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3750814dcd43SSerapheim Dimitropoulos 			space_map_histogram_add(msp->ms_sm,
3751814dcd43SSerapheim Dimitropoulos 			    msp->ms_defer[t], tx);
3752814dcd43SSerapheim Dimitropoulos 		}
3753814dcd43SSerapheim Dimitropoulos 		metaslab_aux_histograms_update(msp);
3754814dcd43SSerapheim Dimitropoulos 
3755814dcd43SSerapheim Dimitropoulos 		metaslab_group_histogram_add(mg, msp);
3756814dcd43SSerapheim Dimitropoulos 		metaslab_group_histogram_verify(mg);
3757814dcd43SSerapheim Dimitropoulos 		metaslab_class_histogram_verify(mg->mg_class);
3758814dcd43SSerapheim Dimitropoulos 
3759814dcd43SSerapheim Dimitropoulos 		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3760814dcd43SSerapheim Dimitropoulos 
3761814dcd43SSerapheim Dimitropoulos 		/*
3762814dcd43SSerapheim Dimitropoulos 		 * Since we recreated the histogram (and potentially
3763814dcd43SSerapheim Dimitropoulos 		 * the ms_sm too while condensing) ensure that the
3764814dcd43SSerapheim Dimitropoulos 		 * weight is updated too because we are not guaranteed
3765814dcd43SSerapheim Dimitropoulos 		 * that this metaslab is dirty and will go through
3766814dcd43SSerapheim Dimitropoulos 		 * metaslab_sync_done().
3767814dcd43SSerapheim Dimitropoulos 		 */
3768814dcd43SSerapheim Dimitropoulos 		metaslab_recalculate_weight_and_sort(msp);
3769814dcd43SSerapheim Dimitropoulos 		return (B_TRUE);
3770814dcd43SSerapheim Dimitropoulos 	}
3771814dcd43SSerapheim Dimitropoulos 
3772814dcd43SSerapheim Dimitropoulos 	msp->ms_flushing = B_TRUE;
3773814dcd43SSerapheim Dimitropoulos 	uint64_t sm_len_before = space_map_length(msp->ms_sm);
3774814dcd43SSerapheim Dimitropoulos 
3775814dcd43SSerapheim Dimitropoulos 	mutex_exit(&msp->ms_lock);
3776814dcd43SSerapheim Dimitropoulos 	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3777814dcd43SSerapheim Dimitropoulos 	    SM_NO_VDEVID, tx);
3778814dcd43SSerapheim Dimitropoulos 	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3779814dcd43SSerapheim Dimitropoulos 	    SM_NO_VDEVID, tx);
3780814dcd43SSerapheim Dimitropoulos 	mutex_enter(&msp->ms_lock);
3781814dcd43SSerapheim Dimitropoulos 
3782814dcd43SSerapheim Dimitropoulos 	uint64_t sm_len_after = space_map_length(msp->ms_sm);
3783814dcd43SSerapheim Dimitropoulos 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3784814dcd43SSerapheim Dimitropoulos 		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3785814dcd43SSerapheim Dimitropoulos 		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3786814dcd43SSerapheim Dimitropoulos 		    "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
3787814dcd43SSerapheim Dimitropoulos 		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
3788814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_unflushed_allocs),
3789814dcd43SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_unflushed_frees),
3790814dcd43SSerapheim Dimitropoulos 		    (sm_len_after - sm_len_before));
3791814dcd43SSerapheim Dimitropoulos 	}
3792814dcd43SSerapheim Dimitropoulos 
3793814dcd43SSerapheim Dimitropoulos 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3794814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp));
3795814dcd43SSerapheim Dimitropoulos 	spa->spa_unflushed_stats.sus_memused -=
3796814dcd43SSerapheim Dimitropoulos 	    metaslab_unflushed_changes_memused(msp);
3797814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3798814dcd43SSerapheim Dimitropoulos 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3799814dcd43SSerapheim Dimitropoulos 
3800814dcd43SSerapheim Dimitropoulos 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3801814dcd43SSerapheim Dimitropoulos 	metaslab_verify_weight_and_frag(msp);
3802814dcd43SSerapheim Dimitropoulos 
3803814dcd43SSerapheim Dimitropoulos 	metaslab_flush_update(msp, tx);
3804814dcd43SSerapheim Dimitropoulos 
3805814dcd43SSerapheim Dimitropoulos 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3806814dcd43SSerapheim Dimitropoulos 	metaslab_verify_weight_and_frag(msp);
3807814dcd43SSerapheim Dimitropoulos 
3808814dcd43SSerapheim Dimitropoulos 	msp->ms_flushing = B_FALSE;
3809814dcd43SSerapheim Dimitropoulos 	cv_broadcast(&msp->ms_flush_cv);
3810814dcd43SSerapheim Dimitropoulos 	return (B_TRUE);
381116a4a807SGeorge Wilson }
381216a4a807SGeorge Wilson 
3813fa9e4066Sahrens /*
3814fa9e4066Sahrens  * Write a metaslab to disk in the context of the specified transaction group.
3815fa9e4066Sahrens  */
3816fa9e4066Sahrens void
metaslab_sync(metaslab_t * msp,uint64_t txg)3817fa9e4066Sahrens metaslab_sync(metaslab_t *msp, uint64_t txg)
3818fa9e4066Sahrens {
38190713e232SGeorge Wilson 	metaslab_group_t *mg = msp->ms_group;
38200713e232SGeorge Wilson 	vdev_t *vd = mg->mg_vd;
3821fa9e4066Sahrens 	spa_t *spa = vd->vdev_spa;
3822b24ab676SJeff Bonwick 	objset_t *mos = spa_meta_objset(spa);
382386714001SSerapheim Dimitropoulos 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3824fa9e4066Sahrens 	dmu_tx_t *tx;
3825fa9e4066Sahrens 
382688ecc943SGeorge Wilson 	ASSERT(!vd->vdev_ishole);
382788ecc943SGeorge Wilson 
382816a4a807SGeorge Wilson 	/*
382916a4a807SGeorge Wilson 	 * This metaslab has just been added so there's no work to do now.
383016a4a807SGeorge Wilson 	 */
383186714001SSerapheim Dimitropoulos 	if (msp->ms_freeing == NULL) {
38320713e232SGeorge Wilson 		ASSERT3P(alloctree, ==, NULL);
383316a4a807SGeorge Wilson 		return;
383416a4a807SGeorge Wilson 	}
383516a4a807SGeorge Wilson 
38360713e232SGeorge Wilson 	ASSERT3P(alloctree, !=, NULL);
383786714001SSerapheim Dimitropoulos 	ASSERT3P(msp->ms_freeing, !=, NULL);
383886714001SSerapheim Dimitropoulos 	ASSERT3P(msp->ms_freed, !=, NULL);
383986714001SSerapheim Dimitropoulos 	ASSERT3P(msp->ms_checkpointing, !=, NULL);
3840084fd14fSBrian Behlendorf 	ASSERT3P(msp->ms_trim, !=, NULL);
384116a4a807SGeorge Wilson 
38422e4c9986SGeorge Wilson 	/*
384386714001SSerapheim Dimitropoulos 	 * Normally, we don't want to process a metaslab if there are no
384486714001SSerapheim Dimitropoulos 	 * allocations or frees to perform. However, if the metaslab is being
3845af1d63abSPaul Dagnelie 	 * forced to condense, it's loaded and we're not beyond the final
3846af1d63abSPaul Dagnelie 	 * dirty txg, we need to let it through. Not condensing beyond the
3847af1d63abSPaul Dagnelie 	 * final dirty txg prevents an issue where metaslabs that need to be
3848af1d63abSPaul Dagnelie 	 * condensed but were loaded for other reasons could cause a panic
3849af1d63abSPaul Dagnelie 	 * here. By only checking the txg in that branch of the conditional,
3850af1d63abSPaul Dagnelie 	 * we preserve the utility of the VERIFY statements in all other
3851af1d63abSPaul Dagnelie 	 * cases.
38522e4c9986SGeorge Wilson 	 */
385386714001SSerapheim Dimitropoulos 	if (range_tree_is_empty(alloctree) &&
385486714001SSerapheim Dimitropoulos 	    range_tree_is_empty(msp->ms_freeing) &&
385586714001SSerapheim Dimitropoulos 	    range_tree_is_empty(msp->ms_checkpointing) &&
3856af1d63abSPaul Dagnelie 	    !(msp->ms_loaded && msp->ms_condense_wanted &&
3857af1d63abSPaul Dagnelie 	    txg <= spa_final_dirty_txg(spa)))
3858468c413aSTim Haley 		return;
3859fa9e4066Sahrens 
38603991b535SGeorge Wilson 
38614d7988d6SPaul Dagnelie 	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
38623991b535SGeorge Wilson 
3863ecc2d604Sbonwick 	/*
3864555d674dSSerapheim Dimitropoulos 	 * The only state that can actually be changing concurrently
3865555d674dSSerapheim Dimitropoulos 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
3866555d674dSSerapheim Dimitropoulos 	 * other thread can be modifying this txg's alloc, freeing,
386786714001SSerapheim Dimitropoulos 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
3868555d674dSSerapheim Dimitropoulos 	 * could call into the DMU, because the DMU can call down to
3869555d674dSSerapheim Dimitropoulos 	 * us (e.g. via zio_free()) at any time.
38705cabbc6bSPrashanth Sreenivasa 	 *
38715cabbc6bSPrashanth Sreenivasa 	 * The spa_vdev_remove_thread() can be reading metaslab state
3872555d674dSSerapheim Dimitropoulos 	 * concurrently, and it is locked out by the ms_sync_lock.
3873555d674dSSerapheim Dimitropoulos 	 * Note that the ms_lock is insufficient for this, because it
3874555d674dSSerapheim Dimitropoulos 	 * is dropped by space_map_write().
3875ecc2d604Sbonwick 	 */
3876468c413aSTim Haley 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3877fa9e4066Sahrens 
3878814dcd43SSerapheim Dimitropoulos 	/*
3879814dcd43SSerapheim Dimitropoulos 	 * Generate a log space map if one doesn't exist already.
3880814dcd43SSerapheim Dimitropoulos 	 */
3881814dcd43SSerapheim Dimitropoulos 	spa_generate_syncing_log_sm(spa, tx);
38820713e232SGeorge Wilson 
3883814dcd43SSerapheim Dimitropoulos 	if (msp->ms_sm == NULL) {
3884814dcd43SSerapheim Dimitropoulos 		uint64_t new_object = space_map_alloc(mos,
3885814dcd43SSerapheim Dimitropoulos 		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3886814dcd43SSerapheim Dimitropoulos 		    zfs_metaslab_sm_blksz_with_log :
3887814dcd43SSerapheim Dimitropoulos 		    zfs_metaslab_sm_blksz_no_log, tx);
38880713e232SGeorge Wilson 		VERIFY3U(new_object, !=, 0);
38890713e232SGeorge Wilson 
3890814dcd43SSerapheim Dimitropoulos 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
3891814dcd43SSerapheim Dimitropoulos 		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
3892814dcd43SSerapheim Dimitropoulos 
38930713e232SGeorge Wilson 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
38945cabbc6bSPrashanth Sreenivasa 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
38950713e232SGeorge Wilson 		ASSERT(msp->ms_sm != NULL);
3896814dcd43SSerapheim Dimitropoulos 
3897814dcd43SSerapheim Dimitropoulos 		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3898814dcd43SSerapheim Dimitropoulos 		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3899555d674dSSerapheim Dimitropoulos 		ASSERT0(metaslab_allocated_space(msp));
3900ecc2d604Sbonwick 	}
3901fa9e4066Sahrens 
3902814dcd43SSerapheim Dimitropoulos 	if (metaslab_unflushed_txg(msp) == 0 &&
3903814dcd43SSerapheim Dimitropoulos 	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
3904814dcd43SSerapheim Dimitropoulos 		ASSERT(spa_syncing_log_sm(spa) != NULL);
3905814dcd43SSerapheim Dimitropoulos 
3906814dcd43SSerapheim Dimitropoulos 		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3907814dcd43SSerapheim Dimitropoulos 		spa_log_sm_increment_current_mscount(spa);
3908814dcd43SSerapheim Dimitropoulos 		spa_log_summary_add_flushed_metaslab(spa);
3909814dcd43SSerapheim Dimitropoulos 
3910814dcd43SSerapheim Dimitropoulos 		ASSERT(msp->ms_sm != NULL);
3911814dcd43SSerapheim Dimitropoulos 		mutex_enter(&spa->spa_flushed_ms_lock);
3912814dcd43SSerapheim Dimitropoulos 		avl_add(&spa->spa_metaslabs_by_flushed, msp);
3913814dcd43SSerapheim Dimitropoulos 		mutex_exit(&spa->spa_flushed_ms_lock);
3914814dcd43SSerapheim Dimitropoulos 
3915814dcd43SSerapheim Dimitropoulos 		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3916814dcd43SSerapheim Dimitropoulos 		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3917814dcd43SSerapheim Dimitropoulos 	}
3918814dcd43SSerapheim Dimitropoulos 
391986714001SSerapheim Dimitropoulos 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
392086714001SSerapheim Dimitropoulos 	    vd->vdev_checkpoint_sm == NULL) {
392186714001SSerapheim Dimitropoulos 		ASSERT(spa_has_checkpoint(spa));
392286714001SSerapheim Dimitropoulos 
392386714001SSerapheim Dimitropoulos 		uint64_t new_object = space_map_alloc(mos,
3924814dcd43SSerapheim Dimitropoulos 		    zfs_vdev_standard_sm_blksz, tx);
392586714001SSerapheim Dimitropoulos 		VERIFY3U(new_object, !=, 0);
392686714001SSerapheim Dimitropoulos 
392786714001SSerapheim Dimitropoulos 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
392886714001SSerapheim Dimitropoulos 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
392986714001SSerapheim Dimitropoulos 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
393086714001SSerapheim Dimitropoulos 
393186714001SSerapheim Dimitropoulos 		/*
393286714001SSerapheim Dimitropoulos 		 * We save the space map object as an entry in vdev_top_zap
393386714001SSerapheim Dimitropoulos 		 * so it can be retrieved when the pool is reopened after an
393486714001SSerapheim Dimitropoulos 		 * export or through zdb.
393586714001SSerapheim Dimitropoulos 		 */
393686714001SSerapheim Dimitropoulos 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
393786714001SSerapheim Dimitropoulos 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
393886714001SSerapheim Dimitropoulos 		    sizeof (new_object), 1, &new_object, tx));
393986714001SSerapheim Dimitropoulos 	}
394086714001SSerapheim Dimitropoulos 
39415cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_sync_lock);
3942468c413aSTim Haley 	mutex_enter(&msp->ms_lock);
3943468c413aSTim Haley 
3944b1be2892SMatthew Ahrens 	/*
39458363e80aSGeorge Wilson 	 * Note: metaslab_condense() clears the space map's histogram.
3946b1be2892SMatthew Ahrens 	 * Therefore we must verify and remove this histogram before
3947b1be2892SMatthew Ahrens 	 * condensing.
3948b1be2892SMatthew Ahrens 	 */
3949b1be2892SMatthew Ahrens 	metaslab_group_histogram_verify(mg);
3950b1be2892SMatthew Ahrens 	metaslab_class_histogram_verify(mg->mg_class);
3951b1be2892SMatthew Ahrens 	metaslab_group_histogram_remove(mg, msp);
3952b1be2892SMatthew Ahrens 
3953814dcd43SSerapheim Dimitropoulos 	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
3954814dcd43SSerapheim Dimitropoulos 	    metaslab_should_condense(msp))
3955814dcd43SSerapheim Dimitropoulos 		metaslab_condense(msp, tx);
3956814dcd43SSerapheim Dimitropoulos 
3957814dcd43SSerapheim Dimitropoulos 	/*
3958814dcd43SSerapheim Dimitropoulos 	 * We'll be going to disk to sync our space accounting, thus we
3959814dcd43SSerapheim Dimitropoulos 	 * drop the ms_lock during that time so allocations coming from
3960814dcd43SSerapheim Dimitropoulos 	 * open-context (ZIL) for future TXGs do not block.
3961814dcd43SSerapheim Dimitropoulos 	 */
3962814dcd43SSerapheim Dimitropoulos 	mutex_exit(&msp->ms_lock);
3963814dcd43SSerapheim Dimitropoulos 	space_map_t *log_sm = spa_syncing_log_sm(spa);
3964814dcd43SSerapheim Dimitropoulos 	if (log_sm != NULL) {
3965814dcd43SSerapheim Dimitropoulos 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3966814dcd43SSerapheim Dimitropoulos 
3967814dcd43SSerapheim Dimitropoulos 		space_map_write(log_sm, alloctree, SM_ALLOC,
3968814dcd43SSerapheim Dimitropoulos 		    vd->vdev_id, tx);
3969814dcd43SSerapheim Dimitropoulos 		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
3970814dcd43SSerapheim Dimitropoulos 		    vd->vdev_id, tx);
3971814dcd43SSerapheim Dimitropoulos 		mutex_enter(&msp->ms_lock);
3972814dcd43SSerapheim Dimitropoulos 
3973814dcd43SSerapheim Dimitropoulos 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3974814dcd43SSerapheim Dimitropoulos 		    metaslab_unflushed_changes_memused(msp));
3975814dcd43SSerapheim Dimitropoulos 		spa->spa_unflushed_stats.sus_memused -=
3976814dcd43SSerapheim Dimitropoulos 		    metaslab_unflushed_changes_memused(msp);
3977814dcd43SSerapheim Dimitropoulos 		range_tree_remove_xor_add(alloctree,
3978814dcd43SSerapheim Dimitropoulos 		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
3979814dcd43SSerapheim Dimitropoulos 		range_tree_remove_xor_add(msp->ms_freeing,
3980814dcd43SSerapheim Dimitropoulos 		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
3981814dcd43SSerapheim Dimitropoulos 		spa->spa_unflushed_stats.sus_memused +=
3982814dcd43SSerapheim Dimitropoulos 		    metaslab_unflushed_changes_memused(msp);
398316a4a807SGeorge Wilson 	} else {
3984814dcd43SSerapheim Dimitropoulos 		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3985814dcd43SSerapheim Dimitropoulos 
398617f11284SSerapheim Dimitropoulos 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
398717f11284SSerapheim Dimitropoulos 		    SM_NO_VDEVID, tx);
398817f11284SSerapheim Dimitropoulos 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
398917f11284SSerapheim Dimitropoulos 		    SM_NO_VDEVID, tx);
39905cabbc6bSPrashanth Sreenivasa 		mutex_enter(&msp->ms_lock);
399116a4a807SGeorge Wilson 	}
3992468c413aSTim Haley 
3993555d674dSSerapheim Dimitropoulos 	msp->ms_allocated_space += range_tree_space(alloctree);
3994555d674dSSerapheim Dimitropoulos 	ASSERT3U(msp->ms_allocated_space, >=,
3995555d674dSSerapheim Dimitropoulos 	    range_tree_space(msp->ms_freeing));
3996555d674dSSerapheim Dimitropoulos 	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
3997555d674dSSerapheim Dimitropoulos 
399886714001SSerapheim Dimitropoulos 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
399986714001SSerapheim Dimitropoulos 		ASSERT(spa_has_checkpoint(spa));
400086714001SSerapheim Dimitropoulos 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
400186714001SSerapheim Dimitropoulos 
400286714001SSerapheim Dimitropoulos 		/*
400386714001SSerapheim Dimitropoulos 		 * Since we are doing writes to disk and the ms_checkpointing
400486714001SSerapheim Dimitropoulos 		 * tree won't be changing during that time, we drop the
4005814dcd43SSerapheim Dimitropoulos 		 * ms_lock while writing to the checkpoint space map, for the
4006814dcd43SSerapheim Dimitropoulos 		 * same reason mentioned above.
400786714001SSerapheim Dimitropoulos 		 */
400886714001SSerapheim Dimitropoulos 		mutex_exit(&msp->ms_lock);
400986714001SSerapheim Dimitropoulos 		space_map_write(vd->vdev_checkpoint_sm,
401017f11284SSerapheim Dimitropoulos 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
401186714001SSerapheim Dimitropoulos 		mutex_enter(&msp->ms_lock);
401286714001SSerapheim Dimitropoulos 
401386714001SSerapheim Dimitropoulos 		spa->spa_checkpoint_info.sci_dspace +=
401486714001SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_checkpointing);
401586714001SSerapheim Dimitropoulos 		vd->vdev_stat.vs_checkpoint_space +=
401686714001SSerapheim Dimitropoulos 		    range_tree_space(msp->ms_checkpointing);
401786714001SSerapheim Dimitropoulos 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4018555d674dSSerapheim Dimitropoulos 		    -space_map_allocated(vd->vdev_checkpoint_sm));
401986714001SSerapheim Dimitropoulos 
402086714001SSerapheim Dimitropoulos 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
402186714001SSerapheim Dimitropoulos 	}
402286714001SSerapheim Dimitropoulos 
40230713e232SGeorge Wilson 	if (msp->ms_loaded) {
40240713e232SGeorge Wilson 		/*
40255cabbc6bSPrashanth Sreenivasa 		 * When the space map is loaded, we have an accurate
40260713e232SGeorge Wilson 		 * histogram in the range tree. This gives us an opportunity
40270713e232SGeorge Wilson 		 * to bring the space map's histogram up-to-date so we clear
40280713e232SGeorge Wilson 		 * it first before updating it.
40290713e232SGeorge Wilson 		 */
40300713e232SGeorge Wilson 		space_map_histogram_clear(msp->ms_sm);
403186714001SSerapheim Dimitropoulos 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
40328363e80aSGeorge Wilson 
40330713e232SGeorge Wilson 		/*
40348363e80aSGeorge Wilson 		 * Since we've cleared the histogram we need to add back
40358363e80aSGeorge Wilson 		 * any free space that has already been processed, plus
40368363e80aSGeorge Wilson 		 * any deferred space. This allows the on-disk histogram
40378363e80aSGeorge Wilson 		 * to accurately reflect all free space even if some space
40388363e80aSGeorge Wilson 		 * is not yet available for allocation (i.e. deferred).
40390713e232SGeorge Wilson 		 */
404086714001SSerapheim Dimitropoulos 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
40418363e80aSGeorge Wilson 
40428363e80aSGeorge Wilson 		/*
40438363e80aSGeorge Wilson 		 * Add back any deferred free space that has not been
40448363e80aSGeorge Wilson 		 * added back into the in-core free tree yet. This will
40458363e80aSGeorge Wilson 		 * ensure that we don't end up with a space map histogram
40468363e80aSGeorge Wilson 		 * that is completely empty unless the metaslab is fully
40478363e80aSGeorge Wilson 		 * allocated.
40488363e80aSGeorge Wilson 		 */
40498363e80aSGeorge Wilson 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
40508363e80aSGeorge Wilson 			space_map_histogram_add(msp->ms_sm,
405186714001SSerapheim Dimitropoulos 			    msp->ms_defer[t], tx);
40528363e80aSGeorge Wilson 		}
40530713e232SGeorge Wilson 	}
40548363e80aSGeorge Wilson 
40558363e80aSGeorge Wilson 	/*
40568363e80aSGeorge Wilson 	 * Always add the free space from this sync pass to the space
40578363e80aSGeorge Wilson 	 * map histogram. We want to make sure that the on-disk histogram
40588363e80aSGeorge Wilson 	 * accounts for all free space. If the space map is not loaded,
40598363e80aSGeorge Wilson 	 * then we will lose some accuracy but will correct it the next
40608363e80aSGeorge Wilson 	 * time we load the space map.
40618363e80aSGeorge Wilson 	 */
406286714001SSerapheim Dimitropoulos 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4063555d674dSSerapheim Dimitropoulos 	metaslab_aux_histograms_update(msp);
40648363e80aSGeorge Wilson 
40652e4c9986SGeorge Wilson 	metaslab_group_histogram_add(mg, msp);
40662e4c9986SGeorge Wilson 	metaslab_group_histogram_verify(mg);
40672e4c9986SGeorge Wilson 	metaslab_class_histogram_verify(mg->mg_class);
4068fa9e4066Sahrens 
406916a4a807SGeorge Wilson 	/*
40700713e232SGeorge Wilson 	 * For sync pass 1, we avoid traversing this txg's free range tree
4071555d674dSSerapheim Dimitropoulos 	 * and instead will just swap the pointers for freeing and freed.
4072555d674dSSerapheim Dimitropoulos 	 * We can safely do this since the freed_tree is guaranteed to be
4073555d674dSSerapheim Dimitropoulos 	 * empty on the initial pass.
4074814dcd43SSerapheim Dimitropoulos 	 *
4075814dcd43SSerapheim Dimitropoulos 	 * Keep in mind that even if we are currently using a log spacemap
4076814dcd43SSerapheim Dimitropoulos 	 * we want current frees to end up in the ms_allocatable (but not
4077814dcd43SSerapheim Dimitropoulos 	 * get appended to the ms_sm) so their ranges can be reused as usual.
407816a4a807SGeorge Wilson 	 */
407916a4a807SGeorge Wilson 	if (spa_sync_pass(spa) == 1) {
408086714001SSerapheim Dimitropoulos 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4081555d674dSSerapheim Dimitropoulos 		ASSERT0(msp->ms_allocated_this_txg);
408216a4a807SGeorge Wilson 	} else {
408386714001SSerapheim Dimitropoulos 		range_tree_vacate(msp->ms_freeing,
408486714001SSerapheim Dimitropoulos 		    range_tree_add, msp->ms_freed);
4085fa9e4066Sahrens 	}
4086555d674dSSerapheim Dimitropoulos 	msp->ms_allocated_this_txg += range_tree_space(alloctree);
40872e4c9986SGeorge Wilson 	range_tree_vacate(alloctree, NULL, NULL);
4088fa9e4066Sahrens 
408986714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
409086714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
409186714001SSerapheim Dimitropoulos 	    & TXG_MASK]));
409286714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_freeing));
409386714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_checkpointing));
4094fa9e4066Sahrens 
4095fa9e4066Sahrens 	mutex_exit(&msp->ms_lock);
4096fa9e4066Sahrens 
4097814dcd43SSerapheim Dimitropoulos 	/*
4098814dcd43SSerapheim Dimitropoulos 	 * Verify that the space map object ID has been recorded in the
4099814dcd43SSerapheim Dimitropoulos 	 * vdev_ms_array.
4100814dcd43SSerapheim Dimitropoulos 	 */
4101814dcd43SSerapheim Dimitropoulos 	uint64_t object;
4102814dcd43SSerapheim Dimitropoulos 	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4103814dcd43SSerapheim Dimitropoulos 	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4104814dcd43SSerapheim Dimitropoulos 	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4105814dcd43SSerapheim Dimitropoulos 
41065cabbc6bSPrashanth Sreenivasa 	mutex_exit(&msp->ms_sync_lock);
4107ecc2d604Sbonwick 	dmu_tx_commit(tx);
4108fa9e4066Sahrens }
4109fa9e4066Sahrens 
4110af1d63abSPaul Dagnelie static void
metaslab_evict(metaslab_t * msp,uint64_t txg)4111af1d63abSPaul Dagnelie metaslab_evict(metaslab_t *msp, uint64_t txg)
4112af1d63abSPaul Dagnelie {
4113af1d63abSPaul Dagnelie 	if (!msp->ms_loaded || msp->ms_disabled != 0)
4114af1d63abSPaul Dagnelie 		return;
4115af1d63abSPaul Dagnelie 
4116af1d63abSPaul Dagnelie 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4117af1d63abSPaul Dagnelie 		VERIFY0(range_tree_space(
4118af1d63abSPaul Dagnelie 		    msp->ms_allocating[(txg + t) & TXG_MASK]));
4119af1d63abSPaul Dagnelie 	}
4120af1d63abSPaul Dagnelie 	if (msp->ms_allocator != -1)
4121af1d63abSPaul Dagnelie 		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4122af1d63abSPaul Dagnelie 
4123af1d63abSPaul Dagnelie 	if (!metaslab_debug_unload)
4124af1d63abSPaul Dagnelie 		metaslab_unload(msp);
4125af1d63abSPaul Dagnelie }
4126af1d63abSPaul Dagnelie 
4127fa9e4066Sahrens /*
4128fa9e4066Sahrens  * Called after a transaction group has completely synced to mark
4129fa9e4066Sahrens  * all of the metaslab's free space as usable.
4130fa9e4066Sahrens  */
4131fa9e4066Sahrens void
metaslab_sync_done(metaslab_t * msp,uint64_t txg)4132fa9e4066Sahrens metaslab_sync_done(metaslab_t *msp, uint64_t txg)
4133fa9e4066Sahrens {
4134ecc2d604Sbonwick 	metaslab_group_t *mg = msp->ms_group;
4135ecc2d604Sbonwick 	vdev_t *vd = mg->mg_vd;
41368363e80aSGeorge Wilson 	spa_t *spa = vd->vdev_spa;
41370713e232SGeorge Wilson 	range_tree_t **defer_tree;
4138468c413aSTim Haley 	int64_t alloc_delta, defer_delta;
41398363e80aSGeorge Wilson 	boolean_t defer_allowed = B_TRUE;
4140fa9e4066Sahrens 
414188ecc943SGeorge Wilson 	ASSERT(!vd->vdev_ishole);
414288ecc943SGeorge Wilson 
4143fa9e4066Sahrens 	mutex_enter(&msp->ms_lock);
4144fa9e4066Sahrens 
4145ecc2d604Sbonwick 	/*
4146ecc2d604Sbonwick 	 * If this metaslab is just becoming available, initialize its
41475f145778SMatthew Ahrens 	 * range trees and add its capacity to the vdev.
4148ecc2d604Sbonwick 	 */
414986714001SSerapheim Dimitropoulos 	if (msp->ms_freed == NULL) {
41504d7988d6SPaul Dagnelie 		range_seg_type_t type;
41514d7988d6SPaul Dagnelie 		uint64_t shift, start;
41524d7988d6SPaul Dagnelie 		type = metaslab_calculate_range_tree_type(vd, msp, &start,
41534d7988d6SPaul Dagnelie 		    &shift);
41544d7988d6SPaul Dagnelie 
4155468c413aSTim Haley 		for (int t = 0; t < TXG_SIZE; t++) {
415686714001SSerapheim Dimitropoulos 			ASSERT(msp->ms_allocating[t] == NULL);
41570713e232SGeorge Wilson 
41584d7988d6SPaul Dagnelie 			msp->ms_allocating[t] = range_tree_create(NULL, type,
41594d7988d6SPaul Dagnelie 			    NULL, start, shift);
4160fa9e4066Sahrens 		}
4161468c413aSTim Haley 
416286714001SSerapheim Dimitropoulos 		ASSERT3P(msp->ms_freeing, ==, NULL);
41634d7988d6SPaul Dagnelie 		msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
41644d7988d6SPaul Dagnelie 		    shift);
41655f145778SMatthew Ahrens 
416686714001SSerapheim Dimitropoulos 		ASSERT3P(msp->ms_freed, ==, NULL);
41674d7988d6SPaul Dagnelie 		msp->ms_freed = range_tree_create(NULL, type, NULL, start,
41684d7988d6SPaul Dagnelie 		    shift);
41695f145778SMatthew Ahrens 
417016a4a807SGeorge Wilson 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4171814dcd43SSerapheim Dimitropoulos 			ASSERT3P(msp->ms_defer[t], ==, NULL);
41724d7988d6SPaul Dagnelie 			msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
41734d7988d6SPaul Dagnelie 			    start, shift);
41740713e232SGeorge Wilson 		}
4175468c413aSTim Haley 
417686714001SSerapheim Dimitropoulos 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
41774d7988d6SPaul Dagnelie 		msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
41784d7988d6SPaul Dagnelie 		    start, shift);
417986714001SSerapheim Dimitropoulos 
4180814dcd43SSerapheim Dimitropoulos 		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
41814d7988d6SPaul Dagnelie 		msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
41824d7988d6SPaul Dagnelie 		    start, shift);
41834d7988d6SPaul Dagnelie 
41844d7988d6SPaul Dagnelie 		metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
41854d7988d6SPaul Dagnelie 		mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
41864d7988d6SPaul Dagnelie 		mrap->mra_floor_shift = metaslab_by_size_min_shift;
4187814dcd43SSerapheim Dimitropoulos 		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
41884d7988d6SPaul Dagnelie 		msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
41894d7988d6SPaul Dagnelie 		    type, mrap, start, shift);
4190814dcd43SSerapheim Dimitropoulos 
4191663207adSDon Brady 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4192fa9e4066Sahrens 	}
419386714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_freeing));
419486714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_checkpointing));
4195fa9e4066Sahrens 
419686714001SSerapheim Dimitropoulos 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
41970713e232SGeorge Wilson 
41988363e80aSGeorge Wilson 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
41998363e80aSGeorge Wilson 	    metaslab_class_get_alloc(spa_normal_class(spa));
42005cabbc6bSPrashanth Sreenivasa 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
42018363e80aSGeorge Wilson 		defer_allowed = B_FALSE;
42028363e80aSGeorge Wilson 	}
42038363e80aSGeorge Wilson 
42048363e80aSGeorge Wilson 	defer_delta = 0;
4205555d674dSSerapheim Dimitropoulos 	alloc_delta = msp->ms_allocated_this_txg -
4206555d674dSSerapheim Dimitropoulos 	    range_tree_space(msp->ms_freed);
4207814dcd43SSerapheim Dimitropoulos 
42088363e80aSGeorge Wilson 	if (defer_allowed) {
420986714001SSerapheim Dimitropoulos 		defer_delta = range_tree_space(msp->ms_freed) -
42108363e80aSGeorge Wilson 		    range_tree_space(*defer_tree);
42118363e80aSGeorge Wilson 	} else {
42128363e80aSGeorge Wilson 		defer_delta -= range_tree_space(*defer_tree);
42138363e80aSGeorge Wilson 	}
4214663207adSDon Brady 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4215663207adSDon Brady 	    defer_delta, 0);
4216fa9e4066Sahrens 
4217814dcd43SSerapheim Dimitropoulos 	if (spa_syncing_log_sm(spa) == NULL) {
4218814dcd43SSerapheim Dimitropoulos 		/*
4219814dcd43SSerapheim Dimitropoulos 		 * If there's a metaslab_load() in progress and we don't have
4220814dcd43SSerapheim Dimitropoulos 		 * a log space map, it means that we probably wrote to the
4221814dcd43SSerapheim Dimitropoulos 		 * metaslab's space map. If this is the case, we need to
4222814dcd43SSerapheim Dimitropoulos 		 * make sure that we wait for the load to complete so that we
4223814dcd43SSerapheim Dimitropoulos 		 * have a consistent view at the in-core side of the metaslab.
4224814dcd43SSerapheim Dimitropoulos 		 */
4225814dcd43SSerapheim Dimitropoulos 		metaslab_load_wait(msp);
4226814dcd43SSerapheim Dimitropoulos 	} else {
4227814dcd43SSerapheim Dimitropoulos 		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
4228814dcd43SSerapheim Dimitropoulos 	}
42299eb57f7fSGeorge Wilson 
4230084fd14fSBrian Behlendorf 	/*
4231084fd14fSBrian Behlendorf 	 * When auto-trimming is enabled, free ranges which are added to
4232084fd14fSBrian Behlendorf 	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
4233084fd14fSBrian Behlendorf 	 * periodically consumed by the vdev_autotrim_thread() which issues
4234084fd14fSBrian Behlendorf 	 * trims for all ranges and then vacates the tree.  The ms_trim tree
4235084fd14fSBrian Behlendorf 	 * can be discarded at any time with the sole consequence of recent
4236084fd14fSBrian Behlendorf 	 * frees not being trimmed.
4237084fd14fSBrian Behlendorf 	 */
4238084fd14fSBrian Behlendorf 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
4239084fd14fSBrian Behlendorf 		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4240084fd14fSBrian Behlendorf 		if (!defer_allowed) {
4241084fd14fSBrian Behlendorf 			range_tree_walk(msp->ms_freed, range_tree_add,
4242084fd14fSBrian Behlendorf 			    msp->ms_trim);
4243084fd14fSBrian Behlendorf 		}
4244084fd14fSBrian Behlendorf 	} else {
4245084fd14fSBrian Behlendorf 		range_tree_vacate(msp->ms_trim, NULL, NULL);
4246084fd14fSBrian Behlendorf 	}
4247084fd14fSBrian Behlendorf 
42489eb57f7fSGeorge Wilson 	/*
42490713e232SGeorge Wilson 	 * Move the frees from the defer_tree back to the free
425086714001SSerapheim Dimitropoulos 	 * range tree (if it's loaded). Swap the freed_tree and
425186714001SSerapheim Dimitropoulos 	 * the defer_tree -- this is safe to do because we've
425286714001SSerapheim Dimitropoulos 	 * just emptied out the defer_tree.
42539eb57f7fSGeorge Wilson 	 */
42540713e232SGeorge Wilson 	range_tree_vacate(*defer_tree,
425586714001SSerapheim Dimitropoulos 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
42568363e80aSGeorge Wilson 	if (defer_allowed) {
425786714001SSerapheim Dimitropoulos 		range_tree_swap(&msp->ms_freed, defer_tree);
42588363e80aSGeorge Wilson 	} else {
425986714001SSerapheim Dimitropoulos 		range_tree_vacate(msp->ms_freed,
426086714001SSerapheim Dimitropoulos 		    msp->ms_loaded ? range_tree_add : NULL,
426186714001SSerapheim Dimitropoulos 		    msp->ms_allocatable);
42628363e80aSGeorge Wilson 	}
4263555d674dSSerapheim Dimitropoulos 
4264555d674dSSerapheim Dimitropoulos 	msp->ms_synced_length = space_map_length(msp->ms_sm);
4265fa9e4066Sahrens 
4266468c413aSTim Haley 	msp->ms_deferspace += defer_delta;
4267468c413aSTim Haley 	ASSERT3S(msp->ms_deferspace, >=, 0);
42680713e232SGeorge Wilson 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4269468c413aSTim Haley 	if (msp->ms_deferspace != 0) {
4270468c413aSTim Haley 		/*
4271468c413aSTim Haley 		 * Keep syncing this metaslab until all deferred frees
4272468c413aSTim Haley 		 * are back in circulation.
4273468c413aSTim Haley 		 */
4274468c413aSTim Haley 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
4275468c413aSTim Haley 	}
4276555d674dSSerapheim Dimitropoulos 	metaslab_aux_histograms_update_done(msp, defer_allowed);
4277468c413aSTim Haley 
4278f78cdc34SPaul Dagnelie 	if (msp->ms_new) {
4279f78cdc34SPaul Dagnelie 		msp->ms_new = B_FALSE;
4280f78cdc34SPaul Dagnelie 		mutex_enter(&mg->mg_lock);
4281f78cdc34SPaul Dagnelie 		mg->mg_ms_ready++;
4282f78cdc34SPaul Dagnelie 		mutex_exit(&mg->mg_lock);
4283f78cdc34SPaul Dagnelie 	}
4284555d674dSSerapheim Dimitropoulos 
42858363e80aSGeorge Wilson 	/*
4286555d674dSSerapheim Dimitropoulos 	 * Re-sort metaslab within its group now that we've adjusted
4287555d674dSSerapheim Dimitropoulos 	 * its allocatable space.
42888363e80aSGeorge Wilson 	 */
4289555d674dSSerapheim Dimitropoulos 	metaslab_recalculate_weight_and_sort(msp);
42908363e80aSGeorge Wilson 
42918363e80aSGeorge Wilson 	/*
42928363e80aSGeorge Wilson 	 * If the metaslab is loaded and we've not tried to load or allocate
42938363e80aSGeorge Wilson 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
42948363e80aSGeorge Wilson 	 */
42958363e80aSGeorge Wilson 	if (msp->ms_loaded &&
4296084fd14fSBrian Behlendorf 	    msp->ms_disabled == 0 &&
42978363e80aSGeorge Wilson 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
4298084fd14fSBrian Behlendorf 
42990713e232SGeorge Wilson 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
43000713e232SGeorge Wilson 			VERIFY0(range_tree_space(
430186714001SSerapheim Dimitropoulos 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
43020713e232SGeorge Wilson 		}
4303f78cdc34SPaul Dagnelie 		if (msp->ms_allocator != -1) {
4304f78cdc34SPaul Dagnelie 			metaslab_passivate(msp, msp->ms_weight &
4305f78cdc34SPaul Dagnelie 			    ~METASLAB_ACTIVE_MASK);
4306f78cdc34SPaul Dagnelie 		}
4307fa9e4066Sahrens 
43080713e232SGeorge Wilson 		if (!metaslab_debug_unload)
43090713e232SGeorge Wilson 			metaslab_unload(msp);
4310fa9e4066Sahrens 	}
4311fa9e4066Sahrens 
431286714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
431386714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_freeing));
431486714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_freed));
431586714001SSerapheim Dimitropoulos 	ASSERT0(range_tree_space(msp->ms_checkpointing));
4316af1d63abSPaul Dagnelie 	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4317555d674dSSerapheim Dimitropoulos 	msp->ms_allocated_this_txg = 0;
4318ecc2d604Sbonwick 	mutex_exit(&msp->ms_lock);
4319fa9e4066Sahrens }
4320fa9e4066Sahrens 
432180eb36f2SGeorge Wilson void
metaslab_sync_reassess(metaslab_group_t * mg)432280eb36f2SGeorge Wilson metaslab_sync_reassess(metaslab_group_t *mg)
432380eb36f2SGeorge Wilson {
43245cabbc6bSPrashanth Sreenivasa 	spa_t *spa = mg->mg_class->mc_spa;
43255cabbc6bSPrashanth Sreenivasa 
43265cabbc6bSPrashanth Sreenivasa 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
432722e30981SGeorge Wilson 	metaslab_group_alloc_update(mg);
43282e4c9986SGeorge Wilson 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
432909c9d376SGeorge Wilson 
433080eb36f2SGeorge Wilson 	/*
43315cabbc6bSPrashanth Sreenivasa 	 * Preload the next potential metaslabs but only on active
43325cabbc6bSPrashanth Sreenivasa 	 * metaslab groups. We can get into a state where the metaslab
43335cabbc6bSPrashanth Sreenivasa 	 * is no longer active since we dirty metaslabs as we remove a
43345cabbc6bSPrashanth Sreenivasa 	 * a device, thus potentially making the metaslab group eligible
43355cabbc6bSPrashanth Sreenivasa 	 * for preloading.
433680eb36f2SGeorge Wilson 	 */
43375cabbc6bSPrashanth Sreenivasa 	if (mg->mg_activation_count > 0) {
43385cabbc6bSPrashanth Sreenivasa 		metaslab_group_preload(mg);
43395cabbc6bSPrashanth Sreenivasa 	}
43405cabbc6bSPrashanth Sreenivasa 	spa_config_exit(spa, SCL_ALLOC, FTAG);
434180eb36f2SGeorge Wilson }
434280eb36f2SGeorge Wilson 
4343663207adSDon Brady /*
4344663207adSDon Brady  * When writing a ditto block (i.e. more than one DVA for a given BP) on
4345663207adSDon Brady  * the same vdev as an existing DVA of this BP, then try to allocate it
4346663207adSDon Brady  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4347663207adSDon Brady  */
4348663207adSDon Brady static boolean_t
metaslab_is_unique(metaslab_t * msp,dva_t * dva)4349663207adSDon Brady metaslab_is_unique(metaslab_t *msp, dva_t *dva)
435044cd46caSbillm {
4351663207adSDon Brady 	uint64_t dva_ms_id;
4352663207adSDon Brady 
4353663207adSDon Brady 	if (DVA_GET_ASIZE(dva) == 0)
4354663207adSDon Brady 		return (B_TRUE);
435544cd46caSbillm 
435644cd46caSbillm 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4357663207adSDon Brady 		return (B_TRUE);
435844cd46caSbillm 
4359663207adSDon Brady 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4360663207adSDon Brady 
4361663207adSDon Brady 	return (msp->ms_id != dva_ms_id);
436244cd46caSbillm }
436344cd46caSbillm 
43648363e80aSGeorge Wilson /*
43658363e80aSGeorge Wilson  * ==========================================================================
43668363e80aSGeorge Wilson  * Metaslab allocation tracing facility
43678363e80aSGeorge Wilson  * ==========================================================================
43688363e80aSGeorge Wilson  */
43698363e80aSGeorge Wilson 
43708363e80aSGeorge Wilson /*
43718363e80aSGeorge Wilson  * Add an allocation trace element to the allocation tracing list.
43728363e80aSGeorge Wilson  */
43738363e80aSGeorge Wilson static void
metaslab_trace_add(zio_alloc_list_t * zal,metaslab_group_t * mg,metaslab_t * msp,uint64_t psize,uint32_t dva_id,uint64_t offset,int allocator)43748363e80aSGeorge Wilson metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4375f78cdc34SPaul Dagnelie     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4376f78cdc34SPaul Dagnelie     int allocator)
43778363e80aSGeorge Wilson {
43788363e80aSGeorge Wilson 	if (!metaslab_trace_enabled)
43798363e80aSGeorge Wilson 		return;
43808363e80aSGeorge Wilson 
43818363e80aSGeorge Wilson 	/*
43828363e80aSGeorge Wilson 	 * When the tracing list reaches its maximum we remove
43838363e80aSGeorge Wilson 	 * the second element in the list before adding a new one.
43848363e80aSGeorge Wilson 	 * By removing the second element we preserve the original
43858363e80aSGeorge Wilson 	 * entry as a clue to what allocations steps have already been
43868363e80aSGeorge Wilson 	 * performed.
43878363e80aSGeorge Wilson 	 */
43888363e80aSGeorge Wilson 	if (zal->zal_size == metaslab_trace_max_entries) {
43898363e80aSGeorge Wilson 		metaslab_alloc_trace_t *mat_next;
43908363e80aSGeorge Wilson #ifdef DEBUG
43918363e80aSGeorge Wilson 		panic("too many entries in allocation list");
43928363e80aSGeorge Wilson #endif
43934d7988d6SPaul Dagnelie 		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
43948363e80aSGeorge Wilson 		zal->zal_size--;
43958363e80aSGeorge Wilson 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
43968363e80aSGeorge Wilson 		list_remove(&zal->zal_list, mat_next);
43978363e80aSGeorge Wilson 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
43988363e80aSGeorge Wilson 	}
43998363e80aSGeorge Wilson 
44008363e80aSGeorge Wilson 	metaslab_alloc_trace_t *mat =
44018363e80aSGeorge Wilson 	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
44028363e80aSGeorge Wilson 	list_link_init(&mat->mat_list_node);
44038363e80aSGeorge Wilson 	mat->mat_mg = mg;
44048363e80aSGeorge Wilson 	mat->mat_msp = msp;
44058363e80aSGeorge Wilson 	mat->mat_size = psize;
44068363e80aSGeorge Wilson 	mat->mat_dva_id = dva_id;
44078363e80aSGeorge Wilson 	mat->mat_offset = offset;
44088363e80aSGeorge Wilson 	mat->mat_weight = 0;
4409f78cdc34SPaul Dagnelie 	mat->mat_allocator = allocator;
44108363e80aSGeorge Wilson 
44118363e80aSGeorge Wilson 	if (msp != NULL)
44128363e80aSGeorge Wilson 		mat->mat_weight = msp->ms_weight;
44138363e80aSGeorge Wilson 
44148363e80aSGeorge Wilson 	/*
44158363e80aSGeorge Wilson 	 * The list is part of the zio so locking is not required. Only
44168363e80aSGeorge Wilson 	 * a single thread will perform allocations for a given zio.
44178363e80aSGeorge Wilson 	 */
44188363e80aSGeorge Wilson 	list_insert_tail(&zal->zal_list, mat);
44198363e80aSGeorge Wilson 	zal->zal_size++;
44208363e80aSGeorge Wilson 
44218363e80aSGeorge Wilson 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
44228363e80aSGeorge Wilson }
44238363e80aSGeorge Wilson 
44248363e80aSGeorge Wilson void
metaslab_trace_init(zio_alloc_list_t * zal)44258363e80aSGeorge Wilson metaslab_trace_init(zio_alloc_list_t *zal)
44268363e80aSGeorge Wilson {
44278363e80aSGeorge Wilson 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
44288363e80aSGeorge Wilson 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
44298363e80aSGeorge Wilson 	zal->zal_size = 0;
44308363e80aSGeorge Wilson }
44318363e80aSGeorge Wilson 
44328363e80aSGeorge Wilson void
metaslab_trace_fini(zio_alloc_list_t * zal)44338363e80aSGeorge Wilson metaslab_trace_fini(zio_alloc_list_t *zal)
44348363e80aSGeorge Wilson {
44358363e80aSGeorge Wilson 	metaslab_alloc_trace_t *mat;
44368363e80aSGeorge Wilson 
44378363e80aSGeorge Wilson 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
44388363e80aSGeorge Wilson 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
44398363e80aSGeorge Wilson 	list_destroy(&zal->zal_list);
44408363e80aSGeorge Wilson 	zal->zal_size = 0;
44418363e80aSGeorge Wilson }
44428363e80aSGeorge Wilson 
44430f7643c7SGeorge Wilson /*
44440f7643c7SGeorge Wilson  * ==========================================================================
44450f7643c7SGeorge Wilson  * Metaslab block operations
44460f7643c7SGeorge Wilson  * ==========================================================================
44470f7643c7SGeorge Wilson  */
44480f7643c7SGeorge Wilson 
44490f7643c7SGeorge Wilson static void
metaslab_group_alloc_increment(spa_t * spa,uint64_t vdev,void * tag,int flags,int allocator)4450f78cdc34SPaul Dagnelie metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
4451f78cdc34SPaul Dagnelie     int allocator)
44520f7643c7SGeorge Wilson {
44530f7643c7SGeorge Wilson 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4454f78cdc34SPaul Dagnelie 	    (flags & METASLAB_DONT_THROTTLE))
44550f7643c7SGeorge Wilson 		return;
44560f7643c7SGeorge Wilson 
44570f7643c7SGeorge Wilson 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
44580f7643c7SGeorge Wilson 	if (!mg->mg_class->mc_alloc_throttle_enabled)
44590f7643c7SGeorge Wilson 		return;
44600f7643c7SGeorge Wilson 
4461e914ace2STim Schumacher 	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
4462f78cdc34SPaul Dagnelie }
4463f78cdc34SPaul Dagnelie 
4464f78cdc34SPaul Dagnelie static void
metaslab_group_increment_qdepth(metaslab_group_t * mg,int allocator)4465f78cdc34SPaul Dagnelie metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4466f78cdc34SPaul Dagnelie {
4467f78cdc34SPaul Dagnelie 	uint64_t max = mg->mg_max_alloc_queue_depth;
4468f78cdc34SPaul Dagnelie 	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4469f78cdc34SPaul Dagnelie 	while (cur < max) {
4470f78cdc34SPaul Dagnelie 		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
4471f78cdc34SPaul Dagnelie 		    cur, cur + 1) == cur) {
4472f78cdc34SPaul Dagnelie 			atomic_inc_64(
4473f78cdc34SPaul Dagnelie 			    &mg->mg_class->mc_alloc_max_slots[allocator]);
4474f78cdc34SPaul Dagnelie 			return;
4475f78cdc34SPaul Dagnelie 		}
4476f78cdc34SPaul Dagnelie 		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4477f78cdc34SPaul Dagnelie 	}
44780f7643c7SGeorge Wilson }
44790f7643c7SGeorge Wilson 
44800f7643c7SGeorge Wilson void
metaslab_group_alloc_decrement(spa_t * spa,uint64_t vdev,void * tag,int flags,int allocator,boolean_t io_complete)4481f78cdc34SPaul Dagnelie metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
4482f78cdc34SPaul Dagnelie     int allocator, boolean_t io_complete)
44830f7643c7SGeorge Wilson {
44840f7643c7SGeorge Wilson 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4485f78cdc34SPaul Dagnelie 	    (flags & METASLAB_DONT_THROTTLE))
44860f7643c7SGeorge Wilson 		return;
44870f7643c7SGeorge Wilson 
44880f7643c7SGeorge Wilson 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
44890f7643c7SGeorge Wilson 	if (!mg->mg_class->mc_alloc_throttle_enabled)
44900f7643c7SGeorge Wilson 		return;
44910f7643c7SGeorge Wilson 
4492e914ace2STim Schumacher 	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
4493f78cdc34SPaul Dagnelie 	if (io_complete)
4494f78cdc34SPaul Dagnelie 		metaslab_group_increment_qdepth(mg, allocator);
44950f7643c7SGeorge Wilson }
44960f7643c7SGeorge Wilson 
44970f7643c7SGeorge Wilson void
metaslab_group_alloc_verify(spa_t * spa,const blkptr_t * bp,void * tag,int allocator)4498f78cdc34SPaul Dagnelie metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
4499f78cdc34SPaul Dagnelie     int allocator)
45000f7643c7SGeorge Wilson {
45010f7643c7SGeorge Wilson #ifdef ZFS_DEBUG
45020f7643c7SGeorge Wilson 	const dva_t *dva = bp->blk_dva;
45030f7643c7SGeorge Wilson 	int ndvas = BP_GET_NDVAS(bp);
45040f7643c7SGeorge Wilson 
45050f7643c7SGeorge Wilson 	for (int d = 0; d < ndvas; d++) {
45060f7643c7SGeorge Wilson 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
45070f7643c7SGeorge Wilson 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4508e914ace2STim Schumacher 		VERIFY(zfs_refcount_not_held(
4509e914ace2STim Schumacher 		    &mg->mg_alloc_queue_depth[allocator], tag));
45100f7643c7SGeorge Wilson 	}
45110f7643c7SGeorge Wilson #endif
45120f7643c7SGeorge Wilson }
45130f7643c7SGeorge Wilson 
451444cd46caSbillm static uint64_t
metaslab_block_alloc(metaslab_t * msp,uint64_t size,uint64_t txg)45158363e80aSGeorge Wilson metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
45168363e80aSGeorge Wilson {
45178363e80aSGeorge Wilson 	uint64_t start;
451886714001SSerapheim Dimitropoulos 	range_tree_t *rt = msp->ms_allocatable;
45198363e80aSGeorge Wilson 	metaslab_class_t *mc = msp->ms_group->mg_class;
45208363e80aSGeorge Wilson 
4521814dcd43SSerapheim Dimitropoulos 	ASSERT(MUTEX_HELD(&msp->ms_lock));
45228363e80aSGeorge Wilson 	VERIFY(!msp->ms_condensing);
4523084fd14fSBrian Behlendorf 	VERIFY0(msp->ms_disabled);
45248363e80aSGeorge Wilson 
45258363e80aSGeorge Wilson 	start = mc->mc_ops->msop_alloc(msp, size);
45268363e80aSGeorge Wilson 	if (start != -1ULL) {
45278363e80aSGeorge Wilson 		metaslab_group_t *mg = msp->ms_group;
45288363e80aSGeorge Wilson 		vdev_t *vd = mg->mg_vd;
45298363e80aSGeorge Wilson 
45308363e80aSGeorge Wilson 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
45318363e80aSGeorge Wilson 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
45328363e80aSGeorge Wilson 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
45338363e80aSGeorge Wilson 		range_tree_remove(rt, start, size);
4534084fd14fSBrian Behlendorf 		range_tree_clear(msp->ms_trim, start, size);
45358363e80aSGeorge Wilson 
453686714001SSerapheim Dimitropoulos 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
45378363e80aSGeorge Wilson 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
45388363e80aSGeorge Wilson 
453986714001SSerapheim Dimitropoulos 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4540af1d63abSPaul Dagnelie 		msp->ms_allocating_total += size;
45418363e80aSGeorge Wilson 
45428363e80aSGeorge Wilson 		/* Track the last successful allocation */
45438363e80aSGeorge Wilson 		msp->ms_alloc_txg = txg;
45448363e80aSGeorge Wilson 		metaslab_verify_space(msp, txg);
45458363e80aSGeorge Wilson 	}
45468363e80aSGeorge Wilson 
45478363e80aSGeorge Wilson 	/*
45488363e80aSGeorge Wilson 	 * Now that we've attempted the allocation we need to update the
45498363e80aSGeorge Wilson 	 * metaslab's maximum block size since it may have changed.
45508363e80aSGeorge Wilson 	 */
4551af1d63abSPaul Dagnelie 	msp->ms_max_size = metaslab_largest_allocatable(msp);
45528363e80aSGeorge Wilson 	return (start);
45538363e80aSGeorge Wilson }
45548363e80aSGeorge Wilson 
4555f78cdc34SPaul Dagnelie /*
4556f78cdc34SPaul Dagnelie  * Find the metaslab with the highest weight that is less than what we've
4557f78cdc34SPaul Dagnelie  * already tried.  In the common case, this means that we will examine each
4558f78cdc34SPaul Dagnelie  * metaslab at most once. Note that concurrent callers could reorder metaslabs
4559f78cdc34SPaul Dagnelie  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4560f78cdc34SPaul Dagnelie  * activated by another thread, and we fail to allocate from the metaslab we
4561f78cdc34SPaul Dagnelie  * have selected, we may not try the newly-activated metaslab, and instead
4562f78cdc34SPaul Dagnelie  * activate another metaslab.  This is not optimal, but generally does not cause
4563f78cdc34SPaul Dagnelie  * any problems (a possible exception being if every metaslab is completely full
4564f78cdc34SPaul Dagnelie  * except for the the newly-activated metaslab which we fail to examine).
4565f78cdc34SPaul Dagnelie  */
4566f78cdc34SPaul Dagnelie static metaslab_t *
find_valid_metaslab(metaslab_group_t * mg,uint64_t activation_weight,dva_t * dva,int d,boolean_t want_unique,uint64_t asize,int allocator,boolean_t try_hard,zio_alloc_list_t * zal,metaslab_t * search,boolean_t * was_active)4567f78cdc34SPaul Dagnelie find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4568663207adSDon Brady     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4569af1d63abSPaul Dagnelie     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4570af1d63abSPaul Dagnelie     boolean_t *was_active)
4571f78cdc34SPaul Dagnelie {
4572f78cdc34SPaul Dagnelie 	avl_index_t idx;
4573f78cdc34SPaul Dagnelie 	avl_tree_t *t = &mg->mg_metaslab_tree;
4574f78cdc34SPaul Dagnelie 	metaslab_t *msp = avl_find(t, search, &idx);
4575f78cdc34SPaul Dagnelie 	if (msp == NULL)
4576f78cdc34SPaul Dagnelie 		msp = avl_nearest(t, idx, AVL_AFTER);
4577f78cdc34SPaul Dagnelie 
4578f78cdc34SPaul Dagnelie 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4579f78cdc34SPaul Dagnelie 		int i;
4580af1d63abSPaul Dagnelie 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
4581f78cdc34SPaul Dagnelie 			metaslab_trace_add(zal, mg, msp, asize, d,
4582f78cdc34SPaul Dagnelie 			    TRACE_TOO_SMALL, allocator);
4583f78cdc34SPaul Dagnelie 			continue;
4584f78cdc34SPaul Dagnelie 		}
4585f78cdc34SPaul Dagnelie 
4586f78cdc34SPaul Dagnelie 		/*
4587084fd14fSBrian Behlendorf 		 * If the selected metaslab is condensing or disabled,
4588084fd14fSBrian Behlendorf 		 * skip it.
4589f78cdc34SPaul Dagnelie 		 */
4590084fd14fSBrian Behlendorf 		if (msp->ms_condensing || msp->ms_disabled > 0)
4591f78cdc34SPaul Dagnelie 			continue;
4592f78cdc34SPaul Dagnelie 
4593f78cdc34SPaul Dagnelie 		*was_active = msp->ms_allocator != -1;
4594f78cdc34SPaul Dagnelie 		/*
4595f78cdc34SPaul Dagnelie 		 * If we're activating as primary, this is our first allocation
4596f78cdc34SPaul Dagnelie 		 * from this disk, so we don't need to check how close we are.
4597f78cdc34SPaul Dagnelie 		 * If the metaslab under consideration was already active,
4598f78cdc34SPaul Dagnelie 		 * we're getting desperate enough to steal another allocator's
4599f78cdc34SPaul Dagnelie 		 * metaslab, so we still don't care about distances.
4600f78cdc34SPaul Dagnelie 		 */
4601f78cdc34SPaul Dagnelie 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4602f78cdc34SPaul Dagnelie 			break;
4603f78cdc34SPaul Dagnelie 
4604f78cdc34SPaul Dagnelie 		for (i = 0; i < d; i++) {
4605663207adSDon Brady 			if (want_unique &&
4606663207adSDon Brady 			    !metaslab_is_unique(msp, &dva[i]))
4607663207adSDon Brady 				break;  /* try another metaslab */
4608f78cdc34SPaul Dagnelie 		}
4609f78cdc34SPaul Dagnelie 		if (i == d)
4610f78cdc34SPaul Dagnelie 			break;
4611f78cdc34SPaul Dagnelie 	}
4612f78cdc34SPaul Dagnelie 
4613f78cdc34SPaul Dagnelie 	if (msp != NULL) {
4614f78cdc34SPaul Dagnelie 		search->ms_weight = msp->ms_weight;
4615f78cdc34SPaul Dagnelie 		search->ms_start = msp->ms_start + 1;
4616f78cdc34SPaul Dagnelie 		search->ms_allocator = msp->ms_allocator;
4617f78cdc34SPaul Dagnelie 		search->ms_primary = msp->ms_primary;
4618f78cdc34SPaul Dagnelie 	}
4619f78cdc34SPaul Dagnelie 	return (msp);
4620f78cdc34SPaul Dagnelie }
4621f78cdc34SPaul Dagnelie 
4622af1d63abSPaul Dagnelie void
metaslab_active_mask_verify(metaslab_t * msp)4623af1d63abSPaul Dagnelie metaslab_active_mask_verify(metaslab_t *msp)
4624af1d63abSPaul Dagnelie {
4625af1d63abSPaul Dagnelie 	ASSERT(MUTEX_HELD(&msp->ms_lock));
4626af1d63abSPaul Dagnelie 
4627af1d63abSPaul Dagnelie 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4628af1d63abSPaul Dagnelie 		return;
4629af1d63abSPaul Dagnelie 
4630af1d63abSPaul Dagnelie 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4631af1d63abSPaul Dagnelie 		return;
4632af1d63abSPaul Dagnelie 
4633af1d63abSPaul Dagnelie 	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4634af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4635af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4636af1d63abSPaul Dagnelie 		VERIFY3S(msp->ms_allocator, !=, -1);
4637af1d63abSPaul Dagnelie 		VERIFY(msp->ms_primary);
4638af1d63abSPaul Dagnelie 		return;
4639af1d63abSPaul Dagnelie 	}
4640af1d63abSPaul Dagnelie 
4641af1d63abSPaul Dagnelie 	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4642af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4643af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4644af1d63abSPaul Dagnelie 		VERIFY3S(msp->ms_allocator, !=, -1);
4645af1d63abSPaul Dagnelie 		VERIFY(!msp->ms_primary);
4646af1d63abSPaul Dagnelie 		return;
4647af1d63abSPaul Dagnelie 	}
4648af1d63abSPaul Dagnelie 
4649af1d63abSPaul Dagnelie 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4650af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4651af1d63abSPaul Dagnelie 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4652af1d63abSPaul Dagnelie 		VERIFY3S(msp->ms_allocator, ==, -1);
4653af1d63abSPaul Dagnelie 		return;
4654af1d63abSPaul Dagnelie 	}
4655af1d63abSPaul Dagnelie }
4656af1d63abSPaul Dagnelie 
4657f78cdc34SPaul Dagnelie /* ARGSUSED */
46588363e80aSGeorge Wilson static uint64_t
metaslab_group_alloc_normal(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,boolean_t want_unique,dva_t * dva,int d,int allocator,boolean_t try_hard)46598363e80aSGeorge Wilson metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4660af1d63abSPaul Dagnelie     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4661af1d63abSPaul Dagnelie     int allocator, boolean_t try_hard)
4662fa9e4066Sahrens {
4663ecc2d604Sbonwick 	metaslab_t *msp = NULL;
4664ecc2d604Sbonwick 	uint64_t offset = -1ULL;
466544cd46caSbillm 
4666af1d63abSPaul Dagnelie 	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4667f78cdc34SPaul Dagnelie 	for (int i = 0; i < d; i++) {
4668f78cdc34SPaul Dagnelie 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4669f78cdc34SPaul Dagnelie 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
467044cd46caSbillm 			activation_weight = METASLAB_WEIGHT_SECONDARY;
4671f78cdc34SPaul Dagnelie 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4672f78cdc34SPaul Dagnelie 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4673b86e7e3fSAlexander Motin 			activation_weight = METASLAB_WEIGHT_CLAIM;
4674d6e555bdSGeorge Wilson 			break;
4675d6e555bdSGeorge Wilson 		}
4676d6e555bdSGeorge Wilson 	}
4677fa9e4066Sahrens 
4678f78cdc34SPaul Dagnelie 	/*
4679f78cdc34SPaul Dagnelie 	 * If we don't have enough metaslabs active to fill the entire array, we
4680f78cdc34SPaul Dagnelie 	 * just use the 0th slot.
4681f78cdc34SPaul Dagnelie 	 */
4682b86e7e3fSAlexander Motin 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
4683f78cdc34SPaul Dagnelie 		allocator = 0;
4684f78cdc34SPaul Dagnelie 
4685f78cdc34SPaul Dagnelie 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4686f78cdc34SPaul Dagnelie 
46878363e80aSGeorge Wilson 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
46888363e80aSGeorge Wilson 	search->ms_weight = UINT64_MAX;
46898363e80aSGeorge Wilson 	search->ms_start = 0;
4690f78cdc34SPaul Dagnelie 	/*
4691f78cdc34SPaul Dagnelie 	 * At the end of the metaslab tree are the already-active metaslabs,
4692f78cdc34SPaul Dagnelie 	 * first the primaries, then the secondaries. When we resume searching
4693f78cdc34SPaul Dagnelie 	 * through the tree, we need to consider ms_allocator and ms_primary so
4694f78cdc34SPaul Dagnelie 	 * we start in the location right after where we left off, and don't
4695f78cdc34SPaul Dagnelie 	 * accidentally loop forever considering the same metaslabs.
4696f78cdc34SPaul Dagnelie 	 */
4697f78cdc34SPaul Dagnelie 	search->ms_allocator = -1;
4698f78cdc34SPaul Dagnelie 	search->ms_primary = B_TRUE;
4699ecc2d604Sbonwick 	for (;;) {
4700f78cdc34SPaul Dagnelie 		boolean_t was_active = B_FALSE;
4701d6e555bdSGeorge Wilson 
4702ecc2d604Sbonwick 		mutex_enter(&mg->mg_lock);
47038363e80aSGeorge Wilson 
4704f78cdc34SPaul Dagnelie 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4705f78cdc34SPaul Dagnelie 		    mg->mg_primaries[allocator] != NULL) {
4706f78cdc34SPaul Dagnelie 			msp = mg->mg_primaries[allocator];
4707af1d63abSPaul Dagnelie 
4708af1d63abSPaul Dagnelie 			/*
4709af1d63abSPaul Dagnelie 			 * Even though we don't hold the ms_lock for the
4710af1d63abSPaul Dagnelie 			 * primary metaslab, those fields should not
4711af1d63abSPaul Dagnelie 			 * change while we hold the mg_lock. Thus is is
4712af1d63abSPaul Dagnelie 			 * safe to make assertions on them.
4713af1d63abSPaul Dagnelie 			 */
4714af1d63abSPaul Dagnelie 			ASSERT(msp->ms_primary);
4715af1d63abSPaul Dagnelie 			ASSERT3S(msp->ms_allocator, ==, allocator);
4716af1d63abSPaul Dagnelie 			ASSERT(msp->ms_loaded);
4717af1d63abSPaul Dagnelie 
4718f78cdc34SPaul Dagnelie 			was_active = B_TRUE;
4719af1d63abSPaul Dagnelie 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4720f78cdc34SPaul Dagnelie 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4721b86e7e3fSAlexander Motin 		    mg->mg_secondaries[allocator] != NULL) {
4722f78cdc34SPaul Dagnelie 			msp = mg->mg_secondaries[allocator];
4723af1d63abSPaul Dagnelie 
4724af1d63abSPaul Dagnelie 			/*
4725af1d63abSPaul Dagnelie 			 * See comment above about the similar assertions
4726af1d63abSPaul Dagnelie 			 * for the primary metaslab.
4727af1d63abSPaul Dagnelie 			 */
4728af1d63abSPaul Dagnelie 			ASSERT(!msp->ms_primary);
4729af1d63abSPaul Dagnelie 			ASSERT3S(msp->ms_allocator, ==, allocator);
4730af1d63abSPaul Dagnelie 			ASSERT(msp->ms_loaded);
4731af1d63abSPaul Dagnelie 
4732f78cdc34SPaul Dagnelie 			was_active = B_TRUE;
4733af1d63abSPaul Dagnelie 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4734f78cdc34SPaul Dagnelie 		} else {
4735f78cdc34SPaul Dagnelie 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
4736af1d63abSPaul Dagnelie 			    want_unique, asize, allocator, try_hard, zal,
4737af1d63abSPaul Dagnelie 			    search, &was_active);
4738ecc2d604Sbonwick 		}
4739f78cdc34SPaul Dagnelie 
4740ecc2d604Sbonwick 		mutex_exit(&mg->mg_lock);
47418363e80aSGeorge Wilson 		if (msp == NULL) {
47428363e80aSGeorge Wilson 			kmem_free(search, sizeof (*search));
474344cd46caSbillm 			return (-1ULL);
47448363e80aSGeorge Wilson 		}
474522e30981SGeorge Wilson 		mutex_enter(&msp->ms_lock);
4746af1d63abSPaul Dagnelie 
4747af1d63abSPaul Dagnelie 		metaslab_active_mask_verify(msp);
4748af1d63abSPaul Dagnelie 
4749af1d63abSPaul Dagnelie 		/*
4750af1d63abSPaul Dagnelie 		 * This code is disabled out because of issues with
4751af1d63abSPaul Dagnelie 		 * tracepoints in non-gpl kernel modules.
4752af1d63abSPaul Dagnelie 		 */
4753af1d63abSPaul Dagnelie #if 0
4754af1d63abSPaul Dagnelie 		DTRACE_PROBE3(ms__activation__attempt,
4755af1d63abSPaul Dagnelie 		    metaslab_t *, msp, uint64_t, activation_weight,
4756af1d63abSPaul Dagnelie 		    boolean_t, was_active);
4757af1d63abSPaul Dagnelie #endif
4758af1d63abSPaul Dagnelie 
4759aeb1c1b6Sgw 		/*
4760aeb1c1b6Sgw 		 * Ensure that the metaslab we have selected is still
4761aeb1c1b6Sgw 		 * capable of handling our request. It's possible that
4762aeb1c1b6Sgw 		 * another thread may have changed the weight while we
47638363e80aSGeorge Wilson 		 * were blocked on the metaslab lock. We check the
4764af1d63abSPaul Dagnelie 		 * active status first to see if we need to set_selected_txg
47658363e80aSGeorge Wilson 		 * a new metaslab.
4766aeb1c1b6Sgw 		 */
47678363e80aSGeorge Wilson 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4768af1d63abSPaul Dagnelie 			ASSERT3S(msp->ms_allocator, ==, -1);
4769aeb1c1b6Sgw 			mutex_exit(&msp->ms_lock);
4770aeb1c1b6Sgw 			continue;
4771aeb1c1b6Sgw 		}
4772aeb1c1b6Sgw 
4773f78cdc34SPaul Dagnelie 		/*
4774af1d63abSPaul Dagnelie 		 * If the metaslab was activated for another allocator
4775af1d63abSPaul Dagnelie 		 * while we were waiting in the ms_lock above, or it's
4776af1d63abSPaul Dagnelie 		 * a primary and we're seeking a secondary (or vice versa),
4777af1d63abSPaul Dagnelie 		 * we go back and select a new metaslab.
4778f78cdc34SPaul Dagnelie 		 */
4779f78cdc34SPaul Dagnelie 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4780f78cdc34SPaul Dagnelie 		    (msp->ms_allocator != -1) &&
4781f78cdc34SPaul Dagnelie 		    (msp->ms_allocator != allocator || ((activation_weight ==
4782f78cdc34SPaul Dagnelie 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4783af1d63abSPaul Dagnelie 			ASSERT(msp->ms_loaded);
4784af1d63abSPaul Dagnelie 			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4785af1d63abSPaul Dagnelie 			    msp->ms_allocator != -1);
4786f78cdc34SPaul Dagnelie 			mutex_exit(&msp->ms_lock);
4787f78cdc34SPaul Dagnelie 			continue;
4788f78cdc34SPaul Dagnelie 		}
4789f78cdc34SPaul Dagnelie 
4790af1d63abSPaul Dagnelie 		/*
4791af1d63abSPaul Dagnelie 		 * This metaslab was used for claiming regions allocated
4792af1d63abSPaul Dagnelie 		 * by the ZIL during pool import. Once these regions are
4793af1d63abSPaul Dagnelie 		 * claimed we don't need to keep the CLAIM bit set
4794af1d63abSPaul Dagnelie 		 * anymore. Passivate this metaslab to zero its activation
4795af1d63abSPaul Dagnelie 		 * mask.
4796af1d63abSPaul Dagnelie 		 */
4797b86e7e3fSAlexander Motin 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4798b86e7e3fSAlexander Motin 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
4799af1d63abSPaul Dagnelie 			ASSERT(msp->ms_loaded);
4800af1d63abSPaul Dagnelie 			ASSERT3S(msp->ms_allocator, ==, -1);
4801f78cdc34SPaul Dagnelie 			metaslab_passivate(msp, msp->ms_weight &
4802f78cdc34SPaul Dagnelie 			    ~METASLAB_WEIGHT_CLAIM);
480344cd46caSbillm 			mutex_exit(&msp->ms_lock);
480444cd46caSbillm 			continue;
480544cd46caSbillm 		}
480644cd46caSbillm 
4807af1d63abSPaul Dagnelie 		metaslab_set_selected_txg(msp, txg);
4808af1d63abSPaul Dagnelie 
4809af1d63abSPaul Dagnelie 		int activation_error =
4810af1d63abSPaul Dagnelie 		    metaslab_activate(msp, allocator, activation_weight);
4811af1d63abSPaul Dagnelie 		metaslab_active_mask_verify(msp);
4812af1d63abSPaul Dagnelie 
4813af1d63abSPaul Dagnelie 		/*
4814af1d63abSPaul Dagnelie 		 * If the metaslab was activated by another thread for
4815af1d63abSPaul Dagnelie 		 * another allocator or activation_weight (EBUSY), or it
4816af1d63abSPaul Dagnelie 		 * failed because another metaslab was assigned as primary
4817af1d63abSPaul Dagnelie 		 * for this allocator (EEXIST) we continue using this
4818af1d63abSPaul Dagnelie 		 * metaslab for our allocation, rather than going on to a
4819af1d63abSPaul Dagnelie 		 * worse metaslab (we waited for that metaslab to be loaded
4820af1d63abSPaul Dagnelie 		 * after all).
4821af1d63abSPaul Dagnelie 		 *
4822af1d63abSPaul Dagnelie 		 * If the activation failed due to an I/O error or ENOSPC we
4823af1d63abSPaul Dagnelie 		 * skip to the next metaslab.
4824af1d63abSPaul Dagnelie 		 */
4825af1d63abSPaul Dagnelie 		boolean_t activated;
4826af1d63abSPaul Dagnelie 		if (activation_error == 0) {
4827af1d63abSPaul Dagnelie 			activated = B_TRUE;
4828af1d63abSPaul Dagnelie 		} else if (activation_error == EBUSY ||
4829af1d63abSPaul Dagnelie 		    activation_error == EEXIST) {
4830af1d63abSPaul Dagnelie 			activated = B_FALSE;
4831af1d63abSPaul Dagnelie 		} else {
4832fa9e4066Sahrens 			mutex_exit(&msp->ms_lock);
4833fa9e4066Sahrens 			continue;
4834fa9e4066Sahrens 		}
4835af1d63abSPaul Dagnelie 		ASSERT(msp->ms_loaded);
48368363e80aSGeorge Wilson 
48378363e80aSGeorge Wilson 		/*
48388363e80aSGeorge Wilson 		 * Now that we have the lock, recheck to see if we should
48398363e80aSGeorge Wilson 		 * continue to use this metaslab for this allocation. The
4840af1d63abSPaul Dagnelie 		 * the metaslab is now loaded so metaslab_should_allocate()
4841af1d63abSPaul Dagnelie 		 * can accurately determine if the allocation attempt should
48428363e80aSGeorge Wilson 		 * proceed.
48438363e80aSGeorge Wilson 		 */
4844af1d63abSPaul Dagnelie 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
48458363e80aSGeorge Wilson 			/* Passivate this metaslab and select a new one. */
48468363e80aSGeorge Wilson 			metaslab_trace_add(zal, mg, msp, asize, d,
4847f78cdc34SPaul Dagnelie 			    TRACE_TOO_SMALL, allocator);
48488363e80aSGeorge Wilson 			goto next;
48498363e80aSGeorge Wilson 		}
4850ecc2d604Sbonwick 
485103f8c366SGeorge Wilson 		/*
4852af1d63abSPaul Dagnelie 		 * If this metaslab is currently condensing then pick again
4853af1d63abSPaul Dagnelie 		 * as we can't manipulate this metaslab until it's committed
4854094e47e9SGeorge Wilson 		 * to disk. If this metaslab is being initialized, we shouldn't
4855094e47e9SGeorge Wilson 		 * allocate from it since the allocated region might be
4856094e47e9SGeorge Wilson 		 * overwritten after allocation.
485703f8c366SGeorge Wilson 		 */
48580713e232SGeorge Wilson 		if (msp->ms_condensing) {
48598363e80aSGeorge Wilson 			metaslab_trace_add(zal, mg, msp, asize, d,
4860f78cdc34SPaul Dagnelie 			    TRACE_CONDENSING, allocator);
4861af1d63abSPaul Dagnelie 			if (activated) {
4862af1d63abSPaul Dagnelie 				metaslab_passivate(msp, msp->ms_weight &
4863af1d63abSPaul Dagnelie 				    ~METASLAB_ACTIVE_MASK);
4864af1d63abSPaul Dagnelie 			}
486503f8c366SGeorge Wilson 			mutex_exit(&msp->ms_lock);
486603f8c366SGeorge Wilson 			continue;
4867084fd14fSBrian Behlendorf 		} else if (msp->ms_disabled > 0) {
4868094e47e9SGeorge Wilson 			metaslab_trace_add(zal, mg, msp, asize, d,
4869084fd14fSBrian Behlendorf 			    TRACE_DISABLED, allocator);
4870af1d63abSPaul Dagnelie 			if (activated) {
4871af1d63abSPaul Dagnelie 				metaslab_passivate(msp, msp->ms_weight &
4872af1d63abSPaul Dagnelie 				    ~METASLAB_ACTIVE_MASK);
4873af1d63abSPaul Dagnelie 			}
4874094e47e9SGeorge Wilson 			mutex_exit(&msp->ms_lock);
4875094e47e9SGeorge Wilson 			continue;
487603f8c366SGeorge Wilson 		}
487703f8c366SGeorge Wilson 
48788363e80aSGeorge Wilson 		offset = metaslab_block_alloc(msp, asize, txg);
4879f78cdc34SPaul Dagnelie 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
48808363e80aSGeorge Wilson 
48818363e80aSGeorge Wilson 		if (offset != -1ULL) {
48828363e80aSGeorge Wilson 			/* Proactively passivate the metaslab, if needed */
4883af1d63abSPaul Dagnelie 			if (activated)
4884af1d63abSPaul Dagnelie 				metaslab_segment_may_passivate(msp);
4885ecc2d604Sbonwick 			break;
48868363e80aSGeorge Wilson 		}
48878363e80aSGeorge Wilson next:
48888363e80aSGeorge Wilson 		ASSERT(msp->ms_loaded);
48898363e80aSGeorge Wilson 
4890af1d63abSPaul Dagnelie 		/*
4891af1d63abSPaul Dagnelie 		 * This code is disabled out because of issues with
4892af1d63abSPaul Dagnelie 		 * tracepoints in non-gpl kernel modules.
4893af1d63abSPaul Dagnelie 		 */
4894af1d63abSPaul Dagnelie #if 0
4895af1d63abSPaul Dagnelie 		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4896af1d63abSPaul Dagnelie 		    uint64_t, asize);
4897af1d63abSPaul Dagnelie #endif
4898af1d63abSPaul Dagnelie 
48998363e80aSGeorge Wilson 		/*
49008363e80aSGeorge Wilson 		 * We were unable to allocate from this metaslab so determine
49018363e80aSGeorge Wilson 		 * a new weight for this metaslab. Now that we have loaded
49028363e80aSGeorge Wilson 		 * the metaslab we can provide a better hint to the metaslab
49038363e80aSGeorge Wilson 		 * selector.
49048363e80aSGeorge Wilson 		 *
49058363e80aSGeorge Wilson 		 * For space-based metaslabs, we use the maximum block size.
49068363e80aSGeorge Wilson 		 * This information is only available when the metaslab
49078363e80aSGeorge Wilson 		 * is loaded and is more accurate than the generic free
49088363e80aSGeorge Wilson 		 * space weight that was calculated by metaslab_weight().
49098363e80aSGeorge Wilson 		 * This information allows us to quickly compare the maximum
49108363e80aSGeorge Wilson 		 * available allocation in the metaslab to the allocation
49118363e80aSGeorge Wilson 		 * size being requested.
49128363e80aSGeorge Wilson 		 *
49138363e80aSGeorge Wilson 		 * For segment-based metaslabs, determine the new weight
49148363e80aSGeorge Wilson 		 * based on the highest bucket in the range tree. We
49158363e80aSGeorge Wilson 		 * explicitly use the loaded segment weight (i.e. the range
49168363e80aSGeorge Wilson 		 * tree histogram) since it contains the space that is
49178363e80aSGeorge Wilson 		 * currently available for allocation and is accurate
49188363e80aSGeorge Wilson 		 * even within a sync pass.
49198363e80aSGeorge Wilson 		 */
4920af1d63abSPaul Dagnelie 		uint64_t weight;
49218363e80aSGeorge Wilson 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
4922af1d63abSPaul Dagnelie 			weight = metaslab_largest_allocatable(msp);
49238363e80aSGeorge Wilson 			WEIGHT_SET_SPACEBASED(weight);
4924af1d63abSPaul Dagnelie 		} else {
4925af1d63abSPaul Dagnelie 			weight = metaslab_weight_from_range_tree(msp);
4926af1d63abSPaul Dagnelie 		}
4927af1d63abSPaul Dagnelie 
4928af1d63abSPaul Dagnelie 		if (activated) {
49298363e80aSGeorge Wilson 			metaslab_passivate(msp, weight);
49308363e80aSGeorge Wilson 		} else {
4931af1d63abSPaul Dagnelie 			/*
4932af1d63abSPaul Dagnelie 			 * For the case where we use the metaslab that is
4933af1d63abSPaul Dagnelie 			 * active for another allocator we want to make
4934af1d63abSPaul Dagnelie 			 * sure that we retain the activation mask.
4935af1d63abSPaul Dagnelie 			 *
4936af1d63abSPaul Dagnelie 			 * Note that we could attempt to use something like
4937af1d63abSPaul Dagnelie 			 * metaslab_recalculate_weight_and_sort() that
4938af1d63abSPaul Dagnelie 			 * retains the activation mask here. That function
4939af1d63abSPaul Dagnelie 			 * uses metaslab_weight() to set the weight though
4940af1d63abSPaul Dagnelie 			 * which is not as accurate as the calculations
4941af1d63abSPaul Dagnelie 			 * above.
4942af1d63abSPaul Dagnelie 			 */
4943af1d63abSPaul Dagnelie 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
4944af1d63abSPaul Dagnelie 			metaslab_group_sort(mg, msp, weight);
49458363e80aSGeorge Wilson 		}
4946af1d63abSPaul Dagnelie 		metaslab_active_mask_verify(msp);
4947ecc2d604Sbonwick 
49488363e80aSGeorge Wilson 		/*
49498363e80aSGeorge Wilson 		 * We have just failed an allocation attempt, check
49508363e80aSGeorge Wilson 		 * that metaslab_should_allocate() agrees. Otherwise,
49518363e80aSGeorge Wilson 		 * we may end up in an infinite loop retrying the same
49528363e80aSGeorge Wilson 		 * metaslab.
49538363e80aSGeorge Wilson 		 */
4954af1d63abSPaul Dagnelie 		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
4955663207adSDon Brady 
4956fa9e4066Sahrens 		mutex_exit(&msp->ms_lock);
4957fa9e4066Sahrens 	}
49588363e80aSGeorge Wilson 	mutex_exit(&msp->ms_lock);
49598363e80aSGeorge Wilson 	kmem_free(search, sizeof (*search));
49608363e80aSGeorge Wilson 	return (offset);
49618363e80aSGeorge Wilson }
4962fa9e4066Sahrens 
49638363e80aSGeorge Wilson static uint64_t
metaslab_group_alloc(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,boolean_t want_unique,dva_t * dva,int d,int allocator,boolean_t try_hard)49648363e80aSGeorge Wilson metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
4965af1d63abSPaul Dagnelie     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4966af1d63abSPaul Dagnelie     int allocator, boolean_t try_hard)
49678363e80aSGeorge Wilson {
49688363e80aSGeorge Wilson 	uint64_t offset;
49698363e80aSGeorge Wilson 	ASSERT(mg->mg_initialized);
4970ecc2d604Sbonwick 
4971663207adSDon Brady 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
4972af1d63abSPaul Dagnelie 	    dva, d, allocator, try_hard);
4973ecc2d604Sbonwick 
49748363e80aSGeorge Wilson 	mutex_enter(&mg->mg_lock);
49758363e80aSGeorge Wilson 	if (offset == -1ULL) {
49768363e80aSGeorge Wilson 		mg->mg_failed_allocations++;
49778363e80aSGeorge Wilson 		metaslab_trace_add(zal, mg, NULL, asize, d,
4978f78cdc34SPaul Dagnelie 		    TRACE_GROUP_FAILURE, allocator);
49798363e80aSGeorge Wilson 		if (asize == SPA_GANGBLOCKSIZE) {
49808363e80aSGeorge Wilson 			/*
49818363e80aSGeorge Wilson 			 * This metaslab group was unable to allocate
49828363e80aSGeorge Wilson 			 * the minimum gang block size so it must be out of
49838363e80aSGeorge Wilson 			 * space. We must notify the allocation throttle
49848363e80aSGeorge Wilson 			 * to start skipping allocation attempts to this
49858363e80aSGeorge Wilson 			 * metaslab group until more space becomes available.
49868363e80aSGeorge Wilson 			 * Note: this failure cannot be caused by the
49878363e80aSGeorge Wilson 			 * allocation throttle since the allocation throttle
49888363e80aSGeorge Wilson 			 * is only responsible for skipping devices and
49898363e80aSGeorge Wilson 			 * not failing block allocations.
49908363e80aSGeorge Wilson 			 */
49918363e80aSGeorge Wilson 			mg->mg_no_free_space = B_TRUE;
49928363e80aSGeorge Wilson 		}
49938363e80aSGeorge Wilson 	}
49948363e80aSGeorge Wilson 	mg->mg_allocations++;
49958363e80aSGeorge Wilson 	mutex_exit(&mg->mg_lock);
499644cd46caSbillm 	return (offset);
4997fa9e4066Sahrens }
4998fa9e4066Sahrens 
4999fa9e4066Sahrens /*
5000fa9e4066Sahrens  * Allocate a block for the specified i/o.
5001fa9e4066Sahrens  */
50025cabbc6bSPrashanth Sreenivasa int
metaslab_alloc_dva(spa_t * spa,metaslab_class_t * mc,uint64_t psize,dva_t * dva,int d,dva_t * hintdva,uint64_t txg,int flags,zio_alloc_list_t * zal,int allocator)50038654d025Sperrin metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
50048363e80aSGeorge Wilson     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
5005f78cdc34SPaul Dagnelie     zio_alloc_list_t *zal, int allocator)
5006fa9e4066Sahrens {
5007fa9e4066Sahrens 	metaslab_group_t *mg, *rotor;
5008fa9e4066Sahrens 	vdev_t *vd;
50098363e80aSGeorge Wilson 	boolean_t try_hard = B_FALSE;
5010fa9e4066Sahrens 
5011d80c45e0Sbonwick 	ASSERT(!DVA_IS_VALID(&dva[d]));
5012d80c45e0Sbonwick 
5013e05725b1Sbonwick 	/*
5014e05725b1Sbonwick 	 * For testing, make some blocks above a certain size be gang blocks.
5015663207adSDon Brady 	 * This will also test spilling from special to normal.
5016e05725b1Sbonwick 	 */
5017243952c7SMatt Ahrens 	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
5018f78cdc34SPaul Dagnelie 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
5019f78cdc34SPaul Dagnelie 		    allocator);
5020be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENOSPC));
50218363e80aSGeorge Wilson 	}
5022e05725b1Sbonwick 
5023fa9e4066Sahrens 	/*
5024fa9e4066Sahrens 	 * Start at the rotor and loop through all mgs until we find something.
5025b24ab676SJeff Bonwick 	 * Note that there's no locking on mc_rotor or mc_aliquot because
5026fa9e4066Sahrens 	 * nothing actually breaks if we miss a few updates -- we just won't
5027fa9e4066Sahrens 	 * allocate quite as evenly.  It all balances out over time.
502844cd46caSbillm 	 *
502967bd71c6Sperrin 	 * If we are doing ditto or log blocks, try to spread them across
503067bd71c6Sperrin 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
503167bd71c6Sperrin 	 * allocated all of our ditto blocks, then try and spread them out on
503267bd71c6Sperrin 	 * that vdev as much as possible.  If it turns out to not be possible,
503344cd46caSbillm 	 * gradually lower our standards until anything becomes acceptable.
503444cd46caSbillm 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
503544cd46caSbillm 	 * gives us hope of containing our fault domains to something we're
503644cd46caSbillm 	 * able to reason about.  Otherwise, any two top-level vdev failures
503744cd46caSbillm 	 * will guarantee the loss of data.  With consecutive allocation,
503844cd46caSbillm 	 * only two adjacent top-level vdev failures will result in data loss.
503944cd46caSbillm 	 *
504044cd46caSbillm 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
504144cd46caSbillm 	 * ourselves on the same vdev as our gang block header.  That
504244cd46caSbillm 	 * way, we can hope for locality in vdev_cache, plus it makes our
504344cd46caSbillm 	 * fault domains something tractable.
5044fa9e4066Sahrens 	 */
504544cd46caSbillm 	if (hintdva) {
504644cd46caSbillm 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
504788ecc943SGeorge Wilson 
504888ecc943SGeorge Wilson 		/*
504988ecc943SGeorge Wilson 		 * It's possible the vdev we're using as the hint no
50505cabbc6bSPrashanth Sreenivasa 		 * longer exists or its mg has been closed (e.g. by
50515cabbc6bSPrashanth Sreenivasa 		 * device removal).  Consult the rotor when
505288ecc943SGeorge Wilson 		 * all else fails.
505388ecc943SGeorge Wilson 		 */
50545cabbc6bSPrashanth Sreenivasa 		if (vd != NULL && vd->vdev_mg != NULL) {
505567bd71c6Sperrin 			mg = vd->vdev_mg;
505688ecc943SGeorge Wilson 
505788ecc943SGeorge Wilson 			if (flags & METASLAB_HINTBP_AVOID &&
505888ecc943SGeorge Wilson 			    mg->mg_next != NULL)
505988ecc943SGeorge Wilson 				mg = mg->mg_next;
506088ecc943SGeorge Wilson 		} else {
506188ecc943SGeorge Wilson 			mg = mc->mc_rotor;
506288ecc943SGeorge Wilson 		}
506344cd46caSbillm 	} else if (d != 0) {
506444cd46caSbillm 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
506544cd46caSbillm 		mg = vd->vdev_mg->mg_next;
506644cd46caSbillm 	} else {
5067663207adSDon Brady 		ASSERT(mc->mc_rotor != NULL);
506844cd46caSbillm 		mg = mc->mc_rotor;
506944cd46caSbillm 	}
507044cd46caSbillm 
50718654d025Sperrin 	/*
5072a1521560SJeff Bonwick 	 * If the hint put us into the wrong metaslab class, or into a
5073a1521560SJeff Bonwick 	 * metaslab group that has been passivated, just follow the rotor.
50748654d025Sperrin 	 */
5075a1521560SJeff Bonwick 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
50768654d025Sperrin 		mg = mc->mc_rotor;
50778654d025Sperrin 
50788654d025Sperrin 	rotor = mg;
507944cd46caSbillm top:
5080fa9e4066Sahrens 	do {
50818363e80aSGeorge Wilson 		boolean_t allocatable;
50828363e80aSGeorge Wilson 
5083a1521560SJeff Bonwick 		ASSERT(mg->mg_activation_count == 1);
5084fa9e4066Sahrens 		vd = mg->mg_vd;
50858ad4d6ddSJeff Bonwick 
50860a4e9518Sgw 		/*
5087e14bb325SJeff Bonwick 		 * Don't allocate from faulted devices.
50880a4e9518Sgw 		 */
50898363e80aSGeorge Wilson 		if (try_hard) {
50908ad4d6ddSJeff Bonwick 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
50918ad4d6ddSJeff Bonwick 			allocatable = vdev_allocatable(vd);
50928ad4d6ddSJeff Bonwick 			spa_config_exit(spa, SCL_ZIO, FTAG);
50938ad4d6ddSJeff Bonwick 		} else {
50948ad4d6ddSJeff Bonwick 			allocatable = vdev_allocatable(vd);
50958ad4d6ddSJeff Bonwick 		}
509622e30981SGeorge Wilson 
509722e30981SGeorge Wilson 		/*
509822e30981SGeorge Wilson 		 * Determine if the selected metaslab group is eligible
50990f7643c7SGeorge Wilson 		 * for allocations. If we're ganging then don't allow
51000f7643c7SGeorge Wilson 		 * this metaslab group to skip allocations since that would
51010f7643c7SGeorge Wilson 		 * inadvertently return ENOSPC and suspend the pool
510222e30981SGeorge Wilson 		 * even though space is still available.
510322e30981SGeorge Wilson 		 */
51048363e80aSGeorge Wilson 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
51050f7643c7SGeorge Wilson 			allocatable = metaslab_group_allocatable(mg, rotor,
5106dbcaafbdSAlexander Motin 			    psize, allocator, d);
51070f7643c7SGeorge Wilson 		}
510822e30981SGeorge Wilson 
51098363e80aSGeorge Wilson 		if (!allocatable) {
51108363e80aSGeorge Wilson 			metaslab_trace_add(zal, mg, NULL, psize, d,
5111f78cdc34SPaul Dagnelie 			    TRACE_NOT_ALLOCATABLE, allocator);
51120a4e9518Sgw 			goto next;
51138363e80aSGeorge Wilson 		}
51148ad4d6ddSJeff Bonwick 
51150f7643c7SGeorge Wilson 		ASSERT(mg->mg_initialized);
51160f7643c7SGeorge Wilson 
51170a4e9518Sgw 		/*
51188363e80aSGeorge Wilson 		 * Avoid writing single-copy data to a failing,
51198363e80aSGeorge Wilson 		 * non-redundant vdev, unless we've already tried all
51208363e80aSGeorge Wilson 		 * other vdevs.
51210a4e9518Sgw 		 */
51220a4e9518Sgw 		if ((vd->vdev_stat.vs_write_errors > 0 ||
51230a4e9518Sgw 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
51248363e80aSGeorge Wilson 		    d == 0 && !try_hard && vd->vdev_children == 0) {
51258363e80aSGeorge Wilson 			metaslab_trace_add(zal, mg, NULL, psize, d,
5126f78cdc34SPaul Dagnelie 			    TRACE_VDEV_ERROR, allocator);
51270a4e9518Sgw 			goto next;
51280a4e9518Sgw 		}
512944cd46caSbillm 
51308654d025Sperrin 		ASSERT(mg->mg_class == mc);
51318654d025Sperrin 
51328363e80aSGeorge Wilson 		uint64_t asize = vdev_psize_to_asize(vd, psize);
5133fa9e4066Sahrens 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5134fa9e4066Sahrens 
5135663207adSDon Brady 		/*
5136663207adSDon Brady 		 * If we don't need to try hard, then require that the
5137663207adSDon Brady 		 * block be on an different metaslab from any other DVAs
5138663207adSDon Brady 		 * in this BP (unique=true).  If we are trying hard, then
5139663207adSDon Brady 		 * allow any metaslab to be used (unique=false).
5140663207adSDon Brady 		 */
51418363e80aSGeorge Wilson 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
5142af1d63abSPaul Dagnelie 		    !try_hard, dva, d, allocator, try_hard);
51430f7643c7SGeorge Wilson 
514444cd46caSbillm 		if (offset != -1ULL) {
5145fa9e4066Sahrens 			/*
5146fa9e4066Sahrens 			 * If we've just selected this metaslab group,
5147fa9e4066Sahrens 			 * figure out whether the corresponding vdev is
5148fa9e4066Sahrens 			 * over- or under-used relative to the pool,
5149fa9e4066Sahrens 			 * and set an allocation bias to even it out.
5150fa9e4066Sahrens 			 */
51512e4c9986SGeorge Wilson 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
5152fa9e4066Sahrens 				vdev_stat_t *vs = &vd->vdev_stat;
5153b24ab676SJeff Bonwick 				int64_t vu, cu;
5154fa9e4066Sahrens 
515509c9d376SGeorge Wilson 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
515609c9d376SGeorge Wilson 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
5157fa9e4066Sahrens 
5158fa9e4066Sahrens 				/*
515909c9d376SGeorge Wilson 				 * Calculate how much more or less we should
516009c9d376SGeorge Wilson 				 * try to allocate from this device during
516109c9d376SGeorge Wilson 				 * this iteration around the rotor.
516209c9d376SGeorge Wilson 				 * For example, if a device is 80% full
516309c9d376SGeorge Wilson 				 * and the pool is 20% full then we should
516409c9d376SGeorge Wilson 				 * reduce allocations by 60% on this device.
516509c9d376SGeorge Wilson 				 *
516609c9d376SGeorge Wilson 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
516709c9d376SGeorge Wilson 				 *
516809c9d376SGeorge Wilson 				 * This reduces allocations by 307K for this
516909c9d376SGeorge Wilson 				 * iteration.
5170fa9e4066Sahrens 				 */
5171b24ab676SJeff Bonwick 				mg->mg_bias = ((cu - vu) *
517209c9d376SGeorge Wilson 				    (int64_t)mg->mg_aliquot) / 100;
51732e4c9986SGeorge Wilson 			} else if (!metaslab_bias_enabled) {
51742e4c9986SGeorge Wilson 				mg->mg_bias = 0;
5175fa9e4066Sahrens 			}
5176fa9e4066Sahrens 
5177b24ab676SJeff Bonwick 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
5178fa9e4066Sahrens 			    mg->mg_aliquot + mg->mg_bias) {
5179fa9e4066Sahrens 				mc->mc_rotor = mg->mg_next;
5180b24ab676SJeff Bonwick 				mc->mc_aliquot = 0;
5181fa9e4066Sahrens 			}
5182fa9e4066Sahrens 
518344cd46caSbillm 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
518444cd46caSbillm 			DVA_SET_OFFSET(&dva[d], offset);
5185e14bb325SJeff Bonwick 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
518644cd46caSbillm 			DVA_SET_ASIZE(&dva[d], asize);
5187fa9e4066Sahrens 
5188fa9e4066Sahrens 			return (0);
5189fa9e4066Sahrens 		}
51900a4e9518Sgw next:
5191fa9e4066Sahrens 		mc->mc_rotor = mg->mg_next;
5192b24ab676SJeff Bonwick 		mc->mc_aliquot = 0;
5193fa9e4066Sahrens 	} while ((mg = mg->mg_next) != rotor);
5194fa9e4066Sahrens 
51958363e80aSGeorge Wilson 	/*
51968363e80aSGeorge Wilson 	 * If we haven't tried hard, do so now.
51978363e80aSGeorge Wilson 	 */
51988363e80aSGeorge Wilson 	if (!try_hard) {
51998363e80aSGeorge Wilson 		try_hard = B_TRUE;
52008ad4d6ddSJeff Bonwick 		goto top;
52018ad4d6ddSJeff Bonwick 	}
52028ad4d6ddSJeff Bonwick 
520344cd46caSbillm 	bzero(&dva[d], sizeof (dva_t));
5204fa9e4066Sahrens 
5205f78cdc34SPaul Dagnelie 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
5206be6fd75aSMatthew Ahrens 	return (SET_ERROR(ENOSPC));
5207fa9e4066Sahrens }
5208fa9e4066Sahrens 
52095cabbc6bSPrashanth Sreenivasa void
metaslab_free_concrete(vdev_t * vd,uint64_t offset,uint64_t asize,boolean_t checkpoint)52105cabbc6bSPrashanth Sreenivasa metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
521186714001SSerapheim Dimitropoulos     boolean_t checkpoint)
52125cabbc6bSPrashanth Sreenivasa {
52135cabbc6bSPrashanth Sreenivasa 	metaslab_t *msp;
52145cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
52155cabbc6bSPrashanth Sreenivasa 
52165cabbc6bSPrashanth Sreenivasa 	ASSERT(vdev_is_concrete(vd));
52175cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
52185cabbc6bSPrashanth Sreenivasa 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
52195cabbc6bSPrashanth Sreenivasa 
52205cabbc6bSPrashanth Sreenivasa 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
52215cabbc6bSPrashanth Sreenivasa 
52225cabbc6bSPrashanth Sreenivasa 	VERIFY(!msp->ms_condensing);
52235cabbc6bSPrashanth Sreenivasa 	VERIFY3U(offset, >=, msp->ms_start);
52245cabbc6bSPrashanth Sreenivasa 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
52255cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
52265cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
52275cabbc6bSPrashanth Sreenivasa 
52285cabbc6bSPrashanth Sreenivasa 	metaslab_check_free_impl(vd, offset, asize);
522986714001SSerapheim Dimitropoulos 
52305cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
523186714001SSerapheim Dimitropoulos 	if (range_tree_is_empty(msp->ms_freeing) &&
523286714001SSerapheim Dimitropoulos 	    range_tree_is_empty(msp->ms_checkpointing)) {
523386714001SSerapheim Dimitropoulos 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
523486714001SSerapheim Dimitropoulos 	}
523586714001SSerapheim Dimitropoulos 
523686714001SSerapheim Dimitropoulos 	if (checkpoint) {
523786714001SSerapheim Dimitropoulos 		ASSERT(spa_has_checkpoint(spa));
523886714001SSerapheim Dimitropoulos 		range_tree_add(msp->ms_checkpointing, offset, asize);
523986714001SSerapheim Dimitropoulos 	} else {
524086714001SSerapheim Dimitropoulos 		range_tree_add(msp->ms_freeing, offset, asize);
52415cabbc6bSPrashanth Sreenivasa 	}
52425cabbc6bSPrashanth Sreenivasa 	mutex_exit(&msp->ms_lock);
52435cabbc6bSPrashanth Sreenivasa }
52445cabbc6bSPrashanth Sreenivasa 
52455cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
52465cabbc6bSPrashanth Sreenivasa void
metaslab_free_impl_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)52475cabbc6bSPrashanth Sreenivasa metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
52485cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
52495cabbc6bSPrashanth Sreenivasa {
525086714001SSerapheim Dimitropoulos 	boolean_t *checkpoint = arg;
525186714001SSerapheim Dimitropoulos 
525286714001SSerapheim Dimitropoulos 	ASSERT3P(checkpoint, !=, NULL);
52535cabbc6bSPrashanth Sreenivasa 
52545cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops->vdev_op_remap != NULL)
525586714001SSerapheim Dimitropoulos 		vdev_indirect_mark_obsolete(vd, offset, size);
52565cabbc6bSPrashanth Sreenivasa 	else
525786714001SSerapheim Dimitropoulos 		metaslab_free_impl(vd, offset, size, *checkpoint);
52585cabbc6bSPrashanth Sreenivasa }
52595cabbc6bSPrashanth Sreenivasa 
52605cabbc6bSPrashanth Sreenivasa static void
metaslab_free_impl(vdev_t * vd,uint64_t offset,uint64_t size,boolean_t checkpoint)52615cabbc6bSPrashanth Sreenivasa metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
526286714001SSerapheim Dimitropoulos     boolean_t checkpoint)
52635cabbc6bSPrashanth Sreenivasa {
52645cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
52655cabbc6bSPrashanth Sreenivasa 
52665cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
52675cabbc6bSPrashanth Sreenivasa 
526886714001SSerapheim Dimitropoulos 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
52695cabbc6bSPrashanth Sreenivasa 		return;
52705cabbc6bSPrashanth Sreenivasa 
52715cabbc6bSPrashanth Sreenivasa 	if (spa->spa_vdev_removal != NULL &&
52723a4b1be9SMatthew Ahrens 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
52735cabbc6bSPrashanth Sreenivasa 	    vdev_is_concrete(vd)) {
52745cabbc6bSPrashanth Sreenivasa 		/*
52755cabbc6bSPrashanth Sreenivasa 		 * Note: we check if the vdev is concrete because when
52765cabbc6bSPrashanth Sreenivasa 		 * we complete the removal, we first change the vdev to be
52775cabbc6bSPrashanth Sreenivasa 		 * an indirect vdev (in open context), and then (in syncing
52785cabbc6bSPrashanth Sreenivasa 		 * context) clear spa_vdev_removal.
52795cabbc6bSPrashanth Sreenivasa 		 */
528086714001SSerapheim Dimitropoulos 		free_from_removing_vdev(vd, offset, size);
52815cabbc6bSPrashanth Sreenivasa 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
528286714001SSerapheim Dimitropoulos 		vdev_indirect_mark_obsolete(vd, offset, size);
52835cabbc6bSPrashanth Sreenivasa 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
528486714001SSerapheim Dimitropoulos 		    metaslab_free_impl_cb, &checkpoint);
52855cabbc6bSPrashanth Sreenivasa 	} else {
528686714001SSerapheim Dimitropoulos 		metaslab_free_concrete(vd, offset, size, checkpoint);
52875cabbc6bSPrashanth Sreenivasa 	}
52885cabbc6bSPrashanth Sreenivasa }
52895cabbc6bSPrashanth Sreenivasa 
52905cabbc6bSPrashanth Sreenivasa typedef struct remap_blkptr_cb_arg {
52915cabbc6bSPrashanth Sreenivasa 	blkptr_t *rbca_bp;
52925cabbc6bSPrashanth Sreenivasa 	spa_remap_cb_t rbca_cb;
52935cabbc6bSPrashanth Sreenivasa 	vdev_t *rbca_remap_vd;
52945cabbc6bSPrashanth Sreenivasa 	uint64_t rbca_remap_offset;
52955cabbc6bSPrashanth Sreenivasa 	void *rbca_cb_arg;
52965cabbc6bSPrashanth Sreenivasa } remap_blkptr_cb_arg_t;
52975cabbc6bSPrashanth Sreenivasa 
52985cabbc6bSPrashanth Sreenivasa void
remap_blkptr_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)52995cabbc6bSPrashanth Sreenivasa remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
53005cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
53015cabbc6bSPrashanth Sreenivasa {
53025cabbc6bSPrashanth Sreenivasa 	remap_blkptr_cb_arg_t *rbca = arg;
53035cabbc6bSPrashanth Sreenivasa 	blkptr_t *bp = rbca->rbca_bp;
53045cabbc6bSPrashanth Sreenivasa 
53055cabbc6bSPrashanth Sreenivasa 	/* We can not remap split blocks. */
53065cabbc6bSPrashanth Sreenivasa 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
53075cabbc6bSPrashanth Sreenivasa 		return;
53085cabbc6bSPrashanth Sreenivasa 	ASSERT0(inner_offset);
53095cabbc6bSPrashanth Sreenivasa 
53105cabbc6bSPrashanth Sreenivasa 	if (rbca->rbca_cb != NULL) {
53115cabbc6bSPrashanth Sreenivasa 		/*
53125cabbc6bSPrashanth Sreenivasa 		 * At this point we know that we are not handling split
53135cabbc6bSPrashanth Sreenivasa 		 * blocks and we invoke the callback on the previous
53145cabbc6bSPrashanth Sreenivasa 		 * vdev which must be indirect.
53155cabbc6bSPrashanth Sreenivasa 		 */
53165cabbc6bSPrashanth Sreenivasa 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
53175cabbc6bSPrashanth Sreenivasa 
53185cabbc6bSPrashanth Sreenivasa 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
53195cabbc6bSPrashanth Sreenivasa 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
53205cabbc6bSPrashanth Sreenivasa 
53215cabbc6bSPrashanth Sreenivasa 		/* set up remap_blkptr_cb_arg for the next call */
53225cabbc6bSPrashanth Sreenivasa 		rbca->rbca_remap_vd = vd;
53235cabbc6bSPrashanth Sreenivasa 		rbca->rbca_remap_offset = offset;
53245cabbc6bSPrashanth Sreenivasa 	}
53255cabbc6bSPrashanth Sreenivasa 
53265cabbc6bSPrashanth Sreenivasa 	/*
53275cabbc6bSPrashanth Sreenivasa 	 * The phys birth time is that of dva[0].  This ensures that we know
53285cabbc6bSPrashanth Sreenivasa 	 * when each dva was written, so that resilver can determine which
53295cabbc6bSPrashanth Sreenivasa 	 * blocks need to be scrubbed (i.e. those written during the time
53305cabbc6bSPrashanth Sreenivasa 	 * the vdev was offline).  It also ensures that the key used in
53315cabbc6bSPrashanth Sreenivasa 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
53325cabbc6bSPrashanth Sreenivasa 	 * we didn't change the phys_birth, a lookup in the ARC for a
53335cabbc6bSPrashanth Sreenivasa 	 * remapped BP could find the data that was previously stored at
53345cabbc6bSPrashanth Sreenivasa 	 * this vdev + offset.
53355cabbc6bSPrashanth Sreenivasa 	 */
53365cabbc6bSPrashanth Sreenivasa 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
53375cabbc6bSPrashanth Sreenivasa 	    DVA_GET_VDEV(&bp->blk_dva[0]));
53385cabbc6bSPrashanth Sreenivasa 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
53395cabbc6bSPrashanth Sreenivasa 	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
53405cabbc6bSPrashanth Sreenivasa 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
53415cabbc6bSPrashanth Sreenivasa 
53425cabbc6bSPrashanth Sreenivasa 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
53435cabbc6bSPrashanth Sreenivasa 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
53445cabbc6bSPrashanth Sreenivasa }
53455cabbc6bSPrashanth Sreenivasa 
5346fa9e4066Sahrens /*
53475cabbc6bSPrashanth Sreenivasa  * If the block pointer contains any indirect DVAs, modify them to refer to
53485cabbc6bSPrashanth Sreenivasa  * concrete DVAs.  Note that this will sometimes not be possible, leaving
53495cabbc6bSPrashanth Sreenivasa  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
53505cabbc6bSPrashanth Sreenivasa  * segments in the mapping (i.e. it is a "split block").
53515cabbc6bSPrashanth Sreenivasa  *
53525cabbc6bSPrashanth Sreenivasa  * If the BP was remapped, calls the callback on the original dva (note the
53535cabbc6bSPrashanth Sreenivasa  * callback can be called multiple times if the original indirect DVA refers
53545cabbc6bSPrashanth Sreenivasa  * to another indirect DVA, etc).
53555cabbc6bSPrashanth Sreenivasa  *
53565cabbc6bSPrashanth Sreenivasa  * Returns TRUE if the BP was remapped.
5357fa9e4066Sahrens  */
53585cabbc6bSPrashanth Sreenivasa boolean_t
spa_remap_blkptr(spa_t * spa,blkptr_t * bp,spa_remap_cb_t callback,void * arg)53595cabbc6bSPrashanth Sreenivasa spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5360fa9e4066Sahrens {
53615cabbc6bSPrashanth Sreenivasa 	remap_blkptr_cb_arg_t rbca;
53625cabbc6bSPrashanth Sreenivasa 
53635cabbc6bSPrashanth Sreenivasa 	if (!zfs_remap_blkptr_enable)
53645cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
53655cabbc6bSPrashanth Sreenivasa 
53665cabbc6bSPrashanth Sreenivasa 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
53675cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
53685cabbc6bSPrashanth Sreenivasa 
53695cabbc6bSPrashanth Sreenivasa 	/*
53705cabbc6bSPrashanth Sreenivasa 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
53715cabbc6bSPrashanth Sreenivasa 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
53725cabbc6bSPrashanth Sreenivasa 	 */
53735cabbc6bSPrashanth Sreenivasa 	if (BP_GET_DEDUP(bp))
53745cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
53755cabbc6bSPrashanth Sreenivasa 
53765cabbc6bSPrashanth Sreenivasa 	/*
53775cabbc6bSPrashanth Sreenivasa 	 * Gang blocks can not be remapped, because
53785cabbc6bSPrashanth Sreenivasa 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
53795cabbc6bSPrashanth Sreenivasa 	 * the BP used to read the gang block header (GBH) being the same
53805cabbc6bSPrashanth Sreenivasa 	 * as the DVA[0] that we allocated for the GBH.
53815cabbc6bSPrashanth Sreenivasa 	 */
53825cabbc6bSPrashanth Sreenivasa 	if (BP_IS_GANG(bp))
53835cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
53845cabbc6bSPrashanth Sreenivasa 
53855cabbc6bSPrashanth Sreenivasa 	/*
53865cabbc6bSPrashanth Sreenivasa 	 * Embedded BP's have no DVA to remap.
53875cabbc6bSPrashanth Sreenivasa 	 */
53885cabbc6bSPrashanth Sreenivasa 	if (BP_GET_NDVAS(bp) < 1)
53895cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
53905cabbc6bSPrashanth Sreenivasa 
53915cabbc6bSPrashanth Sreenivasa 	/*
53925cabbc6bSPrashanth Sreenivasa 	 * Note: we only remap dva[0].  If we remapped other dvas, we
53935cabbc6bSPrashanth Sreenivasa 	 * would no longer know what their phys birth txg is.
53945cabbc6bSPrashanth Sreenivasa 	 */
53955cabbc6bSPrashanth Sreenivasa 	dva_t *dva = &bp->blk_dva[0];
53965cabbc6bSPrashanth Sreenivasa 
5397fa9e4066Sahrens 	uint64_t offset = DVA_GET_OFFSET(dva);
5398fa9e4066Sahrens 	uint64_t size = DVA_GET_ASIZE(dva);
53995cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
54005cabbc6bSPrashanth Sreenivasa 
54015cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops->vdev_op_remap == NULL)
54025cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
54035cabbc6bSPrashanth Sreenivasa 
54045cabbc6bSPrashanth Sreenivasa 	rbca.rbca_bp = bp;
54055cabbc6bSPrashanth Sreenivasa 	rbca.rbca_cb = callback;
54065cabbc6bSPrashanth Sreenivasa 	rbca.rbca_remap_vd = vd;
54075cabbc6bSPrashanth Sreenivasa 	rbca.rbca_remap_offset = offset;
54085cabbc6bSPrashanth Sreenivasa 	rbca.rbca_cb_arg = arg;
54095cabbc6bSPrashanth Sreenivasa 
54105cabbc6bSPrashanth Sreenivasa 	/*
54115cabbc6bSPrashanth Sreenivasa 	 * remap_blkptr_cb() will be called in order for each level of
54125cabbc6bSPrashanth Sreenivasa 	 * indirection, until a concrete vdev is reached or a split block is
54135cabbc6bSPrashanth Sreenivasa 	 * encountered. old_vd and old_offset are updated within the callback
54145cabbc6bSPrashanth Sreenivasa 	 * as we go from the one indirect vdev to the next one (either concrete
54155cabbc6bSPrashanth Sreenivasa 	 * or indirect again) in that order.
54165cabbc6bSPrashanth Sreenivasa 	 */
54175cabbc6bSPrashanth Sreenivasa 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
54185cabbc6bSPrashanth Sreenivasa 
54195cabbc6bSPrashanth Sreenivasa 	/* Check if the DVA wasn't remapped because it is a split block */
54205cabbc6bSPrashanth Sreenivasa 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
54215cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
54225cabbc6bSPrashanth Sreenivasa 
54235cabbc6bSPrashanth Sreenivasa 	return (B_TRUE);
54245cabbc6bSPrashanth Sreenivasa }
54255cabbc6bSPrashanth Sreenivasa 
54265cabbc6bSPrashanth Sreenivasa /*
54275cabbc6bSPrashanth Sreenivasa  * Undo the allocation of a DVA which happened in the given transaction group.
54285cabbc6bSPrashanth Sreenivasa  */
54295cabbc6bSPrashanth Sreenivasa void
metaslab_unalloc_dva(spa_t * spa,const dva_t * dva,uint64_t txg)54305cabbc6bSPrashanth Sreenivasa metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
54315cabbc6bSPrashanth Sreenivasa {
5432fa9e4066Sahrens 	metaslab_t *msp;
54335cabbc6bSPrashanth Sreenivasa 	vdev_t *vd;
54345cabbc6bSPrashanth Sreenivasa 	uint64_t vdev = DVA_GET_VDEV(dva);
54355cabbc6bSPrashanth Sreenivasa 	uint64_t offset = DVA_GET_OFFSET(dva);
54365cabbc6bSPrashanth Sreenivasa 	uint64_t size = DVA_GET_ASIZE(dva);
5437fa9e4066Sahrens 
5438d80c45e0Sbonwick 	ASSERT(DVA_IS_VALID(dva));
54395cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5440d80c45e0Sbonwick 
5441fa9e4066Sahrens 	if (txg > spa_freeze_txg(spa))
5442fa9e4066Sahrens 		return;
5443fa9e4066Sahrens 
5444d80c45e0Sbonwick 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
5445d80c45e0Sbonwick 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5446d80c45e0Sbonwick 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
5447d80c45e0Sbonwick 		    (u_longlong_t)vdev, (u_longlong_t)offset);
5448fa9e4066Sahrens 		ASSERT(0);
5449fa9e4066Sahrens 		return;
5450fa9e4066Sahrens 	}
5451fa9e4066Sahrens 
54525cabbc6bSPrashanth Sreenivasa 	ASSERT(!vd->vdev_removing);
54535cabbc6bSPrashanth Sreenivasa 	ASSERT(vdev_is_concrete(vd));
54545cabbc6bSPrashanth Sreenivasa 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
54555cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5456fa9e4066Sahrens 
5457fa9e4066Sahrens 	if (DVA_GET_GANG(dva))
5458fa9e4066Sahrens 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5459fa9e4066Sahrens 
54605cabbc6bSPrashanth Sreenivasa 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
54610713e232SGeorge Wilson 
54625cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
546386714001SSerapheim Dimitropoulos 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
54645cabbc6bSPrashanth Sreenivasa 	    offset, size);
5465af1d63abSPaul Dagnelie 	msp->ms_allocating_total -= size;
5466fa9e4066Sahrens 
54675cabbc6bSPrashanth Sreenivasa 	VERIFY(!msp->ms_condensing);
54685cabbc6bSPrashanth Sreenivasa 	VERIFY3U(offset, >=, msp->ms_start);
54695cabbc6bSPrashanth Sreenivasa 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
547086714001SSerapheim Dimitropoulos 	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
54715cabbc6bSPrashanth Sreenivasa 	    msp->ms_size);
54725cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
54735cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
547486714001SSerapheim Dimitropoulos 	range_tree_add(msp->ms_allocatable, offset, size);
5475fa9e4066Sahrens 	mutex_exit(&msp->ms_lock);
5476fa9e4066Sahrens }
5477d80c45e0Sbonwick 
5478d80c45e0Sbonwick /*
547986714001SSerapheim Dimitropoulos  * Free the block represented by the given DVA.
5480d80c45e0Sbonwick  */
54815cabbc6bSPrashanth Sreenivasa void
metaslab_free_dva(spa_t * spa,const dva_t * dva,boolean_t checkpoint)548286714001SSerapheim Dimitropoulos metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5483d80c45e0Sbonwick {
5484d80c45e0Sbonwick 	uint64_t vdev = DVA_GET_VDEV(dva);
5485d80c45e0Sbonwick 	uint64_t offset = DVA_GET_OFFSET(dva);
5486d80c45e0Sbonwick 	uint64_t size = DVA_GET_ASIZE(dva);
54875cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, vdev);
5488d80c45e0Sbonwick 
5489d80c45e0Sbonwick 	ASSERT(DVA_IS_VALID(dva));
54905cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5491d80c45e0Sbonwick 
54925cabbc6bSPrashanth Sreenivasa 	if (DVA_GET_GANG(dva)) {
5493d80c45e0Sbonwick 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5494d80c45e0Sbonwick 	}
5495d80c45e0Sbonwick 
549686714001SSerapheim Dimitropoulos 	metaslab_free_impl(vd, offset, size, checkpoint);
5497d80c45e0Sbonwick }
5498d80c45e0Sbonwick 
54990f7643c7SGeorge Wilson /*
55000f7643c7SGeorge Wilson  * Reserve some allocation slots. The reservation system must be called
55010f7643c7SGeorge Wilson  * before we call into the allocator. If there aren't any available slots
55020f7643c7SGeorge Wilson  * then the I/O will be throttled until an I/O completes and its slots are
55030f7643c7SGeorge Wilson  * freed up. The function returns true if it was successful in placing
55040f7643c7SGeorge Wilson  * the reservation.
55050f7643c7SGeorge Wilson  */
55060f7643c7SGeorge Wilson boolean_t
metaslab_class_throttle_reserve(metaslab_class_t * mc,int slots,int allocator,zio_t * zio,int flags)5507f78cdc34SPaul Dagnelie metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5508f78cdc34SPaul Dagnelie     zio_t *zio, int flags)
55090f7643c7SGeorge Wilson {
55100f7643c7SGeorge Wilson 	uint64_t available_slots = 0;
55110f7643c7SGeorge Wilson 	boolean_t slot_reserved = B_FALSE;
5512f78cdc34SPaul Dagnelie 	uint64_t max = mc->mc_alloc_max_slots[allocator];
55130f7643c7SGeorge Wilson 
55140f7643c7SGeorge Wilson 	ASSERT(mc->mc_alloc_throttle_enabled);
55150f7643c7SGeorge Wilson 	mutex_enter(&mc->mc_lock);
55160f7643c7SGeorge Wilson 
5517f78cdc34SPaul Dagnelie 	uint64_t reserved_slots =
5518e914ace2STim Schumacher 	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
5519f78cdc34SPaul Dagnelie 	if (reserved_slots < max)
5520f78cdc34SPaul Dagnelie 		available_slots = max - reserved_slots;
55210f7643c7SGeorge Wilson 
5522663207adSDon Brady 	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5523663207adSDon Brady 	    flags & METASLAB_MUST_RESERVE) {
55240f7643c7SGeorge Wilson 		/*
55250f7643c7SGeorge Wilson 		 * We reserve the slots individually so that we can unreserve
55260f7643c7SGeorge Wilson 		 * them individually when an I/O completes.
55270f7643c7SGeorge Wilson 		 */
5528*ecd18decSAlexander Motin 		zfs_refcount_add_few(&mc->mc_alloc_slots[allocator], slots,
5529*ecd18decSAlexander Motin 		    zio);
55300f7643c7SGeorge Wilson 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
55310f7643c7SGeorge Wilson 		slot_reserved = B_TRUE;
55320f7643c7SGeorge Wilson 	}
55330f7643c7SGeorge Wilson 
55340f7643c7SGeorge Wilson 	mutex_exit(&mc->mc_lock);
55350f7643c7SGeorge Wilson 	return (slot_reserved);
55360f7643c7SGeorge Wilson }
55370f7643c7SGeorge Wilson 
55380f7643c7SGeorge Wilson void
metaslab_class_throttle_unreserve(metaslab_class_t * mc,int slots,int allocator,zio_t * zio)5539f78cdc34SPaul Dagnelie metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5540f78cdc34SPaul Dagnelie     int allocator, zio_t *zio)
55410f7643c7SGeorge Wilson {
55420f7643c7SGeorge Wilson 	ASSERT(mc->mc_alloc_throttle_enabled);
55430f7643c7SGeorge Wilson 	mutex_enter(&mc->mc_lock);
5544*ecd18decSAlexander Motin 	zfs_refcount_remove_few(&mc->mc_alloc_slots[allocator], slots, zio);
55450f7643c7SGeorge Wilson 	mutex_exit(&mc->mc_lock);
55460f7643c7SGeorge Wilson }
55470f7643c7SGeorge Wilson 
55485cabbc6bSPrashanth Sreenivasa static int
metaslab_claim_concrete(vdev_t * vd,uint64_t offset,uint64_t size,uint64_t txg)55495cabbc6bSPrashanth Sreenivasa metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
55505cabbc6bSPrashanth Sreenivasa     uint64_t txg)
55515cabbc6bSPrashanth Sreenivasa {
55525cabbc6bSPrashanth Sreenivasa 	metaslab_t *msp;
55535cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
55545cabbc6bSPrashanth Sreenivasa 	int error = 0;
55555cabbc6bSPrashanth Sreenivasa 
55565cabbc6bSPrashanth Sreenivasa 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
55575cabbc6bSPrashanth Sreenivasa 		return (ENXIO);
55585cabbc6bSPrashanth Sreenivasa 
55595cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ms, !=, NULL);
55605cabbc6bSPrashanth Sreenivasa 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
55615cabbc6bSPrashanth Sreenivasa 
55625cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
55635cabbc6bSPrashanth Sreenivasa 
55645cabbc6bSPrashanth Sreenivasa 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
5565f78cdc34SPaul Dagnelie 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5566f78cdc34SPaul Dagnelie 	/*
5567f78cdc34SPaul Dagnelie 	 * No need to fail in that case; someone else has activated the
5568f78cdc34SPaul Dagnelie 	 * metaslab, but that doesn't preclude us from using it.
5569f78cdc34SPaul Dagnelie 	 */
5570f78cdc34SPaul Dagnelie 	if (error == EBUSY)
5571f78cdc34SPaul Dagnelie 		error = 0;
55725cabbc6bSPrashanth Sreenivasa 
557386714001SSerapheim Dimitropoulos 	if (error == 0 &&
557486714001SSerapheim Dimitropoulos 	    !range_tree_contains(msp->ms_allocatable, offset, size))
55755cabbc6bSPrashanth Sreenivasa 		error = SET_ERROR(ENOENT);
55765cabbc6bSPrashanth Sreenivasa 
55775cabbc6bSPrashanth Sreenivasa 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
55785cabbc6bSPrashanth Sreenivasa 		mutex_exit(&msp->ms_lock);
55795cabbc6bSPrashanth Sreenivasa 		return (error);
55805cabbc6bSPrashanth Sreenivasa 	}
55815cabbc6bSPrashanth Sreenivasa 
55825cabbc6bSPrashanth Sreenivasa 	VERIFY(!msp->ms_condensing);
55835cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
55845cabbc6bSPrashanth Sreenivasa 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
558586714001SSerapheim Dimitropoulos 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
558686714001SSerapheim Dimitropoulos 	    msp->ms_size);
558786714001SSerapheim Dimitropoulos 	range_tree_remove(msp->ms_allocatable, offset, size);
5588084fd14fSBrian Behlendorf 	range_tree_clear(msp->ms_trim, offset, size);
55895cabbc6bSPrashanth Sreenivasa 
5590bbf21555SRichard Lowe 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
5591af1d63abSPaul Dagnelie 		metaslab_class_t *mc = msp->ms_group->mg_class;
5592af1d63abSPaul Dagnelie 		multilist_sublist_t *mls =
5593af1d63abSPaul Dagnelie 		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
5594af1d63abSPaul Dagnelie 		if (!multilist_link_active(&msp->ms_class_txg_node)) {
5595af1d63abSPaul Dagnelie 			msp->ms_selected_txg = txg;
5596af1d63abSPaul Dagnelie 			multilist_sublist_insert_head(mls, msp);
5597af1d63abSPaul Dagnelie 		}
5598af1d63abSPaul Dagnelie 		multilist_sublist_unlock(mls);
5599af1d63abSPaul Dagnelie 
560086714001SSerapheim Dimitropoulos 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
56015cabbc6bSPrashanth Sreenivasa 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
560286714001SSerapheim Dimitropoulos 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
560386714001SSerapheim Dimitropoulos 		    offset, size);
5604af1d63abSPaul Dagnelie 		msp->ms_allocating_total += size;
56055cabbc6bSPrashanth Sreenivasa 	}
56065cabbc6bSPrashanth Sreenivasa 
56075cabbc6bSPrashanth Sreenivasa 	mutex_exit(&msp->ms_lock);
56085cabbc6bSPrashanth Sreenivasa 
56095cabbc6bSPrashanth Sreenivasa 	return (0);
56105cabbc6bSPrashanth Sreenivasa }
56115cabbc6bSPrashanth Sreenivasa 
56125cabbc6bSPrashanth Sreenivasa typedef struct metaslab_claim_cb_arg_t {
56135cabbc6bSPrashanth Sreenivasa 	uint64_t	mcca_txg;
56145cabbc6bSPrashanth Sreenivasa 	int		mcca_error;
56155cabbc6bSPrashanth Sreenivasa } metaslab_claim_cb_arg_t;
56165cabbc6bSPrashanth Sreenivasa 
56175cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
56185cabbc6bSPrashanth Sreenivasa static void
metaslab_claim_impl_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)56195cabbc6bSPrashanth Sreenivasa metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
56205cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
56215cabbc6bSPrashanth Sreenivasa {
56225cabbc6bSPrashanth Sreenivasa 	metaslab_claim_cb_arg_t *mcca_arg = arg;
56235cabbc6bSPrashanth Sreenivasa 
56245cabbc6bSPrashanth Sreenivasa 	if (mcca_arg->mcca_error == 0) {
56255cabbc6bSPrashanth Sreenivasa 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
56265cabbc6bSPrashanth Sreenivasa 		    size, mcca_arg->mcca_txg);
56275cabbc6bSPrashanth Sreenivasa 	}
56285cabbc6bSPrashanth Sreenivasa }
56295cabbc6bSPrashanth Sreenivasa 
56305cabbc6bSPrashanth Sreenivasa int
metaslab_claim_impl(vdev_t * vd,uint64_t offset,uint64_t size,uint64_t txg)56315cabbc6bSPrashanth Sreenivasa metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
56325cabbc6bSPrashanth Sreenivasa {
56335cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops->vdev_op_remap != NULL) {
56345cabbc6bSPrashanth Sreenivasa 		metaslab_claim_cb_arg_t arg;
56355cabbc6bSPrashanth Sreenivasa 
56365cabbc6bSPrashanth Sreenivasa 		/*
5637bbf21555SRichard Lowe 		 * Only zdb(8) can claim on indirect vdevs.  This is used
56385cabbc6bSPrashanth Sreenivasa 		 * to detect leaks of mapped space (that are not accounted
56395cabbc6bSPrashanth Sreenivasa 		 * for in the obsolete counts, spacemap, or bpobj).
56405cabbc6bSPrashanth Sreenivasa 		 */
56415cabbc6bSPrashanth Sreenivasa 		ASSERT(!spa_writeable(vd->vdev_spa));
56425cabbc6bSPrashanth Sreenivasa 		arg.mcca_error = 0;
56435cabbc6bSPrashanth Sreenivasa 		arg.mcca_txg = txg;
56445cabbc6bSPrashanth Sreenivasa 
56455cabbc6bSPrashanth Sreenivasa 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
56465cabbc6bSPrashanth Sreenivasa 		    metaslab_claim_impl_cb, &arg);
56475cabbc6bSPrashanth Sreenivasa 
56485cabbc6bSPrashanth Sreenivasa 		if (arg.mcca_error == 0) {
56495cabbc6bSPrashanth Sreenivasa 			arg.mcca_error = metaslab_claim_concrete(vd,
56505cabbc6bSPrashanth Sreenivasa 			    offset, size, txg);
56515cabbc6bSPrashanth Sreenivasa 		}
56525cabbc6bSPrashanth Sreenivasa 		return (arg.mcca_error);
56535cabbc6bSPrashanth Sreenivasa 	} else {
56545cabbc6bSPrashanth Sreenivasa 		return (metaslab_claim_concrete(vd, offset, size, txg));
56555cabbc6bSPrashanth Sreenivasa 	}
56565cabbc6bSPrashanth Sreenivasa }
56575cabbc6bSPrashanth Sreenivasa 
56585cabbc6bSPrashanth Sreenivasa /*
56595cabbc6bSPrashanth Sreenivasa  * Intent log support: upon opening the pool after a crash, notify the SPA
56605cabbc6bSPrashanth Sreenivasa  * of blocks that the intent log has allocated for immediate write, but
56615cabbc6bSPrashanth Sreenivasa  * which are still considered free by the SPA because the last transaction
56625cabbc6bSPrashanth Sreenivasa  * group didn't commit yet.
56635cabbc6bSPrashanth Sreenivasa  */
56645cabbc6bSPrashanth Sreenivasa static int
metaslab_claim_dva(spa_t * spa,const dva_t * dva,uint64_t txg)56655cabbc6bSPrashanth Sreenivasa metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
56665cabbc6bSPrashanth Sreenivasa {
56675cabbc6bSPrashanth Sreenivasa 	uint64_t vdev = DVA_GET_VDEV(dva);
56685cabbc6bSPrashanth Sreenivasa 	uint64_t offset = DVA_GET_OFFSET(dva);
56695cabbc6bSPrashanth Sreenivasa 	uint64_t size = DVA_GET_ASIZE(dva);
56705cabbc6bSPrashanth Sreenivasa 	vdev_t *vd;
56715cabbc6bSPrashanth Sreenivasa 
56725cabbc6bSPrashanth Sreenivasa 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
56735cabbc6bSPrashanth Sreenivasa 		return (SET_ERROR(ENXIO));
56745cabbc6bSPrashanth Sreenivasa 	}
56755cabbc6bSPrashanth Sreenivasa 
56765cabbc6bSPrashanth Sreenivasa 	ASSERT(DVA_IS_VALID(dva));
56775cabbc6bSPrashanth Sreenivasa 
56785cabbc6bSPrashanth Sreenivasa 	if (DVA_GET_GANG(dva))
56795cabbc6bSPrashanth Sreenivasa 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
56805cabbc6bSPrashanth Sreenivasa 
56815cabbc6bSPrashanth Sreenivasa 	return (metaslab_claim_impl(vd, offset, size, txg));
56825cabbc6bSPrashanth Sreenivasa }
56835cabbc6bSPrashanth Sreenivasa 
5684d80c45e0Sbonwick int
metaslab_alloc(spa_t * spa,metaslab_class_t * mc,uint64_t psize,blkptr_t * bp,int ndvas,uint64_t txg,blkptr_t * hintbp,int flags,zio_alloc_list_t * zal,zio_t * zio,int allocator)56858654d025Sperrin metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
56868363e80aSGeorge Wilson     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5687f78cdc34SPaul Dagnelie     zio_alloc_list_t *zal, zio_t *zio, int allocator)
5688d80c45e0Sbonwick {
5689d80c45e0Sbonwick 	dva_t *dva = bp->blk_dva;
5690555d674dSSerapheim Dimitropoulos 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5691d80c45e0Sbonwick 	int error = 0;
5692d80c45e0Sbonwick 
5693e14bb325SJeff Bonwick 	ASSERT(bp->blk_birth == 0);
5694b24ab676SJeff Bonwick 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5695e14bb325SJeff Bonwick 
5696e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5697e14bb325SJeff Bonwick 
5698e14bb325SJeff Bonwick 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
5699e14bb325SJeff Bonwick 		spa_config_exit(spa, SCL_ALLOC, FTAG);
5700be6fd75aSMatthew Ahrens 		return (SET_ERROR(ENOSPC));
5701e14bb325SJeff Bonwick 	}
57028654d025Sperrin 
5703d80c45e0Sbonwick 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5704d80c45e0Sbonwick 	ASSERT(BP_GET_NDVAS(bp) == 0);
5705d80c45e0Sbonwick 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
57068363e80aSGeorge Wilson 	ASSERT3P(zal, !=, NULL);
5707d80c45e0Sbonwick 
5708e14bb325SJeff Bonwick 	for (int d = 0; d < ndvas; d++) {
57098654d025Sperrin 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5710f78cdc34SPaul Dagnelie 		    txg, flags, zal, allocator);
57110713e232SGeorge Wilson 		if (error != 0) {
5712d80c45e0Sbonwick 			for (d--; d >= 0; d--) {
57135cabbc6bSPrashanth Sreenivasa 				metaslab_unalloc_dva(spa, &dva[d], txg);
57140f7643c7SGeorge Wilson 				metaslab_group_alloc_decrement(spa,
5715f78cdc34SPaul Dagnelie 				    DVA_GET_VDEV(&dva[d]), zio, flags,
5716f78cdc34SPaul Dagnelie 				    allocator, B_FALSE);
5717d80c45e0Sbonwick 				bzero(&dva[d], sizeof (dva_t));
5718d80c45e0Sbonwick 			}
5719e14bb325SJeff Bonwick 			spa_config_exit(spa, SCL_ALLOC, FTAG);
5720d80c45e0Sbonwick 			return (error);
57210f7643c7SGeorge Wilson 		} else {
57220f7643c7SGeorge Wilson 			/*
57230f7643c7SGeorge Wilson 			 * Update the metaslab group's queue depth
57240f7643c7SGeorge Wilson 			 * based on the newly allocated dva.
57250f7643c7SGeorge Wilson 			 */
57260f7643c7SGeorge Wilson 			metaslab_group_alloc_increment(spa,
5727f78cdc34SPaul Dagnelie 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5728d80c45e0Sbonwick 		}
57290f7643c7SGeorge Wilson 
5730d80c45e0Sbonwick 	}
5731d80c45e0Sbonwick 	ASSERT(error == 0);
5732d80c45e0Sbonwick 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
5733d80c45e0Sbonwick 
5734e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALLOC, FTAG);
5735e14bb325SJeff Bonwick 
5736b24ab676SJeff Bonwick 	BP_SET_BIRTH(bp, txg, txg);
5737e14bb325SJeff Bonwick 
5738d80c45e0Sbonwick 	return (0);
5739d80c45e0Sbonwick }
5740d80c45e0Sbonwick 
5741d80c45e0Sbonwick void
metaslab_free(spa_t * spa,const blkptr_t * bp,uint64_t txg,boolean_t now)5742d80c45e0Sbonwick metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5743d80c45e0Sbonwick {
5744d80c45e0Sbonwick 	const dva_t *dva = bp->blk_dva;
5745d80c45e0Sbonwick 	int ndvas = BP_GET_NDVAS(bp);
5746d80c45e0Sbonwick 
5747d80c45e0Sbonwick 	ASSERT(!BP_IS_HOLE(bp));
5748b24ab676SJeff Bonwick 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5749e14bb325SJeff Bonwick 
575086714001SSerapheim Dimitropoulos 	/*
575186714001SSerapheim Dimitropoulos 	 * If we have a checkpoint for the pool we need to make sure that
575286714001SSerapheim Dimitropoulos 	 * the blocks that we free that are part of the checkpoint won't be
575386714001SSerapheim Dimitropoulos 	 * reused until the checkpoint is discarded or we revert to it.
575486714001SSerapheim Dimitropoulos 	 *
575586714001SSerapheim Dimitropoulos 	 * The checkpoint flag is passed down the metaslab_free code path
575686714001SSerapheim Dimitropoulos 	 * and is set whenever we want to add a block to the checkpoint's
575786714001SSerapheim Dimitropoulos 	 * accounting. That is, we "checkpoint" blocks that existed at the
575886714001SSerapheim Dimitropoulos 	 * time the checkpoint was created and are therefore referenced by
575986714001SSerapheim Dimitropoulos 	 * the checkpointed uberblock.
576086714001SSerapheim Dimitropoulos 	 *
576186714001SSerapheim Dimitropoulos 	 * Note that, we don't checkpoint any blocks if the current
576286714001SSerapheim Dimitropoulos 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
576386714001SSerapheim Dimitropoulos 	 * normally as they will be referenced by the checkpointed uberblock.
576486714001SSerapheim Dimitropoulos 	 */
576586714001SSerapheim Dimitropoulos 	boolean_t checkpoint = B_FALSE;
576686714001SSerapheim Dimitropoulos 	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
576786714001SSerapheim Dimitropoulos 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
576886714001SSerapheim Dimitropoulos 		/*
576986714001SSerapheim Dimitropoulos 		 * At this point, if the block is part of the checkpoint
577086714001SSerapheim Dimitropoulos 		 * there is no way it was created in the current txg.
577186714001SSerapheim Dimitropoulos 		 */
577286714001SSerapheim Dimitropoulos 		ASSERT(!now);
577386714001SSerapheim Dimitropoulos 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
577486714001SSerapheim Dimitropoulos 		checkpoint = B_TRUE;
577586714001SSerapheim Dimitropoulos 	}
577686714001SSerapheim Dimitropoulos 
5777e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5778d80c45e0Sbonwick 
57795cabbc6bSPrashanth Sreenivasa 	for (int d = 0; d < ndvas; d++) {
57805cabbc6bSPrashanth Sreenivasa 		if (now) {
57815cabbc6bSPrashanth Sreenivasa 			metaslab_unalloc_dva(spa, &dva[d], txg);
57825cabbc6bSPrashanth Sreenivasa 		} else {
578386714001SSerapheim Dimitropoulos 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
578486714001SSerapheim Dimitropoulos 			metaslab_free_dva(spa, &dva[d], checkpoint);
57855cabbc6bSPrashanth Sreenivasa 		}
57865cabbc6bSPrashanth Sreenivasa 	}
5787e14bb325SJeff Bonwick 
5788e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_FREE, FTAG);
5789d80c45e0Sbonwick }
5790d80c45e0Sbonwick 
5791d80c45e0Sbonwick int
metaslab_claim(spa_t * spa,const blkptr_t * bp,uint64_t txg)5792d80c45e0Sbonwick metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5793d80c45e0Sbonwick {
5794d80c45e0Sbonwick 	const dva_t *dva = bp->blk_dva;
5795d80c45e0Sbonwick 	int ndvas = BP_GET_NDVAS(bp);
5796e14bb325SJeff Bonwick 	int error = 0;
5797d80c45e0Sbonwick 
5798d80c45e0Sbonwick 	ASSERT(!BP_IS_HOLE(bp));
5799d80c45e0Sbonwick 
5800e14bb325SJeff Bonwick 	if (txg != 0) {
5801e14bb325SJeff Bonwick 		/*
5802e14bb325SJeff Bonwick 		 * First do a dry run to make sure all DVAs are claimable,
5803e14bb325SJeff Bonwick 		 * so we don't have to unwind from partial failures below.
5804e14bb325SJeff Bonwick 		 */
5805e14bb325SJeff Bonwick 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
5806e14bb325SJeff Bonwick 			return (error);
5807e14bb325SJeff Bonwick 	}
5808e14bb325SJeff Bonwick 
5809e14bb325SJeff Bonwick 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5810e14bb325SJeff Bonwick 
5811663207adSDon Brady 	for (int d = 0; d < ndvas; d++) {
5812663207adSDon Brady 		error = metaslab_claim_dva(spa, &dva[d], txg);
5813663207adSDon Brady 		if (error != 0)
5814e14bb325SJeff Bonwick 			break;
5815663207adSDon Brady 	}
5816e14bb325SJeff Bonwick 
5817e14bb325SJeff Bonwick 	spa_config_exit(spa, SCL_ALLOC, FTAG);
5818e14bb325SJeff Bonwick 
5819e14bb325SJeff Bonwick 	ASSERT(error == 0 || txg == 0);
5820d80c45e0Sbonwick 
5821e14bb325SJeff Bonwick 	return (error);
5822d80c45e0Sbonwick }
58233b2aab18SMatthew Ahrens 
58245cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
58255cabbc6bSPrashanth Sreenivasa static void
metaslab_check_free_impl_cb(uint64_t inner,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)58265cabbc6bSPrashanth Sreenivasa metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
58275cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
58285cabbc6bSPrashanth Sreenivasa {
58295cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops == &vdev_indirect_ops)
58305cabbc6bSPrashanth Sreenivasa 		return;
58315cabbc6bSPrashanth Sreenivasa 
58325cabbc6bSPrashanth Sreenivasa 	metaslab_check_free_impl(vd, offset, size);
58335cabbc6bSPrashanth Sreenivasa }
58345cabbc6bSPrashanth Sreenivasa 
58355cabbc6bSPrashanth Sreenivasa static void
metaslab_check_free_impl(vdev_t * vd,uint64_t offset,uint64_t size)58365cabbc6bSPrashanth Sreenivasa metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
58375cabbc6bSPrashanth Sreenivasa {
58385cabbc6bSPrashanth Sreenivasa 	metaslab_t *msp;
58395cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
58405cabbc6bSPrashanth Sreenivasa 
58415cabbc6bSPrashanth Sreenivasa 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
58425cabbc6bSPrashanth Sreenivasa 		return;
58435cabbc6bSPrashanth Sreenivasa 
58445cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops->vdev_op_remap != NULL) {
58455cabbc6bSPrashanth Sreenivasa 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
58465cabbc6bSPrashanth Sreenivasa 		    metaslab_check_free_impl_cb, NULL);
58475cabbc6bSPrashanth Sreenivasa 		return;
58485cabbc6bSPrashanth Sreenivasa 	}
58495cabbc6bSPrashanth Sreenivasa 
58505cabbc6bSPrashanth Sreenivasa 	ASSERT(vdev_is_concrete(vd));
58515cabbc6bSPrashanth Sreenivasa 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
58525cabbc6bSPrashanth Sreenivasa 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
58535cabbc6bSPrashanth Sreenivasa 
58545cabbc6bSPrashanth Sreenivasa 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
58555cabbc6bSPrashanth Sreenivasa 
58565cabbc6bSPrashanth Sreenivasa 	mutex_enter(&msp->ms_lock);
5857555d674dSSerapheim Dimitropoulos 	if (msp->ms_loaded) {
5858555d674dSSerapheim Dimitropoulos 		range_tree_verify_not_present(msp->ms_allocatable,
5859555d674dSSerapheim Dimitropoulos 		    offset, size);
5860555d674dSSerapheim Dimitropoulos 	}
58615cabbc6bSPrashanth Sreenivasa 
5862814dcd43SSerapheim Dimitropoulos 	/*
5863814dcd43SSerapheim Dimitropoulos 	 * Check all segments that currently exist in the freeing pipeline.
5864814dcd43SSerapheim Dimitropoulos 	 *
5865814dcd43SSerapheim Dimitropoulos 	 * It would intuitively make sense to also check the current allocating
5866814dcd43SSerapheim Dimitropoulos 	 * tree since metaslab_unalloc_dva() exists for extents that are
5867814dcd43SSerapheim Dimitropoulos 	 * allocated and freed in the same sync pass withing the same txg.
5868814dcd43SSerapheim Dimitropoulos 	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
5869814dcd43SSerapheim Dimitropoulos 	 * segment but then we free part of it within the same txg
5870814dcd43SSerapheim Dimitropoulos 	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5871814dcd43SSerapheim Dimitropoulos 	 * current allocating tree.
5872814dcd43SSerapheim Dimitropoulos 	 */
5873555d674dSSerapheim Dimitropoulos 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
5874555d674dSSerapheim Dimitropoulos 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5875555d674dSSerapheim Dimitropoulos 	range_tree_verify_not_present(msp->ms_freed, offset, size);
58765cabbc6bSPrashanth Sreenivasa 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
5877555d674dSSerapheim Dimitropoulos 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5878814dcd43SSerapheim Dimitropoulos 	range_tree_verify_not_present(msp->ms_trim, offset, size);
58795cabbc6bSPrashanth Sreenivasa 	mutex_exit(&msp->ms_lock);
58805cabbc6bSPrashanth Sreenivasa }
58815cabbc6bSPrashanth Sreenivasa 
58823b2aab18SMatthew Ahrens void
metaslab_check_free(spa_t * spa,const blkptr_t * bp)58833b2aab18SMatthew Ahrens metaslab_check_free(spa_t *spa, const blkptr_t *bp)
58843b2aab18SMatthew Ahrens {
58853b2aab18SMatthew Ahrens 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
58863b2aab18SMatthew Ahrens 		return;
58873b2aab18SMatthew Ahrens 
58883b2aab18SMatthew Ahrens 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
58893b2aab18SMatthew Ahrens 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
58900713e232SGeorge Wilson 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
58910713e232SGeorge Wilson 		vdev_t *vd = vdev_lookup_top(spa, vdev);
58920713e232SGeorge Wilson 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
58933b2aab18SMatthew Ahrens 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
58943b2aab18SMatthew Ahrens 
58955cabbc6bSPrashanth Sreenivasa 		if (DVA_GET_GANG(&bp->blk_dva[i]))
58965cabbc6bSPrashanth Sreenivasa 			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
58975cabbc6bSPrashanth Sreenivasa 
58985cabbc6bSPrashanth Sreenivasa 		ASSERT3P(vd, !=, NULL);
58993b2aab18SMatthew Ahrens 
59005cabbc6bSPrashanth Sreenivasa 		metaslab_check_free_impl(vd, offset, size);
59013b2aab18SMatthew Ahrens 	}
59023b2aab18SMatthew Ahrens 	spa_config_exit(spa, SCL_VDEV, FTAG);
59033b2aab18SMatthew Ahrens }
5904084fd14fSBrian Behlendorf 
5905084fd14fSBrian Behlendorf static void
metaslab_group_disable_wait(metaslab_group_t * mg)5906084fd14fSBrian Behlendorf metaslab_group_disable_wait(metaslab_group_t *mg)
5907084fd14fSBrian Behlendorf {
5908084fd14fSBrian Behlendorf 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5909084fd14fSBrian Behlendorf 	while (mg->mg_disabled_updating) {
5910084fd14fSBrian Behlendorf 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5911084fd14fSBrian Behlendorf 	}
5912084fd14fSBrian Behlendorf }
5913084fd14fSBrian Behlendorf 
5914084fd14fSBrian Behlendorf static void
metaslab_group_disabled_increment(metaslab_group_t * mg)5915084fd14fSBrian Behlendorf metaslab_group_disabled_increment(metaslab_group_t *mg)
5916084fd14fSBrian Behlendorf {
5917084fd14fSBrian Behlendorf 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5918084fd14fSBrian Behlendorf 	ASSERT(mg->mg_disabled_updating);
5919084fd14fSBrian Behlendorf 
5920084fd14fSBrian Behlendorf 	while (mg->mg_ms_disabled >= max_disabled_ms) {
5921084fd14fSBrian Behlendorf 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5922084fd14fSBrian Behlendorf 	}
5923084fd14fSBrian Behlendorf 	mg->mg_ms_disabled++;
5924084fd14fSBrian Behlendorf 	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
5925084fd14fSBrian Behlendorf }
5926084fd14fSBrian Behlendorf 
5927084fd14fSBrian Behlendorf /*
5928084fd14fSBrian Behlendorf  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
5929084fd14fSBrian Behlendorf  * We must also track how many metaslabs are currently disabled within a
5930084fd14fSBrian Behlendorf  * metaslab group and limit them to prevent allocation failures from
5931084fd14fSBrian Behlendorf  * occurring because all metaslabs are disabled.
5932084fd14fSBrian Behlendorf  */
5933084fd14fSBrian Behlendorf void
metaslab_disable(metaslab_t * msp)5934084fd14fSBrian Behlendorf metaslab_disable(metaslab_t *msp)
5935084fd14fSBrian Behlendorf {
5936084fd14fSBrian Behlendorf 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
5937084fd14fSBrian Behlendorf 	metaslab_group_t *mg = msp->ms_group;
5938084fd14fSBrian Behlendorf 
5939084fd14fSBrian Behlendorf 	mutex_enter(&mg->mg_ms_disabled_lock);
5940084fd14fSBrian Behlendorf 
5941084fd14fSBrian Behlendorf 	/*
5942084fd14fSBrian Behlendorf 	 * To keep an accurate count of how many threads have disabled
5943084fd14fSBrian Behlendorf 	 * a specific metaslab group, we only allow one thread to mark
5944084fd14fSBrian Behlendorf 	 * the metaslab group at a time. This ensures that the value of
5945084fd14fSBrian Behlendorf 	 * ms_disabled will be accurate when we decide to mark a metaslab
5946084fd14fSBrian Behlendorf 	 * group as disabled. To do this we force all other threads
5947084fd14fSBrian Behlendorf 	 * to wait till the metaslab's mg_disabled_updating flag is no
5948084fd14fSBrian Behlendorf 	 * longer set.
5949084fd14fSBrian Behlendorf 	 */
5950084fd14fSBrian Behlendorf 	metaslab_group_disable_wait(mg);
5951084fd14fSBrian Behlendorf 	mg->mg_disabled_updating = B_TRUE;
5952084fd14fSBrian Behlendorf 	if (msp->ms_disabled == 0) {
5953084fd14fSBrian Behlendorf 		metaslab_group_disabled_increment(mg);
5954084fd14fSBrian Behlendorf 	}
5955084fd14fSBrian Behlendorf 	mutex_enter(&msp->ms_lock);
5956084fd14fSBrian Behlendorf 	msp->ms_disabled++;
5957084fd14fSBrian Behlendorf 	mutex_exit(&msp->ms_lock);
5958084fd14fSBrian Behlendorf 
5959084fd14fSBrian Behlendorf 	mg->mg_disabled_updating = B_FALSE;
5960084fd14fSBrian Behlendorf 	cv_broadcast(&mg->mg_ms_disabled_cv);
5961084fd14fSBrian Behlendorf 	mutex_exit(&mg->mg_ms_disabled_lock);
5962084fd14fSBrian Behlendorf }
5963084fd14fSBrian Behlendorf 
5964084fd14fSBrian Behlendorf void
metaslab_enable(metaslab_t * msp,boolean_t sync,boolean_t unload)5965af1d63abSPaul Dagnelie metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
5966084fd14fSBrian Behlendorf {
5967084fd14fSBrian Behlendorf 	metaslab_group_t *mg = msp->ms_group;
5968084fd14fSBrian Behlendorf 	spa_t *spa = mg->mg_vd->vdev_spa;
5969084fd14fSBrian Behlendorf 
5970084fd14fSBrian Behlendorf 	/*
5971084fd14fSBrian Behlendorf 	 * Wait for the outstanding IO to be synced to prevent newly
5972084fd14fSBrian Behlendorf 	 * allocated blocks from being overwritten.  This used by
5973084fd14fSBrian Behlendorf 	 * initialize and TRIM which are modifying unallocated space.
5974084fd14fSBrian Behlendorf 	 */
5975084fd14fSBrian Behlendorf 	if (sync)
5976084fd14fSBrian Behlendorf 		txg_wait_synced(spa_get_dsl(spa), 0);
5977084fd14fSBrian Behlendorf 
5978084fd14fSBrian Behlendorf 	mutex_enter(&mg->mg_ms_disabled_lock);
5979084fd14fSBrian Behlendorf 	mutex_enter(&msp->ms_lock);
5980084fd14fSBrian Behlendorf 	if (--msp->ms_disabled == 0) {
5981084fd14fSBrian Behlendorf 		mg->mg_ms_disabled--;
5982084fd14fSBrian Behlendorf 		cv_broadcast(&mg->mg_ms_disabled_cv);
5983af1d63abSPaul Dagnelie 		if (unload)
5984af1d63abSPaul Dagnelie 			metaslab_unload(msp);
5985084fd14fSBrian Behlendorf 	}
5986084fd14fSBrian Behlendorf 	mutex_exit(&msp->ms_lock);
5987084fd14fSBrian Behlendorf 	mutex_exit(&mg->mg_ms_disabled_lock);
5988084fd14fSBrian Behlendorf }
5989814dcd43SSerapheim Dimitropoulos 
5990814dcd43SSerapheim Dimitropoulos static void
metaslab_update_ondisk_flush_data(metaslab_t * ms,dmu_tx_t * tx)5991814dcd43SSerapheim Dimitropoulos metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
5992814dcd43SSerapheim Dimitropoulos {
5993814dcd43SSerapheim Dimitropoulos 	vdev_t *vd = ms->ms_group->mg_vd;
5994814dcd43SSerapheim Dimitropoulos 	spa_t *spa = vd->vdev_spa;
5995814dcd43SSerapheim Dimitropoulos 	objset_t *mos = spa_meta_objset(spa);
5996814dcd43SSerapheim Dimitropoulos 
5997814dcd43SSerapheim Dimitropoulos 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
5998814dcd43SSerapheim Dimitropoulos 
5999814dcd43SSerapheim Dimitropoulos 	metaslab_unflushed_phys_t entry = {
6000814dcd43SSerapheim Dimitropoulos 		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
6001814dcd43SSerapheim Dimitropoulos 	};
6002814dcd43SSerapheim Dimitropoulos 	uint64_t entry_size = sizeof (entry);
6003814dcd43SSerapheim Dimitropoulos 	uint64_t entry_offset = ms->ms_id * entry_size;
6004814dcd43SSerapheim Dimitropoulos 
6005814dcd43SSerapheim Dimitropoulos 	uint64_t object = 0;
6006814dcd43SSerapheim Dimitropoulos 	int err = zap_lookup(mos, vd->vdev_top_zap,
6007814dcd43SSerapheim Dimitropoulos 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6008814dcd43SSerapheim Dimitropoulos 	    &object);
6009814dcd43SSerapheim Dimitropoulos 	if (err == ENOENT) {
6010814dcd43SSerapheim Dimitropoulos 		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
6011814dcd43SSerapheim Dimitropoulos 		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
6012814dcd43SSerapheim Dimitropoulos 		VERIFY0(zap_add(mos, vd->vdev_top_zap,
6013814dcd43SSerapheim Dimitropoulos 		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6014814dcd43SSerapheim Dimitropoulos 		    &object, tx));
6015814dcd43SSerapheim Dimitropoulos 	} else {
6016814dcd43SSerapheim Dimitropoulos 		VERIFY0(err);
6017814dcd43SSerapheim Dimitropoulos 	}
6018814dcd43SSerapheim Dimitropoulos 
6019814dcd43SSerapheim Dimitropoulos 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
6020814dcd43SSerapheim Dimitropoulos 	    &entry, tx);
6021814dcd43SSerapheim Dimitropoulos }
6022814dcd43SSerapheim Dimitropoulos 
6023814dcd43SSerapheim Dimitropoulos void
metaslab_set_unflushed_txg(metaslab_t * ms,uint64_t txg,dmu_tx_t * tx)6024814dcd43SSerapheim Dimitropoulos metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
6025814dcd43SSerapheim Dimitropoulos {
6026814dcd43SSerapheim Dimitropoulos 	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
6027814dcd43SSerapheim Dimitropoulos 
6028814dcd43SSerapheim Dimitropoulos 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
6029814dcd43SSerapheim Dimitropoulos 		return;
6030814dcd43SSerapheim Dimitropoulos 
6031814dcd43SSerapheim Dimitropoulos 	ms->ms_unflushed_txg = txg;
6032814dcd43SSerapheim Dimitropoulos 	metaslab_update_ondisk_flush_data(ms, tx);
6033814dcd43SSerapheim Dimitropoulos }
6034814dcd43SSerapheim Dimitropoulos 
6035814dcd43SSerapheim Dimitropoulos uint64_t
metaslab_unflushed_txg(metaslab_t * ms)6036814dcd43SSerapheim Dimitropoulos metaslab_unflushed_txg(metaslab_t *ms)
6037814dcd43SSerapheim Dimitropoulos {
6038814dcd43SSerapheim Dimitropoulos 	return (ms->ms_unflushed_txg);
6039814dcd43SSerapheim Dimitropoulos }
6040