1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright (c) 2017, Intel Corporation.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/dmu.h>
31#include <sys/dmu_tx.h>
32#include <sys/space_map.h>
33#include <sys/metaslab_impl.h>
34#include <sys/vdev_impl.h>
35#include <sys/zio.h>
36#include <sys/spa_impl.h>
37#include <sys/zfeature.h>
38#include <sys/vdev_indirect_mapping.h>
39#include <sys/zap.h>
40
41#define	GANG_ALLOCATION(flags) \
42	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
43
44uint64_t metaslab_aliquot = 512ULL << 10;
45uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
46
47/*
48 * In pools where the log space map feature is not enabled we touch
49 * multiple metaslabs (and their respective space maps) with each
50 * transaction group. Thus, we benefit from having a small space map
51 * block size since it allows us to issue more I/O operations scattered
52 * around the disk. So a sane default for the space map block size
53 * is 8~16K.
54 */
55int zfs_metaslab_sm_blksz_no_log = (1 << 14);
56
57/*
58 * When the log space map feature is enabled, we accumulate a lot of
59 * changes per metaslab that are flushed once in a while so we benefit
60 * from a bigger block size like 128K for the metaslab space maps.
61 */
62int zfs_metaslab_sm_blksz_with_log = (1 << 17);
63
64/*
65 * The in-core space map representation is more compact than its on-disk form.
66 * The zfs_condense_pct determines how much more compact the in-core
67 * space map representation must be before we compact it on-disk.
68 * Values should be greater than or equal to 100.
69 */
70int zfs_condense_pct = 200;
71
72/*
73 * Condensing a metaslab is not guaranteed to actually reduce the amount of
74 * space used on disk. In particular, a space map uses data in increments of
75 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
76 * same number of blocks after condensing. Since the goal of condensing is to
77 * reduce the number of IOPs required to read the space map, we only want to
78 * condense when we can be sure we will reduce the number of blocks used by the
79 * space map. Unfortunately, we cannot precisely compute whether or not this is
80 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
81 * we apply the following heuristic: do not condense a spacemap unless the
82 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
83 * blocks.
84 */
85int zfs_metaslab_condense_block_threshold = 4;
86
87/*
88 * The zfs_mg_noalloc_threshold defines which metaslab groups should
89 * be eligible for allocation. The value is defined as a percentage of
90 * free space. Metaslab groups that have more free space than
91 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
92 * a metaslab group's free space is less than or equal to the
93 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
94 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
95 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
96 * groups are allowed to accept allocations. Gang blocks are always
97 * eligible to allocate on any metaslab group. The default value of 0 means
98 * no metaslab group will be excluded based on this criterion.
99 */
100int zfs_mg_noalloc_threshold = 0;
101
102/*
103 * Metaslab groups are considered eligible for allocations if their
104 * fragmenation metric (measured as a percentage) is less than or
105 * equal to zfs_mg_fragmentation_threshold. If a metaslab group
106 * exceeds this threshold then it will be skipped unless all metaslab
107 * groups within the metaslab class have also crossed this threshold.
108 *
109 * This tunable was introduced to avoid edge cases where we continue
110 * allocating from very fragmented disks in our pool while other, less
111 * fragmented disks, exists. On the other hand, if all disks in the
112 * pool are uniformly approaching the threshold, the threshold can
113 * be a speed bump in performance, where we keep switching the disks
114 * that we allocate from (e.g. we allocate some segments from disk A
115 * making it bypassing the threshold while freeing segments from disk
116 * B getting its fragmentation below the threshold).
117 *
118 * Empirically, we've seen that our vdev selection for allocations is
119 * good enough that fragmentation increases uniformly across all vdevs
120 * the majority of the time. Thus we set the threshold percentage high
121 * enough to avoid hitting the speed bump on pools that are being pushed
122 * to the edge.
123 */
124int zfs_mg_fragmentation_threshold = 95;
125
126/*
127 * Allow metaslabs to keep their active state as long as their fragmentation
128 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
129 * active metaslab that exceeds this threshold will no longer keep its active
130 * status allowing better metaslabs to be selected.
131 */
132int zfs_metaslab_fragmentation_threshold = 70;
133
134/*
135 * When set will load all metaslabs when pool is first opened.
136 */
137int metaslab_debug_load = 0;
138
139/*
140 * When set will prevent metaslabs from being unloaded.
141 */
142int metaslab_debug_unload = 0;
143
144/*
145 * Minimum size which forces the dynamic allocator to change
146 * it's allocation strategy.  Once the space map cannot satisfy
147 * an allocation of this size then it switches to using more
148 * aggressive strategy (i.e search by size rather than offset).
149 */
150uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
151
152/*
153 * The minimum free space, in percent, which must be available
154 * in a space map to continue allocations in a first-fit fashion.
155 * Once the space map's free space drops below this level we dynamically
156 * switch to using best-fit allocations.
157 */
158int metaslab_df_free_pct = 4;
159
160/*
161 * Maximum distance to search forward from the last offset. Without this
162 * limit, fragmented pools can see >100,000 iterations and
163 * metaslab_block_picker() becomes the performance limiting factor on
164 * high-performance storage.
165 *
166 * With the default setting of 16MB, we typically see less than 500
167 * iterations, even with very fragmented, ashift=9 pools. The maximum number
168 * of iterations possible is:
169 *     metaslab_df_max_search / (2 * (1<<ashift))
170 * With the default setting of 16MB this is 16*1024 (with ashift=9) or
171 * 2048 (with ashift=12).
172 */
173int metaslab_df_max_search = 16 * 1024 * 1024;
174
175/*
176 * If we are not searching forward (due to metaslab_df_max_search,
177 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
178 * controls what segment is used.  If it is set, we will use the largest free
179 * segment.  If it is not set, we will use a segment of exactly the requested
180 * size (or larger).
181 */
182int metaslab_df_use_largest_segment = B_FALSE;
183
184/*
185 * A metaslab is considered "free" if it contains a contiguous
186 * segment which is greater than metaslab_min_alloc_size.
187 */
188uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
189
190/*
191 * Percentage of all cpus that can be used by the metaslab taskq.
192 */
193int metaslab_load_pct = 50;
194
195/*
196 * These tunables control how long a metaslab will remain loaded after the
197 * last allocation from it.  A metaslab can't be unloaded until at least
198 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
199 * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
200 * unloaded sooner.  These settings are intended to be generous -- to keep
201 * metaslabs loaded for a long time, reducing the rate of metaslab loading.
202 */
203int metaslab_unload_delay = 32;
204int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
205
206/*
207 * Max number of metaslabs per group to preload.
208 */
209int metaslab_preload_limit = 10;
210
211/*
212 * Enable/disable preloading of metaslab.
213 */
214boolean_t metaslab_preload_enabled = B_TRUE;
215
216/*
217 * Enable/disable fragmentation weighting on metaslabs.
218 */
219boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
220
221/*
222 * Enable/disable lba weighting (i.e. outer tracks are given preference).
223 */
224boolean_t metaslab_lba_weighting_enabled = B_TRUE;
225
226/*
227 * Enable/disable metaslab group biasing.
228 */
229boolean_t metaslab_bias_enabled = B_TRUE;
230
231/*
232 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
233 */
234boolean_t zfs_remap_blkptr_enable = B_TRUE;
235
236/*
237 * Enable/disable segment-based metaslab selection.
238 */
239boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
240
241/*
242 * When using segment-based metaslab selection, we will continue
243 * allocating from the active metaslab until we have exhausted
244 * zfs_metaslab_switch_threshold of its buckets.
245 */
246int zfs_metaslab_switch_threshold = 2;
247
248/*
249 * Internal switch to enable/disable the metaslab allocation tracing
250 * facility.
251 */
252boolean_t metaslab_trace_enabled = B_TRUE;
253
254/*
255 * Maximum entries that the metaslab allocation tracing facility will keep
256 * in a given list when running in non-debug mode. We limit the number
257 * of entries in non-debug mode to prevent us from using up too much memory.
258 * The limit should be sufficiently large that we don't expect any allocation
259 * to every exceed this value. In debug mode, the system will panic if this
260 * limit is ever reached allowing for further investigation.
261 */
262uint64_t metaslab_trace_max_entries = 5000;
263
264/*
265 * Maximum number of metaslabs per group that can be disabled
266 * simultaneously.
267 */
268int max_disabled_ms = 3;
269
270/*
271 * Maximum percentage of memory to use on storing loaded metaslabs. If loading
272 * a metaslab would take it over this percentage, the oldest selected metaslab
273 * is automatically unloaded.
274 */
275int zfs_metaslab_mem_limit = 25;
276
277/*
278 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
279 * To avoid 64-bit overflow, don't set above UINT32_MAX.
280 */
281unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
282
283static uint64_t metaslab_weight(metaslab_t *);
284static void metaslab_set_fragmentation(metaslab_t *);
285static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
286static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
287static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
288static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
289static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
290static unsigned int metaslab_idx_func(multilist_t *, void *);
291static void metaslab_evict(metaslab_t *, uint64_t);
292
293kmem_cache_t *metaslab_alloc_trace_cache;
294
295/*
296 * ==========================================================================
297 * Metaslab classes
298 * ==========================================================================
299 */
300metaslab_class_t *
301metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
302{
303	metaslab_class_t *mc;
304
305	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
306
307	mc->mc_spa = spa;
308	mc->mc_rotor = NULL;
309	mc->mc_ops = ops;
310	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
311	mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
312	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
313	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
314	    sizeof (zfs_refcount_t), KM_SLEEP);
315	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
316	    sizeof (uint64_t), KM_SLEEP);
317	for (int i = 0; i < spa->spa_alloc_count; i++)
318		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
319
320	return (mc);
321}
322
323void
324metaslab_class_destroy(metaslab_class_t *mc)
325{
326	ASSERT(mc->mc_rotor == NULL);
327	ASSERT(mc->mc_alloc == 0);
328	ASSERT(mc->mc_deferred == 0);
329	ASSERT(mc->mc_space == 0);
330	ASSERT(mc->mc_dspace == 0);
331
332	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
333		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
334	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
335	    sizeof (zfs_refcount_t));
336	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
337	    sizeof (uint64_t));
338	mutex_destroy(&mc->mc_lock);
339	multilist_destroy(mc->mc_metaslab_txg_list);
340	kmem_free(mc, sizeof (metaslab_class_t));
341}
342
343int
344metaslab_class_validate(metaslab_class_t *mc)
345{
346	metaslab_group_t *mg;
347	vdev_t *vd;
348
349	/*
350	 * Must hold one of the spa_config locks.
351	 */
352	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
353	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
354
355	if ((mg = mc->mc_rotor) == NULL)
356		return (0);
357
358	do {
359		vd = mg->mg_vd;
360		ASSERT(vd->vdev_mg != NULL);
361		ASSERT3P(vd->vdev_top, ==, vd);
362		ASSERT3P(mg->mg_class, ==, mc);
363		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
364	} while ((mg = mg->mg_next) != mc->mc_rotor);
365
366	return (0);
367}
368
369static void
370metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
371    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
372{
373	atomic_add_64(&mc->mc_alloc, alloc_delta);
374	atomic_add_64(&mc->mc_deferred, defer_delta);
375	atomic_add_64(&mc->mc_space, space_delta);
376	atomic_add_64(&mc->mc_dspace, dspace_delta);
377}
378
379uint64_t
380metaslab_class_get_alloc(metaslab_class_t *mc)
381{
382	return (mc->mc_alloc);
383}
384
385uint64_t
386metaslab_class_get_deferred(metaslab_class_t *mc)
387{
388	return (mc->mc_deferred);
389}
390
391uint64_t
392metaslab_class_get_space(metaslab_class_t *mc)
393{
394	return (mc->mc_space);
395}
396
397uint64_t
398metaslab_class_get_dspace(metaslab_class_t *mc)
399{
400	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
401}
402
403void
404metaslab_class_histogram_verify(metaslab_class_t *mc)
405{
406	spa_t *spa = mc->mc_spa;
407	vdev_t *rvd = spa->spa_root_vdev;
408	uint64_t *mc_hist;
409	int i;
410
411	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
412		return;
413
414	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
415	    KM_SLEEP);
416
417	for (int c = 0; c < rvd->vdev_children; c++) {
418		vdev_t *tvd = rvd->vdev_child[c];
419		metaslab_group_t *mg = tvd->vdev_mg;
420
421		/*
422		 * Skip any holes, uninitialized top-levels, or
423		 * vdevs that are not in this metalab class.
424		 */
425		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
426		    mg->mg_class != mc) {
427			continue;
428		}
429
430		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
431			mc_hist[i] += mg->mg_histogram[i];
432	}
433
434	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
435		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
436
437	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
438}
439
440/*
441 * Calculate the metaslab class's fragmentation metric. The metric
442 * is weighted based on the space contribution of each metaslab group.
443 * The return value will be a number between 0 and 100 (inclusive), or
444 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
445 * zfs_frag_table for more information about the metric.
446 */
447uint64_t
448metaslab_class_fragmentation(metaslab_class_t *mc)
449{
450	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
451	uint64_t fragmentation = 0;
452
453	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
454
455	for (int c = 0; c < rvd->vdev_children; c++) {
456		vdev_t *tvd = rvd->vdev_child[c];
457		metaslab_group_t *mg = tvd->vdev_mg;
458
459		/*
460		 * Skip any holes, uninitialized top-levels,
461		 * or vdevs that are not in this metalab class.
462		 */
463		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
464		    mg->mg_class != mc) {
465			continue;
466		}
467
468		/*
469		 * If a metaslab group does not contain a fragmentation
470		 * metric then just bail out.
471		 */
472		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
473			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
474			return (ZFS_FRAG_INVALID);
475		}
476
477		/*
478		 * Determine how much this metaslab_group is contributing
479		 * to the overall pool fragmentation metric.
480		 */
481		fragmentation += mg->mg_fragmentation *
482		    metaslab_group_get_space(mg);
483	}
484	fragmentation /= metaslab_class_get_space(mc);
485
486	ASSERT3U(fragmentation, <=, 100);
487	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
488	return (fragmentation);
489}
490
491/*
492 * Calculate the amount of expandable space that is available in
493 * this metaslab class. If a device is expanded then its expandable
494 * space will be the amount of allocatable space that is currently not
495 * part of this metaslab class.
496 */
497uint64_t
498metaslab_class_expandable_space(metaslab_class_t *mc)
499{
500	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
501	uint64_t space = 0;
502
503	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
504	for (int c = 0; c < rvd->vdev_children; c++) {
505		uint64_t tspace;
506		vdev_t *tvd = rvd->vdev_child[c];
507		metaslab_group_t *mg = tvd->vdev_mg;
508
509		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
510		    mg->mg_class != mc) {
511			continue;
512		}
513
514		/*
515		 * Calculate if we have enough space to add additional
516		 * metaslabs. We report the expandable space in terms
517		 * of the metaslab size since that's the unit of expansion.
518		 * Adjust by efi system partition size.
519		 */
520		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
521		if (tspace > mc->mc_spa->spa_bootsize) {
522			tspace -= mc->mc_spa->spa_bootsize;
523		}
524		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
525	}
526	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
527	return (space);
528}
529
530void
531metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
532{
533	multilist_t *ml = mc->mc_metaslab_txg_list;
534	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
535		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
536		metaslab_t *msp = multilist_sublist_head(mls);
537		multilist_sublist_unlock(mls);
538		while (msp != NULL) {
539			mutex_enter(&msp->ms_lock);
540
541			/*
542			 * If the metaslab has been removed from the list
543			 * (which could happen if we were at the memory limit
544			 * and it was evicted during this loop), then we can't
545			 * proceed and we should restart the sublist.
546			 */
547			if (!multilist_link_active(&msp->ms_class_txg_node)) {
548				mutex_exit(&msp->ms_lock);
549				i--;
550				break;
551			}
552			mls = multilist_sublist_lock(ml, i);
553			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
554			multilist_sublist_unlock(mls);
555			if (txg >
556			    msp->ms_selected_txg + metaslab_unload_delay &&
557			    gethrtime() > msp->ms_selected_time +
558			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
559				metaslab_evict(msp, txg);
560			} else {
561				/*
562				 * Once we've hit a metaslab selected too
563				 * recently to evict, we're done evicting for
564				 * now.
565				 */
566				mutex_exit(&msp->ms_lock);
567				break;
568			}
569			mutex_exit(&msp->ms_lock);
570			msp = next_msp;
571		}
572	}
573}
574
575static int
576metaslab_compare(const void *x1, const void *x2)
577{
578	const metaslab_t *m1 = (const metaslab_t *)x1;
579	const metaslab_t *m2 = (const metaslab_t *)x2;
580
581	int sort1 = 0;
582	int sort2 = 0;
583	if (m1->ms_allocator != -1 && m1->ms_primary)
584		sort1 = 1;
585	else if (m1->ms_allocator != -1 && !m1->ms_primary)
586		sort1 = 2;
587	if (m2->ms_allocator != -1 && m2->ms_primary)
588		sort2 = 1;
589	else if (m2->ms_allocator != -1 && !m2->ms_primary)
590		sort2 = 2;
591
592	/*
593	 * Sort inactive metaslabs first, then primaries, then secondaries. When
594	 * selecting a metaslab to allocate from, an allocator first tries its
595	 * primary, then secondary active metaslab. If it doesn't have active
596	 * metaslabs, or can't allocate from them, it searches for an inactive
597	 * metaslab to activate. If it can't find a suitable one, it will steal
598	 * a primary or secondary metaslab from another allocator.
599	 */
600	if (sort1 < sort2)
601		return (-1);
602	if (sort1 > sort2)
603		return (1);
604
605	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
606	if (likely(cmp))
607		return (cmp);
608
609	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
610
611	return (AVL_CMP(m1->ms_start, m2->ms_start));
612}
613
614/*
615 * ==========================================================================
616 * Metaslab groups
617 * ==========================================================================
618 */
619/*
620 * Update the allocatable flag and the metaslab group's capacity.
621 * The allocatable flag is set to true if the capacity is below
622 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
623 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
624 * transitions from allocatable to non-allocatable or vice versa then the
625 * metaslab group's class is updated to reflect the transition.
626 */
627static void
628metaslab_group_alloc_update(metaslab_group_t *mg)
629{
630	vdev_t *vd = mg->mg_vd;
631	metaslab_class_t *mc = mg->mg_class;
632	vdev_stat_t *vs = &vd->vdev_stat;
633	boolean_t was_allocatable;
634	boolean_t was_initialized;
635
636	ASSERT(vd == vd->vdev_top);
637	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
638	    SCL_ALLOC);
639
640	mutex_enter(&mg->mg_lock);
641	was_allocatable = mg->mg_allocatable;
642	was_initialized = mg->mg_initialized;
643
644	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
645	    (vs->vs_space + 1);
646
647	mutex_enter(&mc->mc_lock);
648
649	/*
650	 * If the metaslab group was just added then it won't
651	 * have any space until we finish syncing out this txg.
652	 * At that point we will consider it initialized and available
653	 * for allocations.  We also don't consider non-activated
654	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
655	 * to be initialized, because they can't be used for allocation.
656	 */
657	mg->mg_initialized = metaslab_group_initialized(mg);
658	if (!was_initialized && mg->mg_initialized) {
659		mc->mc_groups++;
660	} else if (was_initialized && !mg->mg_initialized) {
661		ASSERT3U(mc->mc_groups, >, 0);
662		mc->mc_groups--;
663	}
664	if (mg->mg_initialized)
665		mg->mg_no_free_space = B_FALSE;
666
667	/*
668	 * A metaslab group is considered allocatable if it has plenty
669	 * of free space or is not heavily fragmented. We only take
670	 * fragmentation into account if the metaslab group has a valid
671	 * fragmentation metric (i.e. a value between 0 and 100).
672	 */
673	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
674	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
675	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
676	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
677
678	/*
679	 * The mc_alloc_groups maintains a count of the number of
680	 * groups in this metaslab class that are still above the
681	 * zfs_mg_noalloc_threshold. This is used by the allocating
682	 * threads to determine if they should avoid allocations to
683	 * a given group. The allocator will avoid allocations to a group
684	 * if that group has reached or is below the zfs_mg_noalloc_threshold
685	 * and there are still other groups that are above the threshold.
686	 * When a group transitions from allocatable to non-allocatable or
687	 * vice versa we update the metaslab class to reflect that change.
688	 * When the mc_alloc_groups value drops to 0 that means that all
689	 * groups have reached the zfs_mg_noalloc_threshold making all groups
690	 * eligible for allocations. This effectively means that all devices
691	 * are balanced again.
692	 */
693	if (was_allocatable && !mg->mg_allocatable)
694		mc->mc_alloc_groups--;
695	else if (!was_allocatable && mg->mg_allocatable)
696		mc->mc_alloc_groups++;
697	mutex_exit(&mc->mc_lock);
698
699	mutex_exit(&mg->mg_lock);
700}
701
702int
703metaslab_sort_by_flushed(const void *va, const void *vb)
704{
705	const metaslab_t *a = va;
706	const metaslab_t *b = vb;
707
708	int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
709	if (likely(cmp))
710		return (cmp);
711
712	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
713	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
714	cmp = AVL_CMP(a_vdev_id, b_vdev_id);
715	if (cmp)
716		return (cmp);
717
718	return (AVL_CMP(a->ms_id, b->ms_id));
719}
720
721metaslab_group_t *
722metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
723{
724	metaslab_group_t *mg;
725
726	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
727	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
728	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
729	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
730	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
731	    KM_SLEEP);
732	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
733	    KM_SLEEP);
734	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
735	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
736	mg->mg_vd = vd;
737	mg->mg_class = mc;
738	mg->mg_activation_count = 0;
739	mg->mg_initialized = B_FALSE;
740	mg->mg_no_free_space = B_TRUE;
741	mg->mg_allocators = allocators;
742
743	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
744	    sizeof (zfs_refcount_t), KM_SLEEP);
745	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
746	    sizeof (uint64_t), KM_SLEEP);
747	for (int i = 0; i < allocators; i++) {
748		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
749		mg->mg_cur_max_alloc_queue_depth[i] = 0;
750	}
751
752	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
753	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
754
755	return (mg);
756}
757
758void
759metaslab_group_destroy(metaslab_group_t *mg)
760{
761	ASSERT(mg->mg_prev == NULL);
762	ASSERT(mg->mg_next == NULL);
763	/*
764	 * We may have gone below zero with the activation count
765	 * either because we never activated in the first place or
766	 * because we're done, and possibly removing the vdev.
767	 */
768	ASSERT(mg->mg_activation_count <= 0);
769
770	taskq_destroy(mg->mg_taskq);
771	avl_destroy(&mg->mg_metaslab_tree);
772	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
773	kmem_free(mg->mg_secondaries, mg->mg_allocators *
774	    sizeof (metaslab_t *));
775	mutex_destroy(&mg->mg_lock);
776	mutex_destroy(&mg->mg_ms_disabled_lock);
777	cv_destroy(&mg->mg_ms_disabled_cv);
778
779	for (int i = 0; i < mg->mg_allocators; i++) {
780		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
781		mg->mg_cur_max_alloc_queue_depth[i] = 0;
782	}
783	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
784	    sizeof (zfs_refcount_t));
785	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
786	    sizeof (uint64_t));
787
788	kmem_free(mg, sizeof (metaslab_group_t));
789}
790
791void
792metaslab_group_activate(metaslab_group_t *mg)
793{
794	metaslab_class_t *mc = mg->mg_class;
795	metaslab_group_t *mgprev, *mgnext;
796
797	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
798
799	ASSERT(mc->mc_rotor != mg);
800	ASSERT(mg->mg_prev == NULL);
801	ASSERT(mg->mg_next == NULL);
802	ASSERT(mg->mg_activation_count <= 0);
803
804	if (++mg->mg_activation_count <= 0)
805		return;
806
807	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
808	metaslab_group_alloc_update(mg);
809
810	if ((mgprev = mc->mc_rotor) == NULL) {
811		mg->mg_prev = mg;
812		mg->mg_next = mg;
813	} else {
814		mgnext = mgprev->mg_next;
815		mg->mg_prev = mgprev;
816		mg->mg_next = mgnext;
817		mgprev->mg_next = mg;
818		mgnext->mg_prev = mg;
819	}
820	mc->mc_rotor = mg;
821}
822
823/*
824 * Passivate a metaslab group and remove it from the allocation rotor.
825 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
826 * a metaslab group. This function will momentarily drop spa_config_locks
827 * that are lower than the SCL_ALLOC lock (see comment below).
828 */
829void
830metaslab_group_passivate(metaslab_group_t *mg)
831{
832	metaslab_class_t *mc = mg->mg_class;
833	spa_t *spa = mc->mc_spa;
834	metaslab_group_t *mgprev, *mgnext;
835	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
836
837	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
838	    (SCL_ALLOC | SCL_ZIO));
839
840	if (--mg->mg_activation_count != 0) {
841		ASSERT(mc->mc_rotor != mg);
842		ASSERT(mg->mg_prev == NULL);
843		ASSERT(mg->mg_next == NULL);
844		ASSERT(mg->mg_activation_count < 0);
845		return;
846	}
847
848	/*
849	 * The spa_config_lock is an array of rwlocks, ordered as
850	 * follows (from highest to lowest):
851	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
852	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
853	 * (For more information about the spa_config_lock see spa_misc.c)
854	 * The higher the lock, the broader its coverage. When we passivate
855	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
856	 * config locks. However, the metaslab group's taskq might be trying
857	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
858	 * lower locks to allow the I/O to complete. At a minimum,
859	 * we continue to hold the SCL_ALLOC lock, which prevents any future
860	 * allocations from taking place and any changes to the vdev tree.
861	 */
862	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
863	taskq_wait(mg->mg_taskq);
864	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
865	metaslab_group_alloc_update(mg);
866	for (int i = 0; i < mg->mg_allocators; i++) {
867		metaslab_t *msp = mg->mg_primaries[i];
868		if (msp != NULL) {
869			mutex_enter(&msp->ms_lock);
870			metaslab_passivate(msp,
871			    metaslab_weight_from_range_tree(msp));
872			mutex_exit(&msp->ms_lock);
873		}
874		msp = mg->mg_secondaries[i];
875		if (msp != NULL) {
876			mutex_enter(&msp->ms_lock);
877			metaslab_passivate(msp,
878			    metaslab_weight_from_range_tree(msp));
879			mutex_exit(&msp->ms_lock);
880		}
881	}
882
883	mgprev = mg->mg_prev;
884	mgnext = mg->mg_next;
885
886	if (mg == mgnext) {
887		mc->mc_rotor = NULL;
888	} else {
889		mc->mc_rotor = mgnext;
890		mgprev->mg_next = mgnext;
891		mgnext->mg_prev = mgprev;
892	}
893
894	mg->mg_prev = NULL;
895	mg->mg_next = NULL;
896}
897
898boolean_t
899metaslab_group_initialized(metaslab_group_t *mg)
900{
901	vdev_t *vd = mg->mg_vd;
902	vdev_stat_t *vs = &vd->vdev_stat;
903
904	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
905}
906
907uint64_t
908metaslab_group_get_space(metaslab_group_t *mg)
909{
910	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
911}
912
913void
914metaslab_group_histogram_verify(metaslab_group_t *mg)
915{
916	uint64_t *mg_hist;
917	vdev_t *vd = mg->mg_vd;
918	uint64_t ashift = vd->vdev_ashift;
919	int i;
920
921	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
922		return;
923
924	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
925	    KM_SLEEP);
926
927	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
928	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
929
930	for (int m = 0; m < vd->vdev_ms_count; m++) {
931		metaslab_t *msp = vd->vdev_ms[m];
932
933		/* skip if not active or not a member */
934		if (msp->ms_sm == NULL || msp->ms_group != mg)
935			continue;
936
937		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
938			mg_hist[i + ashift] +=
939			    msp->ms_sm->sm_phys->smp_histogram[i];
940	}
941
942	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
943		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
944
945	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
946}
947
948static void
949metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
950{
951	metaslab_class_t *mc = mg->mg_class;
952	uint64_t ashift = mg->mg_vd->vdev_ashift;
953
954	ASSERT(MUTEX_HELD(&msp->ms_lock));
955	if (msp->ms_sm == NULL)
956		return;
957
958	mutex_enter(&mg->mg_lock);
959	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
960		mg->mg_histogram[i + ashift] +=
961		    msp->ms_sm->sm_phys->smp_histogram[i];
962		mc->mc_histogram[i + ashift] +=
963		    msp->ms_sm->sm_phys->smp_histogram[i];
964	}
965	mutex_exit(&mg->mg_lock);
966}
967
968void
969metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
970{
971	metaslab_class_t *mc = mg->mg_class;
972	uint64_t ashift = mg->mg_vd->vdev_ashift;
973
974	ASSERT(MUTEX_HELD(&msp->ms_lock));
975	if (msp->ms_sm == NULL)
976		return;
977
978	mutex_enter(&mg->mg_lock);
979	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
980		ASSERT3U(mg->mg_histogram[i + ashift], >=,
981		    msp->ms_sm->sm_phys->smp_histogram[i]);
982		ASSERT3U(mc->mc_histogram[i + ashift], >=,
983		    msp->ms_sm->sm_phys->smp_histogram[i]);
984
985		mg->mg_histogram[i + ashift] -=
986		    msp->ms_sm->sm_phys->smp_histogram[i];
987		mc->mc_histogram[i + ashift] -=
988		    msp->ms_sm->sm_phys->smp_histogram[i];
989	}
990	mutex_exit(&mg->mg_lock);
991}
992
993static void
994metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
995{
996	ASSERT(msp->ms_group == NULL);
997	mutex_enter(&mg->mg_lock);
998	msp->ms_group = mg;
999	msp->ms_weight = 0;
1000	avl_add(&mg->mg_metaslab_tree, msp);
1001	mutex_exit(&mg->mg_lock);
1002
1003	mutex_enter(&msp->ms_lock);
1004	metaslab_group_histogram_add(mg, msp);
1005	mutex_exit(&msp->ms_lock);
1006}
1007
1008static void
1009metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1010{
1011	mutex_enter(&msp->ms_lock);
1012	metaslab_group_histogram_remove(mg, msp);
1013	mutex_exit(&msp->ms_lock);
1014
1015	mutex_enter(&mg->mg_lock);
1016	ASSERT(msp->ms_group == mg);
1017	avl_remove(&mg->mg_metaslab_tree, msp);
1018
1019	metaslab_class_t *mc = msp->ms_group->mg_class;
1020	multilist_sublist_t *mls =
1021	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
1022	if (multilist_link_active(&msp->ms_class_txg_node))
1023		multilist_sublist_remove(mls, msp);
1024	multilist_sublist_unlock(mls);
1025
1026	msp->ms_group = NULL;
1027	mutex_exit(&mg->mg_lock);
1028}
1029
1030static void
1031metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1032{
1033	ASSERT(MUTEX_HELD(&msp->ms_lock));
1034	ASSERT(MUTEX_HELD(&mg->mg_lock));
1035	ASSERT(msp->ms_group == mg);
1036
1037	avl_remove(&mg->mg_metaslab_tree, msp);
1038	msp->ms_weight = weight;
1039	avl_add(&mg->mg_metaslab_tree, msp);
1040
1041}
1042
1043static void
1044metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1045{
1046	/*
1047	 * Although in principle the weight can be any value, in
1048	 * practice we do not use values in the range [1, 511].
1049	 */
1050	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1051	ASSERT(MUTEX_HELD(&msp->ms_lock));
1052
1053	mutex_enter(&mg->mg_lock);
1054	metaslab_group_sort_impl(mg, msp, weight);
1055	mutex_exit(&mg->mg_lock);
1056}
1057
1058/*
1059 * Calculate the fragmentation for a given metaslab group. We can use
1060 * a simple average here since all metaslabs within the group must have
1061 * the same size. The return value will be a value between 0 and 100
1062 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1063 * group have a fragmentation metric.
1064 */
1065uint64_t
1066metaslab_group_fragmentation(metaslab_group_t *mg)
1067{
1068	vdev_t *vd = mg->mg_vd;
1069	uint64_t fragmentation = 0;
1070	uint64_t valid_ms = 0;
1071
1072	for (int m = 0; m < vd->vdev_ms_count; m++) {
1073		metaslab_t *msp = vd->vdev_ms[m];
1074
1075		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1076			continue;
1077		if (msp->ms_group != mg)
1078			continue;
1079
1080		valid_ms++;
1081		fragmentation += msp->ms_fragmentation;
1082	}
1083
1084	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1085		return (ZFS_FRAG_INVALID);
1086
1087	fragmentation /= valid_ms;
1088	ASSERT3U(fragmentation, <=, 100);
1089	return (fragmentation);
1090}
1091
1092/*
1093 * Determine if a given metaslab group should skip allocations. A metaslab
1094 * group should avoid allocations if its free capacity is less than the
1095 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1096 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1097 * that can still handle allocations. If the allocation throttle is enabled
1098 * then we skip allocations to devices that have reached their maximum
1099 * allocation queue depth unless the selected metaslab group is the only
1100 * eligible group remaining.
1101 */
1102static boolean_t
1103metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1104    uint64_t psize, int allocator, int d)
1105{
1106	spa_t *spa = mg->mg_vd->vdev_spa;
1107	metaslab_class_t *mc = mg->mg_class;
1108
1109	/*
1110	 * We can only consider skipping this metaslab group if it's
1111	 * in the normal metaslab class and there are other metaslab
1112	 * groups to select from. Otherwise, we always consider it eligible
1113	 * for allocations.
1114	 */
1115	if ((mc != spa_normal_class(spa) &&
1116	    mc != spa_special_class(spa) &&
1117	    mc != spa_dedup_class(spa)) ||
1118	    mc->mc_groups <= 1)
1119		return (B_TRUE);
1120
1121	/*
1122	 * If the metaslab group's mg_allocatable flag is set (see comments
1123	 * in metaslab_group_alloc_update() for more information) and
1124	 * the allocation throttle is disabled then allow allocations to this
1125	 * device. However, if the allocation throttle is enabled then
1126	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
1127	 * to determine if we should allow allocations to this metaslab group.
1128	 * If all metaslab groups are no longer considered allocatable
1129	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1130	 * gang block size then we allow allocations on this metaslab group
1131	 * regardless of the mg_allocatable or throttle settings.
1132	 */
1133	if (mg->mg_allocatable) {
1134		metaslab_group_t *mgp;
1135		int64_t qdepth;
1136		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1137
1138		if (!mc->mc_alloc_throttle_enabled)
1139			return (B_TRUE);
1140
1141		/*
1142		 * If this metaslab group does not have any free space, then
1143		 * there is no point in looking further.
1144		 */
1145		if (mg->mg_no_free_space)
1146			return (B_FALSE);
1147
1148		/*
1149		 * Relax allocation throttling for ditto blocks.  Due to
1150		 * random imbalances in allocation it tends to push copies
1151		 * to one vdev, that looks a bit better at the moment.
1152		 */
1153		qmax = qmax * (4 + d) / 4;
1154
1155		qdepth = zfs_refcount_count(
1156		    &mg->mg_alloc_queue_depth[allocator]);
1157
1158		/*
1159		 * If this metaslab group is below its qmax or it's
1160		 * the only allocatable metasable group, then attempt
1161		 * to allocate from it.
1162		 */
1163		if (qdepth < qmax || mc->mc_alloc_groups == 1)
1164			return (B_TRUE);
1165		ASSERT3U(mc->mc_alloc_groups, >, 1);
1166
1167		/*
1168		 * Since this metaslab group is at or over its qmax, we
1169		 * need to determine if there are metaslab groups after this
1170		 * one that might be able to handle this allocation. This is
1171		 * racy since we can't hold the locks for all metaslab
1172		 * groups at the same time when we make this check.
1173		 */
1174		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1175			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1176			qmax = qmax * (4 + d) / 4;
1177			qdepth = zfs_refcount_count(
1178			    &mgp->mg_alloc_queue_depth[allocator]);
1179
1180			/*
1181			 * If there is another metaslab group that
1182			 * might be able to handle the allocation, then
1183			 * we return false so that we skip this group.
1184			 */
1185			if (qdepth < qmax && !mgp->mg_no_free_space)
1186				return (B_FALSE);
1187		}
1188
1189		/*
1190		 * We didn't find another group to handle the allocation
1191		 * so we can't skip this metaslab group even though
1192		 * we are at or over our qmax.
1193		 */
1194		return (B_TRUE);
1195
1196	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1197		return (B_TRUE);
1198	}
1199	return (B_FALSE);
1200}
1201
1202/*
1203 * ==========================================================================
1204 * Range tree callbacks
1205 * ==========================================================================
1206 */
1207
1208/*
1209 * Comparison function for the private size-ordered tree. Tree is sorted
1210 * by size, larger sizes at the end of the tree.
1211 */
1212static int
1213metaslab_rangesize_compare(const void *x1, const void *x2)
1214{
1215	const range_seg_t *r1 = x1;
1216	const range_seg_t *r2 = x2;
1217	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1218	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1219
1220	int cmp = AVL_CMP(rs_size1, rs_size2);
1221	if (likely(cmp))
1222		return (cmp);
1223
1224	return (AVL_CMP(r1->rs_start, r2->rs_start));
1225}
1226
1227/*
1228 * ==========================================================================
1229 * Common allocator routines
1230 * ==========================================================================
1231 */
1232
1233/*
1234 * Return the maximum contiguous segment within the metaslab.
1235 */
1236uint64_t
1237metaslab_largest_allocatable(metaslab_t *msp)
1238{
1239	avl_tree_t *t = &msp->ms_allocatable_by_size;
1240	range_seg_t *rs;
1241
1242	if (t == NULL)
1243		return (0);
1244	rs = avl_last(t);
1245	if (rs == NULL)
1246		return (0);
1247
1248	return (rs->rs_end - rs->rs_start);
1249}
1250
1251/*
1252 * Return the maximum contiguous segment within the unflushed frees of this
1253 * metaslab.
1254 */
1255uint64_t
1256metaslab_largest_unflushed_free(metaslab_t *msp)
1257{
1258	ASSERT(MUTEX_HELD(&msp->ms_lock));
1259
1260	if (msp->ms_unflushed_frees == NULL)
1261		return (0);
1262
1263	range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size);
1264	if (rs == NULL)
1265		return (0);
1266
1267	/*
1268	 * When a range is freed from the metaslab, that range is added to
1269	 * both the unflushed frees and the deferred frees. While the block
1270	 * will eventually be usable, if the metaslab were loaded the range
1271	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1272	 * txgs had passed.  As a result, when attempting to estimate an upper
1273	 * bound for the largest currently-usable free segment in the
1274	 * metaslab, we need to not consider any ranges currently in the defer
1275	 * trees. This algorithm approximates the largest available chunk in
1276	 * the largest range in the unflushed_frees tree by taking the first
1277	 * chunk.  While this may be a poor estimate, it should only remain so
1278	 * briefly and should eventually self-correct as frees are no longer
1279	 * deferred. Similar logic applies to the ms_freed tree. See
1280	 * metaslab_load() for more details.
1281	 *
1282	 * There are two primary sources of innacuracy in this estimate. Both
1283	 * are tolerated for performance reasons. The first source is that we
1284	 * only check the largest segment for overlaps. Smaller segments may
1285	 * have more favorable overlaps with the other trees, resulting in
1286	 * larger usable chunks.  Second, we only look at the first chunk in
1287	 * the largest segment; there may be other usable chunks in the
1288	 * largest segment, but we ignore them.
1289	 */
1290	uint64_t rstart = rs->rs_start;
1291	uint64_t rsize = rs->rs_end - rstart;
1292	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1293		uint64_t start = 0;
1294		uint64_t size = 0;
1295		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1296		    rsize, &start, &size);
1297		if (found) {
1298			if (rstart == start)
1299				return (0);
1300			rsize = start - rstart;
1301		}
1302	}
1303
1304	uint64_t start = 0;
1305	uint64_t size = 0;
1306	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1307	    rsize, &start, &size);
1308	if (found)
1309		rsize = start - rstart;
1310
1311	return (rsize);
1312}
1313
1314static range_seg_t *
1315metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1316{
1317	range_seg_t *rs, rsearch;
1318	avl_index_t where;
1319
1320	rsearch.rs_start = start;
1321	rsearch.rs_end = start + size;
1322
1323	rs = avl_find(t, &rsearch, &where);
1324	if (rs == NULL) {
1325		rs = avl_nearest(t, where, AVL_AFTER);
1326	}
1327
1328	return (rs);
1329}
1330
1331/*
1332 * This is a helper function that can be used by the allocator to find
1333 * a suitable block to allocate. This will search the specified AVL
1334 * tree looking for a block that matches the specified criteria.
1335 */
1336static uint64_t
1337metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1338    uint64_t max_search)
1339{
1340	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1341	uint64_t first_found;
1342
1343	if (rs != NULL)
1344		first_found = rs->rs_start;
1345
1346	while (rs != NULL && rs->rs_start - first_found <= max_search) {
1347		uint64_t offset = rs->rs_start;
1348		if (offset + size <= rs->rs_end) {
1349			*cursor = offset + size;
1350			return (offset);
1351		}
1352		rs = AVL_NEXT(t, rs);
1353	}
1354
1355	*cursor = 0;
1356	return (-1ULL);
1357}
1358
1359/*
1360 * ==========================================================================
1361 * Dynamic Fit (df) block allocator
1362 *
1363 * Search for a free chunk of at least this size, starting from the last
1364 * offset (for this alignment of block) looking for up to
1365 * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
1366 * found within 16MB, then return a free chunk of exactly the requested size (or
1367 * larger).
1368 *
1369 * If it seems like searching from the last offset will be unproductive, skip
1370 * that and just return a free chunk of exactly the requested size (or larger).
1371 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
1372 * mechanism is probably not very useful and may be removed in the future.
1373 *
1374 * The behavior when not searching can be changed to return the largest free
1375 * chunk, instead of a free chunk of exactly the requested size, by setting
1376 * metaslab_df_use_largest_segment.
1377 * ==========================================================================
1378 */
1379static uint64_t
1380metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1381{
1382	/*
1383	 * Find the largest power of 2 block size that evenly divides the
1384	 * requested size. This is used to try to allocate blocks with similar
1385	 * alignment from the same area of the metaslab (i.e. same cursor
1386	 * bucket) but it does not guarantee that other allocations sizes
1387	 * may exist in the same region.
1388	 */
1389	uint64_t align = size & -size;
1390	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1391	range_tree_t *rt = msp->ms_allocatable;
1392	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1393	uint64_t offset;
1394
1395	ASSERT(MUTEX_HELD(&msp->ms_lock));
1396	ASSERT3U(avl_numnodes(&rt->rt_root), ==,
1397	    avl_numnodes(&msp->ms_allocatable_by_size));
1398
1399	/*
1400	 * If we're running low on space, find a segment based on size,
1401	 * rather than iterating based on offset.
1402	 */
1403	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1404	    free_pct < metaslab_df_free_pct) {
1405		offset = -1;
1406	} else {
1407		offset = metaslab_block_picker(&rt->rt_root,
1408		    cursor, size, metaslab_df_max_search);
1409	}
1410
1411	if (offset == -1) {
1412		range_seg_t *rs;
1413		if (metaslab_df_use_largest_segment) {
1414			/* use largest free segment */
1415			rs = avl_last(&msp->ms_allocatable_by_size);
1416		} else {
1417			/* use segment of this size, or next largest */
1418			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1419			    0, size);
1420		}
1421		if (rs != NULL && rs->rs_start + size <= rs->rs_end) {
1422			offset = rs->rs_start;
1423			*cursor = offset + size;
1424		}
1425	}
1426
1427	return (offset);
1428}
1429
1430static metaslab_ops_t metaslab_df_ops = {
1431	metaslab_df_alloc
1432};
1433
1434/*
1435 * ==========================================================================
1436 * Cursor fit block allocator -
1437 * Select the largest region in the metaslab, set the cursor to the beginning
1438 * of the range and the cursor_end to the end of the range. As allocations
1439 * are made advance the cursor. Continue allocating from the cursor until
1440 * the range is exhausted and then find a new range.
1441 * ==========================================================================
1442 */
1443static uint64_t
1444metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1445{
1446	range_tree_t *rt = msp->ms_allocatable;
1447	avl_tree_t *t = &msp->ms_allocatable_by_size;
1448	uint64_t *cursor = &msp->ms_lbas[0];
1449	uint64_t *cursor_end = &msp->ms_lbas[1];
1450	uint64_t offset = 0;
1451
1452	ASSERT(MUTEX_HELD(&msp->ms_lock));
1453	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1454
1455	ASSERT3U(*cursor_end, >=, *cursor);
1456
1457	if ((*cursor + size) > *cursor_end) {
1458		range_seg_t *rs;
1459
1460		rs = avl_last(&msp->ms_allocatable_by_size);
1461		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1462			return (-1ULL);
1463
1464		*cursor = rs->rs_start;
1465		*cursor_end = rs->rs_end;
1466	}
1467
1468	offset = *cursor;
1469	*cursor += size;
1470
1471	return (offset);
1472}
1473
1474static metaslab_ops_t metaslab_cf_ops = {
1475	metaslab_cf_alloc
1476};
1477
1478/*
1479 * ==========================================================================
1480 * New dynamic fit allocator -
1481 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1482 * contiguous blocks. If no region is found then just use the largest segment
1483 * that remains.
1484 * ==========================================================================
1485 */
1486
1487/*
1488 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1489 * to request from the allocator.
1490 */
1491uint64_t metaslab_ndf_clump_shift = 4;
1492
1493static uint64_t
1494metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1495{
1496	avl_tree_t *t = &msp->ms_allocatable->rt_root;
1497	avl_index_t where;
1498	range_seg_t *rs, rsearch;
1499	uint64_t hbit = highbit64(size);
1500	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1501	uint64_t max_size = metaslab_largest_allocatable(msp);
1502
1503	ASSERT(MUTEX_HELD(&msp->ms_lock));
1504	ASSERT3U(avl_numnodes(t), ==,
1505	    avl_numnodes(&msp->ms_allocatable_by_size));
1506
1507	if (max_size < size)
1508		return (-1ULL);
1509
1510	rsearch.rs_start = *cursor;
1511	rsearch.rs_end = *cursor + size;
1512
1513	rs = avl_find(t, &rsearch, &where);
1514	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1515		t = &msp->ms_allocatable_by_size;
1516
1517		rsearch.rs_start = 0;
1518		rsearch.rs_end = MIN(max_size,
1519		    1ULL << (hbit + metaslab_ndf_clump_shift));
1520		rs = avl_find(t, &rsearch, &where);
1521		if (rs == NULL)
1522			rs = avl_nearest(t, where, AVL_AFTER);
1523		ASSERT(rs != NULL);
1524	}
1525
1526	if ((rs->rs_end - rs->rs_start) >= size) {
1527		*cursor = rs->rs_start + size;
1528		return (rs->rs_start);
1529	}
1530	return (-1ULL);
1531}
1532
1533static metaslab_ops_t metaslab_ndf_ops = {
1534	metaslab_ndf_alloc
1535};
1536
1537metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1538
1539/*
1540 * ==========================================================================
1541 * Metaslabs
1542 * ==========================================================================
1543 */
1544
1545/*
1546 * Wait for any in-progress metaslab loads to complete.
1547 */
1548void
1549metaslab_load_wait(metaslab_t *msp)
1550{
1551	ASSERT(MUTEX_HELD(&msp->ms_lock));
1552
1553	while (msp->ms_loading) {
1554		ASSERT(!msp->ms_loaded);
1555		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1556	}
1557}
1558
1559/*
1560 * Wait for any in-progress flushing to complete.
1561 */
1562void
1563metaslab_flush_wait(metaslab_t *msp)
1564{
1565	ASSERT(MUTEX_HELD(&msp->ms_lock));
1566
1567	while (msp->ms_flushing)
1568		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1569}
1570
1571static unsigned int
1572metaslab_idx_func(multilist_t *ml, void *arg)
1573{
1574	metaslab_t *msp = arg;
1575	return (msp->ms_id % multilist_get_num_sublists(ml));
1576}
1577
1578uint64_t
1579metaslab_allocated_space(metaslab_t *msp)
1580{
1581	return (msp->ms_allocated_space);
1582}
1583
1584/*
1585 * Verify that the space accounting on disk matches the in-core range_trees.
1586 */
1587static void
1588metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1589{
1590	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1591	uint64_t allocating = 0;
1592	uint64_t sm_free_space, msp_free_space;
1593
1594	ASSERT(MUTEX_HELD(&msp->ms_lock));
1595	ASSERT(!msp->ms_condensing);
1596
1597	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1598		return;
1599
1600	/*
1601	 * We can only verify the metaslab space when we're called
1602	 * from syncing context with a loaded metaslab that has an
1603	 * allocated space map. Calling this in non-syncing context
1604	 * does not provide a consistent view of the metaslab since
1605	 * we're performing allocations in the future.
1606	 */
1607	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1608	    !msp->ms_loaded)
1609		return;
1610
1611	/*
1612	 * Even though the smp_alloc field can get negative,
1613	 * when it comes to a metaslab's space map, that should
1614	 * never be the case.
1615	 */
1616	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1617
1618	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1619	    range_tree_space(msp->ms_unflushed_frees));
1620
1621	ASSERT3U(metaslab_allocated_space(msp), ==,
1622	    space_map_allocated(msp->ms_sm) +
1623	    range_tree_space(msp->ms_unflushed_allocs) -
1624	    range_tree_space(msp->ms_unflushed_frees));
1625
1626	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1627
1628	/*
1629	 * Account for future allocations since we would have
1630	 * already deducted that space from the ms_allocatable.
1631	 */
1632	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1633		allocating +=
1634		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1635	}
1636	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1637	    msp->ms_allocating_total);
1638
1639	ASSERT3U(msp->ms_deferspace, ==,
1640	    range_tree_space(msp->ms_defer[0]) +
1641	    range_tree_space(msp->ms_defer[1]));
1642
1643	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1644	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
1645
1646	VERIFY3U(sm_free_space, ==, msp_free_space);
1647}
1648
1649static void
1650metaslab_aux_histograms_clear(metaslab_t *msp)
1651{
1652	/*
1653	 * Auxiliary histograms are only cleared when resetting them,
1654	 * which can only happen while the metaslab is loaded.
1655	 */
1656	ASSERT(msp->ms_loaded);
1657
1658	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1659	for (int t = 0; t < TXG_DEFER_SIZE; t++)
1660		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1661}
1662
1663static void
1664metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1665    range_tree_t *rt)
1666{
1667	/*
1668	 * This is modeled after space_map_histogram_add(), so refer to that
1669	 * function for implementation details. We want this to work like
1670	 * the space map histogram, and not the range tree histogram, as we
1671	 * are essentially constructing a delta that will be later subtracted
1672	 * from the space map histogram.
1673	 */
1674	int idx = 0;
1675	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1676		ASSERT3U(i, >=, idx + shift);
1677		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1678
1679		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1680			ASSERT3U(idx + shift, ==, i);
1681			idx++;
1682			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1683		}
1684	}
1685}
1686
1687/*
1688 * Called at every sync pass that the metaslab gets synced.
1689 *
1690 * The reason is that we want our auxiliary histograms to be updated
1691 * wherever the metaslab's space map histogram is updated. This way
1692 * we stay consistent on which parts of the metaslab space map's
1693 * histogram are currently not available for allocations (e.g because
1694 * they are in the defer, freed, and freeing trees).
1695 */
1696static void
1697metaslab_aux_histograms_update(metaslab_t *msp)
1698{
1699	space_map_t *sm = msp->ms_sm;
1700	ASSERT(sm != NULL);
1701
1702	/*
1703	 * This is similar to the metaslab's space map histogram updates
1704	 * that take place in metaslab_sync(). The only difference is that
1705	 * we only care about segments that haven't made it into the
1706	 * ms_allocatable tree yet.
1707	 */
1708	if (msp->ms_loaded) {
1709		metaslab_aux_histograms_clear(msp);
1710
1711		metaslab_aux_histogram_add(msp->ms_synchist,
1712		    sm->sm_shift, msp->ms_freed);
1713
1714		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1715			metaslab_aux_histogram_add(msp->ms_deferhist[t],
1716			    sm->sm_shift, msp->ms_defer[t]);
1717		}
1718	}
1719
1720	metaslab_aux_histogram_add(msp->ms_synchist,
1721	    sm->sm_shift, msp->ms_freeing);
1722}
1723
1724/*
1725 * Called every time we are done syncing (writing to) the metaslab,
1726 * i.e. at the end of each sync pass.
1727 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1728 */
1729static void
1730metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1731{
1732	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1733	space_map_t *sm = msp->ms_sm;
1734
1735	if (sm == NULL) {
1736		/*
1737		 * We came here from metaslab_init() when creating/opening a
1738		 * pool, looking at a metaslab that hasn't had any allocations
1739		 * yet.
1740		 */
1741		return;
1742	}
1743
1744	/*
1745	 * This is similar to the actions that we take for the ms_freed
1746	 * and ms_defer trees in metaslab_sync_done().
1747	 */
1748	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1749	if (defer_allowed) {
1750		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1751		    sizeof (msp->ms_synchist));
1752	} else {
1753		bzero(msp->ms_deferhist[hist_index],
1754		    sizeof (msp->ms_deferhist[hist_index]));
1755	}
1756	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1757}
1758
1759/*
1760 * Ensure that the metaslab's weight and fragmentation are consistent
1761 * with the contents of the histogram (either the range tree's histogram
1762 * or the space map's depending whether the metaslab is loaded).
1763 */
1764static void
1765metaslab_verify_weight_and_frag(metaslab_t *msp)
1766{
1767	ASSERT(MUTEX_HELD(&msp->ms_lock));
1768
1769	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1770		return;
1771
1772	/*
1773	 * We can end up here from vdev_remove_complete(), in which case we
1774	 * cannot do these assertions because we hold spa config locks and
1775	 * thus we are not allowed to read from the DMU.
1776	 *
1777	 * We check if the metaslab group has been removed and if that's
1778	 * the case we return immediately as that would mean that we are
1779	 * here from the aforementioned code path.
1780	 */
1781	if (msp->ms_group == NULL)
1782		return;
1783
1784	/*
1785	 * Devices being removed always return a weight of 0 and leave
1786	 * fragmentation and ms_max_size as is - there is nothing for
1787	 * us to verify here.
1788	 */
1789	vdev_t *vd = msp->ms_group->mg_vd;
1790	if (vd->vdev_removing)
1791		return;
1792
1793	/*
1794	 * If the metaslab is dirty it probably means that we've done
1795	 * some allocations or frees that have changed our histograms
1796	 * and thus the weight.
1797	 */
1798	for (int t = 0; t < TXG_SIZE; t++) {
1799		if (txg_list_member(&vd->vdev_ms_list, msp, t))
1800			return;
1801	}
1802
1803	/*
1804	 * This verification checks that our in-memory state is consistent
1805	 * with what's on disk. If the pool is read-only then there aren't
1806	 * any changes and we just have the initially-loaded state.
1807	 */
1808	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1809		return;
1810
1811	/* some extra verification for in-core tree if you can */
1812	if (msp->ms_loaded) {
1813		range_tree_stat_verify(msp->ms_allocatable);
1814		VERIFY(space_map_histogram_verify(msp->ms_sm,
1815		    msp->ms_allocatable));
1816	}
1817
1818	uint64_t weight = msp->ms_weight;
1819	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1820	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1821	uint64_t frag = msp->ms_fragmentation;
1822	uint64_t max_segsize = msp->ms_max_size;
1823
1824	msp->ms_weight = 0;
1825	msp->ms_fragmentation = 0;
1826
1827	/*
1828	 * This function is used for verification purposes. Regardless of
1829	 * whether metaslab_weight() thinks this metaslab should be active or
1830	 * not, we want to ensure that the actual weight (and therefore the
1831	 * value of ms_weight) would be the same if it was to be recalculated
1832	 * at this point.
1833	 */
1834	msp->ms_weight = metaslab_weight(msp) | was_active;
1835
1836	VERIFY3U(max_segsize, ==, msp->ms_max_size);
1837
1838	/*
1839	 * If the weight type changed then there is no point in doing
1840	 * verification. Revert fields to their original values.
1841	 */
1842	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1843	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1844		msp->ms_fragmentation = frag;
1845		msp->ms_weight = weight;
1846		return;
1847	}
1848
1849	VERIFY3U(msp->ms_fragmentation, ==, frag);
1850	VERIFY3U(msp->ms_weight, ==, weight);
1851}
1852
1853/*
1854 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
1855 * this class that was used longest ago, and attempt to unload it.  We don't
1856 * want to spend too much time in this loop to prevent performance
1857 * degredation, and we expect that most of the time this operation will
1858 * succeed. Between that and the normal unloading processing during txg sync,
1859 * we expect this to keep the metaslab memory usage under control.
1860 */
1861static void
1862metaslab_potentially_evict(metaslab_class_t *mc)
1863{
1864#ifdef _KERNEL
1865	uint64_t allmem = arc_all_memory();
1866	extern kmem_cache_t *range_seg_cache;
1867	uint64_t inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
1868	uint64_t size =	kmem_cache_stat(range_seg_cache, "buf_size");
1869	int tries = 0;
1870	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
1871	    tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
1872	    tries++) {
1873		unsigned int idx = multilist_get_random_index(
1874		    mc->mc_metaslab_txg_list);
1875		multilist_sublist_t *mls =
1876		    multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
1877		metaslab_t *msp = multilist_sublist_head(mls);
1878		multilist_sublist_unlock(mls);
1879		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
1880		    inuse * size) {
1881			VERIFY3P(mls, ==, multilist_sublist_lock(
1882			    mc->mc_metaslab_txg_list, idx));
1883			ASSERT3U(idx, ==,
1884			    metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
1885
1886			if (!multilist_link_active(&msp->ms_class_txg_node)) {
1887				multilist_sublist_unlock(mls);
1888				break;
1889			}
1890			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
1891			multilist_sublist_unlock(mls);
1892			/*
1893			 * If the metaslab is currently loading there are two
1894			 * cases. If it's the metaslab we're evicting, we
1895			 * can't continue on or we'll panic when we attempt to
1896			 * recursively lock the mutex. If it's another
1897			 * metaslab that's loading, it can be safely skipped,
1898			 * since we know it's very new and therefore not a
1899			 * good eviction candidate. We check later once the
1900			 * lock is held that the metaslab is fully loaded
1901			 * before actually unloading it.
1902			 */
1903			if (msp->ms_loading) {
1904				msp = next_msp;
1905				inuse = kmem_cache_stat(range_seg_cache,
1906				    "buf_inuse");
1907				continue;
1908			}
1909			/*
1910			 * We can't unload metaslabs with no spacemap because
1911			 * they're not ready to be unloaded yet. We can't
1912			 * unload metaslabs with outstanding allocations
1913			 * because doing so could cause the metaslab's weight
1914			 * to decrease while it's unloaded, which violates an
1915			 * invariant that we use to prevent unnecessary
1916			 * loading. We also don't unload metaslabs that are
1917			 * currently active because they are high-weight
1918			 * metaslabs that are likely to be used in the near
1919			 * future.
1920			 */
1921			mutex_enter(&msp->ms_lock);
1922			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
1923			    msp->ms_allocating_total == 0) {
1924				metaslab_unload(msp);
1925			}
1926			mutex_exit(&msp->ms_lock);
1927			msp = next_msp;
1928			inuse = kmem_cache_stat(range_seg_cache, "buf_inuse");
1929		}
1930	}
1931#endif
1932}
1933
1934static int
1935metaslab_load_impl(metaslab_t *msp)
1936{
1937	int error = 0;
1938
1939	ASSERT(MUTEX_HELD(&msp->ms_lock));
1940	ASSERT(msp->ms_loading);
1941	ASSERT(!msp->ms_condensing);
1942
1943	/*
1944	 * We temporarily drop the lock to unblock other operations while we
1945	 * are reading the space map. Therefore, metaslab_sync() and
1946	 * metaslab_sync_done() can run at the same time as we do.
1947	 *
1948	 * If we are using the log space maps, metaslab_sync() can't write to
1949	 * the metaslab's space map while we are loading as we only write to
1950	 * it when we are flushing the metaslab, and that can't happen while
1951	 * we are loading it.
1952	 *
1953	 * If we are not using log space maps though, metaslab_sync() can
1954	 * append to the space map while we are loading. Therefore we load
1955	 * only entries that existed when we started the load. Additionally,
1956	 * metaslab_sync_done() has to wait for the load to complete because
1957	 * there are potential races like metaslab_load() loading parts of the
1958	 * space map that are currently being appended by metaslab_sync(). If
1959	 * we didn't, the ms_allocatable would have entries that
1960	 * metaslab_sync_done() would try to re-add later.
1961	 *
1962	 * That's why before dropping the lock we remember the synced length
1963	 * of the metaslab and read up to that point of the space map,
1964	 * ignoring entries appended by metaslab_sync() that happen after we
1965	 * drop the lock.
1966	 */
1967	uint64_t length = msp->ms_synced_length;
1968	mutex_exit(&msp->ms_lock);
1969
1970	hrtime_t load_start = gethrtime();
1971	if (msp->ms_sm != NULL) {
1972		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1973		    SM_FREE, length);
1974	} else {
1975		/*
1976		 * The space map has not been allocated yet, so treat
1977		 * all the space in the metaslab as free and add it to the
1978		 * ms_allocatable tree.
1979		 */
1980		range_tree_add(msp->ms_allocatable,
1981		    msp->ms_start, msp->ms_size);
1982
1983		if (msp->ms_freed != NULL) {
1984			/*
1985			 * If the ms_sm doesn't exist, this means that this
1986			 * metaslab hasn't gone through metaslab_sync() and
1987			 * thus has never been dirtied. So we shouldn't
1988			 * expect any unflushed allocs or frees from previous
1989			 * TXGs.
1990			 *
1991			 * Note: ms_freed and all the other trees except for
1992			 * the ms_allocatable, can be NULL at this point only
1993			 * if this is a new metaslab of a vdev that just got
1994			 * expanded.
1995			 */
1996			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
1997			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
1998		}
1999	}
2000
2001	/*
2002	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2003	 * changing the ms_sm (or log_sm) and the metaslab's range trees
2004	 * while we are about to use them and populate the ms_allocatable.
2005	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
2006	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
2007	 */
2008	mutex_enter(&msp->ms_sync_lock);
2009	mutex_enter(&msp->ms_lock);
2010
2011	ASSERT(!msp->ms_condensing);
2012	ASSERT(!msp->ms_flushing);
2013
2014	if (error != 0) {
2015		mutex_exit(&msp->ms_sync_lock);
2016		return (error);
2017	}
2018
2019	ASSERT3P(msp->ms_group, !=, NULL);
2020	msp->ms_loaded = B_TRUE;
2021
2022	/*
2023	 * Apply all the unflushed changes to ms_allocatable right
2024	 * away so any manipulations we do below have a clear view
2025	 * of what is allocated and what is free.
2026	 */
2027	range_tree_walk(msp->ms_unflushed_allocs,
2028	    range_tree_remove, msp->ms_allocatable);
2029	range_tree_walk(msp->ms_unflushed_frees,
2030	    range_tree_add, msp->ms_allocatable);
2031
2032	msp->ms_loaded = B_TRUE;
2033
2034	ASSERT3P(msp->ms_group, !=, NULL);
2035	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2036	if (spa_syncing_log_sm(spa) != NULL) {
2037		ASSERT(spa_feature_is_enabled(spa,
2038		    SPA_FEATURE_LOG_SPACEMAP));
2039
2040		/*
2041		 * If we use a log space map we add all the segments
2042		 * that are in ms_unflushed_frees so they are available
2043		 * for allocation.
2044		 *
2045		 * ms_allocatable needs to contain all free segments
2046		 * that are ready for allocations (thus not segments
2047		 * from ms_freeing, ms_freed, and the ms_defer trees).
2048		 * But if we grab the lock in this code path at a sync
2049		 * pass later that 1, then it also contains the
2050		 * segments of ms_freed (they were added to it earlier
2051		 * in this path through ms_unflushed_frees). So we
2052		 * need to remove all the segments that exist in
2053		 * ms_freed from ms_allocatable as they will be added
2054		 * later in metaslab_sync_done().
2055		 *
2056		 * When there's no log space map, the ms_allocatable
2057		 * correctly doesn't contain any segments that exist
2058		 * in ms_freed [see ms_synced_length].
2059		 */
2060		range_tree_walk(msp->ms_freed,
2061		    range_tree_remove, msp->ms_allocatable);
2062	}
2063
2064	/*
2065	 * If we are not using the log space map, ms_allocatable
2066	 * contains the segments that exist in the ms_defer trees
2067	 * [see ms_synced_length]. Thus we need to remove them
2068	 * from ms_allocatable as they will be added again in
2069	 * metaslab_sync_done().
2070	 *
2071	 * If we are using the log space map, ms_allocatable still
2072	 * contains the segments that exist in the ms_defer trees.
2073	 * Not because it read them through the ms_sm though. But
2074	 * because these segments are part of ms_unflushed_frees
2075	 * whose segments we add to ms_allocatable earlier in this
2076	 * code path.
2077	 */
2078	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2079		range_tree_walk(msp->ms_defer[t],
2080		    range_tree_remove, msp->ms_allocatable);
2081	}
2082
2083	/*
2084	 * Call metaslab_recalculate_weight_and_sort() now that the
2085	 * metaslab is loaded so we get the metaslab's real weight.
2086	 *
2087	 * Unless this metaslab was created with older software and
2088	 * has not yet been converted to use segment-based weight, we
2089	 * expect the new weight to be better or equal to the weight
2090	 * that the metaslab had while it was not loaded. This is
2091	 * because the old weight does not take into account the
2092	 * consolidation of adjacent segments between TXGs. [see
2093	 * comment for ms_synchist and ms_deferhist[] for more info]
2094	 */
2095	uint64_t weight = msp->ms_weight;
2096	uint64_t max_size = msp->ms_max_size;
2097	metaslab_recalculate_weight_and_sort(msp);
2098	if (!WEIGHT_IS_SPACEBASED(weight))
2099		ASSERT3U(weight, <=, msp->ms_weight);
2100	msp->ms_max_size = metaslab_largest_allocatable(msp);
2101	ASSERT3U(max_size, <=, msp->ms_max_size);
2102	hrtime_t load_end = gethrtime();
2103		msp->ms_load_time = load_end;
2104	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
2105		zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
2106		    "ms_id %llu, smp_length %llu, "
2107		    "unflushed_allocs %llu, unflushed_frees %llu, "
2108		    "freed %llu, defer %llu + %llu, "
2109		    "loading_time %lld ms, ms_max_size %llu, "
2110		    "max size error %llu",
2111		    spa_syncing_txg(spa), spa_name(spa),
2112		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
2113		    space_map_length(msp->ms_sm),
2114		    range_tree_space(msp->ms_unflushed_allocs),
2115		    range_tree_space(msp->ms_unflushed_frees),
2116		    range_tree_space(msp->ms_freed),
2117		    range_tree_space(msp->ms_defer[0]),
2118		    range_tree_space(msp->ms_defer[1]),
2119		    (longlong_t)((load_end - load_start) / 1000000),
2120		    msp->ms_max_size, msp->ms_max_size - max_size);
2121	}
2122
2123	metaslab_verify_space(msp, spa_syncing_txg(spa));
2124	mutex_exit(&msp->ms_sync_lock);
2125	return (0);
2126}
2127
2128int
2129metaslab_load(metaslab_t *msp)
2130{
2131	ASSERT(MUTEX_HELD(&msp->ms_lock));
2132
2133	/*
2134	 * There may be another thread loading the same metaslab, if that's
2135	 * the case just wait until the other thread is done and return.
2136	 */
2137	metaslab_load_wait(msp);
2138	if (msp->ms_loaded)
2139		return (0);
2140	VERIFY(!msp->ms_loading);
2141	ASSERT(!msp->ms_condensing);
2142
2143	/*
2144	 * We set the loading flag BEFORE potentially dropping the lock to
2145	 * wait for an ongoing flush (see ms_flushing below). This way other
2146	 * threads know that there is already a thread that is loading this
2147	 * metaslab.
2148	 */
2149	msp->ms_loading = B_TRUE;
2150
2151	/*
2152	 * Wait for any in-progress flushing to finish as we drop the ms_lock
2153	 * both here (during space_map_load()) and in metaslab_flush() (when
2154	 * we flush our changes to the ms_sm).
2155	 */
2156	if (msp->ms_flushing)
2157		metaslab_flush_wait(msp);
2158
2159	/*
2160	 * In the possibility that we were waiting for the metaslab to be
2161	 * flushed (where we temporarily dropped the ms_lock), ensure that
2162	 * no one else loaded the metaslab somehow.
2163	 */
2164	ASSERT(!msp->ms_loaded);
2165
2166	/*
2167	 * If we're loading a metaslab in the normal class, consider evicting
2168	 * another one to keep our memory usage under the limit defined by the
2169	 * zfs_metaslab_mem_limit tunable.
2170	 */
2171	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2172	    msp->ms_group->mg_class) {
2173		metaslab_potentially_evict(msp->ms_group->mg_class);
2174	}
2175
2176	int error = metaslab_load_impl(msp);
2177
2178	ASSERT(MUTEX_HELD(&msp->ms_lock));
2179	msp->ms_loading = B_FALSE;
2180	cv_broadcast(&msp->ms_load_cv);
2181
2182	return (error);
2183}
2184
2185void
2186metaslab_unload(metaslab_t *msp)
2187{
2188	ASSERT(MUTEX_HELD(&msp->ms_lock));
2189
2190	/*
2191	 * This can happen if a metaslab is selected for eviction (in
2192	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
2193	 * metaslab_class_evict_old).
2194	 */
2195	if (!msp->ms_loaded)
2196		return;
2197
2198	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2199	msp->ms_loaded = B_FALSE;
2200	msp->ms_unload_time = gethrtime();
2201
2202	msp->ms_activation_weight = 0;
2203	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2204
2205	if (msp->ms_group != NULL) {
2206		metaslab_class_t *mc = msp->ms_group->mg_class;
2207		multilist_sublist_t *mls =
2208		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2209		if (multilist_link_active(&msp->ms_class_txg_node))
2210			multilist_sublist_remove(mls, msp);
2211		multilist_sublist_unlock(mls);
2212	}
2213
2214	/*
2215	 * We explicitly recalculate the metaslab's weight based on its space
2216	 * map (as it is now not loaded). We want unload metaslabs to always
2217	 * have their weights calculated from the space map histograms, while
2218	 * loaded ones have it calculated from their in-core range tree
2219	 * [see metaslab_load()]. This way, the weight reflects the information
2220	 * available in-core, whether it is loaded or not.
2221	 *
2222	 * If ms_group == NULL means that we came here from metaslab_fini(),
2223	 * at which point it doesn't make sense for us to do the recalculation
2224	 * and the sorting.
2225	 */
2226	if (msp->ms_group != NULL)
2227		metaslab_recalculate_weight_and_sort(msp);
2228}
2229
2230void
2231metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2232{
2233	ASSERT(MUTEX_HELD(&msp->ms_lock));
2234	metaslab_class_t *mc = msp->ms_group->mg_class;
2235	multilist_sublist_t *mls =
2236	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2237	if (multilist_link_active(&msp->ms_class_txg_node))
2238		multilist_sublist_remove(mls, msp);
2239	msp->ms_selected_txg = txg;
2240	msp->ms_selected_time = gethrtime();
2241	multilist_sublist_insert_tail(mls, msp);
2242	multilist_sublist_unlock(mls);
2243}
2244
2245void
2246metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2247    int64_t defer_delta, int64_t space_delta)
2248{
2249	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2250
2251	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2252	ASSERT(vd->vdev_ms_count != 0);
2253
2254	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2255	    vdev_deflated_space(vd, space_delta));
2256}
2257
2258int
2259metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2260    uint64_t txg, metaslab_t **msp)
2261{
2262	vdev_t *vd = mg->mg_vd;
2263	spa_t *spa = vd->vdev_spa;
2264	objset_t *mos = spa->spa_meta_objset;
2265	metaslab_t *ms;
2266	int error;
2267
2268	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2269	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2270	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2271	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2272	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2273	multilist_link_init(&ms->ms_class_txg_node);
2274
2275	ms->ms_id = id;
2276	ms->ms_start = id << vd->vdev_ms_shift;
2277	ms->ms_size = 1ULL << vd->vdev_ms_shift;
2278	ms->ms_allocator = -1;
2279	ms->ms_new = B_TRUE;
2280
2281	/*
2282	 * We only open space map objects that already exist. All others
2283	 * will be opened when we finally allocate an object for it.
2284	 *
2285	 * Note:
2286	 * When called from vdev_expand(), we can't call into the DMU as
2287	 * we are holding the spa_config_lock as a writer and we would
2288	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
2289	 * that case, the object parameter is zero though, so we won't
2290	 * call into the DMU.
2291	 */
2292	if (object != 0) {
2293		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2294		    ms->ms_size, vd->vdev_ashift);
2295
2296		if (error != 0) {
2297			kmem_free(ms, sizeof (metaslab_t));
2298			return (error);
2299		}
2300
2301		ASSERT(ms->ms_sm != NULL);
2302		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
2303		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2304	}
2305
2306	/*
2307	 * We create the ms_allocatable here, but we don't create the
2308	 * other range trees until metaslab_sync_done().  This serves
2309	 * two purposes: it allows metaslab_sync_done() to detect the
2310	 * addition of new space; and for debugging, it ensures that
2311	 * we'd data fault on any attempt to use this metaslab before
2312	 * it's ready.
2313	 */
2314	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
2315	    &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
2316
2317	ms->ms_trim = range_tree_create(NULL, NULL);
2318
2319	metaslab_group_add(mg, ms);
2320	metaslab_set_fragmentation(ms);
2321
2322	/*
2323	 * If we're opening an existing pool (txg == 0) or creating
2324	 * a new one (txg == TXG_INITIAL), all space is available now.
2325	 * If we're adding space to an existing pool, the new space
2326	 * does not become available until after this txg has synced.
2327	 * The metaslab's weight will also be initialized when we sync
2328	 * out this txg. This ensures that we don't attempt to allocate
2329	 * from it before we have initialized it completely.
2330	 */
2331	if (txg <= TXG_INITIAL) {
2332		metaslab_sync_done(ms, 0);
2333		metaslab_space_update(vd, mg->mg_class,
2334		    metaslab_allocated_space(ms), 0, 0);
2335	}
2336
2337	if (txg != 0) {
2338		vdev_dirty(vd, 0, NULL, txg);
2339		vdev_dirty(vd, VDD_METASLAB, ms, txg);
2340	}
2341
2342	*msp = ms;
2343
2344	return (0);
2345}
2346
2347static void
2348metaslab_fini_flush_data(metaslab_t *msp)
2349{
2350	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2351
2352	if (metaslab_unflushed_txg(msp) == 0) {
2353		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2354		    ==, NULL);
2355		return;
2356	}
2357	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2358
2359	mutex_enter(&spa->spa_flushed_ms_lock);
2360	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2361	mutex_exit(&spa->spa_flushed_ms_lock);
2362
2363	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2364	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2365}
2366
2367uint64_t
2368metaslab_unflushed_changes_memused(metaslab_t *ms)
2369{
2370	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2371	    range_tree_numsegs(ms->ms_unflushed_frees)) *
2372	    sizeof (range_seg_t));
2373}
2374
2375void
2376metaslab_fini(metaslab_t *msp)
2377{
2378	metaslab_group_t *mg = msp->ms_group;
2379	vdev_t *vd = mg->mg_vd;
2380	spa_t *spa = vd->vdev_spa;
2381
2382	metaslab_fini_flush_data(msp);
2383
2384	metaslab_group_remove(mg, msp);
2385
2386	mutex_enter(&msp->ms_lock);
2387	VERIFY(msp->ms_group == NULL);
2388	metaslab_space_update(vd, mg->mg_class,
2389	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
2390
2391	space_map_close(msp->ms_sm);
2392	msp->ms_sm = NULL;
2393
2394	metaslab_unload(msp);
2395	range_tree_destroy(msp->ms_allocatable);
2396	range_tree_destroy(msp->ms_freeing);
2397	range_tree_destroy(msp->ms_freed);
2398
2399	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2400	    metaslab_unflushed_changes_memused(msp));
2401	spa->spa_unflushed_stats.sus_memused -=
2402	    metaslab_unflushed_changes_memused(msp);
2403	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2404	range_tree_destroy(msp->ms_unflushed_allocs);
2405	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2406	range_tree_destroy(msp->ms_unflushed_frees);
2407
2408	for (int t = 0; t < TXG_SIZE; t++) {
2409		range_tree_destroy(msp->ms_allocating[t]);
2410	}
2411
2412	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2413		range_tree_destroy(msp->ms_defer[t]);
2414	}
2415	ASSERT0(msp->ms_deferspace);
2416
2417	range_tree_destroy(msp->ms_checkpointing);
2418
2419	for (int t = 0; t < TXG_SIZE; t++)
2420		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2421
2422	range_tree_vacate(msp->ms_trim, NULL, NULL);
2423	range_tree_destroy(msp->ms_trim);
2424
2425	mutex_exit(&msp->ms_lock);
2426	cv_destroy(&msp->ms_load_cv);
2427	cv_destroy(&msp->ms_flush_cv);
2428	mutex_destroy(&msp->ms_lock);
2429	mutex_destroy(&msp->ms_sync_lock);
2430	ASSERT3U(msp->ms_allocator, ==, -1);
2431
2432	kmem_free(msp, sizeof (metaslab_t));
2433}
2434
2435#define	FRAGMENTATION_TABLE_SIZE	17
2436
2437/*
2438 * This table defines a segment size based fragmentation metric that will
2439 * allow each metaslab to derive its own fragmentation value. This is done
2440 * by calculating the space in each bucket of the spacemap histogram and
2441 * multiplying that by the fragmentation metric in this table. Doing
2442 * this for all buckets and dividing it by the total amount of free
2443 * space in this metaslab (i.e. the total free space in all buckets) gives
2444 * us the fragmentation metric. This means that a high fragmentation metric
2445 * equates to most of the free space being comprised of small segments.
2446 * Conversely, if the metric is low, then most of the free space is in
2447 * large segments. A 10% change in fragmentation equates to approximately
2448 * double the number of segments.
2449 *
2450 * This table defines 0% fragmented space using 16MB segments. Testing has
2451 * shown that segments that are greater than or equal to 16MB do not suffer
2452 * from drastic performance problems. Using this value, we derive the rest
2453 * of the table. Since the fragmentation value is never stored on disk, it
2454 * is possible to change these calculations in the future.
2455 */
2456int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2457	100,	/* 512B	*/
2458	100,	/* 1K	*/
2459	98,	/* 2K	*/
2460	95,	/* 4K	*/
2461	90,	/* 8K	*/
2462	80,	/* 16K	*/
2463	70,	/* 32K	*/
2464	60,	/* 64K	*/
2465	50,	/* 128K	*/
2466	40,	/* 256K	*/
2467	30,	/* 512K	*/
2468	20,	/* 1M	*/
2469	15,	/* 2M	*/
2470	10,	/* 4M	*/
2471	5,	/* 8M	*/
2472	0	/* 16M	*/
2473};
2474
2475/*
2476 * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2477 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2478 * been upgraded and does not support this metric. Otherwise, the return
2479 * value should be in the range [0, 100].
2480 */
2481static void
2482metaslab_set_fragmentation(metaslab_t *msp)
2483{
2484	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2485	uint64_t fragmentation = 0;
2486	uint64_t total = 0;
2487	boolean_t feature_enabled = spa_feature_is_enabled(spa,
2488	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
2489
2490	if (!feature_enabled) {
2491		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2492		return;
2493	}
2494
2495	/*
2496	 * A null space map means that the entire metaslab is free
2497	 * and thus is not fragmented.
2498	 */
2499	if (msp->ms_sm == NULL) {
2500		msp->ms_fragmentation = 0;
2501		return;
2502	}
2503
2504	/*
2505	 * If this metaslab's space map has not been upgraded, flag it
2506	 * so that we upgrade next time we encounter it.
2507	 */
2508	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2509		uint64_t txg = spa_syncing_txg(spa);
2510		vdev_t *vd = msp->ms_group->mg_vd;
2511
2512		/*
2513		 * If we've reached the final dirty txg, then we must
2514		 * be shutting down the pool. We don't want to dirty
2515		 * any data past this point so skip setting the condense
2516		 * flag. We can retry this action the next time the pool
2517		 * is imported.
2518		 */
2519		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2520			msp->ms_condense_wanted = B_TRUE;
2521			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2522			zfs_dbgmsg("txg %llu, requesting force condense: "
2523			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
2524			    vd->vdev_id);
2525		}
2526		msp->ms_fragmentation = ZFS_FRAG_INVALID;
2527		return;
2528	}
2529
2530	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2531		uint64_t space = 0;
2532		uint8_t shift = msp->ms_sm->sm_shift;
2533
2534		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2535		    FRAGMENTATION_TABLE_SIZE - 1);
2536
2537		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2538			continue;
2539
2540		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2541		total += space;
2542
2543		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2544		fragmentation += space * zfs_frag_table[idx];
2545	}
2546
2547	if (total > 0)
2548		fragmentation /= total;
2549	ASSERT3U(fragmentation, <=, 100);
2550
2551	msp->ms_fragmentation = fragmentation;
2552}
2553
2554/*
2555 * Compute a weight -- a selection preference value -- for the given metaslab.
2556 * This is based on the amount of free space, the level of fragmentation,
2557 * the LBA range, and whether the metaslab is loaded.
2558 */
2559static uint64_t
2560metaslab_space_weight(metaslab_t *msp)
2561{
2562	metaslab_group_t *mg = msp->ms_group;
2563	vdev_t *vd = mg->mg_vd;
2564	uint64_t weight, space;
2565
2566	ASSERT(MUTEX_HELD(&msp->ms_lock));
2567
2568	/*
2569	 * The baseline weight is the metaslab's free space.
2570	 */
2571	space = msp->ms_size - metaslab_allocated_space(msp);
2572
2573	if (metaslab_fragmentation_factor_enabled &&
2574	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2575		/*
2576		 * Use the fragmentation information to inversely scale
2577		 * down the baseline weight. We need to ensure that we
2578		 * don't exclude this metaslab completely when it's 100%
2579		 * fragmented. To avoid this we reduce the fragmented value
2580		 * by 1.
2581		 */
2582		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2583
2584		/*
2585		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2586		 * this metaslab again. The fragmentation metric may have
2587		 * decreased the space to something smaller than
2588		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2589		 * so that we can consume any remaining space.
2590		 */
2591		if (space > 0 && space < SPA_MINBLOCKSIZE)
2592			space = SPA_MINBLOCKSIZE;
2593	}
2594	weight = space;
2595
2596	/*
2597	 * Modern disks have uniform bit density and constant angular velocity.
2598	 * Therefore, the outer recording zones are faster (higher bandwidth)
2599	 * than the inner zones by the ratio of outer to inner track diameter,
2600	 * which is typically around 2:1.  We account for this by assigning
2601	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2602	 * In effect, this means that we'll select the metaslab with the most
2603	 * free bandwidth rather than simply the one with the most free space.
2604	 */
2605	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
2606		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
2607		ASSERT(weight >= space && weight <= 2 * space);
2608	}
2609
2610	/*
2611	 * If this metaslab is one we're actively using, adjust its
2612	 * weight to make it preferable to any inactive metaslab so
2613	 * we'll polish it off. If the fragmentation on this metaslab
2614	 * has exceed our threshold, then don't mark it active.
2615	 */
2616	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
2617	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
2618		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
2619	}
2620
2621	WEIGHT_SET_SPACEBASED(weight);
2622	return (weight);
2623}
2624
2625/*
2626 * Return the weight of the specified metaslab, according to the segment-based
2627 * weighting algorithm. The metaslab must be loaded. This function can
2628 * be called within a sync pass since it relies only on the metaslab's
2629 * range tree which is always accurate when the metaslab is loaded.
2630 */
2631static uint64_t
2632metaslab_weight_from_range_tree(metaslab_t *msp)
2633{
2634	uint64_t weight = 0;
2635	uint32_t segments = 0;
2636
2637	ASSERT(msp->ms_loaded);
2638
2639	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
2640	    i--) {
2641		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
2642		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2643
2644		segments <<= 1;
2645		segments += msp->ms_allocatable->rt_histogram[i];
2646
2647		/*
2648		 * The range tree provides more precision than the space map
2649		 * and must be downgraded so that all values fit within the
2650		 * space map's histogram. This allows us to compare loaded
2651		 * vs. unloaded metaslabs to determine which metaslab is
2652		 * considered "best".
2653		 */
2654		if (i > max_idx)
2655			continue;
2656
2657		if (segments != 0) {
2658			WEIGHT_SET_COUNT(weight, segments);
2659			WEIGHT_SET_INDEX(weight, i);
2660			WEIGHT_SET_ACTIVE(weight, 0);
2661			break;
2662		}
2663	}
2664	return (weight);
2665}
2666
2667/*
2668 * Calculate the weight based on the on-disk histogram. Should be applied
2669 * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
2670 * give results consistent with the on-disk state
2671 */
2672static uint64_t
2673metaslab_weight_from_spacemap(metaslab_t *msp)
2674{
2675	space_map_t *sm = msp->ms_sm;
2676	ASSERT(!msp->ms_loaded);
2677	ASSERT(sm != NULL);
2678	ASSERT3U(space_map_object(sm), !=, 0);
2679	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2680
2681	/*
2682	 * Create a joint histogram from all the segments that have made
2683	 * it to the metaslab's space map histogram, that are not yet
2684	 * available for allocation because they are still in the freeing
2685	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2686	 * these segments from the space map's histogram to get a more
2687	 * accurate weight.
2688	 */
2689	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2690	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2691		deferspace_histogram[i] += msp->ms_synchist[i];
2692	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2693		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2694			deferspace_histogram[i] += msp->ms_deferhist[t][i];
2695		}
2696	}
2697
2698	uint64_t weight = 0;
2699	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2700		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2701		    deferspace_histogram[i]);
2702		uint64_t count =
2703		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2704		if (count != 0) {
2705			WEIGHT_SET_COUNT(weight, count);
2706			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
2707			WEIGHT_SET_ACTIVE(weight, 0);
2708			break;
2709		}
2710	}
2711	return (weight);
2712}
2713
2714/*
2715 * Compute a segment-based weight for the specified metaslab. The weight
2716 * is determined by highest bucket in the histogram. The information
2717 * for the highest bucket is encoded into the weight value.
2718 */
2719static uint64_t
2720metaslab_segment_weight(metaslab_t *msp)
2721{
2722	metaslab_group_t *mg = msp->ms_group;
2723	uint64_t weight = 0;
2724	uint8_t shift = mg->mg_vd->vdev_ashift;
2725
2726	ASSERT(MUTEX_HELD(&msp->ms_lock));
2727
2728	/*
2729	 * The metaslab is completely free.
2730	 */
2731	if (metaslab_allocated_space(msp) == 0) {
2732		int idx = highbit64(msp->ms_size) - 1;
2733		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2734
2735		if (idx < max_idx) {
2736			WEIGHT_SET_COUNT(weight, 1ULL);
2737			WEIGHT_SET_INDEX(weight, idx);
2738		} else {
2739			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2740			WEIGHT_SET_INDEX(weight, max_idx);
2741		}
2742		WEIGHT_SET_ACTIVE(weight, 0);
2743		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
2744		return (weight);
2745	}
2746
2747	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2748
2749	/*
2750	 * If the metaslab is fully allocated then just make the weight 0.
2751	 */
2752	if (metaslab_allocated_space(msp) == msp->ms_size)
2753		return (0);
2754	/*
2755	 * If the metaslab is already loaded, then use the range tree to
2756	 * determine the weight. Otherwise, we rely on the space map information
2757	 * to generate the weight.
2758	 */
2759	if (msp->ms_loaded) {
2760		weight = metaslab_weight_from_range_tree(msp);
2761	} else {
2762		weight = metaslab_weight_from_spacemap(msp);
2763	}
2764
2765	/*
2766	 * If the metaslab was active the last time we calculated its weight
2767	 * then keep it active. We want to consume the entire region that
2768	 * is associated with this weight.
2769	 */
2770	if (msp->ms_activation_weight != 0 && weight != 0)
2771		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2772	return (weight);
2773}
2774
2775/*
2776 * Determine if we should attempt to allocate from this metaslab. If the
2777 * metaslab is loaded, then we can determine if the desired allocation
2778 * can be satisfied by looking at the size of the maximum free segment
2779 * on that metaslab. Otherwise, we make our decision based on the metaslab's
2780 * weight. For segment-based weighting we can determine the maximum
2781 * allocation based on the index encoded in its value. For space-based
2782 * weights we rely on the entire weight (excluding the weight-type bit).
2783 */
2784boolean_t
2785metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
2786{
2787	/*
2788	 * If the metaslab is loaded, ms_max_size is definitive and we can use
2789	 * the fast check. If it's not, the ms_max_size is a lower bound (once
2790	 * set), and we should use the fast check as long as we're not in
2791	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
2792	 * seconds since the metaslab was unloaded.
2793	 */
2794	if (msp->ms_loaded ||
2795	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
2796	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
2797		return (msp->ms_max_size >= asize);
2798
2799	boolean_t should_allocate;
2800	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2801		/*
2802		 * The metaslab segment weight indicates segments in the
2803		 * range [2^i, 2^(i+1)), where i is the index in the weight.
2804		 * Since the asize might be in the middle of the range, we
2805		 * should attempt the allocation if asize < 2^(i+1).
2806		 */
2807		should_allocate = (asize <
2808		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
2809	} else {
2810		should_allocate = (asize <=
2811		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
2812	}
2813
2814	return (should_allocate);
2815}
2816
2817static uint64_t
2818metaslab_weight(metaslab_t *msp)
2819{
2820	vdev_t *vd = msp->ms_group->mg_vd;
2821	spa_t *spa = vd->vdev_spa;
2822	uint64_t weight;
2823
2824	ASSERT(MUTEX_HELD(&msp->ms_lock));
2825
2826	metaslab_set_fragmentation(msp);
2827
2828	/*
2829	 * Update the maximum size. If the metaslab is loaded, this will
2830	 * ensure that we get an accurate maximum size if newly freed space
2831	 * has been added back into the free tree. If the metaslab is
2832	 * unloaded, we check if there's a larger free segment in the
2833	 * unflushed frees. This is a lower bound on the largest allocatable
2834	 * segment size. Coalescing of adjacent entries may reveal larger
2835	 * allocatable segments, but we aren't aware of those until loading
2836	 * the space map into a range tree.
2837	 */
2838	if (msp->ms_loaded) {
2839		msp->ms_max_size = metaslab_largest_allocatable(msp);
2840	} else {
2841		msp->ms_max_size = MAX(msp->ms_max_size,
2842		    metaslab_largest_unflushed_free(msp));
2843	}
2844
2845	/*
2846	 * Segment-based weighting requires space map histogram support.
2847	 */
2848	if (zfs_metaslab_segment_weight_enabled &&
2849	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2850	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2851	    sizeof (space_map_phys_t))) {
2852		weight = metaslab_segment_weight(msp);
2853	} else {
2854		weight = metaslab_space_weight(msp);
2855	}
2856	return (weight);
2857}
2858
2859void
2860metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2861{
2862	ASSERT(MUTEX_HELD(&msp->ms_lock));
2863
2864	/* note: we preserve the mask (e.g. indication of primary, etc..) */
2865	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2866	metaslab_group_sort(msp->ms_group, msp,
2867	    metaslab_weight(msp) | was_active);
2868}
2869
2870static int
2871metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2872    int allocator, uint64_t activation_weight)
2873{
2874	ASSERT(MUTEX_HELD(&msp->ms_lock));
2875
2876	/*
2877	 * If we're activating for the claim code, we don't want to actually
2878	 * set the metaslab up for a specific allocator.
2879	 */
2880	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
2881		ASSERT0(msp->ms_activation_weight);
2882		msp->ms_activation_weight = msp->ms_weight;
2883		metaslab_group_sort(mg, msp, msp->ms_weight |
2884		    activation_weight);
2885		return (0);
2886	}
2887
2888	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2889	    mg->mg_primaries : mg->mg_secondaries);
2890
2891	mutex_enter(&mg->mg_lock);
2892	if (arr[allocator] != NULL) {
2893		mutex_exit(&mg->mg_lock);
2894		return (EEXIST);
2895	}
2896
2897	arr[allocator] = msp;
2898	ASSERT3S(msp->ms_allocator, ==, -1);
2899	msp->ms_allocator = allocator;
2900	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
2901
2902	ASSERT0(msp->ms_activation_weight);
2903	msp->ms_activation_weight = msp->ms_weight;
2904	metaslab_group_sort_impl(mg, msp,
2905	    msp->ms_weight | activation_weight);
2906
2907	mutex_exit(&mg->mg_lock);
2908
2909	return (0);
2910}
2911
2912static int
2913metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
2914{
2915	ASSERT(MUTEX_HELD(&msp->ms_lock));
2916
2917	/*
2918	 * The current metaslab is already activated for us so there
2919	 * is nothing to do. Already activated though, doesn't mean
2920	 * that this metaslab is activated for our allocator nor our
2921	 * requested activation weight. The metaslab could have started
2922	 * as an active one for our allocator but changed allocators
2923	 * while we were waiting to grab its ms_lock or we stole it
2924	 * [see find_valid_metaslab()]. This means that there is a
2925	 * possibility of passivating a metaslab of another allocator
2926	 * or from a different activation mask, from this thread.
2927	 */
2928	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2929		ASSERT(msp->ms_loaded);
2930		return (0);
2931	}
2932
2933	int error = metaslab_load(msp);
2934	if (error != 0) {
2935		metaslab_group_sort(msp->ms_group, msp, 0);
2936		return (error);
2937	}
2938
2939	/*
2940	 * When entering metaslab_load() we may have dropped the
2941	 * ms_lock because we were loading this metaslab, or we
2942	 * were waiting for another thread to load it for us. In
2943	 * that scenario, we recheck the weight of the metaslab
2944	 * to see if it was activated by another thread.
2945	 *
2946	 * If the metaslab was activated for another allocator or
2947	 * it was activated with a different activation weight (e.g.
2948	 * we wanted to make it a primary but it was activated as
2949	 * secondary) we return error (EBUSY).
2950	 *
2951	 * If the metaslab was activated for the same allocator
2952	 * and requested activation mask, skip activating it.
2953	 */
2954	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
2955		if (msp->ms_allocator != allocator)
2956			return (EBUSY);
2957
2958		if ((msp->ms_weight & activation_weight) == 0)
2959			return (EBUSY);
2960
2961		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
2962		    msp->ms_primary);
2963		return (0);
2964	}
2965
2966	/*
2967	 * If the metaslab has literally 0 space, it will have weight 0. In
2968	 * that case, don't bother activating it. This can happen if the
2969	 * metaslab had space during find_valid_metaslab, but another thread
2970	 * loaded it and used all that space while we were waiting to grab the
2971	 * lock.
2972	 */
2973	if (msp->ms_weight == 0) {
2974		ASSERT0(range_tree_space(msp->ms_allocatable));
2975		return (SET_ERROR(ENOSPC));
2976	}
2977
2978	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
2979	    allocator, activation_weight)) != 0) {
2980		return (error);
2981	}
2982
2983	ASSERT(msp->ms_loaded);
2984	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
2985
2986	return (0);
2987}
2988
2989static void
2990metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2991    uint64_t weight)
2992{
2993	ASSERT(MUTEX_HELD(&msp->ms_lock));
2994	ASSERT(msp->ms_loaded);
2995
2996	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
2997		metaslab_group_sort(mg, msp, weight);
2998		return;
2999	}
3000
3001	mutex_enter(&mg->mg_lock);
3002	ASSERT3P(msp->ms_group, ==, mg);
3003	ASSERT3S(0, <=, msp->ms_allocator);
3004	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3005
3006	if (msp->ms_primary) {
3007		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
3008		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3009		mg->mg_primaries[msp->ms_allocator] = NULL;
3010	} else {
3011		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
3012		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3013		mg->mg_secondaries[msp->ms_allocator] = NULL;
3014	}
3015	msp->ms_allocator = -1;
3016	metaslab_group_sort_impl(mg, msp, weight);
3017	mutex_exit(&mg->mg_lock);
3018}
3019
3020static void
3021metaslab_passivate(metaslab_t *msp, uint64_t weight)
3022{
3023	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
3024
3025	/*
3026	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3027	 * this metaslab again.  In that case, it had better be empty,
3028	 * or we would be leaving space on the table.
3029	 */
3030	ASSERT(size >= SPA_MINBLOCKSIZE ||
3031	    range_tree_is_empty(msp->ms_allocatable));
3032	ASSERT0(weight & METASLAB_ACTIVE_MASK);
3033
3034	ASSERT(msp->ms_activation_weight != 0);
3035	msp->ms_activation_weight = 0;
3036	metaslab_passivate_allocator(msp->ms_group, msp, weight);
3037	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3038}
3039
3040/*
3041 * Segment-based metaslabs are activated once and remain active until
3042 * we either fail an allocation attempt (similar to space-based metaslabs)
3043 * or have exhausted the free space in zfs_metaslab_switch_threshold
3044 * buckets since the metaslab was activated. This function checks to see
3045 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
3046 * metaslab and passivates it proactively. This will allow us to select a
3047 * metaslabs with larger contiguous region if any remaining within this
3048 * metaslab group. If we're in sync pass > 1, then we continue using this
3049 * metaslab so that we don't dirty more block and cause more sync passes.
3050 */
3051void
3052metaslab_segment_may_passivate(metaslab_t *msp)
3053{
3054	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3055
3056	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3057		return;
3058
3059	/*
3060	 * Since we are in the middle of a sync pass, the most accurate
3061	 * information that is accessible to us is the in-core range tree
3062	 * histogram; calculate the new weight based on that information.
3063	 */
3064	uint64_t weight = metaslab_weight_from_range_tree(msp);
3065	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3066	int current_idx = WEIGHT_GET_INDEX(weight);
3067
3068	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3069		metaslab_passivate(msp, weight);
3070}
3071
3072static void
3073metaslab_preload(void *arg)
3074{
3075	metaslab_t *msp = arg;
3076	metaslab_class_t *mc = msp->ms_group->mg_class;
3077	spa_t *spa = mc->mc_spa;
3078
3079	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3080
3081	mutex_enter(&msp->ms_lock);
3082	(void) metaslab_load(msp);
3083	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3084	mutex_exit(&msp->ms_lock);
3085}
3086
3087static void
3088metaslab_group_preload(metaslab_group_t *mg)
3089{
3090	spa_t *spa = mg->mg_vd->vdev_spa;
3091	metaslab_t *msp;
3092	avl_tree_t *t = &mg->mg_metaslab_tree;
3093	int m = 0;
3094
3095	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
3096		taskq_wait(mg->mg_taskq);
3097		return;
3098	}
3099
3100	mutex_enter(&mg->mg_lock);
3101
3102	/*
3103	 * Load the next potential metaslabs
3104	 */
3105	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3106		ASSERT3P(msp->ms_group, ==, mg);
3107
3108		/*
3109		 * We preload only the maximum number of metaslabs specified
3110		 * by metaslab_preload_limit. If a metaslab is being forced
3111		 * to condense then we preload it too. This will ensure
3112		 * that force condensing happens in the next txg.
3113		 */
3114		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3115			continue;
3116		}
3117
3118		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
3119		    msp, TQ_SLEEP) != TASKQID_INVALID);
3120	}
3121	mutex_exit(&mg->mg_lock);
3122}
3123
3124/*
3125 * Determine if the space map's on-disk footprint is past our tolerance for
3126 * inefficiency. We would like to use the following criteria to make our
3127 * decision:
3128 *
3129 * 1. Do not condense if the size of the space map object would dramatically
3130 *    increase as a result of writing out the free space range tree.
3131 *
3132 * 2. Condense if the on on-disk space map representation is at least
3133 *    zfs_condense_pct/100 times the size of the optimal representation
3134 *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3135 *
3136 * 3. Do not condense if the on-disk size of the space map does not actually
3137 *    decrease.
3138 *
3139 * Unfortunately, we cannot compute the on-disk size of the space map in this
3140 * context because we cannot accurately compute the effects of compression, etc.
3141 * Instead, we apply the heuristic described in the block comment for
3142 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3143 * is greater than a threshold number of blocks.
3144 */
3145static boolean_t
3146metaslab_should_condense(metaslab_t *msp)
3147{
3148	space_map_t *sm = msp->ms_sm;
3149	vdev_t *vd = msp->ms_group->mg_vd;
3150	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
3151
3152	ASSERT(MUTEX_HELD(&msp->ms_lock));
3153	ASSERT(msp->ms_loaded);
3154	ASSERT(sm != NULL);
3155	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3156
3157	/*
3158	 * We always condense metaslabs that are empty and metaslabs for
3159	 * which a condense request has been made.
3160	 */
3161	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
3162	    msp->ms_condense_wanted)
3163		return (B_TRUE);
3164
3165	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3166	uint64_t object_size = space_map_length(sm);
3167	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3168	    msp->ms_allocatable, SM_NO_VDEVID);
3169
3170	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3171	    object_size > zfs_metaslab_condense_block_threshold * record_size);
3172}
3173
3174/*
3175 * Condense the on-disk space map representation to its minimized form.
3176 * The minimized form consists of a small number of allocations followed
3177 * by the entries of the free range tree (ms_allocatable). The condensed
3178 * spacemap contains all the entries of previous TXGs (including those in
3179 * the pool-wide log spacemaps; thus this is effectively a superset of
3180 * metaslab_flush()), but this TXG's entries still need to be written.
3181 */
3182static void
3183metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3184{
3185	range_tree_t *condense_tree;
3186	space_map_t *sm = msp->ms_sm;
3187	uint64_t txg = dmu_tx_get_txg(tx);
3188	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3189
3190	ASSERT(MUTEX_HELD(&msp->ms_lock));
3191	ASSERT(msp->ms_loaded);
3192	ASSERT(msp->ms_sm != NULL);
3193
3194	/*
3195	 * In order to condense the space map, we need to change it so it
3196	 * only describes which segments are currently allocated and free.
3197	 *
3198	 * All the current free space resides in the ms_allocatable, all
3199	 * the ms_defer trees, and all the ms_allocating trees. We ignore
3200	 * ms_freed because it is empty because we're in sync pass 1. We
3201	 * ignore ms_freeing because these changes are not yet reflected
3202	 * in the spacemap (they will be written later this txg).
3203	 *
3204	 * So to truncate the space map to represent all the entries of
3205	 * previous TXGs we do the following:
3206	 *
3207	 * 1] We create a range tree (condense tree) that is 100% allocated.
3208	 * 2] We remove from it all segments found in the ms_defer trees
3209	 *    as those segments are marked as free in the original space
3210	 *    map. We do the same with the ms_allocating trees for the same
3211	 *    reason. Removing these segments should be a relatively
3212	 *    inexpensive operation since we expect these trees to have a
3213	 *    small number of nodes.
3214	 * 3] We vacate any unflushed allocs as they should already exist
3215	 *    in the condense tree. Then we vacate any unflushed frees as
3216	 *    they should already be part of ms_allocatable.
3217	 * 4] At this point, we would ideally like to remove all segments
3218	 *    in the ms_allocatable tree from the condense tree. This way
3219	 *    we would write all the entries of the condense tree as the
3220	 *    condensed space map, which would only contain allocated
3221	 *    segments with everything else assumed to be freed.
3222	 *
3223	 *    Doing so can be prohibitively expensive as ms_allocatable can
3224	 *    be large, and therefore computationally expensive to subtract
3225	 *    from the condense_tree. Instead we first sync out the
3226	 *    condense_tree and then the ms_allocatable, in the condensed
3227	 *    space map. While this is not optimal, it is typically close to
3228	 *    optimal and more importantly much cheaper to compute.
3229	 *
3230	 * 5] Finally, as both of the unflushed trees were written to our
3231	 *    new and condensed metaslab space map, we basically flushed
3232	 *    all the unflushed changes to disk, thus we call
3233	 *    metaslab_flush_update().
3234	 */
3235	ASSERT3U(spa_sync_pass(spa), ==, 1);
3236	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3237
3238	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
3239	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
3240	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
3241	    spa->spa_name, space_map_length(msp->ms_sm),
3242	    avl_numnodes(&msp->ms_allocatable->rt_root),
3243	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
3244
3245	msp->ms_condense_wanted = B_FALSE;
3246
3247	condense_tree = range_tree_create(NULL, NULL);
3248	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
3249
3250	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3251		range_tree_walk(msp->ms_defer[t],
3252		    range_tree_remove, condense_tree);
3253	}
3254
3255	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3256		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3257		    range_tree_remove, condense_tree);
3258	}
3259
3260	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3261	    metaslab_unflushed_changes_memused(msp));
3262	spa->spa_unflushed_stats.sus_memused -=
3263	    metaslab_unflushed_changes_memused(msp);
3264	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3265	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3266
3267	/*
3268	 * We're about to drop the metaslab's lock thus allowing other
3269	 * consumers to change its content. Set the metaslab's ms_condensing
3270	 * flag to ensure that allocations on this metaslab do not occur
3271	 * while we're in the middle of committing it to disk. This is only
3272	 * critical for ms_allocatable as all other range trees use per TXG
3273	 * views of their content.
3274	 */
3275	msp->ms_condensing = B_TRUE;
3276
3277	mutex_exit(&msp->ms_lock);
3278	uint64_t object = space_map_object(msp->ms_sm);
3279	space_map_truncate(sm,
3280	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3281	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3282
3283	/*
3284	 * space_map_truncate() may have reallocated the spacemap object.
3285	 * If so, update the vdev_ms_array.
3286	 */
3287	if (space_map_object(msp->ms_sm) != object) {
3288		object = space_map_object(msp->ms_sm);
3289		dmu_write(spa->spa_meta_objset,
3290		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3291		    msp->ms_id, sizeof (uint64_t), &object, tx);
3292	}
3293
3294	/*
3295	 * Note:
3296	 * When the log space map feature is enabled, each space map will
3297	 * always have ALLOCS followed by FREES for each sync pass. This is
3298	 * typically true even when the log space map feature is disabled,
3299	 * except from the case where a metaslab goes through metaslab_sync()
3300	 * and gets condensed. In that case the metaslab's space map will have
3301	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3302	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
3303	 * sync pass 1.
3304	 */
3305	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3306	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3307
3308	range_tree_vacate(condense_tree, NULL, NULL);
3309	range_tree_destroy(condense_tree);
3310	mutex_enter(&msp->ms_lock);
3311
3312	msp->ms_condensing = B_FALSE;
3313	metaslab_flush_update(msp, tx);
3314}
3315
3316/*
3317 * Called when the metaslab has been flushed (its own spacemap now reflects
3318 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3319 * metadata and any pool-wide related log space map data (e.g. summary,
3320 * obsolete logs, etc.) to reflect that.
3321 */
3322static void
3323metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3324{
3325	metaslab_group_t *mg = msp->ms_group;
3326	spa_t *spa = mg->mg_vd->vdev_spa;
3327
3328	ASSERT(MUTEX_HELD(&msp->ms_lock));
3329
3330	ASSERT3U(spa_sync_pass(spa), ==, 1);
3331	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3332	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3333
3334	/*
3335	 * Just because a metaslab got flushed, that doesn't mean that
3336	 * it will pass through metaslab_sync_done(). Thus, make sure to
3337	 * update ms_synced_length here in case it doesn't.
3338	 */
3339	msp->ms_synced_length = space_map_length(msp->ms_sm);
3340
3341	/*
3342	 * We may end up here from metaslab_condense() without the
3343	 * feature being active. In that case this is a no-op.
3344	 */
3345	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3346		return;
3347
3348	ASSERT(spa_syncing_log_sm(spa) != NULL);
3349	ASSERT(msp->ms_sm != NULL);
3350	ASSERT(metaslab_unflushed_txg(msp) != 0);
3351	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3352
3353	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3354
3355	/* update metaslab's position in our flushing tree */
3356	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3357	mutex_enter(&spa->spa_flushed_ms_lock);
3358	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3359	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3360	avl_add(&spa->spa_metaslabs_by_flushed, msp);
3361	mutex_exit(&spa->spa_flushed_ms_lock);
3362
3363	/* update metaslab counts of spa_log_sm_t nodes */
3364	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3365	spa_log_sm_increment_current_mscount(spa);
3366
3367	/* cleanup obsolete logs if any */
3368	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
3369	spa_cleanup_old_sm_logs(spa, tx);
3370	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
3371	VERIFY3U(log_blocks_after, <=, log_blocks_before);
3372
3373	/* update log space map summary */
3374	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
3375	spa_log_summary_add_flushed_metaslab(spa);
3376	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
3377	spa_log_summary_decrement_blkcount(spa, blocks_gone);
3378}
3379
3380boolean_t
3381metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3382{
3383	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3384
3385	ASSERT(MUTEX_HELD(&msp->ms_lock));
3386	ASSERT3U(spa_sync_pass(spa), ==, 1);
3387	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3388
3389	ASSERT(msp->ms_sm != NULL);
3390	ASSERT(metaslab_unflushed_txg(msp) != 0);
3391	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3392
3393	/*
3394	 * There is nothing wrong with flushing the same metaslab twice, as
3395	 * this codepath should work on that case. However, the current
3396	 * flushing scheme makes sure to avoid this situation as we would be
3397	 * making all these calls without having anything meaningful to write
3398	 * to disk. We assert this behavior here.
3399	 */
3400	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3401
3402	/*
3403	 * We can not flush while loading, because then we would
3404	 * not load the ms_unflushed_{allocs,frees}.
3405	 */
3406	if (msp->ms_loading)
3407		return (B_FALSE);
3408
3409	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3410	metaslab_verify_weight_and_frag(msp);
3411
3412	/*
3413	 * Metaslab condensing is effectively flushing. Therefore if the
3414	 * metaslab can be condensed we can just condense it instead of
3415	 * flushing it.
3416	 *
3417	 * Note that metaslab_condense() does call metaslab_flush_update()
3418	 * so we can just return immediately after condensing. We also
3419	 * don't need to care about setting ms_flushing or broadcasting
3420	 * ms_flush_cv, even if we temporarily drop the ms_lock in
3421	 * metaslab_condense(), as the metaslab is already loaded.
3422	 */
3423	if (msp->ms_loaded && metaslab_should_condense(msp)) {
3424		metaslab_group_t *mg = msp->ms_group;
3425
3426		/*
3427		 * For all histogram operations below refer to the
3428		 * comments of metaslab_sync() where we follow a
3429		 * similar procedure.
3430		 */
3431		metaslab_group_histogram_verify(mg);
3432		metaslab_class_histogram_verify(mg->mg_class);
3433		metaslab_group_histogram_remove(mg, msp);
3434
3435		metaslab_condense(msp, tx);
3436
3437		space_map_histogram_clear(msp->ms_sm);
3438		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3439		ASSERT(range_tree_is_empty(msp->ms_freed));
3440		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3441			space_map_histogram_add(msp->ms_sm,
3442			    msp->ms_defer[t], tx);
3443		}
3444		metaslab_aux_histograms_update(msp);
3445
3446		metaslab_group_histogram_add(mg, msp);
3447		metaslab_group_histogram_verify(mg);
3448		metaslab_class_histogram_verify(mg->mg_class);
3449
3450		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3451
3452		/*
3453		 * Since we recreated the histogram (and potentially
3454		 * the ms_sm too while condensing) ensure that the
3455		 * weight is updated too because we are not guaranteed
3456		 * that this metaslab is dirty and will go through
3457		 * metaslab_sync_done().
3458		 */
3459		metaslab_recalculate_weight_and_sort(msp);
3460		return (B_TRUE);
3461	}
3462
3463	msp->ms_flushing = B_TRUE;
3464	uint64_t sm_len_before = space_map_length(msp->ms_sm);
3465
3466	mutex_exit(&msp->ms_lock);
3467	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3468	    SM_NO_VDEVID, tx);
3469	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3470	    SM_NO_VDEVID, tx);
3471	mutex_enter(&msp->ms_lock);
3472
3473	uint64_t sm_len_after = space_map_length(msp->ms_sm);
3474	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3475		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3476		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3477		    "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
3478		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
3479		    range_tree_space(msp->ms_unflushed_allocs),
3480		    range_tree_space(msp->ms_unflushed_frees),
3481		    (sm_len_after - sm_len_before));
3482	}
3483
3484	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3485	    metaslab_unflushed_changes_memused(msp));
3486	spa->spa_unflushed_stats.sus_memused -=
3487	    metaslab_unflushed_changes_memused(msp);
3488	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3489	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3490
3491	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3492	metaslab_verify_weight_and_frag(msp);
3493
3494	metaslab_flush_update(msp, tx);
3495
3496	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3497	metaslab_verify_weight_and_frag(msp);
3498
3499	msp->ms_flushing = B_FALSE;
3500	cv_broadcast(&msp->ms_flush_cv);
3501	return (B_TRUE);
3502}
3503
3504/*
3505 * Write a metaslab to disk in the context of the specified transaction group.
3506 */
3507void
3508metaslab_sync(metaslab_t *msp, uint64_t txg)
3509{
3510	metaslab_group_t *mg = msp->ms_group;
3511	vdev_t *vd = mg->mg_vd;
3512	spa_t *spa = vd->vdev_spa;
3513	objset_t *mos = spa_meta_objset(spa);
3514	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3515	dmu_tx_t *tx;
3516
3517	ASSERT(!vd->vdev_ishole);
3518
3519	/*
3520	 * This metaslab has just been added so there's no work to do now.
3521	 */
3522	if (msp->ms_freeing == NULL) {
3523		ASSERT3P(alloctree, ==, NULL);
3524		return;
3525	}
3526
3527	ASSERT3P(alloctree, !=, NULL);
3528	ASSERT3P(msp->ms_freeing, !=, NULL);
3529	ASSERT3P(msp->ms_freed, !=, NULL);
3530	ASSERT3P(msp->ms_checkpointing, !=, NULL);
3531	ASSERT3P(msp->ms_trim, !=, NULL);
3532
3533	/*
3534	 * Normally, we don't want to process a metaslab if there are no
3535	 * allocations or frees to perform. However, if the metaslab is being
3536	 * forced to condense, it's loaded and we're not beyond the final
3537	 * dirty txg, we need to let it through. Not condensing beyond the
3538	 * final dirty txg prevents an issue where metaslabs that need to be
3539	 * condensed but were loaded for other reasons could cause a panic
3540	 * here. By only checking the txg in that branch of the conditional,
3541	 * we preserve the utility of the VERIFY statements in all other
3542	 * cases.
3543	 */
3544	if (range_tree_is_empty(alloctree) &&
3545	    range_tree_is_empty(msp->ms_freeing) &&
3546	    range_tree_is_empty(msp->ms_checkpointing) &&
3547	    !(msp->ms_loaded && msp->ms_condense_wanted &&
3548	    txg <= spa_final_dirty_txg(spa)))
3549		return;
3550
3551
3552	VERIFY(txg <= spa_final_dirty_txg(spa));
3553
3554	/*
3555	 * The only state that can actually be changing concurrently
3556	 * with metaslab_sync() is the metaslab's ms_allocatable. No
3557	 * other thread can be modifying this txg's alloc, freeing,
3558	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
3559	 * could call into the DMU, because the DMU can call down to
3560	 * us (e.g. via zio_free()) at any time.
3561	 *
3562	 * The spa_vdev_remove_thread() can be reading metaslab state
3563	 * concurrently, and it is locked out by the ms_sync_lock.
3564	 * Note that the ms_lock is insufficient for this, because it
3565	 * is dropped by space_map_write().
3566	 */
3567	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3568
3569	/*
3570	 * Generate a log space map if one doesn't exist already.
3571	 */
3572	spa_generate_syncing_log_sm(spa, tx);
3573
3574	if (msp->ms_sm == NULL) {
3575		uint64_t new_object = space_map_alloc(mos,
3576		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3577		    zfs_metaslab_sm_blksz_with_log :
3578		    zfs_metaslab_sm_blksz_no_log, tx);
3579		VERIFY3U(new_object, !=, 0);
3580
3581		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
3582		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
3583
3584		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
3585		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
3586		ASSERT(msp->ms_sm != NULL);
3587
3588		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3589		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3590		ASSERT0(metaslab_allocated_space(msp));
3591	}
3592
3593	if (metaslab_unflushed_txg(msp) == 0 &&
3594	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
3595		ASSERT(spa_syncing_log_sm(spa) != NULL);
3596
3597		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3598		spa_log_sm_increment_current_mscount(spa);
3599		spa_log_summary_add_flushed_metaslab(spa);
3600
3601		ASSERT(msp->ms_sm != NULL);
3602		mutex_enter(&spa->spa_flushed_ms_lock);
3603		avl_add(&spa->spa_metaslabs_by_flushed, msp);
3604		mutex_exit(&spa->spa_flushed_ms_lock);
3605
3606		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3607		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3608	}
3609
3610	if (!range_tree_is_empty(msp->ms_checkpointing) &&
3611	    vd->vdev_checkpoint_sm == NULL) {
3612		ASSERT(spa_has_checkpoint(spa));
3613
3614		uint64_t new_object = space_map_alloc(mos,
3615		    zfs_vdev_standard_sm_blksz, tx);
3616		VERIFY3U(new_object, !=, 0);
3617
3618		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
3619		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
3620		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3621
3622		/*
3623		 * We save the space map object as an entry in vdev_top_zap
3624		 * so it can be retrieved when the pool is reopened after an
3625		 * export or through zdb.
3626		 */
3627		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
3628		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
3629		    sizeof (new_object), 1, &new_object, tx));
3630	}
3631
3632	mutex_enter(&msp->ms_sync_lock);
3633	mutex_enter(&msp->ms_lock);
3634
3635	/*
3636	 * Note: metaslab_condense() clears the space map's histogram.
3637	 * Therefore we must verify and remove this histogram before
3638	 * condensing.
3639	 */
3640	metaslab_group_histogram_verify(mg);
3641	metaslab_class_histogram_verify(mg->mg_class);
3642	metaslab_group_histogram_remove(mg, msp);
3643
3644	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
3645	    metaslab_should_condense(msp))
3646		metaslab_condense(msp, tx);
3647
3648	/*
3649	 * We'll be going to disk to sync our space accounting, thus we
3650	 * drop the ms_lock during that time so allocations coming from
3651	 * open-context (ZIL) for future TXGs do not block.
3652	 */
3653	mutex_exit(&msp->ms_lock);
3654	space_map_t *log_sm = spa_syncing_log_sm(spa);
3655	if (log_sm != NULL) {
3656		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3657
3658		space_map_write(log_sm, alloctree, SM_ALLOC,
3659		    vd->vdev_id, tx);
3660		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
3661		    vd->vdev_id, tx);
3662		mutex_enter(&msp->ms_lock);
3663
3664		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3665		    metaslab_unflushed_changes_memused(msp));
3666		spa->spa_unflushed_stats.sus_memused -=
3667		    metaslab_unflushed_changes_memused(msp);
3668		range_tree_remove_xor_add(alloctree,
3669		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
3670		range_tree_remove_xor_add(msp->ms_freeing,
3671		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
3672		spa->spa_unflushed_stats.sus_memused +=
3673		    metaslab_unflushed_changes_memused(msp);
3674	} else {
3675		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3676
3677		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
3678		    SM_NO_VDEVID, tx);
3679		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
3680		    SM_NO_VDEVID, tx);
3681		mutex_enter(&msp->ms_lock);
3682	}
3683
3684	msp->ms_allocated_space += range_tree_space(alloctree);
3685	ASSERT3U(msp->ms_allocated_space, >=,
3686	    range_tree_space(msp->ms_freeing));
3687	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
3688
3689	if (!range_tree_is_empty(msp->ms_checkpointing)) {
3690		ASSERT(spa_has_checkpoint(spa));
3691		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3692
3693		/*
3694		 * Since we are doing writes to disk and the ms_checkpointing
3695		 * tree won't be changing during that time, we drop the
3696		 * ms_lock while writing to the checkpoint space map, for the
3697		 * same reason mentioned above.
3698		 */
3699		mutex_exit(&msp->ms_lock);
3700		space_map_write(vd->vdev_checkpoint_sm,
3701		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
3702		mutex_enter(&msp->ms_lock);
3703
3704		spa->spa_checkpoint_info.sci_dspace +=
3705		    range_tree_space(msp->ms_checkpointing);
3706		vd->vdev_stat.vs_checkpoint_space +=
3707		    range_tree_space(msp->ms_checkpointing);
3708		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
3709		    -space_map_allocated(vd->vdev_checkpoint_sm));
3710
3711		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
3712	}
3713
3714	if (msp->ms_loaded) {
3715		/*
3716		 * When the space map is loaded, we have an accurate
3717		 * histogram in the range tree. This gives us an opportunity
3718		 * to bring the space map's histogram up-to-date so we clear
3719		 * it first before updating it.
3720		 */
3721		space_map_histogram_clear(msp->ms_sm);
3722		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3723
3724		/*
3725		 * Since we've cleared the histogram we need to add back
3726		 * any free space that has already been processed, plus
3727		 * any deferred space. This allows the on-disk histogram
3728		 * to accurately reflect all free space even if some space
3729		 * is not yet available for allocation (i.e. deferred).
3730		 */
3731		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
3732
3733		/*
3734		 * Add back any deferred free space that has not been
3735		 * added back into the in-core free tree yet. This will
3736		 * ensure that we don't end up with a space map histogram
3737		 * that is completely empty unless the metaslab is fully
3738		 * allocated.
3739		 */
3740		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3741			space_map_histogram_add(msp->ms_sm,
3742			    msp->ms_defer[t], tx);
3743		}
3744	}
3745
3746	/*
3747	 * Always add the free space from this sync pass to the space
3748	 * map histogram. We want to make sure that the on-disk histogram
3749	 * accounts for all free space. If the space map is not loaded,
3750	 * then we will lose some accuracy but will correct it the next
3751	 * time we load the space map.
3752	 */
3753	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
3754	metaslab_aux_histograms_update(msp);
3755
3756	metaslab_group_histogram_add(mg, msp);
3757	metaslab_group_histogram_verify(mg);
3758	metaslab_class_histogram_verify(mg->mg_class);
3759
3760	/*
3761	 * For sync pass 1, we avoid traversing this txg's free range tree
3762	 * and instead will just swap the pointers for freeing and freed.
3763	 * We can safely do this since the freed_tree is guaranteed to be
3764	 * empty on the initial pass.
3765	 *
3766	 * Keep in mind that even if we are currently using a log spacemap
3767	 * we want current frees to end up in the ms_allocatable (but not
3768	 * get appended to the ms_sm) so their ranges can be reused as usual.
3769	 */
3770	if (spa_sync_pass(spa) == 1) {
3771		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
3772		ASSERT0(msp->ms_allocated_this_txg);
3773	} else {
3774		range_tree_vacate(msp->ms_freeing,
3775		    range_tree_add, msp->ms_freed);
3776	}
3777	msp->ms_allocated_this_txg += range_tree_space(alloctree);
3778	range_tree_vacate(alloctree, NULL, NULL);
3779
3780	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3781	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
3782	    & TXG_MASK]));
3783	ASSERT0(range_tree_space(msp->ms_freeing));
3784	ASSERT0(range_tree_space(msp->ms_checkpointing));
3785
3786	mutex_exit(&msp->ms_lock);
3787
3788	/*
3789	 * Verify that the space map object ID has been recorded in the
3790	 * vdev_ms_array.
3791	 */
3792	uint64_t object;
3793	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
3794	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
3795	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
3796
3797	mutex_exit(&msp->ms_sync_lock);
3798	dmu_tx_commit(tx);
3799}
3800
3801static void
3802metaslab_evict(metaslab_t *msp, uint64_t txg)
3803{
3804	if (!msp->ms_loaded || msp->ms_disabled != 0)
3805		return;
3806
3807	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3808		VERIFY0(range_tree_space(
3809		    msp->ms_allocating[(txg + t) & TXG_MASK]));
3810	}
3811	if (msp->ms_allocator != -1)
3812		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
3813
3814	if (!metaslab_debug_unload)
3815		metaslab_unload(msp);
3816}
3817
3818/*
3819 * Called after a transaction group has completely synced to mark
3820 * all of the metaslab's free space as usable.
3821 */
3822void
3823metaslab_sync_done(metaslab_t *msp, uint64_t txg)
3824{
3825	metaslab_group_t *mg = msp->ms_group;
3826	vdev_t *vd = mg->mg_vd;
3827	spa_t *spa = vd->vdev_spa;
3828	range_tree_t **defer_tree;
3829	int64_t alloc_delta, defer_delta;
3830	boolean_t defer_allowed = B_TRUE;
3831
3832	ASSERT(!vd->vdev_ishole);
3833
3834	mutex_enter(&msp->ms_lock);
3835
3836	/*
3837	 * If this metaslab is just becoming available, initialize its
3838	 * range trees and add its capacity to the vdev.
3839	 */
3840	if (msp->ms_freed == NULL) {
3841		for (int t = 0; t < TXG_SIZE; t++) {
3842			ASSERT(msp->ms_allocating[t] == NULL);
3843
3844			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
3845		}
3846
3847		ASSERT3P(msp->ms_freeing, ==, NULL);
3848		msp->ms_freeing = range_tree_create(NULL, NULL);
3849
3850		ASSERT3P(msp->ms_freed, ==, NULL);
3851		msp->ms_freed = range_tree_create(NULL, NULL);
3852
3853		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3854			ASSERT3P(msp->ms_defer[t], ==, NULL);
3855			msp->ms_defer[t] = range_tree_create(NULL, NULL);
3856		}
3857
3858		ASSERT3P(msp->ms_checkpointing, ==, NULL);
3859		msp->ms_checkpointing = range_tree_create(NULL, NULL);
3860
3861		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
3862		msp->ms_unflushed_allocs = range_tree_create(NULL, NULL);
3863		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
3864		msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops,
3865		    &msp->ms_unflushed_frees_by_size,
3866		    metaslab_rangesize_compare, 0);
3867
3868		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
3869	}
3870	ASSERT0(range_tree_space(msp->ms_freeing));
3871	ASSERT0(range_tree_space(msp->ms_checkpointing));
3872
3873	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
3874
3875	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
3876	    metaslab_class_get_alloc(spa_normal_class(spa));
3877	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
3878		defer_allowed = B_FALSE;
3879	}
3880
3881	defer_delta = 0;
3882	alloc_delta = msp->ms_allocated_this_txg -
3883	    range_tree_space(msp->ms_freed);
3884
3885	if (defer_allowed) {
3886		defer_delta = range_tree_space(msp->ms_freed) -
3887		    range_tree_space(*defer_tree);
3888	} else {
3889		defer_delta -= range_tree_space(*defer_tree);
3890	}
3891	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3892	    defer_delta, 0);
3893
3894	if (spa_syncing_log_sm(spa) == NULL) {
3895		/*
3896		 * If there's a metaslab_load() in progress and we don't have
3897		 * a log space map, it means that we probably wrote to the
3898		 * metaslab's space map. If this is the case, we need to
3899		 * make sure that we wait for the load to complete so that we
3900		 * have a consistent view at the in-core side of the metaslab.
3901		 */
3902		metaslab_load_wait(msp);
3903	} else {
3904		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3905	}
3906
3907	/*
3908	 * When auto-trimming is enabled, free ranges which are added to
3909	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
3910	 * periodically consumed by the vdev_autotrim_thread() which issues
3911	 * trims for all ranges and then vacates the tree.  The ms_trim tree
3912	 * can be discarded at any time with the sole consequence of recent
3913	 * frees not being trimmed.
3914	 */
3915	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
3916		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
3917		if (!defer_allowed) {
3918			range_tree_walk(msp->ms_freed, range_tree_add,
3919			    msp->ms_trim);
3920		}
3921	} else {
3922		range_tree_vacate(msp->ms_trim, NULL, NULL);
3923	}
3924
3925	/*
3926	 * Move the frees from the defer_tree back to the free
3927	 * range tree (if it's loaded). Swap the freed_tree and
3928	 * the defer_tree -- this is safe to do because we've
3929	 * just emptied out the defer_tree.
3930	 */
3931	range_tree_vacate(*defer_tree,
3932	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
3933	if (defer_allowed) {
3934		range_tree_swap(&msp->ms_freed, defer_tree);
3935	} else {
3936		range_tree_vacate(msp->ms_freed,
3937		    msp->ms_loaded ? range_tree_add : NULL,
3938		    msp->ms_allocatable);
3939	}
3940
3941	msp->ms_synced_length = space_map_length(msp->ms_sm);
3942
3943	msp->ms_deferspace += defer_delta;
3944	ASSERT3S(msp->ms_deferspace, >=, 0);
3945	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
3946	if (msp->ms_deferspace != 0) {
3947		/*
3948		 * Keep syncing this metaslab until all deferred frees
3949		 * are back in circulation.
3950		 */
3951		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3952	}
3953	metaslab_aux_histograms_update_done(msp, defer_allowed);
3954
3955	if (msp->ms_new) {
3956		msp->ms_new = B_FALSE;
3957		mutex_enter(&mg->mg_lock);
3958		mg->mg_ms_ready++;
3959		mutex_exit(&mg->mg_lock);
3960	}
3961
3962	/*
3963	 * Re-sort metaslab within its group now that we've adjusted
3964	 * its allocatable space.
3965	 */
3966	metaslab_recalculate_weight_and_sort(msp);
3967
3968	/*
3969	 * If the metaslab is loaded and we've not tried to load or allocate
3970	 * from it in 'metaslab_unload_delay' txgs, then unload it.
3971	 */
3972	if (msp->ms_loaded &&
3973	    msp->ms_disabled == 0 &&
3974	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
3975
3976		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3977			VERIFY0(range_tree_space(
3978			    msp->ms_allocating[(txg + t) & TXG_MASK]));
3979		}
3980		if (msp->ms_allocator != -1) {
3981			metaslab_passivate(msp, msp->ms_weight &
3982			    ~METASLAB_ACTIVE_MASK);
3983		}
3984
3985		if (!metaslab_debug_unload)
3986			metaslab_unload(msp);
3987	}
3988
3989	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3990	ASSERT0(range_tree_space(msp->ms_freeing));
3991	ASSERT0(range_tree_space(msp->ms_freed));
3992	ASSERT0(range_tree_space(msp->ms_checkpointing));
3993	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
3994	msp->ms_allocated_this_txg = 0;
3995	mutex_exit(&msp->ms_lock);
3996}
3997
3998void
3999metaslab_sync_reassess(metaslab_group_t *mg)
4000{
4001	spa_t *spa = mg->mg_class->mc_spa;
4002
4003	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4004	metaslab_group_alloc_update(mg);
4005	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4006
4007	/*
4008	 * Preload the next potential metaslabs but only on active
4009	 * metaslab groups. We can get into a state where the metaslab
4010	 * is no longer active since we dirty metaslabs as we remove a
4011	 * a device, thus potentially making the metaslab group eligible
4012	 * for preloading.
4013	 */
4014	if (mg->mg_activation_count > 0) {
4015		metaslab_group_preload(mg);
4016	}
4017	spa_config_exit(spa, SCL_ALLOC, FTAG);
4018}
4019
4020/*
4021 * When writing a ditto block (i.e. more than one DVA for a given BP) on
4022 * the same vdev as an existing DVA of this BP, then try to allocate it
4023 * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4024 */
4025static boolean_t
4026metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4027{
4028	uint64_t dva_ms_id;
4029
4030	if (DVA_GET_ASIZE(dva) == 0)
4031		return (B_TRUE);
4032
4033	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4034		return (B_TRUE);
4035
4036	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4037
4038	return (msp->ms_id != dva_ms_id);
4039}
4040
4041/*
4042 * ==========================================================================
4043 * Metaslab allocation tracing facility
4044 * ==========================================================================
4045 */
4046kstat_t *metaslab_trace_ksp;
4047kstat_named_t metaslab_trace_over_limit;
4048
4049void
4050metaslab_alloc_trace_init(void)
4051{
4052	ASSERT(metaslab_alloc_trace_cache == NULL);
4053	metaslab_alloc_trace_cache = kmem_cache_create(
4054	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
4055	    0, NULL, NULL, NULL, NULL, NULL, 0);
4056	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
4057	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
4058	if (metaslab_trace_ksp != NULL) {
4059		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
4060		kstat_named_init(&metaslab_trace_over_limit,
4061		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
4062		kstat_install(metaslab_trace_ksp);
4063	}
4064}
4065
4066void
4067metaslab_alloc_trace_fini(void)
4068{
4069	if (metaslab_trace_ksp != NULL) {
4070		kstat_delete(metaslab_trace_ksp);
4071		metaslab_trace_ksp = NULL;
4072	}
4073	kmem_cache_destroy(metaslab_alloc_trace_cache);
4074	metaslab_alloc_trace_cache = NULL;
4075}
4076
4077/*
4078 * Add an allocation trace element to the allocation tracing list.
4079 */
4080static void
4081metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4082    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4083    int allocator)
4084{
4085	if (!metaslab_trace_enabled)
4086		return;
4087
4088	/*
4089	 * When the tracing list reaches its maximum we remove
4090	 * the second element in the list before adding a new one.
4091	 * By removing the second element we preserve the original
4092	 * entry as a clue to what allocations steps have already been
4093	 * performed.
4094	 */
4095	if (zal->zal_size == metaslab_trace_max_entries) {
4096		metaslab_alloc_trace_t *mat_next;
4097#ifdef DEBUG
4098		panic("too many entries in allocation list");
4099#endif
4100		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
4101		zal->zal_size--;
4102		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4103		list_remove(&zal->zal_list, mat_next);
4104		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4105	}
4106
4107	metaslab_alloc_trace_t *mat =
4108	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4109	list_link_init(&mat->mat_list_node);
4110	mat->mat_mg = mg;
4111	mat->mat_msp = msp;
4112	mat->mat_size = psize;
4113	mat->mat_dva_id = dva_id;
4114	mat->mat_offset = offset;
4115	mat->mat_weight = 0;
4116	mat->mat_allocator = allocator;
4117
4118	if (msp != NULL)
4119		mat->mat_weight = msp->ms_weight;
4120
4121	/*
4122	 * The list is part of the zio so locking is not required. Only
4123	 * a single thread will perform allocations for a given zio.
4124	 */
4125	list_insert_tail(&zal->zal_list, mat);
4126	zal->zal_size++;
4127
4128	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4129}
4130
4131void
4132metaslab_trace_init(zio_alloc_list_t *zal)
4133{
4134	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4135	    offsetof(metaslab_alloc_trace_t, mat_list_node));
4136	zal->zal_size = 0;
4137}
4138
4139void
4140metaslab_trace_fini(zio_alloc_list_t *zal)
4141{
4142	metaslab_alloc_trace_t *mat;
4143
4144	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4145		kmem_cache_free(metaslab_alloc_trace_cache, mat);
4146	list_destroy(&zal->zal_list);
4147	zal->zal_size = 0;
4148}
4149
4150/*
4151 * ==========================================================================
4152 * Metaslab block operations
4153 * ==========================================================================
4154 */
4155
4156static void
4157metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
4158    int allocator)
4159{
4160	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4161	    (flags & METASLAB_DONT_THROTTLE))
4162		return;
4163
4164	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4165	if (!mg->mg_class->mc_alloc_throttle_enabled)
4166		return;
4167
4168	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
4169}
4170
4171static void
4172metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4173{
4174	uint64_t max = mg->mg_max_alloc_queue_depth;
4175	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4176	while (cur < max) {
4177		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
4178		    cur, cur + 1) == cur) {
4179			atomic_inc_64(
4180			    &mg->mg_class->mc_alloc_max_slots[allocator]);
4181			return;
4182		}
4183		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4184	}
4185}
4186
4187void
4188metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
4189    int allocator, boolean_t io_complete)
4190{
4191	if (!(flags & METASLAB_ASYNC_ALLOC) ||
4192	    (flags & METASLAB_DONT_THROTTLE))
4193		return;
4194
4195	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4196	if (!mg->mg_class->mc_alloc_throttle_enabled)
4197		return;
4198
4199	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
4200	if (io_complete)
4201		metaslab_group_increment_qdepth(mg, allocator);
4202}
4203
4204void
4205metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
4206    int allocator)
4207{
4208#ifdef ZFS_DEBUG
4209	const dva_t *dva = bp->blk_dva;
4210	int ndvas = BP_GET_NDVAS(bp);
4211
4212	for (int d = 0; d < ndvas; d++) {
4213		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4214		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4215		VERIFY(zfs_refcount_not_held(
4216		    &mg->mg_alloc_queue_depth[allocator], tag));
4217	}
4218#endif
4219}
4220
4221static uint64_t
4222metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4223{
4224	uint64_t start;
4225	range_tree_t *rt = msp->ms_allocatable;
4226	metaslab_class_t *mc = msp->ms_group->mg_class;
4227
4228	ASSERT(MUTEX_HELD(&msp->ms_lock));
4229	VERIFY(!msp->ms_condensing);
4230	VERIFY0(msp->ms_disabled);
4231
4232	start = mc->mc_ops->msop_alloc(msp, size);
4233	if (start != -1ULL) {
4234		metaslab_group_t *mg = msp->ms_group;
4235		vdev_t *vd = mg->mg_vd;
4236
4237		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4238		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4239		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4240		range_tree_remove(rt, start, size);
4241		range_tree_clear(msp->ms_trim, start, size);
4242
4243		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4244			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4245
4246		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4247		msp->ms_allocating_total += size;
4248
4249		/* Track the last successful allocation */
4250		msp->ms_alloc_txg = txg;
4251		metaslab_verify_space(msp, txg);
4252	}
4253
4254	/*
4255	 * Now that we've attempted the allocation we need to update the
4256	 * metaslab's maximum block size since it may have changed.
4257	 */
4258	msp->ms_max_size = metaslab_largest_allocatable(msp);
4259	return (start);
4260}
4261
4262/*
4263 * Find the metaslab with the highest weight that is less than what we've
4264 * already tried.  In the common case, this means that we will examine each
4265 * metaslab at most once. Note that concurrent callers could reorder metaslabs
4266 * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4267 * activated by another thread, and we fail to allocate from the metaslab we
4268 * have selected, we may not try the newly-activated metaslab, and instead
4269 * activate another metaslab.  This is not optimal, but generally does not cause
4270 * any problems (a possible exception being if every metaslab is completely full
4271 * except for the the newly-activated metaslab which we fail to examine).
4272 */
4273static metaslab_t *
4274find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4275    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4276    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4277    boolean_t *was_active)
4278{
4279	avl_index_t idx;
4280	avl_tree_t *t = &mg->mg_metaslab_tree;
4281	metaslab_t *msp = avl_find(t, search, &idx);
4282	if (msp == NULL)
4283		msp = avl_nearest(t, idx, AVL_AFTER);
4284
4285	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4286		int i;
4287		if (!metaslab_should_allocate(msp, asize, try_hard)) {
4288			metaslab_trace_add(zal, mg, msp, asize, d,
4289			    TRACE_TOO_SMALL, allocator);
4290			continue;
4291		}
4292
4293		/*
4294		 * If the selected metaslab is condensing or disabled,
4295		 * skip it.
4296		 */
4297		if (msp->ms_condensing || msp->ms_disabled > 0)
4298			continue;
4299
4300		*was_active = msp->ms_allocator != -1;
4301		/*
4302		 * If we're activating as primary, this is our first allocation
4303		 * from this disk, so we don't need to check how close we are.
4304		 * If the metaslab under consideration was already active,
4305		 * we're getting desperate enough to steal another allocator's
4306		 * metaslab, so we still don't care about distances.
4307		 */
4308		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4309			break;
4310
4311		for (i = 0; i < d; i++) {
4312			if (want_unique &&
4313			    !metaslab_is_unique(msp, &dva[i]))
4314				break;  /* try another metaslab */
4315		}
4316		if (i == d)
4317			break;
4318	}
4319
4320	if (msp != NULL) {
4321		search->ms_weight = msp->ms_weight;
4322		search->ms_start = msp->ms_start + 1;
4323		search->ms_allocator = msp->ms_allocator;
4324		search->ms_primary = msp->ms_primary;
4325	}
4326	return (msp);
4327}
4328
4329void
4330metaslab_active_mask_verify(metaslab_t *msp)
4331{
4332	ASSERT(MUTEX_HELD(&msp->ms_lock));
4333
4334	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4335		return;
4336
4337	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4338		return;
4339
4340	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4341		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4342		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4343		VERIFY3S(msp->ms_allocator, !=, -1);
4344		VERIFY(msp->ms_primary);
4345		return;
4346	}
4347
4348	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4349		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4350		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4351		VERIFY3S(msp->ms_allocator, !=, -1);
4352		VERIFY(!msp->ms_primary);
4353		return;
4354	}
4355
4356	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4357		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4358		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4359		VERIFY3S(msp->ms_allocator, ==, -1);
4360		return;
4361	}
4362}
4363
4364/* ARGSUSED */
4365static uint64_t
4366metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4367    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4368    int allocator, boolean_t try_hard)
4369{
4370	metaslab_t *msp = NULL;
4371	uint64_t offset = -1ULL;
4372
4373	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4374	for (int i = 0; i < d; i++) {
4375		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4376		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4377			activation_weight = METASLAB_WEIGHT_SECONDARY;
4378		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4379		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4380			activation_weight = METASLAB_WEIGHT_CLAIM;
4381			break;
4382		}
4383	}
4384
4385	/*
4386	 * If we don't have enough metaslabs active to fill the entire array, we
4387	 * just use the 0th slot.
4388	 */
4389	if (mg->mg_ms_ready < mg->mg_allocators * 3)
4390		allocator = 0;
4391
4392	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4393
4394	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4395	search->ms_weight = UINT64_MAX;
4396	search->ms_start = 0;
4397	/*
4398	 * At the end of the metaslab tree are the already-active metaslabs,
4399	 * first the primaries, then the secondaries. When we resume searching
4400	 * through the tree, we need to consider ms_allocator and ms_primary so
4401	 * we start in the location right after where we left off, and don't
4402	 * accidentally loop forever considering the same metaslabs.
4403	 */
4404	search->ms_allocator = -1;
4405	search->ms_primary = B_TRUE;
4406	for (;;) {
4407		boolean_t was_active = B_FALSE;
4408
4409		mutex_enter(&mg->mg_lock);
4410
4411		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4412		    mg->mg_primaries[allocator] != NULL) {
4413			msp = mg->mg_primaries[allocator];
4414
4415			/*
4416			 * Even though we don't hold the ms_lock for the
4417			 * primary metaslab, those fields should not
4418			 * change while we hold the mg_lock. Thus is is
4419			 * safe to make assertions on them.
4420			 */
4421			ASSERT(msp->ms_primary);
4422			ASSERT3S(msp->ms_allocator, ==, allocator);
4423			ASSERT(msp->ms_loaded);
4424
4425			was_active = B_TRUE;
4426			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4427		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4428		    mg->mg_secondaries[allocator] != NULL) {
4429			msp = mg->mg_secondaries[allocator];
4430
4431			/*
4432			 * See comment above about the similar assertions
4433			 * for the primary metaslab.
4434			 */
4435			ASSERT(!msp->ms_primary);
4436			ASSERT3S(msp->ms_allocator, ==, allocator);
4437			ASSERT(msp->ms_loaded);
4438
4439			was_active = B_TRUE;
4440			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4441		} else {
4442			msp = find_valid_metaslab(mg, activation_weight, dva, d,
4443			    want_unique, asize, allocator, try_hard, zal,
4444			    search, &was_active);
4445		}
4446
4447		mutex_exit(&mg->mg_lock);
4448		if (msp == NULL) {
4449			kmem_free(search, sizeof (*search));
4450			return (-1ULL);
4451		}
4452		mutex_enter(&msp->ms_lock);
4453
4454		metaslab_active_mask_verify(msp);
4455
4456		/*
4457		 * This code is disabled out because of issues with
4458		 * tracepoints in non-gpl kernel modules.
4459		 */
4460#if 0
4461		DTRACE_PROBE3(ms__activation__attempt,
4462		    metaslab_t *, msp, uint64_t, activation_weight,
4463		    boolean_t, was_active);
4464#endif
4465
4466		/*
4467		 * Ensure that the metaslab we have selected is still
4468		 * capable of handling our request. It's possible that
4469		 * another thread may have changed the weight while we
4470		 * were blocked on the metaslab lock. We check the
4471		 * active status first to see if we need to set_selected_txg
4472		 * a new metaslab.
4473		 */
4474		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4475			ASSERT3S(msp->ms_allocator, ==, -1);
4476			mutex_exit(&msp->ms_lock);
4477			continue;
4478		}
4479
4480		/*
4481		 * If the metaslab was activated for another allocator
4482		 * while we were waiting in the ms_lock above, or it's
4483		 * a primary and we're seeking a secondary (or vice versa),
4484		 * we go back and select a new metaslab.
4485		 */
4486		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4487		    (msp->ms_allocator != -1) &&
4488		    (msp->ms_allocator != allocator || ((activation_weight ==
4489		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4490			ASSERT(msp->ms_loaded);
4491			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4492			    msp->ms_allocator != -1);
4493			mutex_exit(&msp->ms_lock);
4494			continue;
4495		}
4496
4497		/*
4498		 * This metaslab was used for claiming regions allocated
4499		 * by the ZIL during pool import. Once these regions are
4500		 * claimed we don't need to keep the CLAIM bit set
4501		 * anymore. Passivate this metaslab to zero its activation
4502		 * mask.
4503		 */
4504		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4505		    activation_weight != METASLAB_WEIGHT_CLAIM) {
4506			ASSERT(msp->ms_loaded);
4507			ASSERT3S(msp->ms_allocator, ==, -1);
4508			metaslab_passivate(msp, msp->ms_weight &
4509			    ~METASLAB_WEIGHT_CLAIM);
4510			mutex_exit(&msp->ms_lock);
4511			continue;
4512		}
4513
4514		metaslab_set_selected_txg(msp, txg);
4515
4516		int activation_error =
4517		    metaslab_activate(msp, allocator, activation_weight);
4518		metaslab_active_mask_verify(msp);
4519
4520		/*
4521		 * If the metaslab was activated by another thread for
4522		 * another allocator or activation_weight (EBUSY), or it
4523		 * failed because another metaslab was assigned as primary
4524		 * for this allocator (EEXIST) we continue using this
4525		 * metaslab for our allocation, rather than going on to a
4526		 * worse metaslab (we waited for that metaslab to be loaded
4527		 * after all).
4528		 *
4529		 * If the activation failed due to an I/O error or ENOSPC we
4530		 * skip to the next metaslab.
4531		 */
4532		boolean_t activated;
4533		if (activation_error == 0) {
4534			activated = B_TRUE;
4535		} else if (activation_error == EBUSY ||
4536		    activation_error == EEXIST) {
4537			activated = B_FALSE;
4538		} else {
4539			mutex_exit(&msp->ms_lock);
4540			continue;
4541		}
4542		ASSERT(msp->ms_loaded);
4543
4544		/*
4545		 * Now that we have the lock, recheck to see if we should
4546		 * continue to use this metaslab for this allocation. The
4547		 * the metaslab is now loaded so metaslab_should_allocate()
4548		 * can accurately determine if the allocation attempt should
4549		 * proceed.
4550		 */
4551		if (!metaslab_should_allocate(msp, asize, try_hard)) {
4552			/* Passivate this metaslab and select a new one. */
4553			metaslab_trace_add(zal, mg, msp, asize, d,
4554			    TRACE_TOO_SMALL, allocator);
4555			goto next;
4556		}
4557
4558		/*
4559		 * If this metaslab is currently condensing then pick again
4560		 * as we can't manipulate this metaslab until it's committed
4561		 * to disk. If this metaslab is being initialized, we shouldn't
4562		 * allocate from it since the allocated region might be
4563		 * overwritten after allocation.
4564		 */
4565		if (msp->ms_condensing) {
4566			metaslab_trace_add(zal, mg, msp, asize, d,
4567			    TRACE_CONDENSING, allocator);
4568			if (activated) {
4569				metaslab_passivate(msp, msp->ms_weight &
4570				    ~METASLAB_ACTIVE_MASK);
4571			}
4572			mutex_exit(&msp->ms_lock);
4573			continue;
4574		} else if (msp->ms_disabled > 0) {
4575			metaslab_trace_add(zal, mg, msp, asize, d,
4576			    TRACE_DISABLED, allocator);
4577			if (activated) {
4578				metaslab_passivate(msp, msp->ms_weight &
4579				    ~METASLAB_ACTIVE_MASK);
4580			}
4581			mutex_exit(&msp->ms_lock);
4582			continue;
4583		}
4584
4585		offset = metaslab_block_alloc(msp, asize, txg);
4586		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4587
4588		if (offset != -1ULL) {
4589			/* Proactively passivate the metaslab, if needed */
4590			if (activated)
4591				metaslab_segment_may_passivate(msp);
4592			break;
4593		}
4594next:
4595		ASSERT(msp->ms_loaded);
4596
4597		/*
4598		 * This code is disabled out because of issues with
4599		 * tracepoints in non-gpl kernel modules.
4600		 */
4601#if 0
4602		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4603		    uint64_t, asize);
4604#endif
4605
4606		/*
4607		 * We were unable to allocate from this metaslab so determine
4608		 * a new weight for this metaslab. Now that we have loaded
4609		 * the metaslab we can provide a better hint to the metaslab
4610		 * selector.
4611		 *
4612		 * For space-based metaslabs, we use the maximum block size.
4613		 * This information is only available when the metaslab
4614		 * is loaded and is more accurate than the generic free
4615		 * space weight that was calculated by metaslab_weight().
4616		 * This information allows us to quickly compare the maximum
4617		 * available allocation in the metaslab to the allocation
4618		 * size being requested.
4619		 *
4620		 * For segment-based metaslabs, determine the new weight
4621		 * based on the highest bucket in the range tree. We
4622		 * explicitly use the loaded segment weight (i.e. the range
4623		 * tree histogram) since it contains the space that is
4624		 * currently available for allocation and is accurate
4625		 * even within a sync pass.
4626		 */
4627		uint64_t weight;
4628		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
4629			weight = metaslab_largest_allocatable(msp);
4630			WEIGHT_SET_SPACEBASED(weight);
4631		} else {
4632			weight = metaslab_weight_from_range_tree(msp);
4633		}
4634
4635		if (activated) {
4636			metaslab_passivate(msp, weight);
4637		} else {
4638			/*
4639			 * For the case where we use the metaslab that is
4640			 * active for another allocator we want to make
4641			 * sure that we retain the activation mask.
4642			 *
4643			 * Note that we could attempt to use something like
4644			 * metaslab_recalculate_weight_and_sort() that
4645			 * retains the activation mask here. That function
4646			 * uses metaslab_weight() to set the weight though
4647			 * which is not as accurate as the calculations
4648			 * above.
4649			 */
4650			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
4651			metaslab_group_sort(mg, msp, weight);
4652		}
4653		metaslab_active_mask_verify(msp);
4654
4655		/*
4656		 * We have just failed an allocation attempt, check
4657		 * that metaslab_should_allocate() agrees. Otherwise,
4658		 * we may end up in an infinite loop retrying the same
4659		 * metaslab.
4660		 */
4661		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
4662
4663		mutex_exit(&msp->ms_lock);
4664	}
4665	mutex_exit(&msp->ms_lock);
4666	kmem_free(search, sizeof (*search));
4667	return (offset);
4668}
4669
4670static uint64_t
4671metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
4672    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4673    int allocator, boolean_t try_hard)
4674{
4675	uint64_t offset;
4676	ASSERT(mg->mg_initialized);
4677
4678	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
4679	    dva, d, allocator, try_hard);
4680
4681	mutex_enter(&mg->mg_lock);
4682	if (offset == -1ULL) {
4683		mg->mg_failed_allocations++;
4684		metaslab_trace_add(zal, mg, NULL, asize, d,
4685		    TRACE_GROUP_FAILURE, allocator);
4686		if (asize == SPA_GANGBLOCKSIZE) {
4687			/*
4688			 * This metaslab group was unable to allocate
4689			 * the minimum gang block size so it must be out of
4690			 * space. We must notify the allocation throttle
4691			 * to start skipping allocation attempts to this
4692			 * metaslab group until more space becomes available.
4693			 * Note: this failure cannot be caused by the
4694			 * allocation throttle since the allocation throttle
4695			 * is only responsible for skipping devices and
4696			 * not failing block allocations.
4697			 */
4698			mg->mg_no_free_space = B_TRUE;
4699		}
4700	}
4701	mg->mg_allocations++;
4702	mutex_exit(&mg->mg_lock);
4703	return (offset);
4704}
4705
4706/*
4707 * Allocate a block for the specified i/o.
4708 */
4709int
4710metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
4711    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
4712    zio_alloc_list_t *zal, int allocator)
4713{
4714	metaslab_group_t *mg, *rotor;
4715	vdev_t *vd;
4716	boolean_t try_hard = B_FALSE;
4717
4718	ASSERT(!DVA_IS_VALID(&dva[d]));
4719
4720	/*
4721	 * For testing, make some blocks above a certain size be gang blocks.
4722	 * This will also test spilling from special to normal.
4723	 */
4724	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
4725		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
4726		    allocator);
4727		return (SET_ERROR(ENOSPC));
4728	}
4729
4730	/*
4731	 * Start at the rotor and loop through all mgs until we find something.
4732	 * Note that there's no locking on mc_rotor or mc_aliquot because
4733	 * nothing actually breaks if we miss a few updates -- we just won't
4734	 * allocate quite as evenly.  It all balances out over time.
4735	 *
4736	 * If we are doing ditto or log blocks, try to spread them across
4737	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
4738	 * allocated all of our ditto blocks, then try and spread them out on
4739	 * that vdev as much as possible.  If it turns out to not be possible,
4740	 * gradually lower our standards until anything becomes acceptable.
4741	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
4742	 * gives us hope of containing our fault domains to something we're
4743	 * able to reason about.  Otherwise, any two top-level vdev failures
4744	 * will guarantee the loss of data.  With consecutive allocation,
4745	 * only two adjacent top-level vdev failures will result in data loss.
4746	 *
4747	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
4748	 * ourselves on the same vdev as our gang block header.  That
4749	 * way, we can hope for locality in vdev_cache, plus it makes our
4750	 * fault domains something tractable.
4751	 */
4752	if (hintdva) {
4753		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
4754
4755		/*
4756		 * It's possible the vdev we're using as the hint no
4757		 * longer exists or its mg has been closed (e.g. by
4758		 * device removal).  Consult the rotor when
4759		 * all else fails.
4760		 */
4761		if (vd != NULL && vd->vdev_mg != NULL) {
4762			mg = vd->vdev_mg;
4763
4764			if (flags & METASLAB_HINTBP_AVOID &&
4765			    mg->mg_next != NULL)
4766				mg = mg->mg_next;
4767		} else {
4768			mg = mc->mc_rotor;
4769		}
4770	} else if (d != 0) {
4771		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
4772		mg = vd->vdev_mg->mg_next;
4773	} else {
4774		ASSERT(mc->mc_rotor != NULL);
4775		mg = mc->mc_rotor;
4776	}
4777
4778	/*
4779	 * If the hint put us into the wrong metaslab class, or into a
4780	 * metaslab group that has been passivated, just follow the rotor.
4781	 */
4782	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
4783		mg = mc->mc_rotor;
4784
4785	rotor = mg;
4786top:
4787	do {
4788		boolean_t allocatable;
4789
4790		ASSERT(mg->mg_activation_count == 1);
4791		vd = mg->mg_vd;
4792
4793		/*
4794		 * Don't allocate from faulted devices.
4795		 */
4796		if (try_hard) {
4797			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
4798			allocatable = vdev_allocatable(vd);
4799			spa_config_exit(spa, SCL_ZIO, FTAG);
4800		} else {
4801			allocatable = vdev_allocatable(vd);
4802		}
4803
4804		/*
4805		 * Determine if the selected metaslab group is eligible
4806		 * for allocations. If we're ganging then don't allow
4807		 * this metaslab group to skip allocations since that would
4808		 * inadvertently return ENOSPC and suspend the pool
4809		 * even though space is still available.
4810		 */
4811		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
4812			allocatable = metaslab_group_allocatable(mg, rotor,
4813			    psize, allocator, d);
4814		}
4815
4816		if (!allocatable) {
4817			metaslab_trace_add(zal, mg, NULL, psize, d,
4818			    TRACE_NOT_ALLOCATABLE, allocator);
4819			goto next;
4820		}
4821
4822		ASSERT(mg->mg_initialized);
4823
4824		/*
4825		 * Avoid writing single-copy data to a failing,
4826		 * non-redundant vdev, unless we've already tried all
4827		 * other vdevs.
4828		 */
4829		if ((vd->vdev_stat.vs_write_errors > 0 ||
4830		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
4831		    d == 0 && !try_hard && vd->vdev_children == 0) {
4832			metaslab_trace_add(zal, mg, NULL, psize, d,
4833			    TRACE_VDEV_ERROR, allocator);
4834			goto next;
4835		}
4836
4837		ASSERT(mg->mg_class == mc);
4838
4839		uint64_t asize = vdev_psize_to_asize(vd, psize);
4840		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
4841
4842		/*
4843		 * If we don't need to try hard, then require that the
4844		 * block be on an different metaslab from any other DVAs
4845		 * in this BP (unique=true).  If we are trying hard, then
4846		 * allow any metaslab to be used (unique=false).
4847		 */
4848		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
4849		    !try_hard, dva, d, allocator, try_hard);
4850
4851		if (offset != -1ULL) {
4852			/*
4853			 * If we've just selected this metaslab group,
4854			 * figure out whether the corresponding vdev is
4855			 * over- or under-used relative to the pool,
4856			 * and set an allocation bias to even it out.
4857			 */
4858			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
4859				vdev_stat_t *vs = &vd->vdev_stat;
4860				int64_t vu, cu;
4861
4862				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
4863				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
4864
4865				/*
4866				 * Calculate how much more or less we should
4867				 * try to allocate from this device during
4868				 * this iteration around the rotor.
4869				 * For example, if a device is 80% full
4870				 * and the pool is 20% full then we should
4871				 * reduce allocations by 60% on this device.
4872				 *
4873				 * mg_bias = (20 - 80) * 512K / 100 = -307K
4874				 *
4875				 * This reduces allocations by 307K for this
4876				 * iteration.
4877				 */
4878				mg->mg_bias = ((cu - vu) *
4879				    (int64_t)mg->mg_aliquot) / 100;
4880			} else if (!metaslab_bias_enabled) {
4881				mg->mg_bias = 0;
4882			}
4883
4884			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
4885			    mg->mg_aliquot + mg->mg_bias) {
4886				mc->mc_rotor = mg->mg_next;
4887				mc->mc_aliquot = 0;
4888			}
4889
4890			DVA_SET_VDEV(&dva[d], vd->vdev_id);
4891			DVA_SET_OFFSET(&dva[d], offset);
4892			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
4893			DVA_SET_ASIZE(&dva[d], asize);
4894
4895			return (0);
4896		}
4897next:
4898		mc->mc_rotor = mg->mg_next;
4899		mc->mc_aliquot = 0;
4900	} while ((mg = mg->mg_next) != rotor);
4901
4902	/*
4903	 * If we haven't tried hard, do so now.
4904	 */
4905	if (!try_hard) {
4906		try_hard = B_TRUE;
4907		goto top;
4908	}
4909
4910	bzero(&dva[d], sizeof (dva_t));
4911
4912	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
4913	return (SET_ERROR(ENOSPC));
4914}
4915
4916void
4917metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
4918    boolean_t checkpoint)
4919{
4920	metaslab_t *msp;
4921	spa_t *spa = vd->vdev_spa;
4922
4923	ASSERT(vdev_is_concrete(vd));
4924	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4925	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4926
4927	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4928
4929	VERIFY(!msp->ms_condensing);
4930	VERIFY3U(offset, >=, msp->ms_start);
4931	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
4932	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
4933	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
4934
4935	metaslab_check_free_impl(vd, offset, asize);
4936
4937	mutex_enter(&msp->ms_lock);
4938	if (range_tree_is_empty(msp->ms_freeing) &&
4939	    range_tree_is_empty(msp->ms_checkpointing)) {
4940		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
4941	}
4942
4943	if (checkpoint) {
4944		ASSERT(spa_has_checkpoint(spa));
4945		range_tree_add(msp->ms_checkpointing, offset, asize);
4946	} else {
4947		range_tree_add(msp->ms_freeing, offset, asize);
4948	}
4949	mutex_exit(&msp->ms_lock);
4950}
4951
4952/* ARGSUSED */
4953void
4954metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
4955    uint64_t size, void *arg)
4956{
4957	boolean_t *checkpoint = arg;
4958
4959	ASSERT3P(checkpoint, !=, NULL);
4960
4961	if (vd->vdev_ops->vdev_op_remap != NULL)
4962		vdev_indirect_mark_obsolete(vd, offset, size);
4963	else
4964		metaslab_free_impl(vd, offset, size, *checkpoint);
4965}
4966
4967static void
4968metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
4969    boolean_t checkpoint)
4970{
4971	spa_t *spa = vd->vdev_spa;
4972
4973	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4974
4975	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
4976		return;
4977
4978	if (spa->spa_vdev_removal != NULL &&
4979	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
4980	    vdev_is_concrete(vd)) {
4981		/*
4982		 * Note: we check if the vdev is concrete because when
4983		 * we complete the removal, we first change the vdev to be
4984		 * an indirect vdev (in open context), and then (in syncing
4985		 * context) clear spa_vdev_removal.
4986		 */
4987		free_from_removing_vdev(vd, offset, size);
4988	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
4989		vdev_indirect_mark_obsolete(vd, offset, size);
4990		vd->vdev_ops->vdev_op_remap(vd, offset, size,
4991		    metaslab_free_impl_cb, &checkpoint);
4992	} else {
4993		metaslab_free_concrete(vd, offset, size, checkpoint);
4994	}
4995}
4996
4997typedef struct remap_blkptr_cb_arg {
4998	blkptr_t *rbca_bp;
4999	spa_remap_cb_t rbca_cb;
5000	vdev_t *rbca_remap_vd;
5001	uint64_t rbca_remap_offset;
5002	void *rbca_cb_arg;
5003} remap_blkptr_cb_arg_t;
5004
5005void
5006remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5007    uint64_t size, void *arg)
5008{
5009	remap_blkptr_cb_arg_t *rbca = arg;
5010	blkptr_t *bp = rbca->rbca_bp;
5011
5012	/* We can not remap split blocks. */
5013	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5014		return;
5015	ASSERT0(inner_offset);
5016
5017	if (rbca->rbca_cb != NULL) {
5018		/*
5019		 * At this point we know that we are not handling split
5020		 * blocks and we invoke the callback on the previous
5021		 * vdev which must be indirect.
5022		 */
5023		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5024
5025		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5026		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5027
5028		/* set up remap_blkptr_cb_arg for the next call */
5029		rbca->rbca_remap_vd = vd;
5030		rbca->rbca_remap_offset = offset;
5031	}
5032
5033	/*
5034	 * The phys birth time is that of dva[0].  This ensures that we know
5035	 * when each dva was written, so that resilver can determine which
5036	 * blocks need to be scrubbed (i.e. those written during the time
5037	 * the vdev was offline).  It also ensures that the key used in
5038	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
5039	 * we didn't change the phys_birth, a lookup in the ARC for a
5040	 * remapped BP could find the data that was previously stored at
5041	 * this vdev + offset.
5042	 */
5043	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5044	    DVA_GET_VDEV(&bp->blk_dva[0]));
5045	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5046	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5047	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5048
5049	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5050	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5051}
5052
5053/*
5054 * If the block pointer contains any indirect DVAs, modify them to refer to
5055 * concrete DVAs.  Note that this will sometimes not be possible, leaving
5056 * the indirect DVA in place.  This happens if the indirect DVA spans multiple
5057 * segments in the mapping (i.e. it is a "split block").
5058 *
5059 * If the BP was remapped, calls the callback on the original dva (note the
5060 * callback can be called multiple times if the original indirect DVA refers
5061 * to another indirect DVA, etc).
5062 *
5063 * Returns TRUE if the BP was remapped.
5064 */
5065boolean_t
5066spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5067{
5068	remap_blkptr_cb_arg_t rbca;
5069
5070	if (!zfs_remap_blkptr_enable)
5071		return (B_FALSE);
5072
5073	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5074		return (B_FALSE);
5075
5076	/*
5077	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
5078	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
5079	 */
5080	if (BP_GET_DEDUP(bp))
5081		return (B_FALSE);
5082
5083	/*
5084	 * Gang blocks can not be remapped, because
5085	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5086	 * the BP used to read the gang block header (GBH) being the same
5087	 * as the DVA[0] that we allocated for the GBH.
5088	 */
5089	if (BP_IS_GANG(bp))
5090		return (B_FALSE);
5091
5092	/*
5093	 * Embedded BP's have no DVA to remap.
5094	 */
5095	if (BP_GET_NDVAS(bp) < 1)
5096		return (B_FALSE);
5097
5098	/*
5099	 * Note: we only remap dva[0].  If we remapped other dvas, we
5100	 * would no longer know what their phys birth txg is.
5101	 */
5102	dva_t *dva = &bp->blk_dva[0];
5103
5104	uint64_t offset = DVA_GET_OFFSET(dva);
5105	uint64_t size = DVA_GET_ASIZE(dva);
5106	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5107
5108	if (vd->vdev_ops->vdev_op_remap == NULL)
5109		return (B_FALSE);
5110
5111	rbca.rbca_bp = bp;
5112	rbca.rbca_cb = callback;
5113	rbca.rbca_remap_vd = vd;
5114	rbca.rbca_remap_offset = offset;
5115	rbca.rbca_cb_arg = arg;
5116
5117	/*
5118	 * remap_blkptr_cb() will be called in order for each level of
5119	 * indirection, until a concrete vdev is reached or a split block is
5120	 * encountered. old_vd and old_offset are updated within the callback
5121	 * as we go from the one indirect vdev to the next one (either concrete
5122	 * or indirect again) in that order.
5123	 */
5124	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5125
5126	/* Check if the DVA wasn't remapped because it is a split block */
5127	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5128		return (B_FALSE);
5129
5130	return (B_TRUE);
5131}
5132
5133/*
5134 * Undo the allocation of a DVA which happened in the given transaction group.
5135 */
5136void
5137metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5138{
5139	metaslab_t *msp;
5140	vdev_t *vd;
5141	uint64_t vdev = DVA_GET_VDEV(dva);
5142	uint64_t offset = DVA_GET_OFFSET(dva);
5143	uint64_t size = DVA_GET_ASIZE(dva);
5144
5145	ASSERT(DVA_IS_VALID(dva));
5146	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5147
5148	if (txg > spa_freeze_txg(spa))
5149		return;
5150
5151	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
5152	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5153		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
5154		    (u_longlong_t)vdev, (u_longlong_t)offset);
5155		ASSERT(0);
5156		return;
5157	}
5158
5159	ASSERT(!vd->vdev_removing);
5160	ASSERT(vdev_is_concrete(vd));
5161	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5162	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5163
5164	if (DVA_GET_GANG(dva))
5165		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5166
5167	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5168
5169	mutex_enter(&msp->ms_lock);
5170	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5171	    offset, size);
5172	msp->ms_allocating_total -= size;
5173
5174	VERIFY(!msp->ms_condensing);
5175	VERIFY3U(offset, >=, msp->ms_start);
5176	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5177	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5178	    msp->ms_size);
5179	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5180	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5181	range_tree_add(msp->ms_allocatable, offset, size);
5182	mutex_exit(&msp->ms_lock);
5183}
5184
5185/*
5186 * Free the block represented by the given DVA.
5187 */
5188void
5189metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5190{
5191	uint64_t vdev = DVA_GET_VDEV(dva);
5192	uint64_t offset = DVA_GET_OFFSET(dva);
5193	uint64_t size = DVA_GET_ASIZE(dva);
5194	vdev_t *vd = vdev_lookup_top(spa, vdev);
5195
5196	ASSERT(DVA_IS_VALID(dva));
5197	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5198
5199	if (DVA_GET_GANG(dva)) {
5200		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5201	}
5202
5203	metaslab_free_impl(vd, offset, size, checkpoint);
5204}
5205
5206/*
5207 * Reserve some allocation slots. The reservation system must be called
5208 * before we call into the allocator. If there aren't any available slots
5209 * then the I/O will be throttled until an I/O completes and its slots are
5210 * freed up. The function returns true if it was successful in placing
5211 * the reservation.
5212 */
5213boolean_t
5214metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5215    zio_t *zio, int flags)
5216{
5217	uint64_t available_slots = 0;
5218	boolean_t slot_reserved = B_FALSE;
5219	uint64_t max = mc->mc_alloc_max_slots[allocator];
5220
5221	ASSERT(mc->mc_alloc_throttle_enabled);
5222	mutex_enter(&mc->mc_lock);
5223
5224	uint64_t reserved_slots =
5225	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
5226	if (reserved_slots < max)
5227		available_slots = max - reserved_slots;
5228
5229	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5230	    flags & METASLAB_MUST_RESERVE) {
5231		/*
5232		 * We reserve the slots individually so that we can unreserve
5233		 * them individually when an I/O completes.
5234		 */
5235		for (int d = 0; d < slots; d++) {
5236			reserved_slots =
5237			    zfs_refcount_add(&mc->mc_alloc_slots[allocator],
5238			    zio);
5239		}
5240		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5241		slot_reserved = B_TRUE;
5242	}
5243
5244	mutex_exit(&mc->mc_lock);
5245	return (slot_reserved);
5246}
5247
5248void
5249metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5250    int allocator, zio_t *zio)
5251{
5252	ASSERT(mc->mc_alloc_throttle_enabled);
5253	mutex_enter(&mc->mc_lock);
5254	for (int d = 0; d < slots; d++) {
5255		(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
5256		    zio);
5257	}
5258	mutex_exit(&mc->mc_lock);
5259}
5260
5261static int
5262metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5263    uint64_t txg)
5264{
5265	metaslab_t *msp;
5266	spa_t *spa = vd->vdev_spa;
5267	int error = 0;
5268
5269	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5270		return (ENXIO);
5271
5272	ASSERT3P(vd->vdev_ms, !=, NULL);
5273	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5274
5275	mutex_enter(&msp->ms_lock);
5276
5277	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
5278		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5279	/*
5280	 * No need to fail in that case; someone else has activated the
5281	 * metaslab, but that doesn't preclude us from using it.
5282	 */
5283	if (error == EBUSY)
5284		error = 0;
5285
5286	if (error == 0 &&
5287	    !range_tree_contains(msp->ms_allocatable, offset, size))
5288		error = SET_ERROR(ENOENT);
5289
5290	if (error || txg == 0) {	/* txg == 0 indicates dry run */
5291		mutex_exit(&msp->ms_lock);
5292		return (error);
5293	}
5294
5295	VERIFY(!msp->ms_condensing);
5296	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5297	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5298	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5299	    msp->ms_size);
5300	range_tree_remove(msp->ms_allocatable, offset, size);
5301	range_tree_clear(msp->ms_trim, offset, size);
5302
5303	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
5304		metaslab_class_t *mc = msp->ms_group->mg_class;
5305		multilist_sublist_t *mls =
5306		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
5307		if (!multilist_link_active(&msp->ms_class_txg_node)) {
5308			msp->ms_selected_txg = txg;
5309			multilist_sublist_insert_head(mls, msp);
5310		}
5311		multilist_sublist_unlock(mls);
5312
5313		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5314			vdev_dirty(vd, VDD_METASLAB, msp, txg);
5315		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5316		    offset, size);
5317		msp->ms_allocating_total += size;
5318	}
5319
5320	mutex_exit(&msp->ms_lock);
5321
5322	return (0);
5323}
5324
5325typedef struct metaslab_claim_cb_arg_t {
5326	uint64_t	mcca_txg;
5327	int		mcca_error;
5328} metaslab_claim_cb_arg_t;
5329
5330/* ARGSUSED */
5331static void
5332metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5333    uint64_t size, void *arg)
5334{
5335	metaslab_claim_cb_arg_t *mcca_arg = arg;
5336
5337	if (mcca_arg->mcca_error == 0) {
5338		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5339		    size, mcca_arg->mcca_txg);
5340	}
5341}
5342
5343int
5344metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5345{
5346	if (vd->vdev_ops->vdev_op_remap != NULL) {
5347		metaslab_claim_cb_arg_t arg;
5348
5349		/*
5350		 * Only zdb(1M) can claim on indirect vdevs.  This is used
5351		 * to detect leaks of mapped space (that are not accounted
5352		 * for in the obsolete counts, spacemap, or bpobj).
5353		 */
5354		ASSERT(!spa_writeable(vd->vdev_spa));
5355		arg.mcca_error = 0;
5356		arg.mcca_txg = txg;
5357
5358		vd->vdev_ops->vdev_op_remap(vd, offset, size,
5359		    metaslab_claim_impl_cb, &arg);
5360
5361		if (arg.mcca_error == 0) {
5362			arg.mcca_error = metaslab_claim_concrete(vd,
5363			    offset, size, txg);
5364		}
5365		return (arg.mcca_error);
5366	} else {
5367		return (metaslab_claim_concrete(vd, offset, size, txg));
5368	}
5369}
5370
5371/*
5372 * Intent log support: upon opening the pool after a crash, notify the SPA
5373 * of blocks that the intent log has allocated for immediate write, but
5374 * which are still considered free by the SPA because the last transaction
5375 * group didn't commit yet.
5376 */
5377static int
5378metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5379{
5380	uint64_t vdev = DVA_GET_VDEV(dva);
5381	uint64_t offset = DVA_GET_OFFSET(dva);
5382	uint64_t size = DVA_GET_ASIZE(dva);
5383	vdev_t *vd;
5384
5385	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5386		return (SET_ERROR(ENXIO));
5387	}
5388
5389	ASSERT(DVA_IS_VALID(dva));
5390
5391	if (DVA_GET_GANG(dva))
5392		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5393
5394	return (metaslab_claim_impl(vd, offset, size, txg));
5395}
5396
5397int
5398metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5399    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5400    zio_alloc_list_t *zal, zio_t *zio, int allocator)
5401{
5402	dva_t *dva = bp->blk_dva;
5403	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5404	int error = 0;
5405
5406	ASSERT(bp->blk_birth == 0);
5407	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5408
5409	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5410
5411	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
5412		spa_config_exit(spa, SCL_ALLOC, FTAG);
5413		return (SET_ERROR(ENOSPC));
5414	}
5415
5416	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5417	ASSERT(BP_GET_NDVAS(bp) == 0);
5418	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5419	ASSERT3P(zal, !=, NULL);
5420
5421	for (int d = 0; d < ndvas; d++) {
5422		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5423		    txg, flags, zal, allocator);
5424		if (error != 0) {
5425			for (d--; d >= 0; d--) {
5426				metaslab_unalloc_dva(spa, &dva[d], txg);
5427				metaslab_group_alloc_decrement(spa,
5428				    DVA_GET_VDEV(&dva[d]), zio, flags,
5429				    allocator, B_FALSE);
5430				bzero(&dva[d], sizeof (dva_t));
5431			}
5432			spa_config_exit(spa, SCL_ALLOC, FTAG);
5433			return (error);
5434		} else {
5435			/*
5436			 * Update the metaslab group's queue depth
5437			 * based on the newly allocated dva.
5438			 */
5439			metaslab_group_alloc_increment(spa,
5440			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5441		}
5442
5443	}
5444	ASSERT(error == 0);
5445	ASSERT(BP_GET_NDVAS(bp) == ndvas);
5446
5447	spa_config_exit(spa, SCL_ALLOC, FTAG);
5448
5449	BP_SET_BIRTH(bp, txg, txg);
5450
5451	return (0);
5452}
5453
5454void
5455metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5456{
5457	const dva_t *dva = bp->blk_dva;
5458	int ndvas = BP_GET_NDVAS(bp);
5459
5460	ASSERT(!BP_IS_HOLE(bp));
5461	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5462
5463	/*
5464	 * If we have a checkpoint for the pool we need to make sure that
5465	 * the blocks that we free that are part of the checkpoint won't be
5466	 * reused until the checkpoint is discarded or we revert to it.
5467	 *
5468	 * The checkpoint flag is passed down the metaslab_free code path
5469	 * and is set whenever we want to add a block to the checkpoint's
5470	 * accounting. That is, we "checkpoint" blocks that existed at the
5471	 * time the checkpoint was created and are therefore referenced by
5472	 * the checkpointed uberblock.
5473	 *
5474	 * Note that, we don't checkpoint any blocks if the current
5475	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5476	 * normally as they will be referenced by the checkpointed uberblock.
5477	 */
5478	boolean_t checkpoint = B_FALSE;
5479	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5480	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5481		/*
5482		 * At this point, if the block is part of the checkpoint
5483		 * there is no way it was created in the current txg.
5484		 */
5485		ASSERT(!now);
5486		ASSERT3U(spa_syncing_txg(spa), ==, txg);
5487		checkpoint = B_TRUE;
5488	}
5489
5490	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5491
5492	for (int d = 0; d < ndvas; d++) {
5493		if (now) {
5494			metaslab_unalloc_dva(spa, &dva[d], txg);
5495		} else {
5496			ASSERT3U(txg, ==, spa_syncing_txg(spa));
5497			metaslab_free_dva(spa, &dva[d], checkpoint);
5498		}
5499	}
5500
5501	spa_config_exit(spa, SCL_FREE, FTAG);
5502}
5503
5504int
5505metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5506{
5507	const dva_t *dva = bp->blk_dva;
5508	int ndvas = BP_GET_NDVAS(bp);
5509	int error = 0;
5510
5511	ASSERT(!BP_IS_HOLE(bp));
5512
5513	if (txg != 0) {
5514		/*
5515		 * First do a dry run to make sure all DVAs are claimable,
5516		 * so we don't have to unwind from partial failures below.
5517		 */
5518		if ((error = metaslab_claim(spa, bp, 0)) != 0)
5519			return (error);
5520	}
5521
5522	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5523
5524	for (int d = 0; d < ndvas; d++) {
5525		error = metaslab_claim_dva(spa, &dva[d], txg);
5526		if (error != 0)
5527			break;
5528	}
5529
5530	spa_config_exit(spa, SCL_ALLOC, FTAG);
5531
5532	ASSERT(error == 0 || txg == 0);
5533
5534	return (error);
5535}
5536
5537/* ARGSUSED */
5538static void
5539metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5540    uint64_t size, void *arg)
5541{
5542	if (vd->vdev_ops == &vdev_indirect_ops)
5543		return;
5544
5545	metaslab_check_free_impl(vd, offset, size);
5546}
5547
5548static void
5549metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5550{
5551	metaslab_t *msp;
5552	spa_t *spa = vd->vdev_spa;
5553
5554	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5555		return;
5556
5557	if (vd->vdev_ops->vdev_op_remap != NULL) {
5558		vd->vdev_ops->vdev_op_remap(vd, offset, size,
5559		    metaslab_check_free_impl_cb, NULL);
5560		return;
5561	}
5562
5563	ASSERT(vdev_is_concrete(vd));
5564	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5565	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5566
5567	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5568
5569	mutex_enter(&msp->ms_lock);
5570	if (msp->ms_loaded) {
5571		range_tree_verify_not_present(msp->ms_allocatable,
5572		    offset, size);
5573	}
5574
5575	/*
5576	 * Check all segments that currently exist in the freeing pipeline.
5577	 *
5578	 * It would intuitively make sense to also check the current allocating
5579	 * tree since metaslab_unalloc_dva() exists for extents that are
5580	 * allocated and freed in the same sync pass withing the same txg.
5581	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
5582	 * segment but then we free part of it within the same txg
5583	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5584	 * current allocating tree.
5585	 */
5586	range_tree_verify_not_present(msp->ms_freeing, offset, size);
5587	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5588	range_tree_verify_not_present(msp->ms_freed, offset, size);
5589	for (int j = 0; j < TXG_DEFER_SIZE; j++)
5590		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5591	range_tree_verify_not_present(msp->ms_trim, offset, size);
5592	mutex_exit(&msp->ms_lock);
5593}
5594
5595void
5596metaslab_check_free(spa_t *spa, const blkptr_t *bp)
5597{
5598	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5599		return;
5600
5601	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5602	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
5603		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
5604		vdev_t *vd = vdev_lookup_top(spa, vdev);
5605		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
5606		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
5607
5608		if (DVA_GET_GANG(&bp->blk_dva[i]))
5609			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5610
5611		ASSERT3P(vd, !=, NULL);
5612
5613		metaslab_check_free_impl(vd, offset, size);
5614	}
5615	spa_config_exit(spa, SCL_VDEV, FTAG);
5616}
5617
5618static void
5619metaslab_group_disable_wait(metaslab_group_t *mg)
5620{
5621	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5622	while (mg->mg_disabled_updating) {
5623		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5624	}
5625}
5626
5627static void
5628metaslab_group_disabled_increment(metaslab_group_t *mg)
5629{
5630	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5631	ASSERT(mg->mg_disabled_updating);
5632
5633	while (mg->mg_ms_disabled >= max_disabled_ms) {
5634		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5635	}
5636	mg->mg_ms_disabled++;
5637	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
5638}
5639
5640/*
5641 * Mark the metaslab as disabled to prevent any allocations on this metaslab.
5642 * We must also track how many metaslabs are currently disabled within a
5643 * metaslab group and limit them to prevent allocation failures from
5644 * occurring because all metaslabs are disabled.
5645 */
5646void
5647metaslab_disable(metaslab_t *msp)
5648{
5649	ASSERT(!MUTEX_HELD(&msp->ms_lock));
5650	metaslab_group_t *mg = msp->ms_group;
5651
5652	mutex_enter(&mg->mg_ms_disabled_lock);
5653
5654	/*
5655	 * To keep an accurate count of how many threads have disabled
5656	 * a specific metaslab group, we only allow one thread to mark
5657	 * the metaslab group at a time. This ensures that the value of
5658	 * ms_disabled will be accurate when we decide to mark a metaslab
5659	 * group as disabled. To do this we force all other threads
5660	 * to wait till the metaslab's mg_disabled_updating flag is no
5661	 * longer set.
5662	 */
5663	metaslab_group_disable_wait(mg);
5664	mg->mg_disabled_updating = B_TRUE;
5665	if (msp->ms_disabled == 0) {
5666		metaslab_group_disabled_increment(mg);
5667	}
5668	mutex_enter(&msp->ms_lock);
5669	msp->ms_disabled++;
5670	mutex_exit(&msp->ms_lock);
5671
5672	mg->mg_disabled_updating = B_FALSE;
5673	cv_broadcast(&mg->mg_ms_disabled_cv);
5674	mutex_exit(&mg->mg_ms_disabled_lock);
5675}
5676
5677void
5678metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
5679{
5680	metaslab_group_t *mg = msp->ms_group;
5681	spa_t *spa = mg->mg_vd->vdev_spa;
5682
5683	/*
5684	 * Wait for the outstanding IO to be synced to prevent newly
5685	 * allocated blocks from being overwritten.  This used by
5686	 * initialize and TRIM which are modifying unallocated space.
5687	 */
5688	if (sync)
5689		txg_wait_synced(spa_get_dsl(spa), 0);
5690
5691	mutex_enter(&mg->mg_ms_disabled_lock);
5692	mutex_enter(&msp->ms_lock);
5693	if (--msp->ms_disabled == 0) {
5694		mg->mg_ms_disabled--;
5695		cv_broadcast(&mg->mg_ms_disabled_cv);
5696		if (unload)
5697			metaslab_unload(msp);
5698	}
5699	mutex_exit(&msp->ms_lock);
5700	mutex_exit(&mg->mg_ms_disabled_lock);
5701}
5702
5703static void
5704metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
5705{
5706	vdev_t *vd = ms->ms_group->mg_vd;
5707	spa_t *spa = vd->vdev_spa;
5708	objset_t *mos = spa_meta_objset(spa);
5709
5710	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
5711
5712	metaslab_unflushed_phys_t entry = {
5713		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
5714	};
5715	uint64_t entry_size = sizeof (entry);
5716	uint64_t entry_offset = ms->ms_id * entry_size;
5717
5718	uint64_t object = 0;
5719	int err = zap_lookup(mos, vd->vdev_top_zap,
5720	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5721	    &object);
5722	if (err == ENOENT) {
5723		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
5724		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
5725		VERIFY0(zap_add(mos, vd->vdev_top_zap,
5726		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
5727		    &object, tx));
5728	} else {
5729		VERIFY0(err);
5730	}
5731
5732	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
5733	    &entry, tx);
5734}
5735
5736void
5737metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
5738{
5739	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
5740
5741	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
5742		return;
5743
5744	ms->ms_unflushed_txg = txg;
5745	metaslab_update_ondisk_flush_data(ms, tx);
5746}
5747
5748uint64_t
5749metaslab_unflushed_txg(metaslab_t *ms)
5750{
5751	return (ms->ms_unflushed_txg);
5752}
5753