xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision 0f7643c7376dd69a08acbfc9d1d7d548b10c846a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/space_map.h>
32 #include <sys/metaslab_impl.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/zio.h>
35 #include <sys/spa_impl.h>
36 #include <sys/zfeature.h>
37 
38 #define	GANG_ALLOCATION(flags) \
39 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
40 
41 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
42 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
43 #define	METASLAB_ACTIVE_MASK		\
44 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
45 
46 uint64_t metaslab_aliquot = 512ULL << 10;
47 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
48 
49 /*
50  * The in-core space map representation is more compact than its on-disk form.
51  * The zfs_condense_pct determines how much more compact the in-core
52  * space_map representation must be before we compact it on-disk.
53  * Values should be greater than or equal to 100.
54  */
55 int zfs_condense_pct = 200;
56 
57 /*
58  * Condensing a metaslab is not guaranteed to actually reduce the amount of
59  * space used on disk. In particular, a space map uses data in increments of
60  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
61  * same number of blocks after condensing. Since the goal of condensing is to
62  * reduce the number of IOPs required to read the space map, we only want to
63  * condense when we can be sure we will reduce the number of blocks used by the
64  * space map. Unfortunately, we cannot precisely compute whether or not this is
65  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
66  * we apply the following heuristic: do not condense a spacemap unless the
67  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
68  * blocks.
69  */
70 int zfs_metaslab_condense_block_threshold = 4;
71 
72 /*
73  * The zfs_mg_noalloc_threshold defines which metaslab groups should
74  * be eligible for allocation. The value is defined as a percentage of
75  * free space. Metaslab groups that have more free space than
76  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
77  * a metaslab group's free space is less than or equal to the
78  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
79  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
80  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
81  * groups are allowed to accept allocations. Gang blocks are always
82  * eligible to allocate on any metaslab group. The default value of 0 means
83  * no metaslab group will be excluded based on this criterion.
84  */
85 int zfs_mg_noalloc_threshold = 0;
86 
87 /*
88  * Metaslab groups are considered eligible for allocations if their
89  * fragmenation metric (measured as a percentage) is less than or equal to
90  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
91  * then it will be skipped unless all metaslab groups within the metaslab
92  * class have also crossed this threshold.
93  */
94 int zfs_mg_fragmentation_threshold = 85;
95 
96 /*
97  * Allow metaslabs to keep their active state as long as their fragmentation
98  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
99  * active metaslab that exceeds this threshold will no longer keep its active
100  * status allowing better metaslabs to be selected.
101  */
102 int zfs_metaslab_fragmentation_threshold = 70;
103 
104 /*
105  * When set will load all metaslabs when pool is first opened.
106  */
107 int metaslab_debug_load = 0;
108 
109 /*
110  * When set will prevent metaslabs from being unloaded.
111  */
112 int metaslab_debug_unload = 0;
113 
114 /*
115  * Minimum size which forces the dynamic allocator to change
116  * it's allocation strategy.  Once the space map cannot satisfy
117  * an allocation of this size then it switches to using more
118  * aggressive strategy (i.e search by size rather than offset).
119  */
120 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
121 
122 /*
123  * The minimum free space, in percent, which must be available
124  * in a space map to continue allocations in a first-fit fashion.
125  * Once the space_map's free space drops below this level we dynamically
126  * switch to using best-fit allocations.
127  */
128 int metaslab_df_free_pct = 4;
129 
130 /*
131  * A metaslab is considered "free" if it contains a contiguous
132  * segment which is greater than metaslab_min_alloc_size.
133  */
134 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
135 
136 /*
137  * Percentage of all cpus that can be used by the metaslab taskq.
138  */
139 int metaslab_load_pct = 50;
140 
141 /*
142  * Determines how many txgs a metaslab may remain loaded without having any
143  * allocations from it. As long as a metaslab continues to be used we will
144  * keep it loaded.
145  */
146 int metaslab_unload_delay = TXG_SIZE * 2;
147 
148 /*
149  * Max number of metaslabs per group to preload.
150  */
151 int metaslab_preload_limit = SPA_DVAS_PER_BP;
152 
153 /*
154  * Enable/disable preloading of metaslab.
155  */
156 boolean_t metaslab_preload_enabled = B_TRUE;
157 
158 /*
159  * Enable/disable fragmentation weighting on metaslabs.
160  */
161 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
162 
163 /*
164  * Enable/disable lba weighting (i.e. outer tracks are given preference).
165  */
166 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
167 
168 /*
169  * Enable/disable metaslab group biasing.
170  */
171 boolean_t metaslab_bias_enabled = B_TRUE;
172 
173 static uint64_t metaslab_fragmentation(metaslab_t *);
174 
175 /*
176  * ==========================================================================
177  * Metaslab classes
178  * ==========================================================================
179  */
180 metaslab_class_t *
181 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
182 {
183 	metaslab_class_t *mc;
184 
185 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
186 
187 	mc->mc_spa = spa;
188 	mc->mc_rotor = NULL;
189 	mc->mc_ops = ops;
190 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
191 	refcount_create_tracked(&mc->mc_alloc_slots);
192 
193 	return (mc);
194 }
195 
196 void
197 metaslab_class_destroy(metaslab_class_t *mc)
198 {
199 	ASSERT(mc->mc_rotor == NULL);
200 	ASSERT(mc->mc_alloc == 0);
201 	ASSERT(mc->mc_deferred == 0);
202 	ASSERT(mc->mc_space == 0);
203 	ASSERT(mc->mc_dspace == 0);
204 
205 	refcount_destroy(&mc->mc_alloc_slots);
206 	mutex_destroy(&mc->mc_lock);
207 	kmem_free(mc, sizeof (metaslab_class_t));
208 }
209 
210 int
211 metaslab_class_validate(metaslab_class_t *mc)
212 {
213 	metaslab_group_t *mg;
214 	vdev_t *vd;
215 
216 	/*
217 	 * Must hold one of the spa_config locks.
218 	 */
219 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
220 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
221 
222 	if ((mg = mc->mc_rotor) == NULL)
223 		return (0);
224 
225 	do {
226 		vd = mg->mg_vd;
227 		ASSERT(vd->vdev_mg != NULL);
228 		ASSERT3P(vd->vdev_top, ==, vd);
229 		ASSERT3P(mg->mg_class, ==, mc);
230 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
231 	} while ((mg = mg->mg_next) != mc->mc_rotor);
232 
233 	return (0);
234 }
235 
236 void
237 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
238     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
239 {
240 	atomic_add_64(&mc->mc_alloc, alloc_delta);
241 	atomic_add_64(&mc->mc_deferred, defer_delta);
242 	atomic_add_64(&mc->mc_space, space_delta);
243 	atomic_add_64(&mc->mc_dspace, dspace_delta);
244 }
245 
246 uint64_t
247 metaslab_class_get_alloc(metaslab_class_t *mc)
248 {
249 	return (mc->mc_alloc);
250 }
251 
252 uint64_t
253 metaslab_class_get_deferred(metaslab_class_t *mc)
254 {
255 	return (mc->mc_deferred);
256 }
257 
258 uint64_t
259 metaslab_class_get_space(metaslab_class_t *mc)
260 {
261 	return (mc->mc_space);
262 }
263 
264 uint64_t
265 metaslab_class_get_dspace(metaslab_class_t *mc)
266 {
267 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
268 }
269 
270 void
271 metaslab_class_histogram_verify(metaslab_class_t *mc)
272 {
273 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
274 	uint64_t *mc_hist;
275 	int i;
276 
277 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
278 		return;
279 
280 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
281 	    KM_SLEEP);
282 
283 	for (int c = 0; c < rvd->vdev_children; c++) {
284 		vdev_t *tvd = rvd->vdev_child[c];
285 		metaslab_group_t *mg = tvd->vdev_mg;
286 
287 		/*
288 		 * Skip any holes, uninitialized top-levels, or
289 		 * vdevs that are not in this metalab class.
290 		 */
291 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
292 		    mg->mg_class != mc) {
293 			continue;
294 		}
295 
296 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
297 			mc_hist[i] += mg->mg_histogram[i];
298 	}
299 
300 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
301 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
302 
303 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
304 }
305 
306 /*
307  * Calculate the metaslab class's fragmentation metric. The metric
308  * is weighted based on the space contribution of each metaslab group.
309  * The return value will be a number between 0 and 100 (inclusive), or
310  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
311  * zfs_frag_table for more information about the metric.
312  */
313 uint64_t
314 metaslab_class_fragmentation(metaslab_class_t *mc)
315 {
316 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
317 	uint64_t fragmentation = 0;
318 
319 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
320 
321 	for (int c = 0; c < rvd->vdev_children; c++) {
322 		vdev_t *tvd = rvd->vdev_child[c];
323 		metaslab_group_t *mg = tvd->vdev_mg;
324 
325 		/*
326 		 * Skip any holes, uninitialized top-levels, or
327 		 * vdevs that are not in this metalab class.
328 		 */
329 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
330 		    mg->mg_class != mc) {
331 			continue;
332 		}
333 
334 		/*
335 		 * If a metaslab group does not contain a fragmentation
336 		 * metric then just bail out.
337 		 */
338 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
339 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
340 			return (ZFS_FRAG_INVALID);
341 		}
342 
343 		/*
344 		 * Determine how much this metaslab_group is contributing
345 		 * to the overall pool fragmentation metric.
346 		 */
347 		fragmentation += mg->mg_fragmentation *
348 		    metaslab_group_get_space(mg);
349 	}
350 	fragmentation /= metaslab_class_get_space(mc);
351 
352 	ASSERT3U(fragmentation, <=, 100);
353 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
354 	return (fragmentation);
355 }
356 
357 /*
358  * Calculate the amount of expandable space that is available in
359  * this metaslab class. If a device is expanded then its expandable
360  * space will be the amount of allocatable space that is currently not
361  * part of this metaslab class.
362  */
363 uint64_t
364 metaslab_class_expandable_space(metaslab_class_t *mc)
365 {
366 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
367 	uint64_t space = 0;
368 
369 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
370 	for (int c = 0; c < rvd->vdev_children; c++) {
371 		vdev_t *tvd = rvd->vdev_child[c];
372 		metaslab_group_t *mg = tvd->vdev_mg;
373 
374 		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
375 		    mg->mg_class != mc) {
376 			continue;
377 		}
378 
379 		/*
380 		 * Calculate if we have enough space to add additional
381 		 * metaslabs. We report the expandable space in terms
382 		 * of the metaslab size since that's the unit of expansion.
383 		 */
384 		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
385 		    1ULL << tvd->vdev_ms_shift);
386 	}
387 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
388 	return (space);
389 }
390 
391 /*
392  * ==========================================================================
393  * Metaslab groups
394  * ==========================================================================
395  */
396 static int
397 metaslab_compare(const void *x1, const void *x2)
398 {
399 	const metaslab_t *m1 = x1;
400 	const metaslab_t *m2 = x2;
401 
402 	if (m1->ms_weight < m2->ms_weight)
403 		return (1);
404 	if (m1->ms_weight > m2->ms_weight)
405 		return (-1);
406 
407 	/*
408 	 * If the weights are identical, use the offset to force uniqueness.
409 	 */
410 	if (m1->ms_start < m2->ms_start)
411 		return (-1);
412 	if (m1->ms_start > m2->ms_start)
413 		return (1);
414 
415 	ASSERT3P(m1, ==, m2);
416 
417 	return (0);
418 }
419 
420 /*
421  * Update the allocatable flag and the metaslab group's capacity.
422  * The allocatable flag is set to true if the capacity is below
423  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
424  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
425  * transitions from allocatable to non-allocatable or vice versa then the
426  * metaslab group's class is updated to reflect the transition.
427  */
428 static void
429 metaslab_group_alloc_update(metaslab_group_t *mg)
430 {
431 	vdev_t *vd = mg->mg_vd;
432 	metaslab_class_t *mc = mg->mg_class;
433 	vdev_stat_t *vs = &vd->vdev_stat;
434 	boolean_t was_allocatable;
435 	boolean_t was_initialized;
436 
437 	ASSERT(vd == vd->vdev_top);
438 
439 	mutex_enter(&mg->mg_lock);
440 	was_allocatable = mg->mg_allocatable;
441 	was_initialized = mg->mg_initialized;
442 
443 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
444 	    (vs->vs_space + 1);
445 
446 	mutex_enter(&mc->mc_lock);
447 
448 	/*
449 	 * If the metaslab group was just added then it won't
450 	 * have any space until we finish syncing out this txg.
451 	 * At that point we will consider it initialized and available
452 	 * for allocations.  We also don't consider non-activated
453 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
454 	 * to be initialized, because they can't be used for allocation.
455 	 */
456 	mg->mg_initialized = metaslab_group_initialized(mg);
457 	if (!was_initialized && mg->mg_initialized) {
458 		mc->mc_groups++;
459 	} else if (was_initialized && !mg->mg_initialized) {
460 		ASSERT3U(mc->mc_groups, >, 0);
461 		mc->mc_groups--;
462 	}
463 	if (mg->mg_initialized)
464 		mg->mg_no_free_space = B_FALSE;
465 
466 	/*
467 	 * A metaslab group is considered allocatable if it has plenty
468 	 * of free space or is not heavily fragmented. We only take
469 	 * fragmentation into account if the metaslab group has a valid
470 	 * fragmentation metric (i.e. a value between 0 and 100).
471 	 */
472 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
473 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
474 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
475 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
476 
477 	/*
478 	 * The mc_alloc_groups maintains a count of the number of
479 	 * groups in this metaslab class that are still above the
480 	 * zfs_mg_noalloc_threshold. This is used by the allocating
481 	 * threads to determine if they should avoid allocations to
482 	 * a given group. The allocator will avoid allocations to a group
483 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
484 	 * and there are still other groups that are above the threshold.
485 	 * When a group transitions from allocatable to non-allocatable or
486 	 * vice versa we update the metaslab class to reflect that change.
487 	 * When the mc_alloc_groups value drops to 0 that means that all
488 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
489 	 * eligible for allocations. This effectively means that all devices
490 	 * are balanced again.
491 	 */
492 	if (was_allocatable && !mg->mg_allocatable)
493 		mc->mc_alloc_groups--;
494 	else if (!was_allocatable && mg->mg_allocatable)
495 		mc->mc_alloc_groups++;
496 	mutex_exit(&mc->mc_lock);
497 
498 	mutex_exit(&mg->mg_lock);
499 }
500 
501 metaslab_group_t *
502 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
503 {
504 	metaslab_group_t *mg;
505 
506 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
507 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
508 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
509 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
510 	mg->mg_vd = vd;
511 	mg->mg_class = mc;
512 	mg->mg_activation_count = 0;
513 	mg->mg_initialized = B_FALSE;
514 	mg->mg_no_free_space = B_TRUE;
515 	refcount_create_tracked(&mg->mg_alloc_queue_depth);
516 
517 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
518 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
519 
520 	return (mg);
521 }
522 
523 void
524 metaslab_group_destroy(metaslab_group_t *mg)
525 {
526 	ASSERT(mg->mg_prev == NULL);
527 	ASSERT(mg->mg_next == NULL);
528 	/*
529 	 * We may have gone below zero with the activation count
530 	 * either because we never activated in the first place or
531 	 * because we're done, and possibly removing the vdev.
532 	 */
533 	ASSERT(mg->mg_activation_count <= 0);
534 
535 	taskq_destroy(mg->mg_taskq);
536 	avl_destroy(&mg->mg_metaslab_tree);
537 	mutex_destroy(&mg->mg_lock);
538 	refcount_destroy(&mg->mg_alloc_queue_depth);
539 	kmem_free(mg, sizeof (metaslab_group_t));
540 }
541 
542 void
543 metaslab_group_activate(metaslab_group_t *mg)
544 {
545 	metaslab_class_t *mc = mg->mg_class;
546 	metaslab_group_t *mgprev, *mgnext;
547 
548 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
549 
550 	ASSERT(mc->mc_rotor != mg);
551 	ASSERT(mg->mg_prev == NULL);
552 	ASSERT(mg->mg_next == NULL);
553 	ASSERT(mg->mg_activation_count <= 0);
554 
555 	if (++mg->mg_activation_count <= 0)
556 		return;
557 
558 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
559 	metaslab_group_alloc_update(mg);
560 
561 	if ((mgprev = mc->mc_rotor) == NULL) {
562 		mg->mg_prev = mg;
563 		mg->mg_next = mg;
564 	} else {
565 		mgnext = mgprev->mg_next;
566 		mg->mg_prev = mgprev;
567 		mg->mg_next = mgnext;
568 		mgprev->mg_next = mg;
569 		mgnext->mg_prev = mg;
570 	}
571 	mc->mc_rotor = mg;
572 }
573 
574 void
575 metaslab_group_passivate(metaslab_group_t *mg)
576 {
577 	metaslab_class_t *mc = mg->mg_class;
578 	metaslab_group_t *mgprev, *mgnext;
579 
580 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
581 
582 	if (--mg->mg_activation_count != 0) {
583 		ASSERT(mc->mc_rotor != mg);
584 		ASSERT(mg->mg_prev == NULL);
585 		ASSERT(mg->mg_next == NULL);
586 		ASSERT(mg->mg_activation_count < 0);
587 		return;
588 	}
589 
590 	taskq_wait(mg->mg_taskq);
591 	metaslab_group_alloc_update(mg);
592 
593 	mgprev = mg->mg_prev;
594 	mgnext = mg->mg_next;
595 
596 	if (mg == mgnext) {
597 		mc->mc_rotor = NULL;
598 	} else {
599 		mc->mc_rotor = mgnext;
600 		mgprev->mg_next = mgnext;
601 		mgnext->mg_prev = mgprev;
602 	}
603 
604 	mg->mg_prev = NULL;
605 	mg->mg_next = NULL;
606 }
607 
608 boolean_t
609 metaslab_group_initialized(metaslab_group_t *mg)
610 {
611 	vdev_t *vd = mg->mg_vd;
612 	vdev_stat_t *vs = &vd->vdev_stat;
613 
614 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
615 }
616 
617 uint64_t
618 metaslab_group_get_space(metaslab_group_t *mg)
619 {
620 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
621 }
622 
623 void
624 metaslab_group_histogram_verify(metaslab_group_t *mg)
625 {
626 	uint64_t *mg_hist;
627 	vdev_t *vd = mg->mg_vd;
628 	uint64_t ashift = vd->vdev_ashift;
629 	int i;
630 
631 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
632 		return;
633 
634 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
635 	    KM_SLEEP);
636 
637 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
638 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
639 
640 	for (int m = 0; m < vd->vdev_ms_count; m++) {
641 		metaslab_t *msp = vd->vdev_ms[m];
642 
643 		if (msp->ms_sm == NULL)
644 			continue;
645 
646 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
647 			mg_hist[i + ashift] +=
648 			    msp->ms_sm->sm_phys->smp_histogram[i];
649 	}
650 
651 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
652 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
653 
654 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
655 }
656 
657 static void
658 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
659 {
660 	metaslab_class_t *mc = mg->mg_class;
661 	uint64_t ashift = mg->mg_vd->vdev_ashift;
662 
663 	ASSERT(MUTEX_HELD(&msp->ms_lock));
664 	if (msp->ms_sm == NULL)
665 		return;
666 
667 	mutex_enter(&mg->mg_lock);
668 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
669 		mg->mg_histogram[i + ashift] +=
670 		    msp->ms_sm->sm_phys->smp_histogram[i];
671 		mc->mc_histogram[i + ashift] +=
672 		    msp->ms_sm->sm_phys->smp_histogram[i];
673 	}
674 	mutex_exit(&mg->mg_lock);
675 }
676 
677 void
678 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
679 {
680 	metaslab_class_t *mc = mg->mg_class;
681 	uint64_t ashift = mg->mg_vd->vdev_ashift;
682 
683 	ASSERT(MUTEX_HELD(&msp->ms_lock));
684 	if (msp->ms_sm == NULL)
685 		return;
686 
687 	mutex_enter(&mg->mg_lock);
688 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
689 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
690 		    msp->ms_sm->sm_phys->smp_histogram[i]);
691 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
692 		    msp->ms_sm->sm_phys->smp_histogram[i]);
693 
694 		mg->mg_histogram[i + ashift] -=
695 		    msp->ms_sm->sm_phys->smp_histogram[i];
696 		mc->mc_histogram[i + ashift] -=
697 		    msp->ms_sm->sm_phys->smp_histogram[i];
698 	}
699 	mutex_exit(&mg->mg_lock);
700 }
701 
702 static void
703 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
704 {
705 	ASSERT(msp->ms_group == NULL);
706 	mutex_enter(&mg->mg_lock);
707 	msp->ms_group = mg;
708 	msp->ms_weight = 0;
709 	avl_add(&mg->mg_metaslab_tree, msp);
710 	mutex_exit(&mg->mg_lock);
711 
712 	mutex_enter(&msp->ms_lock);
713 	metaslab_group_histogram_add(mg, msp);
714 	mutex_exit(&msp->ms_lock);
715 }
716 
717 static void
718 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
719 {
720 	mutex_enter(&msp->ms_lock);
721 	metaslab_group_histogram_remove(mg, msp);
722 	mutex_exit(&msp->ms_lock);
723 
724 	mutex_enter(&mg->mg_lock);
725 	ASSERT(msp->ms_group == mg);
726 	avl_remove(&mg->mg_metaslab_tree, msp);
727 	msp->ms_group = NULL;
728 	mutex_exit(&mg->mg_lock);
729 }
730 
731 static void
732 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
733 {
734 	/*
735 	 * Although in principle the weight can be any value, in
736 	 * practice we do not use values in the range [1, 511].
737 	 */
738 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
739 	ASSERT(MUTEX_HELD(&msp->ms_lock));
740 
741 	mutex_enter(&mg->mg_lock);
742 	ASSERT(msp->ms_group == mg);
743 	avl_remove(&mg->mg_metaslab_tree, msp);
744 	msp->ms_weight = weight;
745 	avl_add(&mg->mg_metaslab_tree, msp);
746 	mutex_exit(&mg->mg_lock);
747 }
748 
749 /*
750  * Calculate the fragmentation for a given metaslab group. We can use
751  * a simple average here since all metaslabs within the group must have
752  * the same size. The return value will be a value between 0 and 100
753  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
754  * group have a fragmentation metric.
755  */
756 uint64_t
757 metaslab_group_fragmentation(metaslab_group_t *mg)
758 {
759 	vdev_t *vd = mg->mg_vd;
760 	uint64_t fragmentation = 0;
761 	uint64_t valid_ms = 0;
762 
763 	for (int m = 0; m < vd->vdev_ms_count; m++) {
764 		metaslab_t *msp = vd->vdev_ms[m];
765 
766 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
767 			continue;
768 
769 		valid_ms++;
770 		fragmentation += msp->ms_fragmentation;
771 	}
772 
773 	if (valid_ms <= vd->vdev_ms_count / 2)
774 		return (ZFS_FRAG_INVALID);
775 
776 	fragmentation /= valid_ms;
777 	ASSERT3U(fragmentation, <=, 100);
778 	return (fragmentation);
779 }
780 
781 /*
782  * Determine if a given metaslab group should skip allocations. A metaslab
783  * group should avoid allocations if its free capacity is less than the
784  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
785  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
786  * that can still handle allocations. If the allocation throttle is enabled
787  * then we skip allocations to devices that have reached their maximum
788  * allocation queue depth unless the selected metaslab group is the only
789  * eligible group remaining.
790  */
791 static boolean_t
792 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
793     uint64_t psize)
794 {
795 	spa_t *spa = mg->mg_vd->vdev_spa;
796 	metaslab_class_t *mc = mg->mg_class;
797 
798 	/*
799 	 * We can only consider skipping this metaslab group if it's
800 	 * in the normal metaslab class and there are other metaslab
801 	 * groups to select from. Otherwise, we always consider it eligible
802 	 * for allocations.
803 	 */
804 	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
805 		return (B_TRUE);
806 
807 	/*
808 	 * If the metaslab group's mg_allocatable flag is set (see comments
809 	 * in metaslab_group_alloc_update() for more information) and
810 	 * the allocation throttle is disabled then allow allocations to this
811 	 * device. However, if the allocation throttle is enabled then
812 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
813 	 * to determine if we should allow allocations to this metaslab group.
814 	 * If all metaslab groups are no longer considered allocatable
815 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
816 	 * gang block size then we allow allocations on this metaslab group
817 	 * regardless of the mg_allocatable or throttle settings.
818 	 */
819 	if (mg->mg_allocatable) {
820 		metaslab_group_t *mgp;
821 		int64_t qdepth;
822 		uint64_t qmax = mg->mg_max_alloc_queue_depth;
823 
824 		if (!mc->mc_alloc_throttle_enabled)
825 			return (B_TRUE);
826 
827 		/*
828 		 * If this metaslab group does not have any free space, then
829 		 * there is no point in looking further.
830 		 */
831 		if (mg->mg_no_free_space)
832 			return (B_FALSE);
833 
834 		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
835 
836 		/*
837 		 * If this metaslab group is below its qmax or it's
838 		 * the only allocatable metasable group, then attempt
839 		 * to allocate from it.
840 		 */
841 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
842 			return (B_TRUE);
843 		ASSERT3U(mc->mc_alloc_groups, >, 1);
844 
845 		/*
846 		 * Since this metaslab group is at or over its qmax, we
847 		 * need to determine if there are metaslab groups after this
848 		 * one that might be able to handle this allocation. This is
849 		 * racy since we can't hold the locks for all metaslab
850 		 * groups at the same time when we make this check.
851 		 */
852 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
853 			qmax = mgp->mg_max_alloc_queue_depth;
854 
855 			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
856 
857 			/*
858 			 * If there is another metaslab group that
859 			 * might be able to handle the allocation, then
860 			 * we return false so that we skip this group.
861 			 */
862 			if (qdepth < qmax && !mgp->mg_no_free_space)
863 				return (B_FALSE);
864 		}
865 
866 		/*
867 		 * We didn't find another group to handle the allocation
868 		 * so we can't skip this metaslab group even though
869 		 * we are at or over our qmax.
870 		 */
871 		return (B_TRUE);
872 
873 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
874 		return (B_TRUE);
875 	}
876 	return (B_FALSE);
877 }
878 
879 /*
880  * ==========================================================================
881  * Range tree callbacks
882  * ==========================================================================
883  */
884 
885 /*
886  * Comparison function for the private size-ordered tree. Tree is sorted
887  * by size, larger sizes at the end of the tree.
888  */
889 static int
890 metaslab_rangesize_compare(const void *x1, const void *x2)
891 {
892 	const range_seg_t *r1 = x1;
893 	const range_seg_t *r2 = x2;
894 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
895 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
896 
897 	if (rs_size1 < rs_size2)
898 		return (-1);
899 	if (rs_size1 > rs_size2)
900 		return (1);
901 
902 	if (r1->rs_start < r2->rs_start)
903 		return (-1);
904 
905 	if (r1->rs_start > r2->rs_start)
906 		return (1);
907 
908 	return (0);
909 }
910 
911 /*
912  * Create any block allocator specific components. The current allocators
913  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
914  */
915 static void
916 metaslab_rt_create(range_tree_t *rt, void *arg)
917 {
918 	metaslab_t *msp = arg;
919 
920 	ASSERT3P(rt->rt_arg, ==, msp);
921 	ASSERT(msp->ms_tree == NULL);
922 
923 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
924 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
925 }
926 
927 /*
928  * Destroy the block allocator specific components.
929  */
930 static void
931 metaslab_rt_destroy(range_tree_t *rt, void *arg)
932 {
933 	metaslab_t *msp = arg;
934 
935 	ASSERT3P(rt->rt_arg, ==, msp);
936 	ASSERT3P(msp->ms_tree, ==, rt);
937 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
938 
939 	avl_destroy(&msp->ms_size_tree);
940 }
941 
942 static void
943 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
944 {
945 	metaslab_t *msp = arg;
946 
947 	ASSERT3P(rt->rt_arg, ==, msp);
948 	ASSERT3P(msp->ms_tree, ==, rt);
949 	VERIFY(!msp->ms_condensing);
950 	avl_add(&msp->ms_size_tree, rs);
951 }
952 
953 static void
954 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
955 {
956 	metaslab_t *msp = arg;
957 
958 	ASSERT3P(rt->rt_arg, ==, msp);
959 	ASSERT3P(msp->ms_tree, ==, rt);
960 	VERIFY(!msp->ms_condensing);
961 	avl_remove(&msp->ms_size_tree, rs);
962 }
963 
964 static void
965 metaslab_rt_vacate(range_tree_t *rt, void *arg)
966 {
967 	metaslab_t *msp = arg;
968 
969 	ASSERT3P(rt->rt_arg, ==, msp);
970 	ASSERT3P(msp->ms_tree, ==, rt);
971 
972 	/*
973 	 * Normally one would walk the tree freeing nodes along the way.
974 	 * Since the nodes are shared with the range trees we can avoid
975 	 * walking all nodes and just reinitialize the avl tree. The nodes
976 	 * will be freed by the range tree, so we don't want to free them here.
977 	 */
978 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
979 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
980 }
981 
982 static range_tree_ops_t metaslab_rt_ops = {
983 	metaslab_rt_create,
984 	metaslab_rt_destroy,
985 	metaslab_rt_add,
986 	metaslab_rt_remove,
987 	metaslab_rt_vacate
988 };
989 
990 /*
991  * ==========================================================================
992  * Metaslab block operations
993  * ==========================================================================
994  */
995 
996 /*
997  * Return the maximum contiguous segment within the metaslab.
998  */
999 uint64_t
1000 metaslab_block_maxsize(metaslab_t *msp)
1001 {
1002 	avl_tree_t *t = &msp->ms_size_tree;
1003 	range_seg_t *rs;
1004 
1005 	if (t == NULL || (rs = avl_last(t)) == NULL)
1006 		return (0ULL);
1007 
1008 	return (rs->rs_end - rs->rs_start);
1009 }
1010 
1011 uint64_t
1012 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
1013 {
1014 	uint64_t start;
1015 	range_tree_t *rt = msp->ms_tree;
1016 
1017 	VERIFY(!msp->ms_condensing);
1018 
1019 	start = msp->ms_ops->msop_alloc(msp, size);
1020 	if (start != -1ULL) {
1021 		vdev_t *vd = msp->ms_group->mg_vd;
1022 
1023 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
1024 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1025 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
1026 		range_tree_remove(rt, start, size);
1027 	}
1028 	return (start);
1029 }
1030 
1031 /*
1032  * ==========================================================================
1033  * Common allocator routines
1034  * ==========================================================================
1035  */
1036 
1037 /*
1038  * This is a helper function that can be used by the allocator to find
1039  * a suitable block to allocate. This will search the specified AVL
1040  * tree looking for a block that matches the specified criteria.
1041  */
1042 static uint64_t
1043 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
1044     uint64_t align)
1045 {
1046 	range_seg_t *rs, rsearch;
1047 	avl_index_t where;
1048 
1049 	rsearch.rs_start = *cursor;
1050 	rsearch.rs_end = *cursor + size;
1051 
1052 	rs = avl_find(t, &rsearch, &where);
1053 	if (rs == NULL)
1054 		rs = avl_nearest(t, where, AVL_AFTER);
1055 
1056 	while (rs != NULL) {
1057 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1058 
1059 		if (offset + size <= rs->rs_end) {
1060 			*cursor = offset + size;
1061 			return (offset);
1062 		}
1063 		rs = AVL_NEXT(t, rs);
1064 	}
1065 
1066 	/*
1067 	 * If we know we've searched the whole map (*cursor == 0), give up.
1068 	 * Otherwise, reset the cursor to the beginning and try again.
1069 	 */
1070 	if (*cursor == 0)
1071 		return (-1ULL);
1072 
1073 	*cursor = 0;
1074 	return (metaslab_block_picker(t, cursor, size, align));
1075 }
1076 
1077 /*
1078  * ==========================================================================
1079  * The first-fit block allocator
1080  * ==========================================================================
1081  */
1082 static uint64_t
1083 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1084 {
1085 	/*
1086 	 * Find the largest power of 2 block size that evenly divides the
1087 	 * requested size. This is used to try to allocate blocks with similar
1088 	 * alignment from the same area of the metaslab (i.e. same cursor
1089 	 * bucket) but it does not guarantee that other allocations sizes
1090 	 * may exist in the same region.
1091 	 */
1092 	uint64_t align = size & -size;
1093 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1094 	avl_tree_t *t = &msp->ms_tree->rt_root;
1095 
1096 	return (metaslab_block_picker(t, cursor, size, align));
1097 }
1098 
1099 static metaslab_ops_t metaslab_ff_ops = {
1100 	metaslab_ff_alloc
1101 };
1102 
1103 /*
1104  * ==========================================================================
1105  * Dynamic block allocator -
1106  * Uses the first fit allocation scheme until space get low and then
1107  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1108  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1109  * ==========================================================================
1110  */
1111 static uint64_t
1112 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1113 {
1114 	/*
1115 	 * Find the largest power of 2 block size that evenly divides the
1116 	 * requested size. This is used to try to allocate blocks with similar
1117 	 * alignment from the same area of the metaslab (i.e. same cursor
1118 	 * bucket) but it does not guarantee that other allocations sizes
1119 	 * may exist in the same region.
1120 	 */
1121 	uint64_t align = size & -size;
1122 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1123 	range_tree_t *rt = msp->ms_tree;
1124 	avl_tree_t *t = &rt->rt_root;
1125 	uint64_t max_size = metaslab_block_maxsize(msp);
1126 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1127 
1128 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1129 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1130 
1131 	if (max_size < size)
1132 		return (-1ULL);
1133 
1134 	/*
1135 	 * If we're running low on space switch to using the size
1136 	 * sorted AVL tree (best-fit).
1137 	 */
1138 	if (max_size < metaslab_df_alloc_threshold ||
1139 	    free_pct < metaslab_df_free_pct) {
1140 		t = &msp->ms_size_tree;
1141 		*cursor = 0;
1142 	}
1143 
1144 	return (metaslab_block_picker(t, cursor, size, 1ULL));
1145 }
1146 
1147 static metaslab_ops_t metaslab_df_ops = {
1148 	metaslab_df_alloc
1149 };
1150 
1151 /*
1152  * ==========================================================================
1153  * Cursor fit block allocator -
1154  * Select the largest region in the metaslab, set the cursor to the beginning
1155  * of the range and the cursor_end to the end of the range. As allocations
1156  * are made advance the cursor. Continue allocating from the cursor until
1157  * the range is exhausted and then find a new range.
1158  * ==========================================================================
1159  */
1160 static uint64_t
1161 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1162 {
1163 	range_tree_t *rt = msp->ms_tree;
1164 	avl_tree_t *t = &msp->ms_size_tree;
1165 	uint64_t *cursor = &msp->ms_lbas[0];
1166 	uint64_t *cursor_end = &msp->ms_lbas[1];
1167 	uint64_t offset = 0;
1168 
1169 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1170 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1171 
1172 	ASSERT3U(*cursor_end, >=, *cursor);
1173 
1174 	if ((*cursor + size) > *cursor_end) {
1175 		range_seg_t *rs;
1176 
1177 		rs = avl_last(&msp->ms_size_tree);
1178 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
1179 			return (-1ULL);
1180 
1181 		*cursor = rs->rs_start;
1182 		*cursor_end = rs->rs_end;
1183 	}
1184 
1185 	offset = *cursor;
1186 	*cursor += size;
1187 
1188 	return (offset);
1189 }
1190 
1191 static metaslab_ops_t metaslab_cf_ops = {
1192 	metaslab_cf_alloc
1193 };
1194 
1195 /*
1196  * ==========================================================================
1197  * New dynamic fit allocator -
1198  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1199  * contiguous blocks. If no region is found then just use the largest segment
1200  * that remains.
1201  * ==========================================================================
1202  */
1203 
1204 /*
1205  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1206  * to request from the allocator.
1207  */
1208 uint64_t metaslab_ndf_clump_shift = 4;
1209 
1210 static uint64_t
1211 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1212 {
1213 	avl_tree_t *t = &msp->ms_tree->rt_root;
1214 	avl_index_t where;
1215 	range_seg_t *rs, rsearch;
1216 	uint64_t hbit = highbit64(size);
1217 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1218 	uint64_t max_size = metaslab_block_maxsize(msp);
1219 
1220 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1221 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1222 
1223 	if (max_size < size)
1224 		return (-1ULL);
1225 
1226 	rsearch.rs_start = *cursor;
1227 	rsearch.rs_end = *cursor + size;
1228 
1229 	rs = avl_find(t, &rsearch, &where);
1230 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
1231 		t = &msp->ms_size_tree;
1232 
1233 		rsearch.rs_start = 0;
1234 		rsearch.rs_end = MIN(max_size,
1235 		    1ULL << (hbit + metaslab_ndf_clump_shift));
1236 		rs = avl_find(t, &rsearch, &where);
1237 		if (rs == NULL)
1238 			rs = avl_nearest(t, where, AVL_AFTER);
1239 		ASSERT(rs != NULL);
1240 	}
1241 
1242 	if ((rs->rs_end - rs->rs_start) >= size) {
1243 		*cursor = rs->rs_start + size;
1244 		return (rs->rs_start);
1245 	}
1246 	return (-1ULL);
1247 }
1248 
1249 static metaslab_ops_t metaslab_ndf_ops = {
1250 	metaslab_ndf_alloc
1251 };
1252 
1253 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1254 
1255 /*
1256  * ==========================================================================
1257  * Metaslabs
1258  * ==========================================================================
1259  */
1260 
1261 /*
1262  * Wait for any in-progress metaslab loads to complete.
1263  */
1264 void
1265 metaslab_load_wait(metaslab_t *msp)
1266 {
1267 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1268 
1269 	while (msp->ms_loading) {
1270 		ASSERT(!msp->ms_loaded);
1271 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1272 	}
1273 }
1274 
1275 int
1276 metaslab_load(metaslab_t *msp)
1277 {
1278 	int error = 0;
1279 
1280 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1281 	ASSERT(!msp->ms_loaded);
1282 	ASSERT(!msp->ms_loading);
1283 
1284 	msp->ms_loading = B_TRUE;
1285 
1286 	/*
1287 	 * If the space map has not been allocated yet, then treat
1288 	 * all the space in the metaslab as free and add it to the
1289 	 * ms_tree.
1290 	 */
1291 	if (msp->ms_sm != NULL)
1292 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1293 	else
1294 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1295 
1296 	msp->ms_loaded = (error == 0);
1297 	msp->ms_loading = B_FALSE;
1298 
1299 	if (msp->ms_loaded) {
1300 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1301 			range_tree_walk(msp->ms_defertree[t],
1302 			    range_tree_remove, msp->ms_tree);
1303 		}
1304 	}
1305 	cv_broadcast(&msp->ms_load_cv);
1306 	return (error);
1307 }
1308 
1309 void
1310 metaslab_unload(metaslab_t *msp)
1311 {
1312 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1313 	range_tree_vacate(msp->ms_tree, NULL, NULL);
1314 	msp->ms_loaded = B_FALSE;
1315 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1316 }
1317 
1318 int
1319 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1320     metaslab_t **msp)
1321 {
1322 	vdev_t *vd = mg->mg_vd;
1323 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
1324 	metaslab_t *ms;
1325 	int error;
1326 
1327 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1328 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1329 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1330 	ms->ms_id = id;
1331 	ms->ms_start = id << vd->vdev_ms_shift;
1332 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
1333 
1334 	/*
1335 	 * We only open space map objects that already exist. All others
1336 	 * will be opened when we finally allocate an object for it.
1337 	 */
1338 	if (object != 0) {
1339 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1340 		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1341 
1342 		if (error != 0) {
1343 			kmem_free(ms, sizeof (metaslab_t));
1344 			return (error);
1345 		}
1346 
1347 		ASSERT(ms->ms_sm != NULL);
1348 	}
1349 
1350 	/*
1351 	 * We create the main range tree here, but we don't create the
1352 	 * alloctree and freetree until metaslab_sync_done().  This serves
1353 	 * two purposes: it allows metaslab_sync_done() to detect the
1354 	 * addition of new space; and for debugging, it ensures that we'd
1355 	 * data fault on any attempt to use this metaslab before it's ready.
1356 	 */
1357 	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1358 	metaslab_group_add(mg, ms);
1359 
1360 	ms->ms_fragmentation = metaslab_fragmentation(ms);
1361 	ms->ms_ops = mg->mg_class->mc_ops;
1362 
1363 	/*
1364 	 * If we're opening an existing pool (txg == 0) or creating
1365 	 * a new one (txg == TXG_INITIAL), all space is available now.
1366 	 * If we're adding space to an existing pool, the new space
1367 	 * does not become available until after this txg has synced.
1368 	 */
1369 	if (txg <= TXG_INITIAL)
1370 		metaslab_sync_done(ms, 0);
1371 
1372 	/*
1373 	 * If metaslab_debug_load is set and we're initializing a metaslab
1374 	 * that has an allocated space_map object then load the its space
1375 	 * map so that can verify frees.
1376 	 */
1377 	if (metaslab_debug_load && ms->ms_sm != NULL) {
1378 		mutex_enter(&ms->ms_lock);
1379 		VERIFY0(metaslab_load(ms));
1380 		mutex_exit(&ms->ms_lock);
1381 	}
1382 
1383 	if (txg != 0) {
1384 		vdev_dirty(vd, 0, NULL, txg);
1385 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
1386 	}
1387 
1388 	*msp = ms;
1389 
1390 	return (0);
1391 }
1392 
1393 void
1394 metaslab_fini(metaslab_t *msp)
1395 {
1396 	metaslab_group_t *mg = msp->ms_group;
1397 
1398 	metaslab_group_remove(mg, msp);
1399 
1400 	mutex_enter(&msp->ms_lock);
1401 
1402 	VERIFY(msp->ms_group == NULL);
1403 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1404 	    0, -msp->ms_size);
1405 	space_map_close(msp->ms_sm);
1406 
1407 	metaslab_unload(msp);
1408 	range_tree_destroy(msp->ms_tree);
1409 
1410 	for (int t = 0; t < TXG_SIZE; t++) {
1411 		range_tree_destroy(msp->ms_alloctree[t]);
1412 		range_tree_destroy(msp->ms_freetree[t]);
1413 	}
1414 
1415 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1416 		range_tree_destroy(msp->ms_defertree[t]);
1417 	}
1418 
1419 	ASSERT0(msp->ms_deferspace);
1420 
1421 	mutex_exit(&msp->ms_lock);
1422 	cv_destroy(&msp->ms_load_cv);
1423 	mutex_destroy(&msp->ms_lock);
1424 
1425 	kmem_free(msp, sizeof (metaslab_t));
1426 }
1427 
1428 #define	FRAGMENTATION_TABLE_SIZE	17
1429 
1430 /*
1431  * This table defines a segment size based fragmentation metric that will
1432  * allow each metaslab to derive its own fragmentation value. This is done
1433  * by calculating the space in each bucket of the spacemap histogram and
1434  * multiplying that by the fragmetation metric in this table. Doing
1435  * this for all buckets and dividing it by the total amount of free
1436  * space in this metaslab (i.e. the total free space in all buckets) gives
1437  * us the fragmentation metric. This means that a high fragmentation metric
1438  * equates to most of the free space being comprised of small segments.
1439  * Conversely, if the metric is low, then most of the free space is in
1440  * large segments. A 10% change in fragmentation equates to approximately
1441  * double the number of segments.
1442  *
1443  * This table defines 0% fragmented space using 16MB segments. Testing has
1444  * shown that segments that are greater than or equal to 16MB do not suffer
1445  * from drastic performance problems. Using this value, we derive the rest
1446  * of the table. Since the fragmentation value is never stored on disk, it
1447  * is possible to change these calculations in the future.
1448  */
1449 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1450 	100,	/* 512B	*/
1451 	100,	/* 1K	*/
1452 	98,	/* 2K	*/
1453 	95,	/* 4K	*/
1454 	90,	/* 8K	*/
1455 	80,	/* 16K	*/
1456 	70,	/* 32K	*/
1457 	60,	/* 64K	*/
1458 	50,	/* 128K	*/
1459 	40,	/* 256K	*/
1460 	30,	/* 512K	*/
1461 	20,	/* 1M	*/
1462 	15,	/* 2M	*/
1463 	10,	/* 4M	*/
1464 	5,	/* 8M	*/
1465 	0	/* 16M	*/
1466 };
1467 
1468 /*
1469  * Calclate the metaslab's fragmentation metric. A return value
1470  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1471  * not support this metric. Otherwise, the return value should be in the
1472  * range [0, 100].
1473  */
1474 static uint64_t
1475 metaslab_fragmentation(metaslab_t *msp)
1476 {
1477 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1478 	uint64_t fragmentation = 0;
1479 	uint64_t total = 0;
1480 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
1481 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
1482 
1483 	if (!feature_enabled)
1484 		return (ZFS_FRAG_INVALID);
1485 
1486 	/*
1487 	 * A null space map means that the entire metaslab is free
1488 	 * and thus is not fragmented.
1489 	 */
1490 	if (msp->ms_sm == NULL)
1491 		return (0);
1492 
1493 	/*
1494 	 * If this metaslab's space_map has not been upgraded, flag it
1495 	 * so that we upgrade next time we encounter it.
1496 	 */
1497 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1498 		uint64_t txg = spa_syncing_txg(spa);
1499 		vdev_t *vd = msp->ms_group->mg_vd;
1500 
1501 		if (spa_writeable(spa)) {
1502 			msp->ms_condense_wanted = B_TRUE;
1503 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1504 			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1505 			    "msp %p, vd %p", txg, msp, vd);
1506 		}
1507 		return (ZFS_FRAG_INVALID);
1508 	}
1509 
1510 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1511 		uint64_t space = 0;
1512 		uint8_t shift = msp->ms_sm->sm_shift;
1513 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1514 		    FRAGMENTATION_TABLE_SIZE - 1);
1515 
1516 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1517 			continue;
1518 
1519 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1520 		total += space;
1521 
1522 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1523 		fragmentation += space * zfs_frag_table[idx];
1524 	}
1525 
1526 	if (total > 0)
1527 		fragmentation /= total;
1528 	ASSERT3U(fragmentation, <=, 100);
1529 	return (fragmentation);
1530 }
1531 
1532 /*
1533  * Compute a weight -- a selection preference value -- for the given metaslab.
1534  * This is based on the amount of free space, the level of fragmentation,
1535  * the LBA range, and whether the metaslab is loaded.
1536  */
1537 static uint64_t
1538 metaslab_weight(metaslab_t *msp)
1539 {
1540 	metaslab_group_t *mg = msp->ms_group;
1541 	vdev_t *vd = mg->mg_vd;
1542 	uint64_t weight, space;
1543 
1544 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1545 
1546 	/*
1547 	 * This vdev is in the process of being removed so there is nothing
1548 	 * for us to do here.
1549 	 */
1550 	if (vd->vdev_removing) {
1551 		ASSERT0(space_map_allocated(msp->ms_sm));
1552 		ASSERT0(vd->vdev_ms_shift);
1553 		return (0);
1554 	}
1555 
1556 	/*
1557 	 * The baseline weight is the metaslab's free space.
1558 	 */
1559 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1560 
1561 	msp->ms_fragmentation = metaslab_fragmentation(msp);
1562 	if (metaslab_fragmentation_factor_enabled &&
1563 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1564 		/*
1565 		 * Use the fragmentation information to inversely scale
1566 		 * down the baseline weight. We need to ensure that we
1567 		 * don't exclude this metaslab completely when it's 100%
1568 		 * fragmented. To avoid this we reduce the fragmented value
1569 		 * by 1.
1570 		 */
1571 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1572 
1573 		/*
1574 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1575 		 * this metaslab again. The fragmentation metric may have
1576 		 * decreased the space to something smaller than
1577 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1578 		 * so that we can consume any remaining space.
1579 		 */
1580 		if (space > 0 && space < SPA_MINBLOCKSIZE)
1581 			space = SPA_MINBLOCKSIZE;
1582 	}
1583 	weight = space;
1584 
1585 	/*
1586 	 * Modern disks have uniform bit density and constant angular velocity.
1587 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1588 	 * than the inner zones by the ratio of outer to inner track diameter,
1589 	 * which is typically around 2:1.  We account for this by assigning
1590 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1591 	 * In effect, this means that we'll select the metaslab with the most
1592 	 * free bandwidth rather than simply the one with the most free space.
1593 	 */
1594 	if (metaslab_lba_weighting_enabled) {
1595 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1596 		ASSERT(weight >= space && weight <= 2 * space);
1597 	}
1598 
1599 	/*
1600 	 * If this metaslab is one we're actively using, adjust its
1601 	 * weight to make it preferable to any inactive metaslab so
1602 	 * we'll polish it off. If the fragmentation on this metaslab
1603 	 * has exceed our threshold, then don't mark it active.
1604 	 */
1605 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1606 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1607 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1608 	}
1609 
1610 	return (weight);
1611 }
1612 
1613 static int
1614 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1615 {
1616 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1617 
1618 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1619 		metaslab_load_wait(msp);
1620 		if (!msp->ms_loaded) {
1621 			int error = metaslab_load(msp);
1622 			if (error) {
1623 				metaslab_group_sort(msp->ms_group, msp, 0);
1624 				return (error);
1625 			}
1626 		}
1627 
1628 		metaslab_group_sort(msp->ms_group, msp,
1629 		    msp->ms_weight | activation_weight);
1630 	}
1631 	ASSERT(msp->ms_loaded);
1632 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1633 
1634 	return (0);
1635 }
1636 
1637 static void
1638 metaslab_passivate(metaslab_t *msp, uint64_t size)
1639 {
1640 	/*
1641 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1642 	 * this metaslab again.  In that case, it had better be empty,
1643 	 * or we would be leaving space on the table.
1644 	 */
1645 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1646 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1647 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1648 }
1649 
1650 static void
1651 metaslab_preload(void *arg)
1652 {
1653 	metaslab_t *msp = arg;
1654 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1655 
1656 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1657 
1658 	mutex_enter(&msp->ms_lock);
1659 	metaslab_load_wait(msp);
1660 	if (!msp->ms_loaded)
1661 		(void) metaslab_load(msp);
1662 
1663 	/*
1664 	 * Set the ms_access_txg value so that we don't unload it right away.
1665 	 */
1666 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1667 	mutex_exit(&msp->ms_lock);
1668 }
1669 
1670 static void
1671 metaslab_group_preload(metaslab_group_t *mg)
1672 {
1673 	spa_t *spa = mg->mg_vd->vdev_spa;
1674 	metaslab_t *msp;
1675 	avl_tree_t *t = &mg->mg_metaslab_tree;
1676 	int m = 0;
1677 
1678 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1679 		taskq_wait(mg->mg_taskq);
1680 		return;
1681 	}
1682 
1683 	mutex_enter(&mg->mg_lock);
1684 	/*
1685 	 * Load the next potential metaslabs
1686 	 */
1687 	msp = avl_first(t);
1688 	while (msp != NULL) {
1689 		metaslab_t *msp_next = AVL_NEXT(t, msp);
1690 
1691 		/*
1692 		 * We preload only the maximum number of metaslabs specified
1693 		 * by metaslab_preload_limit. If a metaslab is being forced
1694 		 * to condense then we preload it too. This will ensure
1695 		 * that force condensing happens in the next txg.
1696 		 */
1697 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
1698 			msp = msp_next;
1699 			continue;
1700 		}
1701 
1702 		/*
1703 		 * We must drop the metaslab group lock here to preserve
1704 		 * lock ordering with the ms_lock (when grabbing both
1705 		 * the mg_lock and the ms_lock, the ms_lock must be taken
1706 		 * first).  As a result, it is possible that the ordering
1707 		 * of the metaslabs within the avl tree may change before
1708 		 * we reacquire the lock. The metaslab cannot be removed from
1709 		 * the tree while we're in syncing context so it is safe to
1710 		 * drop the mg_lock here. If the metaslabs are reordered
1711 		 * nothing will break -- we just may end up loading a
1712 		 * less than optimal one.
1713 		 */
1714 		mutex_exit(&mg->mg_lock);
1715 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1716 		    msp, TQ_SLEEP) != NULL);
1717 		mutex_enter(&mg->mg_lock);
1718 		msp = msp_next;
1719 	}
1720 	mutex_exit(&mg->mg_lock);
1721 }
1722 
1723 /*
1724  * Determine if the space map's on-disk footprint is past our tolerance
1725  * for inefficiency. We would like to use the following criteria to make
1726  * our decision:
1727  *
1728  * 1. The size of the space map object should not dramatically increase as a
1729  * result of writing out the free space range tree.
1730  *
1731  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1732  * times the size than the free space range tree representation
1733  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1734  *
1735  * 3. The on-disk size of the space map should actually decrease.
1736  *
1737  * Checking the first condition is tricky since we don't want to walk
1738  * the entire AVL tree calculating the estimated on-disk size. Instead we
1739  * use the size-ordered range tree in the metaslab and calculate the
1740  * size required to write out the largest segment in our free tree. If the
1741  * size required to represent that segment on disk is larger than the space
1742  * map object then we avoid condensing this map.
1743  *
1744  * To determine the second criterion we use a best-case estimate and assume
1745  * each segment can be represented on-disk as a single 64-bit entry. We refer
1746  * to this best-case estimate as the space map's minimal form.
1747  *
1748  * Unfortunately, we cannot compute the on-disk size of the space map in this
1749  * context because we cannot accurately compute the effects of compression, etc.
1750  * Instead, we apply the heuristic described in the block comment for
1751  * zfs_metaslab_condense_block_threshold - we only condense if the space used
1752  * is greater than a threshold number of blocks.
1753  */
1754 static boolean_t
1755 metaslab_should_condense(metaslab_t *msp)
1756 {
1757 	space_map_t *sm = msp->ms_sm;
1758 	range_seg_t *rs;
1759 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1760 	dmu_object_info_t doi;
1761 	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1762 
1763 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1764 	ASSERT(msp->ms_loaded);
1765 
1766 	/*
1767 	 * Use the ms_size_tree range tree, which is ordered by size, to
1768 	 * obtain the largest segment in the free tree. We always condense
1769 	 * metaslabs that are empty and metaslabs for which a condense
1770 	 * request has been made.
1771 	 */
1772 	rs = avl_last(&msp->ms_size_tree);
1773 	if (rs == NULL || msp->ms_condense_wanted)
1774 		return (B_TRUE);
1775 
1776 	/*
1777 	 * Calculate the number of 64-bit entries this segment would
1778 	 * require when written to disk. If this single segment would be
1779 	 * larger on-disk than the entire current on-disk structure, then
1780 	 * clearly condensing will increase the on-disk structure size.
1781 	 */
1782 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1783 	entries = size / (MIN(size, SM_RUN_MAX));
1784 	segsz = entries * sizeof (uint64_t);
1785 
1786 	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1787 	object_size = space_map_length(msp->ms_sm);
1788 
1789 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1790 	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1791 
1792 	return (segsz <= object_size &&
1793 	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1794 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1795 }
1796 
1797 /*
1798  * Condense the on-disk space map representation to its minimized form.
1799  * The minimized form consists of a small number of allocations followed by
1800  * the entries of the free range tree.
1801  */
1802 static void
1803 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1804 {
1805 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1806 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1807 	range_tree_t *condense_tree;
1808 	space_map_t *sm = msp->ms_sm;
1809 
1810 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1811 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1812 	ASSERT(msp->ms_loaded);
1813 
1814 
1815 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
1816 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
1817 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
1818 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
1819 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
1820 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
1821 
1822 	msp->ms_condense_wanted = B_FALSE;
1823 
1824 	/*
1825 	 * Create an range tree that is 100% allocated. We remove segments
1826 	 * that have been freed in this txg, any deferred frees that exist,
1827 	 * and any allocation in the future. Removing segments should be
1828 	 * a relatively inexpensive operation since we expect these trees to
1829 	 * have a small number of nodes.
1830 	 */
1831 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1832 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1833 
1834 	/*
1835 	 * Remove what's been freed in this txg from the condense_tree.
1836 	 * Since we're in sync_pass 1, we know that all the frees from
1837 	 * this txg are in the freetree.
1838 	 */
1839 	range_tree_walk(freetree, range_tree_remove, condense_tree);
1840 
1841 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1842 		range_tree_walk(msp->ms_defertree[t],
1843 		    range_tree_remove, condense_tree);
1844 	}
1845 
1846 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1847 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1848 		    range_tree_remove, condense_tree);
1849 	}
1850 
1851 	/*
1852 	 * We're about to drop the metaslab's lock thus allowing
1853 	 * other consumers to change it's content. Set the
1854 	 * metaslab's ms_condensing flag to ensure that
1855 	 * allocations on this metaslab do not occur while we're
1856 	 * in the middle of committing it to disk. This is only critical
1857 	 * for the ms_tree as all other range trees use per txg
1858 	 * views of their content.
1859 	 */
1860 	msp->ms_condensing = B_TRUE;
1861 
1862 	mutex_exit(&msp->ms_lock);
1863 	space_map_truncate(sm, tx);
1864 	mutex_enter(&msp->ms_lock);
1865 
1866 	/*
1867 	 * While we would ideally like to create a space_map representation
1868 	 * that consists only of allocation records, doing so can be
1869 	 * prohibitively expensive because the in-core free tree can be
1870 	 * large, and therefore computationally expensive to subtract
1871 	 * from the condense_tree. Instead we sync out two trees, a cheap
1872 	 * allocation only tree followed by the in-core free tree. While not
1873 	 * optimal, this is typically close to optimal, and much cheaper to
1874 	 * compute.
1875 	 */
1876 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1877 	range_tree_vacate(condense_tree, NULL, NULL);
1878 	range_tree_destroy(condense_tree);
1879 
1880 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1881 	msp->ms_condensing = B_FALSE;
1882 }
1883 
1884 /*
1885  * Write a metaslab to disk in the context of the specified transaction group.
1886  */
1887 void
1888 metaslab_sync(metaslab_t *msp, uint64_t txg)
1889 {
1890 	metaslab_group_t *mg = msp->ms_group;
1891 	vdev_t *vd = mg->mg_vd;
1892 	spa_t *spa = vd->vdev_spa;
1893 	objset_t *mos = spa_meta_objset(spa);
1894 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1895 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1896 	range_tree_t **freed_tree =
1897 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1898 	dmu_tx_t *tx;
1899 	uint64_t object = space_map_object(msp->ms_sm);
1900 
1901 	ASSERT(!vd->vdev_ishole);
1902 
1903 	/*
1904 	 * This metaslab has just been added so there's no work to do now.
1905 	 */
1906 	if (*freetree == NULL) {
1907 		ASSERT3P(alloctree, ==, NULL);
1908 		return;
1909 	}
1910 
1911 	ASSERT3P(alloctree, !=, NULL);
1912 	ASSERT3P(*freetree, !=, NULL);
1913 	ASSERT3P(*freed_tree, !=, NULL);
1914 
1915 	/*
1916 	 * Normally, we don't want to process a metaslab if there
1917 	 * are no allocations or frees to perform. However, if the metaslab
1918 	 * is being forced to condense we need to let it through.
1919 	 */
1920 	if (range_tree_space(alloctree) == 0 &&
1921 	    range_tree_space(*freetree) == 0 &&
1922 	    !msp->ms_condense_wanted)
1923 		return;
1924 
1925 	/*
1926 	 * The only state that can actually be changing concurrently with
1927 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1928 	 * be modifying this txg's alloctree, freetree, freed_tree, or
1929 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1930 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1931 	 * because the DMU can call down to us (e.g. via zio_free()) at
1932 	 * any time.
1933 	 */
1934 
1935 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1936 
1937 	if (msp->ms_sm == NULL) {
1938 		uint64_t new_object;
1939 
1940 		new_object = space_map_alloc(mos, tx);
1941 		VERIFY3U(new_object, !=, 0);
1942 
1943 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1944 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1945 		    &msp->ms_lock));
1946 		ASSERT(msp->ms_sm != NULL);
1947 	}
1948 
1949 	mutex_enter(&msp->ms_lock);
1950 
1951 	/*
1952 	 * Note: metaslab_condense() clears the space_map's histogram.
1953 	 * Therefore we must verify and remove this histogram before
1954 	 * condensing.
1955 	 */
1956 	metaslab_group_histogram_verify(mg);
1957 	metaslab_class_histogram_verify(mg->mg_class);
1958 	metaslab_group_histogram_remove(mg, msp);
1959 
1960 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1961 	    metaslab_should_condense(msp)) {
1962 		metaslab_condense(msp, txg, tx);
1963 	} else {
1964 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1965 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1966 	}
1967 
1968 	if (msp->ms_loaded) {
1969 		/*
1970 		 * When the space map is loaded, we have an accruate
1971 		 * histogram in the range tree. This gives us an opportunity
1972 		 * to bring the space map's histogram up-to-date so we clear
1973 		 * it first before updating it.
1974 		 */
1975 		space_map_histogram_clear(msp->ms_sm);
1976 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1977 	} else {
1978 		/*
1979 		 * Since the space map is not loaded we simply update the
1980 		 * exisiting histogram with what was freed in this txg. This
1981 		 * means that the on-disk histogram may not have an accurate
1982 		 * view of the free space but it's close enough to allow
1983 		 * us to make allocation decisions.
1984 		 */
1985 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1986 	}
1987 	metaslab_group_histogram_add(mg, msp);
1988 	metaslab_group_histogram_verify(mg);
1989 	metaslab_class_histogram_verify(mg->mg_class);
1990 
1991 	/*
1992 	 * For sync pass 1, we avoid traversing this txg's free range tree
1993 	 * and instead will just swap the pointers for freetree and
1994 	 * freed_tree. We can safely do this since the freed_tree is
1995 	 * guaranteed to be empty on the initial pass.
1996 	 */
1997 	if (spa_sync_pass(spa) == 1) {
1998 		range_tree_swap(freetree, freed_tree);
1999 	} else {
2000 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
2001 	}
2002 	range_tree_vacate(alloctree, NULL, NULL);
2003 
2004 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2005 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2006 
2007 	mutex_exit(&msp->ms_lock);
2008 
2009 	if (object != space_map_object(msp->ms_sm)) {
2010 		object = space_map_object(msp->ms_sm);
2011 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2012 		    msp->ms_id, sizeof (uint64_t), &object, tx);
2013 	}
2014 	dmu_tx_commit(tx);
2015 }
2016 
2017 /*
2018  * Called after a transaction group has completely synced to mark
2019  * all of the metaslab's free space as usable.
2020  */
2021 void
2022 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2023 {
2024 	metaslab_group_t *mg = msp->ms_group;
2025 	vdev_t *vd = mg->mg_vd;
2026 	range_tree_t **freed_tree;
2027 	range_tree_t **defer_tree;
2028 	int64_t alloc_delta, defer_delta;
2029 
2030 	ASSERT(!vd->vdev_ishole);
2031 
2032 	mutex_enter(&msp->ms_lock);
2033 
2034 	/*
2035 	 * If this metaslab is just becoming available, initialize its
2036 	 * alloctrees, freetrees, and defertree and add its capacity to
2037 	 * the vdev.
2038 	 */
2039 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
2040 		for (int t = 0; t < TXG_SIZE; t++) {
2041 			ASSERT(msp->ms_alloctree[t] == NULL);
2042 			ASSERT(msp->ms_freetree[t] == NULL);
2043 
2044 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2045 			    &msp->ms_lock);
2046 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
2047 			    &msp->ms_lock);
2048 		}
2049 
2050 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2051 			ASSERT(msp->ms_defertree[t] == NULL);
2052 
2053 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
2054 			    &msp->ms_lock);
2055 		}
2056 
2057 		vdev_space_update(vd, 0, 0, msp->ms_size);
2058 	}
2059 
2060 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
2061 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2062 
2063 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
2064 	defer_delta = range_tree_space(*freed_tree) -
2065 	    range_tree_space(*defer_tree);
2066 
2067 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2068 
2069 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2070 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
2071 
2072 	/*
2073 	 * If there's a metaslab_load() in progress, wait for it to complete
2074 	 * so that we have a consistent view of the in-core space map.
2075 	 */
2076 	metaslab_load_wait(msp);
2077 
2078 	/*
2079 	 * Move the frees from the defer_tree back to the free
2080 	 * range tree (if it's loaded). Swap the freed_tree and the
2081 	 * defer_tree -- this is safe to do because we've just emptied out
2082 	 * the defer_tree.
2083 	 */
2084 	range_tree_vacate(*defer_tree,
2085 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2086 	range_tree_swap(freed_tree, defer_tree);
2087 
2088 	space_map_update(msp->ms_sm);
2089 
2090 	msp->ms_deferspace += defer_delta;
2091 	ASSERT3S(msp->ms_deferspace, >=, 0);
2092 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2093 	if (msp->ms_deferspace != 0) {
2094 		/*
2095 		 * Keep syncing this metaslab until all deferred frees
2096 		 * are back in circulation.
2097 		 */
2098 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2099 	}
2100 
2101 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
2102 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2103 			VERIFY0(range_tree_space(
2104 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
2105 		}
2106 
2107 		if (!metaslab_debug_unload)
2108 			metaslab_unload(msp);
2109 	}
2110 
2111 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
2112 	mutex_exit(&msp->ms_lock);
2113 }
2114 
2115 void
2116 metaslab_sync_reassess(metaslab_group_t *mg)
2117 {
2118 	metaslab_group_alloc_update(mg);
2119 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2120 
2121 	/*
2122 	 * Preload the next potential metaslabs
2123 	 */
2124 	metaslab_group_preload(mg);
2125 }
2126 
2127 static uint64_t
2128 metaslab_distance(metaslab_t *msp, dva_t *dva)
2129 {
2130 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2131 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2132 	uint64_t start = msp->ms_id;
2133 
2134 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2135 		return (1ULL << 63);
2136 
2137 	if (offset < start)
2138 		return ((start - offset) << ms_shift);
2139 	if (offset > start)
2140 		return ((offset - start) << ms_shift);
2141 	return (0);
2142 }
2143 
2144 /*
2145  * ==========================================================================
2146  * Metaslab block operations
2147  * ==========================================================================
2148  */
2149 
2150 static void
2151 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2152 {
2153 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2154 	    flags & METASLAB_DONT_THROTTLE)
2155 		return;
2156 
2157 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2158 	if (!mg->mg_class->mc_alloc_throttle_enabled)
2159 		return;
2160 
2161 	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2162 }
2163 
2164 void
2165 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2166 {
2167 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
2168 	    flags & METASLAB_DONT_THROTTLE)
2169 		return;
2170 
2171 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2172 	if (!mg->mg_class->mc_alloc_throttle_enabled)
2173 		return;
2174 
2175 	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2176 }
2177 
2178 void
2179 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2180 {
2181 #ifdef ZFS_DEBUG
2182 	const dva_t *dva = bp->blk_dva;
2183 	int ndvas = BP_GET_NDVAS(bp);
2184 
2185 	for (int d = 0; d < ndvas; d++) {
2186 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2187 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2188 		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2189 	}
2190 #endif
2191 }
2192 
2193 static uint64_t
2194 metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
2195     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
2196 {
2197 	spa_t *spa = mg->mg_vd->vdev_spa;
2198 	metaslab_t *msp = NULL;
2199 	uint64_t offset = -1ULL;
2200 	avl_tree_t *t = &mg->mg_metaslab_tree;
2201 	uint64_t activation_weight;
2202 	uint64_t target_distance;
2203 	int i;
2204 
2205 	activation_weight = METASLAB_WEIGHT_PRIMARY;
2206 	for (i = 0; i < d; i++) {
2207 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2208 			activation_weight = METASLAB_WEIGHT_SECONDARY;
2209 			break;
2210 		}
2211 	}
2212 
2213 	for (;;) {
2214 		boolean_t was_active;
2215 
2216 		mutex_enter(&mg->mg_lock);
2217 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
2218 			if (msp->ms_weight < asize) {
2219 				spa_dbgmsg(spa, "%s: failed to meet weight "
2220 				    "requirement: vdev %llu, txg %llu, mg %p, "
2221 				    "msp %p, asize %llu, "
2222 				    "weight %llu", spa_name(spa),
2223 				    mg->mg_vd->vdev_id, txg,
2224 				    mg, msp, asize, msp->ms_weight);
2225 				mutex_exit(&mg->mg_lock);
2226 				return (-1ULL);
2227 			}
2228 
2229 			/*
2230 			 * If the selected metaslab is condensing, skip it.
2231 			 */
2232 			if (msp->ms_condensing)
2233 				continue;
2234 
2235 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2236 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
2237 				break;
2238 
2239 			target_distance = min_distance +
2240 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2241 			    min_distance >> 1);
2242 
2243 			for (i = 0; i < d; i++)
2244 				if (metaslab_distance(msp, &dva[i]) <
2245 				    target_distance)
2246 					break;
2247 			if (i == d)
2248 				break;
2249 		}
2250 		mutex_exit(&mg->mg_lock);
2251 		if (msp == NULL)
2252 			return (-1ULL);
2253 
2254 		mutex_enter(&msp->ms_lock);
2255 
2256 		/*
2257 		 * Ensure that the metaslab we have selected is still
2258 		 * capable of handling our request. It's possible that
2259 		 * another thread may have changed the weight while we
2260 		 * were blocked on the metaslab lock.
2261 		 */
2262 		if (msp->ms_weight < asize || (was_active &&
2263 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
2264 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
2265 			mutex_exit(&msp->ms_lock);
2266 			continue;
2267 		}
2268 
2269 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2270 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
2271 			metaslab_passivate(msp,
2272 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2273 			mutex_exit(&msp->ms_lock);
2274 			continue;
2275 		}
2276 
2277 		if (metaslab_activate(msp, activation_weight) != 0) {
2278 			mutex_exit(&msp->ms_lock);
2279 			continue;
2280 		}
2281 
2282 		/*
2283 		 * If this metaslab is currently condensing then pick again as
2284 		 * we can't manipulate this metaslab until it's committed
2285 		 * to disk.
2286 		 */
2287 		if (msp->ms_condensing) {
2288 			mutex_exit(&msp->ms_lock);
2289 			continue;
2290 		}
2291 
2292 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
2293 			break;
2294 
2295 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
2296 		mutex_exit(&msp->ms_lock);
2297 	}
2298 
2299 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2300 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2301 
2302 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
2303 	msp->ms_access_txg = txg + metaslab_unload_delay;
2304 
2305 	mutex_exit(&msp->ms_lock);
2306 	return (offset);
2307 }
2308 
2309 /*
2310  * Allocate a block for the specified i/o.
2311  */
2312 static int
2313 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
2314     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
2315 {
2316 	metaslab_group_t *mg, *rotor;
2317 	vdev_t *vd;
2318 	int dshift = 3;
2319 	int all_zero;
2320 	int zio_lock = B_FALSE;
2321 	boolean_t allocatable;
2322 	uint64_t asize;
2323 	uint64_t distance;
2324 
2325 	ASSERT(!DVA_IS_VALID(&dva[d]));
2326 
2327 	/*
2328 	 * For testing, make some blocks above a certain size be gang blocks.
2329 	 */
2330 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
2331 		return (SET_ERROR(ENOSPC));
2332 
2333 	/*
2334 	 * Start at the rotor and loop through all mgs until we find something.
2335 	 * Note that there's no locking on mc_rotor or mc_aliquot because
2336 	 * nothing actually breaks if we miss a few updates -- we just won't
2337 	 * allocate quite as evenly.  It all balances out over time.
2338 	 *
2339 	 * If we are doing ditto or log blocks, try to spread them across
2340 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
2341 	 * allocated all of our ditto blocks, then try and spread them out on
2342 	 * that vdev as much as possible.  If it turns out to not be possible,
2343 	 * gradually lower our standards until anything becomes acceptable.
2344 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
2345 	 * gives us hope of containing our fault domains to something we're
2346 	 * able to reason about.  Otherwise, any two top-level vdev failures
2347 	 * will guarantee the loss of data.  With consecutive allocation,
2348 	 * only two adjacent top-level vdev failures will result in data loss.
2349 	 *
2350 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
2351 	 * ourselves on the same vdev as our gang block header.  That
2352 	 * way, we can hope for locality in vdev_cache, plus it makes our
2353 	 * fault domains something tractable.
2354 	 */
2355 	if (hintdva) {
2356 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
2357 
2358 		/*
2359 		 * It's possible the vdev we're using as the hint no
2360 		 * longer exists (i.e. removed). Consult the rotor when
2361 		 * all else fails.
2362 		 */
2363 		if (vd != NULL) {
2364 			mg = vd->vdev_mg;
2365 
2366 			if (flags & METASLAB_HINTBP_AVOID &&
2367 			    mg->mg_next != NULL)
2368 				mg = mg->mg_next;
2369 		} else {
2370 			mg = mc->mc_rotor;
2371 		}
2372 	} else if (d != 0) {
2373 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
2374 		mg = vd->vdev_mg->mg_next;
2375 	} else {
2376 		mg = mc->mc_rotor;
2377 	}
2378 
2379 	/*
2380 	 * If the hint put us into the wrong metaslab class, or into a
2381 	 * metaslab group that has been passivated, just follow the rotor.
2382 	 */
2383 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
2384 		mg = mc->mc_rotor;
2385 
2386 	rotor = mg;
2387 top:
2388 	all_zero = B_TRUE;
2389 	do {
2390 		ASSERT(mg->mg_activation_count == 1);
2391 		vd = mg->mg_vd;
2392 
2393 		/*
2394 		 * Don't allocate from faulted devices.
2395 		 */
2396 		if (zio_lock) {
2397 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
2398 			allocatable = vdev_allocatable(vd);
2399 			spa_config_exit(spa, SCL_ZIO, FTAG);
2400 		} else {
2401 			allocatable = vdev_allocatable(vd);
2402 		}
2403 
2404 		/*
2405 		 * Determine if the selected metaslab group is eligible
2406 		 * for allocations. If we're ganging then don't allow
2407 		 * this metaslab group to skip allocations since that would
2408 		 * inadvertently return ENOSPC and suspend the pool
2409 		 * even though space is still available.
2410 		 */
2411 		if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
2412 			allocatable = metaslab_group_allocatable(mg, rotor,
2413 			    psize);
2414 		}
2415 
2416 		if (!allocatable)
2417 			goto next;
2418 
2419 		ASSERT(mg->mg_initialized);
2420 
2421 		/*
2422 		 * Avoid writing single-copy data to a failing vdev.
2423 		 */
2424 		if ((vd->vdev_stat.vs_write_errors > 0 ||
2425 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
2426 		    d == 0 && dshift == 3 && vd->vdev_children == 0) {
2427 			all_zero = B_FALSE;
2428 			goto next;
2429 		}
2430 
2431 		ASSERT(mg->mg_class == mc);
2432 
2433 		distance = vd->vdev_asize >> dshift;
2434 		if (distance <= (1ULL << vd->vdev_ms_shift))
2435 			distance = 0;
2436 		else
2437 			all_zero = B_FALSE;
2438 
2439 		asize = vdev_psize_to_asize(vd, psize);
2440 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
2441 
2442 		uint64_t offset = metaslab_group_alloc(mg, asize, txg,
2443 		    distance, dva, d);
2444 
2445 		mutex_enter(&mg->mg_lock);
2446 		if (offset == -1ULL) {
2447 			mg->mg_failed_allocations++;
2448 			if (asize == SPA_GANGBLOCKSIZE) {
2449 				/*
2450 				 * This metaslab group was unable to allocate
2451 				 * the minimum gang block size so it must be
2452 				 * out of space. We must notify the allocation
2453 				 * throttle to start skipping allocation
2454 				 * attempts to this metaslab group until more
2455 				 * space becomes available.
2456 				 *
2457 				 * Note: this failure cannot be caused by the
2458 				 * allocation throttle since the allocation
2459 				 * throttle is only responsible for skipping
2460 				 * devices and not failing block allocations.
2461 				 */
2462 				mg->mg_no_free_space = B_TRUE;
2463 			}
2464 		}
2465 		mg->mg_allocations++;
2466 		mutex_exit(&mg->mg_lock);
2467 
2468 		if (offset != -1ULL) {
2469 			/*
2470 			 * If we've just selected this metaslab group,
2471 			 * figure out whether the corresponding vdev is
2472 			 * over- or under-used relative to the pool,
2473 			 * and set an allocation bias to even it out.
2474 			 */
2475 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
2476 				vdev_stat_t *vs = &vd->vdev_stat;
2477 				int64_t vu, cu;
2478 
2479 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
2480 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
2481 
2482 				/*
2483 				 * Calculate how much more or less we should
2484 				 * try to allocate from this device during
2485 				 * this iteration around the rotor.
2486 				 * For example, if a device is 80% full
2487 				 * and the pool is 20% full then we should
2488 				 * reduce allocations by 60% on this device.
2489 				 *
2490 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
2491 				 *
2492 				 * This reduces allocations by 307K for this
2493 				 * iteration.
2494 				 */
2495 				mg->mg_bias = ((cu - vu) *
2496 				    (int64_t)mg->mg_aliquot) / 100;
2497 			} else if (!metaslab_bias_enabled) {
2498 				mg->mg_bias = 0;
2499 			}
2500 
2501 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
2502 			    mg->mg_aliquot + mg->mg_bias) {
2503 				mc->mc_rotor = mg->mg_next;
2504 				mc->mc_aliquot = 0;
2505 			}
2506 
2507 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
2508 			DVA_SET_OFFSET(&dva[d], offset);
2509 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
2510 			DVA_SET_ASIZE(&dva[d], asize);
2511 
2512 			return (0);
2513 		}
2514 next:
2515 		mc->mc_rotor = mg->mg_next;
2516 		mc->mc_aliquot = 0;
2517 	} while ((mg = mg->mg_next) != rotor);
2518 
2519 	if (!all_zero) {
2520 		dshift++;
2521 		ASSERT(dshift < 64);
2522 		goto top;
2523 	}
2524 
2525 	if (!allocatable && !zio_lock) {
2526 		dshift = 3;
2527 		zio_lock = B_TRUE;
2528 		goto top;
2529 	}
2530 
2531 	bzero(&dva[d], sizeof (dva_t));
2532 
2533 	return (SET_ERROR(ENOSPC));
2534 }
2535 
2536 /*
2537  * Free the block represented by DVA in the context of the specified
2538  * transaction group.
2539  */
2540 static void
2541 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2542 {
2543 	uint64_t vdev = DVA_GET_VDEV(dva);
2544 	uint64_t offset = DVA_GET_OFFSET(dva);
2545 	uint64_t size = DVA_GET_ASIZE(dva);
2546 	vdev_t *vd;
2547 	metaslab_t *msp;
2548 
2549 	ASSERT(DVA_IS_VALID(dva));
2550 
2551 	if (txg > spa_freeze_txg(spa))
2552 		return;
2553 
2554 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2555 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2556 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2557 		    (u_longlong_t)vdev, (u_longlong_t)offset);
2558 		ASSERT(0);
2559 		return;
2560 	}
2561 
2562 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2563 
2564 	if (DVA_GET_GANG(dva))
2565 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2566 
2567 	mutex_enter(&msp->ms_lock);
2568 
2569 	if (now) {
2570 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2571 		    offset, size);
2572 
2573 		VERIFY(!msp->ms_condensing);
2574 		VERIFY3U(offset, >=, msp->ms_start);
2575 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2576 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2577 		    msp->ms_size);
2578 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2579 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2580 		range_tree_add(msp->ms_tree, offset, size);
2581 	} else {
2582 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2583 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2584 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2585 		    offset, size);
2586 	}
2587 
2588 	mutex_exit(&msp->ms_lock);
2589 }
2590 
2591 /*
2592  * Intent log support: upon opening the pool after a crash, notify the SPA
2593  * of blocks that the intent log has allocated for immediate write, but
2594  * which are still considered free by the SPA because the last transaction
2595  * group didn't commit yet.
2596  */
2597 static int
2598 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2599 {
2600 	uint64_t vdev = DVA_GET_VDEV(dva);
2601 	uint64_t offset = DVA_GET_OFFSET(dva);
2602 	uint64_t size = DVA_GET_ASIZE(dva);
2603 	vdev_t *vd;
2604 	metaslab_t *msp;
2605 	int error = 0;
2606 
2607 	ASSERT(DVA_IS_VALID(dva));
2608 
2609 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2610 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2611 		return (SET_ERROR(ENXIO));
2612 
2613 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2614 
2615 	if (DVA_GET_GANG(dva))
2616 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2617 
2618 	mutex_enter(&msp->ms_lock);
2619 
2620 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2621 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2622 
2623 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2624 		error = SET_ERROR(ENOENT);
2625 
2626 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2627 		mutex_exit(&msp->ms_lock);
2628 		return (error);
2629 	}
2630 
2631 	VERIFY(!msp->ms_condensing);
2632 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2633 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2634 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2635 	range_tree_remove(msp->ms_tree, offset, size);
2636 
2637 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2638 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2639 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2640 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2641 	}
2642 
2643 	mutex_exit(&msp->ms_lock);
2644 
2645 	return (0);
2646 }
2647 
2648 /*
2649  * Reserve some allocation slots. The reservation system must be called
2650  * before we call into the allocator. If there aren't any available slots
2651  * then the I/O will be throttled until an I/O completes and its slots are
2652  * freed up. The function returns true if it was successful in placing
2653  * the reservation.
2654  */
2655 boolean_t
2656 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
2657     int flags)
2658 {
2659 	uint64_t available_slots = 0;
2660 	boolean_t slot_reserved = B_FALSE;
2661 
2662 	ASSERT(mc->mc_alloc_throttle_enabled);
2663 	mutex_enter(&mc->mc_lock);
2664 
2665 	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
2666 	if (reserved_slots < mc->mc_alloc_max_slots)
2667 		available_slots = mc->mc_alloc_max_slots - reserved_slots;
2668 
2669 	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
2670 		/*
2671 		 * We reserve the slots individually so that we can unreserve
2672 		 * them individually when an I/O completes.
2673 		 */
2674 		for (int d = 0; d < slots; d++) {
2675 			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
2676 		}
2677 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
2678 		slot_reserved = B_TRUE;
2679 	}
2680 
2681 	mutex_exit(&mc->mc_lock);
2682 	return (slot_reserved);
2683 }
2684 
2685 void
2686 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
2687 {
2688 	ASSERT(mc->mc_alloc_throttle_enabled);
2689 	mutex_enter(&mc->mc_lock);
2690 	for (int d = 0; d < slots; d++) {
2691 		(void) refcount_remove(&mc->mc_alloc_slots, zio);
2692 	}
2693 	mutex_exit(&mc->mc_lock);
2694 }
2695 
2696 int
2697 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2698     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
2699 {
2700 	dva_t *dva = bp->blk_dva;
2701 	dva_t *hintdva = hintbp->blk_dva;
2702 	int error = 0;
2703 
2704 	ASSERT(bp->blk_birth == 0);
2705 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2706 
2707 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2708 
2709 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2710 		spa_config_exit(spa, SCL_ALLOC, FTAG);
2711 		return (SET_ERROR(ENOSPC));
2712 	}
2713 
2714 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2715 	ASSERT(BP_GET_NDVAS(bp) == 0);
2716 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2717 
2718 	for (int d = 0; d < ndvas; d++) {
2719 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2720 		    txg, flags);
2721 		if (error != 0) {
2722 			for (d--; d >= 0; d--) {
2723 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2724 				metaslab_group_alloc_decrement(spa,
2725 				    DVA_GET_VDEV(&dva[d]), zio, flags);
2726 				bzero(&dva[d], sizeof (dva_t));
2727 			}
2728 			spa_config_exit(spa, SCL_ALLOC, FTAG);
2729 			return (error);
2730 		} else {
2731 			/*
2732 			 * Update the metaslab group's queue depth
2733 			 * based on the newly allocated dva.
2734 			 */
2735 			metaslab_group_alloc_increment(spa,
2736 			    DVA_GET_VDEV(&dva[d]), zio, flags);
2737 		}
2738 
2739 	}
2740 	ASSERT(error == 0);
2741 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2742 
2743 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2744 
2745 	BP_SET_BIRTH(bp, txg, txg);
2746 
2747 	return (0);
2748 }
2749 
2750 void
2751 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2752 {
2753 	const dva_t *dva = bp->blk_dva;
2754 	int ndvas = BP_GET_NDVAS(bp);
2755 
2756 	ASSERT(!BP_IS_HOLE(bp));
2757 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2758 
2759 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2760 
2761 	for (int d = 0; d < ndvas; d++)
2762 		metaslab_free_dva(spa, &dva[d], txg, now);
2763 
2764 	spa_config_exit(spa, SCL_FREE, FTAG);
2765 }
2766 
2767 int
2768 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2769 {
2770 	const dva_t *dva = bp->blk_dva;
2771 	int ndvas = BP_GET_NDVAS(bp);
2772 	int error = 0;
2773 
2774 	ASSERT(!BP_IS_HOLE(bp));
2775 
2776 	if (txg != 0) {
2777 		/*
2778 		 * First do a dry run to make sure all DVAs are claimable,
2779 		 * so we don't have to unwind from partial failures below.
2780 		 */
2781 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2782 			return (error);
2783 	}
2784 
2785 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2786 
2787 	for (int d = 0; d < ndvas; d++)
2788 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2789 			break;
2790 
2791 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2792 
2793 	ASSERT(error == 0 || txg == 0);
2794 
2795 	return (error);
2796 }
2797 
2798 void
2799 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2800 {
2801 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2802 		return;
2803 
2804 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2805 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2806 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2807 		vdev_t *vd = vdev_lookup_top(spa, vdev);
2808 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2809 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2810 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2811 
2812 		if (msp->ms_loaded)
2813 			range_tree_verify(msp->ms_tree, offset, size);
2814 
2815 		for (int j = 0; j < TXG_SIZE; j++)
2816 			range_tree_verify(msp->ms_freetree[j], offset, size);
2817 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2818 			range_tree_verify(msp->ms_defertree[j], offset, size);
2819 	}
2820 	spa_config_exit(spa, SCL_VDEV, FTAG);
2821 }
2822