1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 #include <sys/vdev_indirect_mapping.h> 38 #include <sys/zap.h> 39 40 #define GANG_ALLOCATION(flags) \ 41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 42 43 uint64_t metaslab_aliquot = 512ULL << 10; 44 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 45 46 /* 47 * Since we can touch multiple metaslabs (and their respective space maps) 48 * with each transaction group, we benefit from having a smaller space map 49 * block size since it allows us to issue more I/O operations scattered 50 * around the disk. 51 */ 52 int zfs_metaslab_sm_blksz = (1 << 12); 53 54 /* 55 * The in-core space map representation is more compact than its on-disk form. 56 * The zfs_condense_pct determines how much more compact the in-core 57 * space map representation must be before we compact it on-disk. 58 * Values should be greater than or equal to 100. 59 */ 60 int zfs_condense_pct = 200; 61 62 /* 63 * Condensing a metaslab is not guaranteed to actually reduce the amount of 64 * space used on disk. In particular, a space map uses data in increments of 65 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 66 * same number of blocks after condensing. Since the goal of condensing is to 67 * reduce the number of IOPs required to read the space map, we only want to 68 * condense when we can be sure we will reduce the number of blocks used by the 69 * space map. Unfortunately, we cannot precisely compute whether or not this is 70 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 71 * we apply the following heuristic: do not condense a spacemap unless the 72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 73 * blocks. 74 */ 75 int zfs_metaslab_condense_block_threshold = 4; 76 77 /* 78 * The zfs_mg_noalloc_threshold defines which metaslab groups should 79 * be eligible for allocation. The value is defined as a percentage of 80 * free space. Metaslab groups that have more free space than 81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 82 * a metaslab group's free space is less than or equal to the 83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 86 * groups are allowed to accept allocations. Gang blocks are always 87 * eligible to allocate on any metaslab group. The default value of 0 means 88 * no metaslab group will be excluded based on this criterion. 89 */ 90 int zfs_mg_noalloc_threshold = 0; 91 92 /* 93 * Metaslab groups are considered eligible for allocations if their 94 * fragmenation metric (measured as a percentage) is less than or equal to 95 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 96 * then it will be skipped unless all metaslab groups within the metaslab 97 * class have also crossed this threshold. 98 */ 99 int zfs_mg_fragmentation_threshold = 85; 100 101 /* 102 * Allow metaslabs to keep their active state as long as their fragmentation 103 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 104 * active metaslab that exceeds this threshold will no longer keep its active 105 * status allowing better metaslabs to be selected. 106 */ 107 int zfs_metaslab_fragmentation_threshold = 70; 108 109 /* 110 * When set will load all metaslabs when pool is first opened. 111 */ 112 int metaslab_debug_load = 0; 113 114 /* 115 * When set will prevent metaslabs from being unloaded. 116 */ 117 int metaslab_debug_unload = 0; 118 119 /* 120 * Minimum size which forces the dynamic allocator to change 121 * it's allocation strategy. Once the space map cannot satisfy 122 * an allocation of this size then it switches to using more 123 * aggressive strategy (i.e search by size rather than offset). 124 */ 125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 126 127 /* 128 * The minimum free space, in percent, which must be available 129 * in a space map to continue allocations in a first-fit fashion. 130 * Once the space map's free space drops below this level we dynamically 131 * switch to using best-fit allocations. 132 */ 133 int metaslab_df_free_pct = 4; 134 135 /* 136 * A metaslab is considered "free" if it contains a contiguous 137 * segment which is greater than metaslab_min_alloc_size. 138 */ 139 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 140 141 /* 142 * Percentage of all cpus that can be used by the metaslab taskq. 143 */ 144 int metaslab_load_pct = 50; 145 146 /* 147 * Determines how many txgs a metaslab may remain loaded without having any 148 * allocations from it. As long as a metaslab continues to be used we will 149 * keep it loaded. 150 */ 151 int metaslab_unload_delay = TXG_SIZE * 2; 152 153 /* 154 * Max number of metaslabs per group to preload. 155 */ 156 int metaslab_preload_limit = SPA_DVAS_PER_BP; 157 158 /* 159 * Enable/disable preloading of metaslab. 160 */ 161 boolean_t metaslab_preload_enabled = B_TRUE; 162 163 /* 164 * Enable/disable fragmentation weighting on metaslabs. 165 */ 166 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 167 168 /* 169 * Enable/disable lba weighting (i.e. outer tracks are given preference). 170 */ 171 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 172 173 /* 174 * Enable/disable metaslab group biasing. 175 */ 176 boolean_t metaslab_bias_enabled = B_TRUE; 177 178 /* 179 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 180 */ 181 boolean_t zfs_remap_blkptr_enable = B_TRUE; 182 183 /* 184 * Enable/disable segment-based metaslab selection. 185 */ 186 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 187 188 /* 189 * When using segment-based metaslab selection, we will continue 190 * allocating from the active metaslab until we have exhausted 191 * zfs_metaslab_switch_threshold of its buckets. 192 */ 193 int zfs_metaslab_switch_threshold = 2; 194 195 /* 196 * Internal switch to enable/disable the metaslab allocation tracing 197 * facility. 198 */ 199 boolean_t metaslab_trace_enabled = B_TRUE; 200 201 /* 202 * Maximum entries that the metaslab allocation tracing facility will keep 203 * in a given list when running in non-debug mode. We limit the number 204 * of entries in non-debug mode to prevent us from using up too much memory. 205 * The limit should be sufficiently large that we don't expect any allocation 206 * to every exceed this value. In debug mode, the system will panic if this 207 * limit is ever reached allowing for further investigation. 208 */ 209 uint64_t metaslab_trace_max_entries = 5000; 210 211 static uint64_t metaslab_weight(metaslab_t *); 212 static void metaslab_set_fragmentation(metaslab_t *); 213 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 214 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 215 216 kmem_cache_t *metaslab_alloc_trace_cache; 217 218 /* 219 * ========================================================================== 220 * Metaslab classes 221 * ========================================================================== 222 */ 223 metaslab_class_t * 224 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 225 { 226 metaslab_class_t *mc; 227 228 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 229 230 mc->mc_spa = spa; 231 mc->mc_rotor = NULL; 232 mc->mc_ops = ops; 233 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 234 refcount_create_tracked(&mc->mc_alloc_slots); 235 236 return (mc); 237 } 238 239 void 240 metaslab_class_destroy(metaslab_class_t *mc) 241 { 242 ASSERT(mc->mc_rotor == NULL); 243 ASSERT(mc->mc_alloc == 0); 244 ASSERT(mc->mc_deferred == 0); 245 ASSERT(mc->mc_space == 0); 246 ASSERT(mc->mc_dspace == 0); 247 248 refcount_destroy(&mc->mc_alloc_slots); 249 mutex_destroy(&mc->mc_lock); 250 kmem_free(mc, sizeof (metaslab_class_t)); 251 } 252 253 int 254 metaslab_class_validate(metaslab_class_t *mc) 255 { 256 metaslab_group_t *mg; 257 vdev_t *vd; 258 259 /* 260 * Must hold one of the spa_config locks. 261 */ 262 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 263 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 264 265 if ((mg = mc->mc_rotor) == NULL) 266 return (0); 267 268 do { 269 vd = mg->mg_vd; 270 ASSERT(vd->vdev_mg != NULL); 271 ASSERT3P(vd->vdev_top, ==, vd); 272 ASSERT3P(mg->mg_class, ==, mc); 273 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 274 } while ((mg = mg->mg_next) != mc->mc_rotor); 275 276 return (0); 277 } 278 279 void 280 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 281 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 282 { 283 atomic_add_64(&mc->mc_alloc, alloc_delta); 284 atomic_add_64(&mc->mc_deferred, defer_delta); 285 atomic_add_64(&mc->mc_space, space_delta); 286 atomic_add_64(&mc->mc_dspace, dspace_delta); 287 } 288 289 uint64_t 290 metaslab_class_get_alloc(metaslab_class_t *mc) 291 { 292 return (mc->mc_alloc); 293 } 294 295 uint64_t 296 metaslab_class_get_deferred(metaslab_class_t *mc) 297 { 298 return (mc->mc_deferred); 299 } 300 301 uint64_t 302 metaslab_class_get_space(metaslab_class_t *mc) 303 { 304 return (mc->mc_space); 305 } 306 307 uint64_t 308 metaslab_class_get_dspace(metaslab_class_t *mc) 309 { 310 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 311 } 312 313 void 314 metaslab_class_histogram_verify(metaslab_class_t *mc) 315 { 316 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 317 uint64_t *mc_hist; 318 int i; 319 320 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 321 return; 322 323 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 324 KM_SLEEP); 325 326 for (int c = 0; c < rvd->vdev_children; c++) { 327 vdev_t *tvd = rvd->vdev_child[c]; 328 metaslab_group_t *mg = tvd->vdev_mg; 329 330 /* 331 * Skip any holes, uninitialized top-levels, or 332 * vdevs that are not in this metalab class. 333 */ 334 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 335 mg->mg_class != mc) { 336 continue; 337 } 338 339 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 340 mc_hist[i] += mg->mg_histogram[i]; 341 } 342 343 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 344 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 345 346 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 347 } 348 349 /* 350 * Calculate the metaslab class's fragmentation metric. The metric 351 * is weighted based on the space contribution of each metaslab group. 352 * The return value will be a number between 0 and 100 (inclusive), or 353 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 354 * zfs_frag_table for more information about the metric. 355 */ 356 uint64_t 357 metaslab_class_fragmentation(metaslab_class_t *mc) 358 { 359 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 360 uint64_t fragmentation = 0; 361 362 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 363 364 for (int c = 0; c < rvd->vdev_children; c++) { 365 vdev_t *tvd = rvd->vdev_child[c]; 366 metaslab_group_t *mg = tvd->vdev_mg; 367 368 /* 369 * Skip any holes, uninitialized top-levels, 370 * or vdevs that are not in this metalab class. 371 */ 372 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 373 mg->mg_class != mc) { 374 continue; 375 } 376 377 /* 378 * If a metaslab group does not contain a fragmentation 379 * metric then just bail out. 380 */ 381 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 382 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 383 return (ZFS_FRAG_INVALID); 384 } 385 386 /* 387 * Determine how much this metaslab_group is contributing 388 * to the overall pool fragmentation metric. 389 */ 390 fragmentation += mg->mg_fragmentation * 391 metaslab_group_get_space(mg); 392 } 393 fragmentation /= metaslab_class_get_space(mc); 394 395 ASSERT3U(fragmentation, <=, 100); 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (fragmentation); 398 } 399 400 /* 401 * Calculate the amount of expandable space that is available in 402 * this metaslab class. If a device is expanded then its expandable 403 * space will be the amount of allocatable space that is currently not 404 * part of this metaslab class. 405 */ 406 uint64_t 407 metaslab_class_expandable_space(metaslab_class_t *mc) 408 { 409 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 410 uint64_t space = 0; 411 412 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 413 for (int c = 0; c < rvd->vdev_children; c++) { 414 uint64_t tspace; 415 vdev_t *tvd = rvd->vdev_child[c]; 416 metaslab_group_t *mg = tvd->vdev_mg; 417 418 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 419 mg->mg_class != mc) { 420 continue; 421 } 422 423 /* 424 * Calculate if we have enough space to add additional 425 * metaslabs. We report the expandable space in terms 426 * of the metaslab size since that's the unit of expansion. 427 * Adjust by efi system partition size. 428 */ 429 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 430 if (tspace > mc->mc_spa->spa_bootsize) { 431 tspace -= mc->mc_spa->spa_bootsize; 432 } 433 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 434 } 435 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 436 return (space); 437 } 438 439 static int 440 metaslab_compare(const void *x1, const void *x2) 441 { 442 const metaslab_t *m1 = x1; 443 const metaslab_t *m2 = x2; 444 445 if (m1->ms_weight < m2->ms_weight) 446 return (1); 447 if (m1->ms_weight > m2->ms_weight) 448 return (-1); 449 450 /* 451 * If the weights are identical, use the offset to force uniqueness. 452 */ 453 if (m1->ms_start < m2->ms_start) 454 return (-1); 455 if (m1->ms_start > m2->ms_start) 456 return (1); 457 458 ASSERT3P(m1, ==, m2); 459 460 return (0); 461 } 462 463 /* 464 * Verify that the space accounting on disk matches the in-core range_trees. 465 */ 466 void 467 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 468 { 469 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 470 uint64_t allocated = 0; 471 uint64_t sm_free_space, msp_free_space; 472 473 ASSERT(MUTEX_HELD(&msp->ms_lock)); 474 475 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 476 return; 477 478 /* 479 * We can only verify the metaslab space when we're called 480 * from syncing context with a loaded metaslab that has an allocated 481 * space map. Calling this in non-syncing context does not 482 * provide a consistent view of the metaslab since we're performing 483 * allocations in the future. 484 */ 485 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 486 !msp->ms_loaded) 487 return; 488 489 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 490 space_map_alloc_delta(msp->ms_sm); 491 492 /* 493 * Account for future allocations since we would have already 494 * deducted that space from the ms_freetree. 495 */ 496 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 497 allocated += 498 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 499 } 500 501 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 502 msp->ms_deferspace + range_tree_space(msp->ms_freed); 503 504 VERIFY3U(sm_free_space, ==, msp_free_space); 505 } 506 507 /* 508 * ========================================================================== 509 * Metaslab groups 510 * ========================================================================== 511 */ 512 /* 513 * Update the allocatable flag and the metaslab group's capacity. 514 * The allocatable flag is set to true if the capacity is below 515 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 516 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 517 * transitions from allocatable to non-allocatable or vice versa then the 518 * metaslab group's class is updated to reflect the transition. 519 */ 520 static void 521 metaslab_group_alloc_update(metaslab_group_t *mg) 522 { 523 vdev_t *vd = mg->mg_vd; 524 metaslab_class_t *mc = mg->mg_class; 525 vdev_stat_t *vs = &vd->vdev_stat; 526 boolean_t was_allocatable; 527 boolean_t was_initialized; 528 529 ASSERT(vd == vd->vdev_top); 530 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 531 SCL_ALLOC); 532 533 mutex_enter(&mg->mg_lock); 534 was_allocatable = mg->mg_allocatable; 535 was_initialized = mg->mg_initialized; 536 537 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 538 (vs->vs_space + 1); 539 540 mutex_enter(&mc->mc_lock); 541 542 /* 543 * If the metaslab group was just added then it won't 544 * have any space until we finish syncing out this txg. 545 * At that point we will consider it initialized and available 546 * for allocations. We also don't consider non-activated 547 * metaslab groups (e.g. vdevs that are in the middle of being removed) 548 * to be initialized, because they can't be used for allocation. 549 */ 550 mg->mg_initialized = metaslab_group_initialized(mg); 551 if (!was_initialized && mg->mg_initialized) { 552 mc->mc_groups++; 553 } else if (was_initialized && !mg->mg_initialized) { 554 ASSERT3U(mc->mc_groups, >, 0); 555 mc->mc_groups--; 556 } 557 if (mg->mg_initialized) 558 mg->mg_no_free_space = B_FALSE; 559 560 /* 561 * A metaslab group is considered allocatable if it has plenty 562 * of free space or is not heavily fragmented. We only take 563 * fragmentation into account if the metaslab group has a valid 564 * fragmentation metric (i.e. a value between 0 and 100). 565 */ 566 mg->mg_allocatable = (mg->mg_activation_count > 0 && 567 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 568 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 569 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 570 571 /* 572 * The mc_alloc_groups maintains a count of the number of 573 * groups in this metaslab class that are still above the 574 * zfs_mg_noalloc_threshold. This is used by the allocating 575 * threads to determine if they should avoid allocations to 576 * a given group. The allocator will avoid allocations to a group 577 * if that group has reached or is below the zfs_mg_noalloc_threshold 578 * and there are still other groups that are above the threshold. 579 * When a group transitions from allocatable to non-allocatable or 580 * vice versa we update the metaslab class to reflect that change. 581 * When the mc_alloc_groups value drops to 0 that means that all 582 * groups have reached the zfs_mg_noalloc_threshold making all groups 583 * eligible for allocations. This effectively means that all devices 584 * are balanced again. 585 */ 586 if (was_allocatable && !mg->mg_allocatable) 587 mc->mc_alloc_groups--; 588 else if (!was_allocatable && mg->mg_allocatable) 589 mc->mc_alloc_groups++; 590 mutex_exit(&mc->mc_lock); 591 592 mutex_exit(&mg->mg_lock); 593 } 594 595 metaslab_group_t * 596 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 597 { 598 metaslab_group_t *mg; 599 600 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 601 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 602 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 603 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 604 mg->mg_vd = vd; 605 mg->mg_class = mc; 606 mg->mg_activation_count = 0; 607 mg->mg_initialized = B_FALSE; 608 mg->mg_no_free_space = B_TRUE; 609 refcount_create_tracked(&mg->mg_alloc_queue_depth); 610 611 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 612 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 613 614 return (mg); 615 } 616 617 void 618 metaslab_group_destroy(metaslab_group_t *mg) 619 { 620 ASSERT(mg->mg_prev == NULL); 621 ASSERT(mg->mg_next == NULL); 622 /* 623 * We may have gone below zero with the activation count 624 * either because we never activated in the first place or 625 * because we're done, and possibly removing the vdev. 626 */ 627 ASSERT(mg->mg_activation_count <= 0); 628 629 taskq_destroy(mg->mg_taskq); 630 avl_destroy(&mg->mg_metaslab_tree); 631 mutex_destroy(&mg->mg_lock); 632 refcount_destroy(&mg->mg_alloc_queue_depth); 633 kmem_free(mg, sizeof (metaslab_group_t)); 634 } 635 636 void 637 metaslab_group_activate(metaslab_group_t *mg) 638 { 639 metaslab_class_t *mc = mg->mg_class; 640 metaslab_group_t *mgprev, *mgnext; 641 642 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 643 644 ASSERT(mc->mc_rotor != mg); 645 ASSERT(mg->mg_prev == NULL); 646 ASSERT(mg->mg_next == NULL); 647 ASSERT(mg->mg_activation_count <= 0); 648 649 if (++mg->mg_activation_count <= 0) 650 return; 651 652 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 653 metaslab_group_alloc_update(mg); 654 655 if ((mgprev = mc->mc_rotor) == NULL) { 656 mg->mg_prev = mg; 657 mg->mg_next = mg; 658 } else { 659 mgnext = mgprev->mg_next; 660 mg->mg_prev = mgprev; 661 mg->mg_next = mgnext; 662 mgprev->mg_next = mg; 663 mgnext->mg_prev = mg; 664 } 665 mc->mc_rotor = mg; 666 } 667 668 /* 669 * Passivate a metaslab group and remove it from the allocation rotor. 670 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 671 * a metaslab group. This function will momentarily drop spa_config_locks 672 * that are lower than the SCL_ALLOC lock (see comment below). 673 */ 674 void 675 metaslab_group_passivate(metaslab_group_t *mg) 676 { 677 metaslab_class_t *mc = mg->mg_class; 678 spa_t *spa = mc->mc_spa; 679 metaslab_group_t *mgprev, *mgnext; 680 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 681 682 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 683 (SCL_ALLOC | SCL_ZIO)); 684 685 if (--mg->mg_activation_count != 0) { 686 ASSERT(mc->mc_rotor != mg); 687 ASSERT(mg->mg_prev == NULL); 688 ASSERT(mg->mg_next == NULL); 689 ASSERT(mg->mg_activation_count < 0); 690 return; 691 } 692 693 /* 694 * The spa_config_lock is an array of rwlocks, ordered as 695 * follows (from highest to lowest): 696 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 697 * SCL_ZIO > SCL_FREE > SCL_VDEV 698 * (For more information about the spa_config_lock see spa_misc.c) 699 * The higher the lock, the broader its coverage. When we passivate 700 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 701 * config locks. However, the metaslab group's taskq might be trying 702 * to preload metaslabs so we must drop the SCL_ZIO lock and any 703 * lower locks to allow the I/O to complete. At a minimum, 704 * we continue to hold the SCL_ALLOC lock, which prevents any future 705 * allocations from taking place and any changes to the vdev tree. 706 */ 707 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 708 taskq_wait(mg->mg_taskq); 709 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 710 metaslab_group_alloc_update(mg); 711 712 mgprev = mg->mg_prev; 713 mgnext = mg->mg_next; 714 715 if (mg == mgnext) { 716 mc->mc_rotor = NULL; 717 } else { 718 mc->mc_rotor = mgnext; 719 mgprev->mg_next = mgnext; 720 mgnext->mg_prev = mgprev; 721 } 722 723 mg->mg_prev = NULL; 724 mg->mg_next = NULL; 725 } 726 727 boolean_t 728 metaslab_group_initialized(metaslab_group_t *mg) 729 { 730 vdev_t *vd = mg->mg_vd; 731 vdev_stat_t *vs = &vd->vdev_stat; 732 733 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 734 } 735 736 uint64_t 737 metaslab_group_get_space(metaslab_group_t *mg) 738 { 739 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 740 } 741 742 void 743 metaslab_group_histogram_verify(metaslab_group_t *mg) 744 { 745 uint64_t *mg_hist; 746 vdev_t *vd = mg->mg_vd; 747 uint64_t ashift = vd->vdev_ashift; 748 int i; 749 750 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 751 return; 752 753 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 754 KM_SLEEP); 755 756 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 757 SPACE_MAP_HISTOGRAM_SIZE + ashift); 758 759 for (int m = 0; m < vd->vdev_ms_count; m++) { 760 metaslab_t *msp = vd->vdev_ms[m]; 761 762 if (msp->ms_sm == NULL) 763 continue; 764 765 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 766 mg_hist[i + ashift] += 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 770 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 771 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 772 773 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 774 } 775 776 static void 777 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 778 { 779 metaslab_class_t *mc = mg->mg_class; 780 uint64_t ashift = mg->mg_vd->vdev_ashift; 781 782 ASSERT(MUTEX_HELD(&msp->ms_lock)); 783 if (msp->ms_sm == NULL) 784 return; 785 786 mutex_enter(&mg->mg_lock); 787 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 788 mg->mg_histogram[i + ashift] += 789 msp->ms_sm->sm_phys->smp_histogram[i]; 790 mc->mc_histogram[i + ashift] += 791 msp->ms_sm->sm_phys->smp_histogram[i]; 792 } 793 mutex_exit(&mg->mg_lock); 794 } 795 796 void 797 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 798 { 799 metaslab_class_t *mc = mg->mg_class; 800 uint64_t ashift = mg->mg_vd->vdev_ashift; 801 802 ASSERT(MUTEX_HELD(&msp->ms_lock)); 803 if (msp->ms_sm == NULL) 804 return; 805 806 mutex_enter(&mg->mg_lock); 807 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 808 ASSERT3U(mg->mg_histogram[i + ashift], >=, 809 msp->ms_sm->sm_phys->smp_histogram[i]); 810 ASSERT3U(mc->mc_histogram[i + ashift], >=, 811 msp->ms_sm->sm_phys->smp_histogram[i]); 812 813 mg->mg_histogram[i + ashift] -= 814 msp->ms_sm->sm_phys->smp_histogram[i]; 815 mc->mc_histogram[i + ashift] -= 816 msp->ms_sm->sm_phys->smp_histogram[i]; 817 } 818 mutex_exit(&mg->mg_lock); 819 } 820 821 static void 822 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 823 { 824 ASSERT(msp->ms_group == NULL); 825 mutex_enter(&mg->mg_lock); 826 msp->ms_group = mg; 827 msp->ms_weight = 0; 828 avl_add(&mg->mg_metaslab_tree, msp); 829 mutex_exit(&mg->mg_lock); 830 831 mutex_enter(&msp->ms_lock); 832 metaslab_group_histogram_add(mg, msp); 833 mutex_exit(&msp->ms_lock); 834 } 835 836 static void 837 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 838 { 839 mutex_enter(&msp->ms_lock); 840 metaslab_group_histogram_remove(mg, msp); 841 mutex_exit(&msp->ms_lock); 842 843 mutex_enter(&mg->mg_lock); 844 ASSERT(msp->ms_group == mg); 845 avl_remove(&mg->mg_metaslab_tree, msp); 846 msp->ms_group = NULL; 847 mutex_exit(&mg->mg_lock); 848 } 849 850 static void 851 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 852 { 853 /* 854 * Although in principle the weight can be any value, in 855 * practice we do not use values in the range [1, 511]. 856 */ 857 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 858 ASSERT(MUTEX_HELD(&msp->ms_lock)); 859 860 mutex_enter(&mg->mg_lock); 861 ASSERT(msp->ms_group == mg); 862 avl_remove(&mg->mg_metaslab_tree, msp); 863 msp->ms_weight = weight; 864 avl_add(&mg->mg_metaslab_tree, msp); 865 mutex_exit(&mg->mg_lock); 866 } 867 868 /* 869 * Calculate the fragmentation for a given metaslab group. We can use 870 * a simple average here since all metaslabs within the group must have 871 * the same size. The return value will be a value between 0 and 100 872 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 873 * group have a fragmentation metric. 874 */ 875 uint64_t 876 metaslab_group_fragmentation(metaslab_group_t *mg) 877 { 878 vdev_t *vd = mg->mg_vd; 879 uint64_t fragmentation = 0; 880 uint64_t valid_ms = 0; 881 882 for (int m = 0; m < vd->vdev_ms_count; m++) { 883 metaslab_t *msp = vd->vdev_ms[m]; 884 885 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 886 continue; 887 888 valid_ms++; 889 fragmentation += msp->ms_fragmentation; 890 } 891 892 if (valid_ms <= vd->vdev_ms_count / 2) 893 return (ZFS_FRAG_INVALID); 894 895 fragmentation /= valid_ms; 896 ASSERT3U(fragmentation, <=, 100); 897 return (fragmentation); 898 } 899 900 /* 901 * Determine if a given metaslab group should skip allocations. A metaslab 902 * group should avoid allocations if its free capacity is less than the 903 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 904 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 905 * that can still handle allocations. If the allocation throttle is enabled 906 * then we skip allocations to devices that have reached their maximum 907 * allocation queue depth unless the selected metaslab group is the only 908 * eligible group remaining. 909 */ 910 static boolean_t 911 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 912 uint64_t psize) 913 { 914 spa_t *spa = mg->mg_vd->vdev_spa; 915 metaslab_class_t *mc = mg->mg_class; 916 917 /* 918 * We can only consider skipping this metaslab group if it's 919 * in the normal metaslab class and there are other metaslab 920 * groups to select from. Otherwise, we always consider it eligible 921 * for allocations. 922 */ 923 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 924 return (B_TRUE); 925 926 /* 927 * If the metaslab group's mg_allocatable flag is set (see comments 928 * in metaslab_group_alloc_update() for more information) and 929 * the allocation throttle is disabled then allow allocations to this 930 * device. However, if the allocation throttle is enabled then 931 * check if we have reached our allocation limit (mg_alloc_queue_depth) 932 * to determine if we should allow allocations to this metaslab group. 933 * If all metaslab groups are no longer considered allocatable 934 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 935 * gang block size then we allow allocations on this metaslab group 936 * regardless of the mg_allocatable or throttle settings. 937 */ 938 if (mg->mg_allocatable) { 939 metaslab_group_t *mgp; 940 int64_t qdepth; 941 uint64_t qmax = mg->mg_max_alloc_queue_depth; 942 943 if (!mc->mc_alloc_throttle_enabled) 944 return (B_TRUE); 945 946 /* 947 * If this metaslab group does not have any free space, then 948 * there is no point in looking further. 949 */ 950 if (mg->mg_no_free_space) 951 return (B_FALSE); 952 953 qdepth = refcount_count(&mg->mg_alloc_queue_depth); 954 955 /* 956 * If this metaslab group is below its qmax or it's 957 * the only allocatable metasable group, then attempt 958 * to allocate from it. 959 */ 960 if (qdepth < qmax || mc->mc_alloc_groups == 1) 961 return (B_TRUE); 962 ASSERT3U(mc->mc_alloc_groups, >, 1); 963 964 /* 965 * Since this metaslab group is at or over its qmax, we 966 * need to determine if there are metaslab groups after this 967 * one that might be able to handle this allocation. This is 968 * racy since we can't hold the locks for all metaslab 969 * groups at the same time when we make this check. 970 */ 971 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 972 qmax = mgp->mg_max_alloc_queue_depth; 973 974 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 975 976 /* 977 * If there is another metaslab group that 978 * might be able to handle the allocation, then 979 * we return false so that we skip this group. 980 */ 981 if (qdepth < qmax && !mgp->mg_no_free_space) 982 return (B_FALSE); 983 } 984 985 /* 986 * We didn't find another group to handle the allocation 987 * so we can't skip this metaslab group even though 988 * we are at or over our qmax. 989 */ 990 return (B_TRUE); 991 992 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 993 return (B_TRUE); 994 } 995 return (B_FALSE); 996 } 997 998 /* 999 * ========================================================================== 1000 * Range tree callbacks 1001 * ========================================================================== 1002 */ 1003 1004 /* 1005 * Comparison function for the private size-ordered tree. Tree is sorted 1006 * by size, larger sizes at the end of the tree. 1007 */ 1008 static int 1009 metaslab_rangesize_compare(const void *x1, const void *x2) 1010 { 1011 const range_seg_t *r1 = x1; 1012 const range_seg_t *r2 = x2; 1013 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1014 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1015 1016 if (rs_size1 < rs_size2) 1017 return (-1); 1018 if (rs_size1 > rs_size2) 1019 return (1); 1020 1021 if (r1->rs_start < r2->rs_start) 1022 return (-1); 1023 1024 if (r1->rs_start > r2->rs_start) 1025 return (1); 1026 1027 return (0); 1028 } 1029 1030 /* 1031 * Create any block allocator specific components. The current allocators 1032 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1033 */ 1034 static void 1035 metaslab_rt_create(range_tree_t *rt, void *arg) 1036 { 1037 metaslab_t *msp = arg; 1038 1039 ASSERT3P(rt->rt_arg, ==, msp); 1040 ASSERT(msp->ms_allocatable == NULL); 1041 1042 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1043 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1044 } 1045 1046 /* 1047 * Destroy the block allocator specific components. 1048 */ 1049 static void 1050 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1051 { 1052 metaslab_t *msp = arg; 1053 1054 ASSERT3P(rt->rt_arg, ==, msp); 1055 ASSERT3P(msp->ms_allocatable, ==, rt); 1056 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1057 1058 avl_destroy(&msp->ms_allocatable_by_size); 1059 } 1060 1061 static void 1062 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1063 { 1064 metaslab_t *msp = arg; 1065 1066 ASSERT3P(rt->rt_arg, ==, msp); 1067 ASSERT3P(msp->ms_allocatable, ==, rt); 1068 VERIFY(!msp->ms_condensing); 1069 avl_add(&msp->ms_allocatable_by_size, rs); 1070 } 1071 1072 static void 1073 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1074 { 1075 metaslab_t *msp = arg; 1076 1077 ASSERT3P(rt->rt_arg, ==, msp); 1078 ASSERT3P(msp->ms_allocatable, ==, rt); 1079 VERIFY(!msp->ms_condensing); 1080 avl_remove(&msp->ms_allocatable_by_size, rs); 1081 } 1082 1083 static void 1084 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1085 { 1086 metaslab_t *msp = arg; 1087 1088 ASSERT3P(rt->rt_arg, ==, msp); 1089 ASSERT3P(msp->ms_allocatable, ==, rt); 1090 1091 /* 1092 * Normally one would walk the tree freeing nodes along the way. 1093 * Since the nodes are shared with the range trees we can avoid 1094 * walking all nodes and just reinitialize the avl tree. The nodes 1095 * will be freed by the range tree, so we don't want to free them here. 1096 */ 1097 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1098 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1099 } 1100 1101 static range_tree_ops_t metaslab_rt_ops = { 1102 metaslab_rt_create, 1103 metaslab_rt_destroy, 1104 metaslab_rt_add, 1105 metaslab_rt_remove, 1106 metaslab_rt_vacate 1107 }; 1108 1109 /* 1110 * ========================================================================== 1111 * Common allocator routines 1112 * ========================================================================== 1113 */ 1114 1115 /* 1116 * Return the maximum contiguous segment within the metaslab. 1117 */ 1118 uint64_t 1119 metaslab_block_maxsize(metaslab_t *msp) 1120 { 1121 avl_tree_t *t = &msp->ms_allocatable_by_size; 1122 range_seg_t *rs; 1123 1124 if (t == NULL || (rs = avl_last(t)) == NULL) 1125 return (0ULL); 1126 1127 return (rs->rs_end - rs->rs_start); 1128 } 1129 1130 static range_seg_t * 1131 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1132 { 1133 range_seg_t *rs, rsearch; 1134 avl_index_t where; 1135 1136 rsearch.rs_start = start; 1137 rsearch.rs_end = start + size; 1138 1139 rs = avl_find(t, &rsearch, &where); 1140 if (rs == NULL) { 1141 rs = avl_nearest(t, where, AVL_AFTER); 1142 } 1143 1144 return (rs); 1145 } 1146 1147 /* 1148 * This is a helper function that can be used by the allocator to find 1149 * a suitable block to allocate. This will search the specified AVL 1150 * tree looking for a block that matches the specified criteria. 1151 */ 1152 static uint64_t 1153 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1154 uint64_t align) 1155 { 1156 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1157 1158 while (rs != NULL) { 1159 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1160 1161 if (offset + size <= rs->rs_end) { 1162 *cursor = offset + size; 1163 return (offset); 1164 } 1165 rs = AVL_NEXT(t, rs); 1166 } 1167 1168 /* 1169 * If we know we've searched the whole map (*cursor == 0), give up. 1170 * Otherwise, reset the cursor to the beginning and try again. 1171 */ 1172 if (*cursor == 0) 1173 return (-1ULL); 1174 1175 *cursor = 0; 1176 return (metaslab_block_picker(t, cursor, size, align)); 1177 } 1178 1179 /* 1180 * ========================================================================== 1181 * The first-fit block allocator 1182 * ========================================================================== 1183 */ 1184 static uint64_t 1185 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1186 { 1187 /* 1188 * Find the largest power of 2 block size that evenly divides the 1189 * requested size. This is used to try to allocate blocks with similar 1190 * alignment from the same area of the metaslab (i.e. same cursor 1191 * bucket) but it does not guarantee that other allocations sizes 1192 * may exist in the same region. 1193 */ 1194 uint64_t align = size & -size; 1195 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1196 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1197 1198 return (metaslab_block_picker(t, cursor, size, align)); 1199 } 1200 1201 static metaslab_ops_t metaslab_ff_ops = { 1202 metaslab_ff_alloc 1203 }; 1204 1205 /* 1206 * ========================================================================== 1207 * Dynamic block allocator - 1208 * Uses the first fit allocation scheme until space get low and then 1209 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1210 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1211 * ========================================================================== 1212 */ 1213 static uint64_t 1214 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1215 { 1216 /* 1217 * Find the largest power of 2 block size that evenly divides the 1218 * requested size. This is used to try to allocate blocks with similar 1219 * alignment from the same area of the metaslab (i.e. same cursor 1220 * bucket) but it does not guarantee that other allocations sizes 1221 * may exist in the same region. 1222 */ 1223 uint64_t align = size & -size; 1224 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1225 range_tree_t *rt = msp->ms_allocatable; 1226 avl_tree_t *t = &rt->rt_root; 1227 uint64_t max_size = metaslab_block_maxsize(msp); 1228 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1229 1230 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1231 ASSERT3U(avl_numnodes(t), ==, 1232 avl_numnodes(&msp->ms_allocatable_by_size)); 1233 1234 if (max_size < size) 1235 return (-1ULL); 1236 1237 /* 1238 * If we're running low on space switch to using the size 1239 * sorted AVL tree (best-fit). 1240 */ 1241 if (max_size < metaslab_df_alloc_threshold || 1242 free_pct < metaslab_df_free_pct) { 1243 t = &msp->ms_allocatable_by_size; 1244 *cursor = 0; 1245 } 1246 1247 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1248 } 1249 1250 static metaslab_ops_t metaslab_df_ops = { 1251 metaslab_df_alloc 1252 }; 1253 1254 /* 1255 * ========================================================================== 1256 * Cursor fit block allocator - 1257 * Select the largest region in the metaslab, set the cursor to the beginning 1258 * of the range and the cursor_end to the end of the range. As allocations 1259 * are made advance the cursor. Continue allocating from the cursor until 1260 * the range is exhausted and then find a new range. 1261 * ========================================================================== 1262 */ 1263 static uint64_t 1264 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1265 { 1266 range_tree_t *rt = msp->ms_allocatable; 1267 avl_tree_t *t = &msp->ms_allocatable_by_size; 1268 uint64_t *cursor = &msp->ms_lbas[0]; 1269 uint64_t *cursor_end = &msp->ms_lbas[1]; 1270 uint64_t offset = 0; 1271 1272 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1273 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1274 1275 ASSERT3U(*cursor_end, >=, *cursor); 1276 1277 if ((*cursor + size) > *cursor_end) { 1278 range_seg_t *rs; 1279 1280 rs = avl_last(&msp->ms_allocatable_by_size); 1281 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1282 return (-1ULL); 1283 1284 *cursor = rs->rs_start; 1285 *cursor_end = rs->rs_end; 1286 } 1287 1288 offset = *cursor; 1289 *cursor += size; 1290 1291 return (offset); 1292 } 1293 1294 static metaslab_ops_t metaslab_cf_ops = { 1295 metaslab_cf_alloc 1296 }; 1297 1298 /* 1299 * ========================================================================== 1300 * New dynamic fit allocator - 1301 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1302 * contiguous blocks. If no region is found then just use the largest segment 1303 * that remains. 1304 * ========================================================================== 1305 */ 1306 1307 /* 1308 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1309 * to request from the allocator. 1310 */ 1311 uint64_t metaslab_ndf_clump_shift = 4; 1312 1313 static uint64_t 1314 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1315 { 1316 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1317 avl_index_t where; 1318 range_seg_t *rs, rsearch; 1319 uint64_t hbit = highbit64(size); 1320 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1321 uint64_t max_size = metaslab_block_maxsize(msp); 1322 1323 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1324 ASSERT3U(avl_numnodes(t), ==, 1325 avl_numnodes(&msp->ms_allocatable_by_size)); 1326 1327 if (max_size < size) 1328 return (-1ULL); 1329 1330 rsearch.rs_start = *cursor; 1331 rsearch.rs_end = *cursor + size; 1332 1333 rs = avl_find(t, &rsearch, &where); 1334 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1335 t = &msp->ms_allocatable_by_size; 1336 1337 rsearch.rs_start = 0; 1338 rsearch.rs_end = MIN(max_size, 1339 1ULL << (hbit + metaslab_ndf_clump_shift)); 1340 rs = avl_find(t, &rsearch, &where); 1341 if (rs == NULL) 1342 rs = avl_nearest(t, where, AVL_AFTER); 1343 ASSERT(rs != NULL); 1344 } 1345 1346 if ((rs->rs_end - rs->rs_start) >= size) { 1347 *cursor = rs->rs_start + size; 1348 return (rs->rs_start); 1349 } 1350 return (-1ULL); 1351 } 1352 1353 static metaslab_ops_t metaslab_ndf_ops = { 1354 metaslab_ndf_alloc 1355 }; 1356 1357 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1358 1359 /* 1360 * ========================================================================== 1361 * Metaslabs 1362 * ========================================================================== 1363 */ 1364 1365 /* 1366 * Wait for any in-progress metaslab loads to complete. 1367 */ 1368 void 1369 metaslab_load_wait(metaslab_t *msp) 1370 { 1371 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1372 1373 while (msp->ms_loading) { 1374 ASSERT(!msp->ms_loaded); 1375 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1376 } 1377 } 1378 1379 int 1380 metaslab_load(metaslab_t *msp) 1381 { 1382 int error = 0; 1383 boolean_t success = B_FALSE; 1384 1385 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1386 ASSERT(!msp->ms_loaded); 1387 ASSERT(!msp->ms_loading); 1388 1389 msp->ms_loading = B_TRUE; 1390 /* 1391 * Nobody else can manipulate a loading metaslab, so it's now safe 1392 * to drop the lock. This way we don't have to hold the lock while 1393 * reading the spacemap from disk. 1394 */ 1395 mutex_exit(&msp->ms_lock); 1396 1397 /* 1398 * If the space map has not been allocated yet, then treat 1399 * all the space in the metaslab as free and add it to ms_allocatable. 1400 */ 1401 if (msp->ms_sm != NULL) { 1402 error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1403 SM_FREE); 1404 } else { 1405 range_tree_add(msp->ms_allocatable, 1406 msp->ms_start, msp->ms_size); 1407 } 1408 1409 success = (error == 0); 1410 1411 mutex_enter(&msp->ms_lock); 1412 msp->ms_loading = B_FALSE; 1413 1414 if (success) { 1415 ASSERT3P(msp->ms_group, !=, NULL); 1416 msp->ms_loaded = B_TRUE; 1417 1418 /* 1419 * If the metaslab already has a spacemap, then we need to 1420 * remove all segments from the defer tree; otherwise, the 1421 * metaslab is completely empty and we can skip this. 1422 */ 1423 if (msp->ms_sm != NULL) { 1424 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1425 range_tree_walk(msp->ms_defer[t], 1426 range_tree_remove, msp->ms_allocatable); 1427 } 1428 } 1429 msp->ms_max_size = metaslab_block_maxsize(msp); 1430 } 1431 cv_broadcast(&msp->ms_load_cv); 1432 return (error); 1433 } 1434 1435 void 1436 metaslab_unload(metaslab_t *msp) 1437 { 1438 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1439 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1440 msp->ms_loaded = B_FALSE; 1441 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1442 msp->ms_max_size = 0; 1443 } 1444 1445 int 1446 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1447 metaslab_t **msp) 1448 { 1449 vdev_t *vd = mg->mg_vd; 1450 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1451 metaslab_t *ms; 1452 int error; 1453 1454 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1455 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1456 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1457 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1458 ms->ms_id = id; 1459 ms->ms_start = id << vd->vdev_ms_shift; 1460 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1461 1462 /* 1463 * We only open space map objects that already exist. All others 1464 * will be opened when we finally allocate an object for it. 1465 */ 1466 if (object != 0) { 1467 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1468 ms->ms_size, vd->vdev_ashift); 1469 1470 if (error != 0) { 1471 kmem_free(ms, sizeof (metaslab_t)); 1472 return (error); 1473 } 1474 1475 ASSERT(ms->ms_sm != NULL); 1476 } 1477 1478 /* 1479 * We create the main range tree here, but we don't create the 1480 * other range trees until metaslab_sync_done(). This serves 1481 * two purposes: it allows metaslab_sync_done() to detect the 1482 * addition of new space; and for debugging, it ensures that we'd 1483 * data fault on any attempt to use this metaslab before it's ready. 1484 */ 1485 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1486 metaslab_group_add(mg, ms); 1487 1488 metaslab_set_fragmentation(ms); 1489 1490 /* 1491 * If we're opening an existing pool (txg == 0) or creating 1492 * a new one (txg == TXG_INITIAL), all space is available now. 1493 * If we're adding space to an existing pool, the new space 1494 * does not become available until after this txg has synced. 1495 * The metaslab's weight will also be initialized when we sync 1496 * out this txg. This ensures that we don't attempt to allocate 1497 * from it before we have initialized it completely. 1498 */ 1499 if (txg <= TXG_INITIAL) 1500 metaslab_sync_done(ms, 0); 1501 1502 /* 1503 * If metaslab_debug_load is set and we're initializing a metaslab 1504 * that has an allocated space map object then load the its space 1505 * map so that can verify frees. 1506 */ 1507 if (metaslab_debug_load && ms->ms_sm != NULL) { 1508 mutex_enter(&ms->ms_lock); 1509 VERIFY0(metaslab_load(ms)); 1510 mutex_exit(&ms->ms_lock); 1511 } 1512 1513 if (txg != 0) { 1514 vdev_dirty(vd, 0, NULL, txg); 1515 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1516 } 1517 1518 *msp = ms; 1519 1520 return (0); 1521 } 1522 1523 void 1524 metaslab_fini(metaslab_t *msp) 1525 { 1526 metaslab_group_t *mg = msp->ms_group; 1527 1528 metaslab_group_remove(mg, msp); 1529 1530 mutex_enter(&msp->ms_lock); 1531 VERIFY(msp->ms_group == NULL); 1532 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1533 0, -msp->ms_size); 1534 space_map_close(msp->ms_sm); 1535 1536 metaslab_unload(msp); 1537 range_tree_destroy(msp->ms_allocatable); 1538 range_tree_destroy(msp->ms_freeing); 1539 range_tree_destroy(msp->ms_freed); 1540 1541 for (int t = 0; t < TXG_SIZE; t++) { 1542 range_tree_destroy(msp->ms_allocating[t]); 1543 } 1544 1545 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1546 range_tree_destroy(msp->ms_defer[t]); 1547 } 1548 ASSERT0(msp->ms_deferspace); 1549 1550 range_tree_destroy(msp->ms_checkpointing); 1551 1552 mutex_exit(&msp->ms_lock); 1553 cv_destroy(&msp->ms_load_cv); 1554 mutex_destroy(&msp->ms_lock); 1555 mutex_destroy(&msp->ms_sync_lock); 1556 1557 kmem_free(msp, sizeof (metaslab_t)); 1558 } 1559 1560 #define FRAGMENTATION_TABLE_SIZE 17 1561 1562 /* 1563 * This table defines a segment size based fragmentation metric that will 1564 * allow each metaslab to derive its own fragmentation value. This is done 1565 * by calculating the space in each bucket of the spacemap histogram and 1566 * multiplying that by the fragmetation metric in this table. Doing 1567 * this for all buckets and dividing it by the total amount of free 1568 * space in this metaslab (i.e. the total free space in all buckets) gives 1569 * us the fragmentation metric. This means that a high fragmentation metric 1570 * equates to most of the free space being comprised of small segments. 1571 * Conversely, if the metric is low, then most of the free space is in 1572 * large segments. A 10% change in fragmentation equates to approximately 1573 * double the number of segments. 1574 * 1575 * This table defines 0% fragmented space using 16MB segments. Testing has 1576 * shown that segments that are greater than or equal to 16MB do not suffer 1577 * from drastic performance problems. Using this value, we derive the rest 1578 * of the table. Since the fragmentation value is never stored on disk, it 1579 * is possible to change these calculations in the future. 1580 */ 1581 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1582 100, /* 512B */ 1583 100, /* 1K */ 1584 98, /* 2K */ 1585 95, /* 4K */ 1586 90, /* 8K */ 1587 80, /* 16K */ 1588 70, /* 32K */ 1589 60, /* 64K */ 1590 50, /* 128K */ 1591 40, /* 256K */ 1592 30, /* 512K */ 1593 20, /* 1M */ 1594 15, /* 2M */ 1595 10, /* 4M */ 1596 5, /* 8M */ 1597 0 /* 16M */ 1598 }; 1599 1600 /* 1601 * Calclate the metaslab's fragmentation metric. A return value 1602 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1603 * not support this metric. Otherwise, the return value should be in the 1604 * range [0, 100]. 1605 */ 1606 static void 1607 metaslab_set_fragmentation(metaslab_t *msp) 1608 { 1609 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1610 uint64_t fragmentation = 0; 1611 uint64_t total = 0; 1612 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1613 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1614 1615 if (!feature_enabled) { 1616 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1617 return; 1618 } 1619 1620 /* 1621 * A null space map means that the entire metaslab is free 1622 * and thus is not fragmented. 1623 */ 1624 if (msp->ms_sm == NULL) { 1625 msp->ms_fragmentation = 0; 1626 return; 1627 } 1628 1629 /* 1630 * If this metaslab's space map has not been upgraded, flag it 1631 * so that we upgrade next time we encounter it. 1632 */ 1633 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1634 uint64_t txg = spa_syncing_txg(spa); 1635 vdev_t *vd = msp->ms_group->mg_vd; 1636 1637 /* 1638 * If we've reached the final dirty txg, then we must 1639 * be shutting down the pool. We don't want to dirty 1640 * any data past this point so skip setting the condense 1641 * flag. We can retry this action the next time the pool 1642 * is imported. 1643 */ 1644 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1645 msp->ms_condense_wanted = B_TRUE; 1646 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1647 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1648 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1649 vd->vdev_id); 1650 } 1651 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1652 return; 1653 } 1654 1655 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1656 uint64_t space = 0; 1657 uint8_t shift = msp->ms_sm->sm_shift; 1658 1659 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1660 FRAGMENTATION_TABLE_SIZE - 1); 1661 1662 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1663 continue; 1664 1665 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1666 total += space; 1667 1668 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1669 fragmentation += space * zfs_frag_table[idx]; 1670 } 1671 1672 if (total > 0) 1673 fragmentation /= total; 1674 ASSERT3U(fragmentation, <=, 100); 1675 1676 msp->ms_fragmentation = fragmentation; 1677 } 1678 1679 /* 1680 * Compute a weight -- a selection preference value -- for the given metaslab. 1681 * This is based on the amount of free space, the level of fragmentation, 1682 * the LBA range, and whether the metaslab is loaded. 1683 */ 1684 static uint64_t 1685 metaslab_space_weight(metaslab_t *msp) 1686 { 1687 metaslab_group_t *mg = msp->ms_group; 1688 vdev_t *vd = mg->mg_vd; 1689 uint64_t weight, space; 1690 1691 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1692 ASSERT(!vd->vdev_removing); 1693 1694 /* 1695 * The baseline weight is the metaslab's free space. 1696 */ 1697 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1698 1699 if (metaslab_fragmentation_factor_enabled && 1700 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1701 /* 1702 * Use the fragmentation information to inversely scale 1703 * down the baseline weight. We need to ensure that we 1704 * don't exclude this metaslab completely when it's 100% 1705 * fragmented. To avoid this we reduce the fragmented value 1706 * by 1. 1707 */ 1708 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1709 1710 /* 1711 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1712 * this metaslab again. The fragmentation metric may have 1713 * decreased the space to something smaller than 1714 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1715 * so that we can consume any remaining space. 1716 */ 1717 if (space > 0 && space < SPA_MINBLOCKSIZE) 1718 space = SPA_MINBLOCKSIZE; 1719 } 1720 weight = space; 1721 1722 /* 1723 * Modern disks have uniform bit density and constant angular velocity. 1724 * Therefore, the outer recording zones are faster (higher bandwidth) 1725 * than the inner zones by the ratio of outer to inner track diameter, 1726 * which is typically around 2:1. We account for this by assigning 1727 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1728 * In effect, this means that we'll select the metaslab with the most 1729 * free bandwidth rather than simply the one with the most free space. 1730 */ 1731 if (metaslab_lba_weighting_enabled) { 1732 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1733 ASSERT(weight >= space && weight <= 2 * space); 1734 } 1735 1736 /* 1737 * If this metaslab is one we're actively using, adjust its 1738 * weight to make it preferable to any inactive metaslab so 1739 * we'll polish it off. If the fragmentation on this metaslab 1740 * has exceed our threshold, then don't mark it active. 1741 */ 1742 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1743 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1744 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1745 } 1746 1747 WEIGHT_SET_SPACEBASED(weight); 1748 return (weight); 1749 } 1750 1751 /* 1752 * Return the weight of the specified metaslab, according to the segment-based 1753 * weighting algorithm. The metaslab must be loaded. This function can 1754 * be called within a sync pass since it relies only on the metaslab's 1755 * range tree which is always accurate when the metaslab is loaded. 1756 */ 1757 static uint64_t 1758 metaslab_weight_from_range_tree(metaslab_t *msp) 1759 { 1760 uint64_t weight = 0; 1761 uint32_t segments = 0; 1762 1763 ASSERT(msp->ms_loaded); 1764 1765 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1766 i--) { 1767 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1768 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1769 1770 segments <<= 1; 1771 segments += msp->ms_allocatable->rt_histogram[i]; 1772 1773 /* 1774 * The range tree provides more precision than the space map 1775 * and must be downgraded so that all values fit within the 1776 * space map's histogram. This allows us to compare loaded 1777 * vs. unloaded metaslabs to determine which metaslab is 1778 * considered "best". 1779 */ 1780 if (i > max_idx) 1781 continue; 1782 1783 if (segments != 0) { 1784 WEIGHT_SET_COUNT(weight, segments); 1785 WEIGHT_SET_INDEX(weight, i); 1786 WEIGHT_SET_ACTIVE(weight, 0); 1787 break; 1788 } 1789 } 1790 return (weight); 1791 } 1792 1793 /* 1794 * Calculate the weight based on the on-disk histogram. This should only 1795 * be called after a sync pass has completely finished since the on-disk 1796 * information is updated in metaslab_sync(). 1797 */ 1798 static uint64_t 1799 metaslab_weight_from_spacemap(metaslab_t *msp) 1800 { 1801 uint64_t weight = 0; 1802 1803 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1804 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1805 WEIGHT_SET_COUNT(weight, 1806 msp->ms_sm->sm_phys->smp_histogram[i]); 1807 WEIGHT_SET_INDEX(weight, i + 1808 msp->ms_sm->sm_shift); 1809 WEIGHT_SET_ACTIVE(weight, 0); 1810 break; 1811 } 1812 } 1813 return (weight); 1814 } 1815 1816 /* 1817 * Compute a segment-based weight for the specified metaslab. The weight 1818 * is determined by highest bucket in the histogram. The information 1819 * for the highest bucket is encoded into the weight value. 1820 */ 1821 static uint64_t 1822 metaslab_segment_weight(metaslab_t *msp) 1823 { 1824 metaslab_group_t *mg = msp->ms_group; 1825 uint64_t weight = 0; 1826 uint8_t shift = mg->mg_vd->vdev_ashift; 1827 1828 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1829 1830 /* 1831 * The metaslab is completely free. 1832 */ 1833 if (space_map_allocated(msp->ms_sm) == 0) { 1834 int idx = highbit64(msp->ms_size) - 1; 1835 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1836 1837 if (idx < max_idx) { 1838 WEIGHT_SET_COUNT(weight, 1ULL); 1839 WEIGHT_SET_INDEX(weight, idx); 1840 } else { 1841 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1842 WEIGHT_SET_INDEX(weight, max_idx); 1843 } 1844 WEIGHT_SET_ACTIVE(weight, 0); 1845 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1846 1847 return (weight); 1848 } 1849 1850 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1851 1852 /* 1853 * If the metaslab is fully allocated then just make the weight 0. 1854 */ 1855 if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1856 return (0); 1857 /* 1858 * If the metaslab is already loaded, then use the range tree to 1859 * determine the weight. Otherwise, we rely on the space map information 1860 * to generate the weight. 1861 */ 1862 if (msp->ms_loaded) { 1863 weight = metaslab_weight_from_range_tree(msp); 1864 } else { 1865 weight = metaslab_weight_from_spacemap(msp); 1866 } 1867 1868 /* 1869 * If the metaslab was active the last time we calculated its weight 1870 * then keep it active. We want to consume the entire region that 1871 * is associated with this weight. 1872 */ 1873 if (msp->ms_activation_weight != 0 && weight != 0) 1874 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 1875 return (weight); 1876 } 1877 1878 /* 1879 * Determine if we should attempt to allocate from this metaslab. If the 1880 * metaslab has a maximum size then we can quickly determine if the desired 1881 * allocation size can be satisfied. Otherwise, if we're using segment-based 1882 * weighting then we can determine the maximum allocation that this metaslab 1883 * can accommodate based on the index encoded in the weight. If we're using 1884 * space-based weights then rely on the entire weight (excluding the weight 1885 * type bit). 1886 */ 1887 boolean_t 1888 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 1889 { 1890 boolean_t should_allocate; 1891 1892 if (msp->ms_max_size != 0) 1893 return (msp->ms_max_size >= asize); 1894 1895 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 1896 /* 1897 * The metaslab segment weight indicates segments in the 1898 * range [2^i, 2^(i+1)), where i is the index in the weight. 1899 * Since the asize might be in the middle of the range, we 1900 * should attempt the allocation if asize < 2^(i+1). 1901 */ 1902 should_allocate = (asize < 1903 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 1904 } else { 1905 should_allocate = (asize <= 1906 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 1907 } 1908 return (should_allocate); 1909 } 1910 1911 static uint64_t 1912 metaslab_weight(metaslab_t *msp) 1913 { 1914 vdev_t *vd = msp->ms_group->mg_vd; 1915 spa_t *spa = vd->vdev_spa; 1916 uint64_t weight; 1917 1918 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1919 1920 /* 1921 * If this vdev is in the process of being removed, there is nothing 1922 * for us to do here. 1923 */ 1924 if (vd->vdev_removing) 1925 return (0); 1926 1927 metaslab_set_fragmentation(msp); 1928 1929 /* 1930 * Update the maximum size if the metaslab is loaded. This will 1931 * ensure that we get an accurate maximum size if newly freed space 1932 * has been added back into the free tree. 1933 */ 1934 if (msp->ms_loaded) 1935 msp->ms_max_size = metaslab_block_maxsize(msp); 1936 1937 /* 1938 * Segment-based weighting requires space map histogram support. 1939 */ 1940 if (zfs_metaslab_segment_weight_enabled && 1941 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 1942 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 1943 sizeof (space_map_phys_t))) { 1944 weight = metaslab_segment_weight(msp); 1945 } else { 1946 weight = metaslab_space_weight(msp); 1947 } 1948 return (weight); 1949 } 1950 1951 static int 1952 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1953 { 1954 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1955 1956 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1957 metaslab_load_wait(msp); 1958 if (!msp->ms_loaded) { 1959 int error = metaslab_load(msp); 1960 if (error) { 1961 metaslab_group_sort(msp->ms_group, msp, 0); 1962 return (error); 1963 } 1964 } 1965 1966 msp->ms_activation_weight = msp->ms_weight; 1967 metaslab_group_sort(msp->ms_group, msp, 1968 msp->ms_weight | activation_weight); 1969 } 1970 ASSERT(msp->ms_loaded); 1971 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1972 1973 return (0); 1974 } 1975 1976 static void 1977 metaslab_passivate(metaslab_t *msp, uint64_t weight) 1978 { 1979 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 1980 1981 /* 1982 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1983 * this metaslab again. In that case, it had better be empty, 1984 * or we would be leaving space on the table. 1985 */ 1986 ASSERT(size >= SPA_MINBLOCKSIZE || 1987 range_tree_is_empty(msp->ms_allocatable)); 1988 ASSERT0(weight & METASLAB_ACTIVE_MASK); 1989 1990 msp->ms_activation_weight = 0; 1991 metaslab_group_sort(msp->ms_group, msp, weight); 1992 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1993 } 1994 1995 /* 1996 * Segment-based metaslabs are activated once and remain active until 1997 * we either fail an allocation attempt (similar to space-based metaslabs) 1998 * or have exhausted the free space in zfs_metaslab_switch_threshold 1999 * buckets since the metaslab was activated. This function checks to see 2000 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2001 * metaslab and passivates it proactively. This will allow us to select a 2002 * metaslabs with larger contiguous region if any remaining within this 2003 * metaslab group. If we're in sync pass > 1, then we continue using this 2004 * metaslab so that we don't dirty more block and cause more sync passes. 2005 */ 2006 void 2007 metaslab_segment_may_passivate(metaslab_t *msp) 2008 { 2009 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2010 2011 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2012 return; 2013 2014 /* 2015 * Since we are in the middle of a sync pass, the most accurate 2016 * information that is accessible to us is the in-core range tree 2017 * histogram; calculate the new weight based on that information. 2018 */ 2019 uint64_t weight = metaslab_weight_from_range_tree(msp); 2020 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2021 int current_idx = WEIGHT_GET_INDEX(weight); 2022 2023 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2024 metaslab_passivate(msp, weight); 2025 } 2026 2027 static void 2028 metaslab_preload(void *arg) 2029 { 2030 metaslab_t *msp = arg; 2031 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2032 2033 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2034 2035 mutex_enter(&msp->ms_lock); 2036 metaslab_load_wait(msp); 2037 if (!msp->ms_loaded) 2038 (void) metaslab_load(msp); 2039 msp->ms_selected_txg = spa_syncing_txg(spa); 2040 mutex_exit(&msp->ms_lock); 2041 } 2042 2043 static void 2044 metaslab_group_preload(metaslab_group_t *mg) 2045 { 2046 spa_t *spa = mg->mg_vd->vdev_spa; 2047 metaslab_t *msp; 2048 avl_tree_t *t = &mg->mg_metaslab_tree; 2049 int m = 0; 2050 2051 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2052 taskq_wait(mg->mg_taskq); 2053 return; 2054 } 2055 2056 mutex_enter(&mg->mg_lock); 2057 2058 /* 2059 * Load the next potential metaslabs 2060 */ 2061 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2062 ASSERT3P(msp->ms_group, ==, mg); 2063 2064 /* 2065 * We preload only the maximum number of metaslabs specified 2066 * by metaslab_preload_limit. If a metaslab is being forced 2067 * to condense then we preload it too. This will ensure 2068 * that force condensing happens in the next txg. 2069 */ 2070 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2071 continue; 2072 } 2073 2074 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2075 msp, TQ_SLEEP) != NULL); 2076 } 2077 mutex_exit(&mg->mg_lock); 2078 } 2079 2080 /* 2081 * Determine if the space map's on-disk footprint is past our tolerance 2082 * for inefficiency. We would like to use the following criteria to make 2083 * our decision: 2084 * 2085 * 1. The size of the space map object should not dramatically increase as a 2086 * result of writing out the free space range tree. 2087 * 2088 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2089 * times the size than the free space range tree representation 2090 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2091 * 2092 * 3. The on-disk size of the space map should actually decrease. 2093 * 2094 * Unfortunately, we cannot compute the on-disk size of the space map in this 2095 * context because we cannot accurately compute the effects of compression, etc. 2096 * Instead, we apply the heuristic described in the block comment for 2097 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2098 * is greater than a threshold number of blocks. 2099 */ 2100 static boolean_t 2101 metaslab_should_condense(metaslab_t *msp) 2102 { 2103 space_map_t *sm = msp->ms_sm; 2104 vdev_t *vd = msp->ms_group->mg_vd; 2105 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2106 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2107 2108 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2109 ASSERT(msp->ms_loaded); 2110 2111 /* 2112 * Allocations and frees in early passes are generally more space 2113 * efficient (in terms of blocks described in space map entries) 2114 * than the ones in later passes (e.g. we don't compress after 2115 * sync pass 5) and condensing a metaslab multiple times in a txg 2116 * could degrade performance. 2117 * 2118 * Thus we prefer condensing each metaslab at most once every txg at 2119 * the earliest sync pass possible. If a metaslab is eligible for 2120 * condensing again after being considered for condensing within the 2121 * same txg, it will hopefully be dirty in the next txg where it will 2122 * be condensed at an earlier pass. 2123 */ 2124 if (msp->ms_condense_checked_txg == current_txg) 2125 return (B_FALSE); 2126 msp->ms_condense_checked_txg = current_txg; 2127 2128 /* 2129 * We always condense metaslabs that are empty and metaslabs for 2130 * which a condense request has been made. 2131 */ 2132 if (avl_is_empty(&msp->ms_allocatable_by_size) || 2133 msp->ms_condense_wanted) 2134 return (B_TRUE); 2135 2136 uint64_t object_size = space_map_length(msp->ms_sm); 2137 uint64_t optimal_size = space_map_estimate_optimal_size(sm, 2138 msp->ms_allocatable, SM_NO_VDEVID); 2139 2140 dmu_object_info_t doi; 2141 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2142 uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2143 2144 return (object_size >= (optimal_size * zfs_condense_pct / 100) && 2145 object_size > zfs_metaslab_condense_block_threshold * record_size); 2146 } 2147 2148 /* 2149 * Condense the on-disk space map representation to its minimized form. 2150 * The minimized form consists of a small number of allocations followed by 2151 * the entries of the free range tree. 2152 */ 2153 static void 2154 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2155 { 2156 range_tree_t *condense_tree; 2157 space_map_t *sm = msp->ms_sm; 2158 2159 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2160 ASSERT(msp->ms_loaded); 2161 2162 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2163 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2164 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2165 msp->ms_group->mg_vd->vdev_spa->spa_name, 2166 space_map_length(msp->ms_sm), 2167 avl_numnodes(&msp->ms_allocatable->rt_root), 2168 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2169 2170 msp->ms_condense_wanted = B_FALSE; 2171 2172 /* 2173 * Create an range tree that is 100% allocated. We remove segments 2174 * that have been freed in this txg, any deferred frees that exist, 2175 * and any allocation in the future. Removing segments should be 2176 * a relatively inexpensive operation since we expect these trees to 2177 * have a small number of nodes. 2178 */ 2179 condense_tree = range_tree_create(NULL, NULL); 2180 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2181 2182 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2183 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2184 2185 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2186 range_tree_walk(msp->ms_defer[t], 2187 range_tree_remove, condense_tree); 2188 } 2189 2190 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2191 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2192 range_tree_remove, condense_tree); 2193 } 2194 2195 /* 2196 * We're about to drop the metaslab's lock thus allowing 2197 * other consumers to change it's content. Set the 2198 * metaslab's ms_condensing flag to ensure that 2199 * allocations on this metaslab do not occur while we're 2200 * in the middle of committing it to disk. This is only critical 2201 * for ms_allocatable as all other range trees use per txg 2202 * views of their content. 2203 */ 2204 msp->ms_condensing = B_TRUE; 2205 2206 mutex_exit(&msp->ms_lock); 2207 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2208 2209 /* 2210 * While we would ideally like to create a space map representation 2211 * that consists only of allocation records, doing so can be 2212 * prohibitively expensive because the in-core free tree can be 2213 * large, and therefore computationally expensive to subtract 2214 * from the condense_tree. Instead we sync out two trees, a cheap 2215 * allocation only tree followed by the in-core free tree. While not 2216 * optimal, this is typically close to optimal, and much cheaper to 2217 * compute. 2218 */ 2219 space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); 2220 range_tree_vacate(condense_tree, NULL, NULL); 2221 range_tree_destroy(condense_tree); 2222 2223 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); 2224 mutex_enter(&msp->ms_lock); 2225 msp->ms_condensing = B_FALSE; 2226 } 2227 2228 /* 2229 * Write a metaslab to disk in the context of the specified transaction group. 2230 */ 2231 void 2232 metaslab_sync(metaslab_t *msp, uint64_t txg) 2233 { 2234 metaslab_group_t *mg = msp->ms_group; 2235 vdev_t *vd = mg->mg_vd; 2236 spa_t *spa = vd->vdev_spa; 2237 objset_t *mos = spa_meta_objset(spa); 2238 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2239 dmu_tx_t *tx; 2240 uint64_t object = space_map_object(msp->ms_sm); 2241 2242 ASSERT(!vd->vdev_ishole); 2243 2244 /* 2245 * This metaslab has just been added so there's no work to do now. 2246 */ 2247 if (msp->ms_freeing == NULL) { 2248 ASSERT3P(alloctree, ==, NULL); 2249 return; 2250 } 2251 2252 ASSERT3P(alloctree, !=, NULL); 2253 ASSERT3P(msp->ms_freeing, !=, NULL); 2254 ASSERT3P(msp->ms_freed, !=, NULL); 2255 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2256 2257 /* 2258 * Normally, we don't want to process a metaslab if there are no 2259 * allocations or frees to perform. However, if the metaslab is being 2260 * forced to condense and it's loaded, we need to let it through. 2261 */ 2262 if (range_tree_is_empty(alloctree) && 2263 range_tree_is_empty(msp->ms_freeing) && 2264 range_tree_is_empty(msp->ms_checkpointing) && 2265 !(msp->ms_loaded && msp->ms_condense_wanted)) 2266 return; 2267 2268 2269 VERIFY(txg <= spa_final_dirty_txg(spa)); 2270 2271 /* 2272 * The only state that can actually be changing concurrently with 2273 * metaslab_sync() is the metaslab's ms_allocatable. No other 2274 * thread can be modifying this txg's alloc, freeing, 2275 * freed, or space_map_phys_t. We drop ms_lock whenever we 2276 * could call into the DMU, because the DMU can call down to us 2277 * (e.g. via zio_free()) at any time. 2278 * 2279 * The spa_vdev_remove_thread() can be reading metaslab state 2280 * concurrently, and it is locked out by the ms_sync_lock. Note 2281 * that the ms_lock is insufficient for this, because it is dropped 2282 * by space_map_write(). 2283 */ 2284 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2285 2286 if (msp->ms_sm == NULL) { 2287 uint64_t new_object; 2288 2289 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2290 VERIFY3U(new_object, !=, 0); 2291 2292 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2293 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2294 ASSERT(msp->ms_sm != NULL); 2295 } 2296 2297 if (!range_tree_is_empty(msp->ms_checkpointing) && 2298 vd->vdev_checkpoint_sm == NULL) { 2299 ASSERT(spa_has_checkpoint(spa)); 2300 2301 uint64_t new_object = space_map_alloc(mos, 2302 vdev_standard_sm_blksz, tx); 2303 VERIFY3U(new_object, !=, 0); 2304 2305 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2306 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2307 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2308 2309 /* 2310 * We save the space map object as an entry in vdev_top_zap 2311 * so it can be retrieved when the pool is reopened after an 2312 * export or through zdb. 2313 */ 2314 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2315 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2316 sizeof (new_object), 1, &new_object, tx)); 2317 } 2318 2319 mutex_enter(&msp->ms_sync_lock); 2320 mutex_enter(&msp->ms_lock); 2321 2322 /* 2323 * Note: metaslab_condense() clears the space map's histogram. 2324 * Therefore we must verify and remove this histogram before 2325 * condensing. 2326 */ 2327 metaslab_group_histogram_verify(mg); 2328 metaslab_class_histogram_verify(mg->mg_class); 2329 metaslab_group_histogram_remove(mg, msp); 2330 2331 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2332 metaslab_condense(msp, txg, tx); 2333 } else { 2334 mutex_exit(&msp->ms_lock); 2335 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, 2336 SM_NO_VDEVID, tx); 2337 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, 2338 SM_NO_VDEVID, tx); 2339 mutex_enter(&msp->ms_lock); 2340 } 2341 2342 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2343 ASSERT(spa_has_checkpoint(spa)); 2344 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2345 2346 /* 2347 * Since we are doing writes to disk and the ms_checkpointing 2348 * tree won't be changing during that time, we drop the 2349 * ms_lock while writing to the checkpoint space map. 2350 */ 2351 mutex_exit(&msp->ms_lock); 2352 space_map_write(vd->vdev_checkpoint_sm, 2353 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); 2354 mutex_enter(&msp->ms_lock); 2355 space_map_update(vd->vdev_checkpoint_sm); 2356 2357 spa->spa_checkpoint_info.sci_dspace += 2358 range_tree_space(msp->ms_checkpointing); 2359 vd->vdev_stat.vs_checkpoint_space += 2360 range_tree_space(msp->ms_checkpointing); 2361 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2362 -vd->vdev_checkpoint_sm->sm_alloc); 2363 2364 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2365 } 2366 2367 if (msp->ms_loaded) { 2368 /* 2369 * When the space map is loaded, we have an accurate 2370 * histogram in the range tree. This gives us an opportunity 2371 * to bring the space map's histogram up-to-date so we clear 2372 * it first before updating it. 2373 */ 2374 space_map_histogram_clear(msp->ms_sm); 2375 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2376 2377 /* 2378 * Since we've cleared the histogram we need to add back 2379 * any free space that has already been processed, plus 2380 * any deferred space. This allows the on-disk histogram 2381 * to accurately reflect all free space even if some space 2382 * is not yet available for allocation (i.e. deferred). 2383 */ 2384 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2385 2386 /* 2387 * Add back any deferred free space that has not been 2388 * added back into the in-core free tree yet. This will 2389 * ensure that we don't end up with a space map histogram 2390 * that is completely empty unless the metaslab is fully 2391 * allocated. 2392 */ 2393 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2394 space_map_histogram_add(msp->ms_sm, 2395 msp->ms_defer[t], tx); 2396 } 2397 } 2398 2399 /* 2400 * Always add the free space from this sync pass to the space 2401 * map histogram. We want to make sure that the on-disk histogram 2402 * accounts for all free space. If the space map is not loaded, 2403 * then we will lose some accuracy but will correct it the next 2404 * time we load the space map. 2405 */ 2406 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2407 2408 metaslab_group_histogram_add(mg, msp); 2409 metaslab_group_histogram_verify(mg); 2410 metaslab_class_histogram_verify(mg->mg_class); 2411 2412 /* 2413 * For sync pass 1, we avoid traversing this txg's free range tree 2414 * and instead will just swap the pointers for freeing and 2415 * freed. We can safely do this since the freed_tree is 2416 * guaranteed to be empty on the initial pass. 2417 */ 2418 if (spa_sync_pass(spa) == 1) { 2419 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2420 } else { 2421 range_tree_vacate(msp->ms_freeing, 2422 range_tree_add, msp->ms_freed); 2423 } 2424 range_tree_vacate(alloctree, NULL, NULL); 2425 2426 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2427 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2428 & TXG_MASK])); 2429 ASSERT0(range_tree_space(msp->ms_freeing)); 2430 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2431 2432 mutex_exit(&msp->ms_lock); 2433 2434 if (object != space_map_object(msp->ms_sm)) { 2435 object = space_map_object(msp->ms_sm); 2436 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2437 msp->ms_id, sizeof (uint64_t), &object, tx); 2438 } 2439 mutex_exit(&msp->ms_sync_lock); 2440 dmu_tx_commit(tx); 2441 } 2442 2443 /* 2444 * Called after a transaction group has completely synced to mark 2445 * all of the metaslab's free space as usable. 2446 */ 2447 void 2448 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2449 { 2450 metaslab_group_t *mg = msp->ms_group; 2451 vdev_t *vd = mg->mg_vd; 2452 spa_t *spa = vd->vdev_spa; 2453 range_tree_t **defer_tree; 2454 int64_t alloc_delta, defer_delta; 2455 boolean_t defer_allowed = B_TRUE; 2456 2457 ASSERT(!vd->vdev_ishole); 2458 2459 mutex_enter(&msp->ms_lock); 2460 2461 /* 2462 * If this metaslab is just becoming available, initialize its 2463 * range trees and add its capacity to the vdev. 2464 */ 2465 if (msp->ms_freed == NULL) { 2466 for (int t = 0; t < TXG_SIZE; t++) { 2467 ASSERT(msp->ms_allocating[t] == NULL); 2468 2469 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2470 } 2471 2472 ASSERT3P(msp->ms_freeing, ==, NULL); 2473 msp->ms_freeing = range_tree_create(NULL, NULL); 2474 2475 ASSERT3P(msp->ms_freed, ==, NULL); 2476 msp->ms_freed = range_tree_create(NULL, NULL); 2477 2478 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2479 ASSERT(msp->ms_defer[t] == NULL); 2480 2481 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2482 } 2483 2484 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2485 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2486 2487 vdev_space_update(vd, 0, 0, msp->ms_size); 2488 } 2489 ASSERT0(range_tree_space(msp->ms_freeing)); 2490 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2491 2492 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2493 2494 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2495 metaslab_class_get_alloc(spa_normal_class(spa)); 2496 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2497 defer_allowed = B_FALSE; 2498 } 2499 2500 defer_delta = 0; 2501 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2502 if (defer_allowed) { 2503 defer_delta = range_tree_space(msp->ms_freed) - 2504 range_tree_space(*defer_tree); 2505 } else { 2506 defer_delta -= range_tree_space(*defer_tree); 2507 } 2508 2509 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2510 2511 /* 2512 * If there's a metaslab_load() in progress, wait for it to complete 2513 * so that we have a consistent view of the in-core space map. 2514 */ 2515 metaslab_load_wait(msp); 2516 2517 /* 2518 * Move the frees from the defer_tree back to the free 2519 * range tree (if it's loaded). Swap the freed_tree and 2520 * the defer_tree -- this is safe to do because we've 2521 * just emptied out the defer_tree. 2522 */ 2523 range_tree_vacate(*defer_tree, 2524 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2525 if (defer_allowed) { 2526 range_tree_swap(&msp->ms_freed, defer_tree); 2527 } else { 2528 range_tree_vacate(msp->ms_freed, 2529 msp->ms_loaded ? range_tree_add : NULL, 2530 msp->ms_allocatable); 2531 } 2532 space_map_update(msp->ms_sm); 2533 2534 msp->ms_deferspace += defer_delta; 2535 ASSERT3S(msp->ms_deferspace, >=, 0); 2536 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2537 if (msp->ms_deferspace != 0) { 2538 /* 2539 * Keep syncing this metaslab until all deferred frees 2540 * are back in circulation. 2541 */ 2542 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2543 } 2544 2545 /* 2546 * Calculate the new weights before unloading any metaslabs. 2547 * This will give us the most accurate weighting. 2548 */ 2549 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2550 2551 /* 2552 * If the metaslab is loaded and we've not tried to load or allocate 2553 * from it in 'metaslab_unload_delay' txgs, then unload it. 2554 */ 2555 if (msp->ms_loaded && 2556 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2557 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2558 VERIFY0(range_tree_space( 2559 msp->ms_allocating[(txg + t) & TXG_MASK])); 2560 } 2561 2562 if (!metaslab_debug_unload) 2563 metaslab_unload(msp); 2564 } 2565 2566 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2567 ASSERT0(range_tree_space(msp->ms_freeing)); 2568 ASSERT0(range_tree_space(msp->ms_freed)); 2569 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2570 2571 mutex_exit(&msp->ms_lock); 2572 } 2573 2574 void 2575 metaslab_sync_reassess(metaslab_group_t *mg) 2576 { 2577 spa_t *spa = mg->mg_class->mc_spa; 2578 2579 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2580 metaslab_group_alloc_update(mg); 2581 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2582 2583 /* 2584 * Preload the next potential metaslabs but only on active 2585 * metaslab groups. We can get into a state where the metaslab 2586 * is no longer active since we dirty metaslabs as we remove a 2587 * a device, thus potentially making the metaslab group eligible 2588 * for preloading. 2589 */ 2590 if (mg->mg_activation_count > 0) { 2591 metaslab_group_preload(mg); 2592 } 2593 spa_config_exit(spa, SCL_ALLOC, FTAG); 2594 } 2595 2596 static uint64_t 2597 metaslab_distance(metaslab_t *msp, dva_t *dva) 2598 { 2599 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2600 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2601 uint64_t start = msp->ms_id; 2602 2603 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2604 return (1ULL << 63); 2605 2606 if (offset < start) 2607 return ((start - offset) << ms_shift); 2608 if (offset > start) 2609 return ((offset - start) << ms_shift); 2610 return (0); 2611 } 2612 2613 /* 2614 * ========================================================================== 2615 * Metaslab allocation tracing facility 2616 * ========================================================================== 2617 */ 2618 kstat_t *metaslab_trace_ksp; 2619 kstat_named_t metaslab_trace_over_limit; 2620 2621 void 2622 metaslab_alloc_trace_init(void) 2623 { 2624 ASSERT(metaslab_alloc_trace_cache == NULL); 2625 metaslab_alloc_trace_cache = kmem_cache_create( 2626 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2627 0, NULL, NULL, NULL, NULL, NULL, 0); 2628 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2629 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2630 if (metaslab_trace_ksp != NULL) { 2631 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2632 kstat_named_init(&metaslab_trace_over_limit, 2633 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2634 kstat_install(metaslab_trace_ksp); 2635 } 2636 } 2637 2638 void 2639 metaslab_alloc_trace_fini(void) 2640 { 2641 if (metaslab_trace_ksp != NULL) { 2642 kstat_delete(metaslab_trace_ksp); 2643 metaslab_trace_ksp = NULL; 2644 } 2645 kmem_cache_destroy(metaslab_alloc_trace_cache); 2646 metaslab_alloc_trace_cache = NULL; 2647 } 2648 2649 /* 2650 * Add an allocation trace element to the allocation tracing list. 2651 */ 2652 static void 2653 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2654 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) 2655 { 2656 if (!metaslab_trace_enabled) 2657 return; 2658 2659 /* 2660 * When the tracing list reaches its maximum we remove 2661 * the second element in the list before adding a new one. 2662 * By removing the second element we preserve the original 2663 * entry as a clue to what allocations steps have already been 2664 * performed. 2665 */ 2666 if (zal->zal_size == metaslab_trace_max_entries) { 2667 metaslab_alloc_trace_t *mat_next; 2668 #ifdef DEBUG 2669 panic("too many entries in allocation list"); 2670 #endif 2671 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2672 zal->zal_size--; 2673 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2674 list_remove(&zal->zal_list, mat_next); 2675 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2676 } 2677 2678 metaslab_alloc_trace_t *mat = 2679 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2680 list_link_init(&mat->mat_list_node); 2681 mat->mat_mg = mg; 2682 mat->mat_msp = msp; 2683 mat->mat_size = psize; 2684 mat->mat_dva_id = dva_id; 2685 mat->mat_offset = offset; 2686 mat->mat_weight = 0; 2687 2688 if (msp != NULL) 2689 mat->mat_weight = msp->ms_weight; 2690 2691 /* 2692 * The list is part of the zio so locking is not required. Only 2693 * a single thread will perform allocations for a given zio. 2694 */ 2695 list_insert_tail(&zal->zal_list, mat); 2696 zal->zal_size++; 2697 2698 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2699 } 2700 2701 void 2702 metaslab_trace_init(zio_alloc_list_t *zal) 2703 { 2704 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2705 offsetof(metaslab_alloc_trace_t, mat_list_node)); 2706 zal->zal_size = 0; 2707 } 2708 2709 void 2710 metaslab_trace_fini(zio_alloc_list_t *zal) 2711 { 2712 metaslab_alloc_trace_t *mat; 2713 2714 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2715 kmem_cache_free(metaslab_alloc_trace_cache, mat); 2716 list_destroy(&zal->zal_list); 2717 zal->zal_size = 0; 2718 } 2719 2720 /* 2721 * ========================================================================== 2722 * Metaslab block operations 2723 * ========================================================================== 2724 */ 2725 2726 static void 2727 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) 2728 { 2729 if (!(flags & METASLAB_ASYNC_ALLOC) || 2730 flags & METASLAB_DONT_THROTTLE) 2731 return; 2732 2733 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2734 if (!mg->mg_class->mc_alloc_throttle_enabled) 2735 return; 2736 2737 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2738 } 2739 2740 void 2741 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) 2742 { 2743 if (!(flags & METASLAB_ASYNC_ALLOC) || 2744 flags & METASLAB_DONT_THROTTLE) 2745 return; 2746 2747 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2748 if (!mg->mg_class->mc_alloc_throttle_enabled) 2749 return; 2750 2751 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2752 } 2753 2754 void 2755 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) 2756 { 2757 #ifdef ZFS_DEBUG 2758 const dva_t *dva = bp->blk_dva; 2759 int ndvas = BP_GET_NDVAS(bp); 2760 2761 for (int d = 0; d < ndvas; d++) { 2762 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2763 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2764 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2765 } 2766 #endif 2767 } 2768 2769 static uint64_t 2770 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2771 { 2772 uint64_t start; 2773 range_tree_t *rt = msp->ms_allocatable; 2774 metaslab_class_t *mc = msp->ms_group->mg_class; 2775 2776 VERIFY(!msp->ms_condensing); 2777 2778 start = mc->mc_ops->msop_alloc(msp, size); 2779 if (start != -1ULL) { 2780 metaslab_group_t *mg = msp->ms_group; 2781 vdev_t *vd = mg->mg_vd; 2782 2783 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 2784 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2785 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 2786 range_tree_remove(rt, start, size); 2787 2788 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 2789 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2790 2791 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 2792 2793 /* Track the last successful allocation */ 2794 msp->ms_alloc_txg = txg; 2795 metaslab_verify_space(msp, txg); 2796 } 2797 2798 /* 2799 * Now that we've attempted the allocation we need to update the 2800 * metaslab's maximum block size since it may have changed. 2801 */ 2802 msp->ms_max_size = metaslab_block_maxsize(msp); 2803 return (start); 2804 } 2805 2806 static uint64_t 2807 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 2808 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2809 { 2810 metaslab_t *msp = NULL; 2811 uint64_t offset = -1ULL; 2812 uint64_t activation_weight; 2813 uint64_t target_distance; 2814 int i; 2815 2816 activation_weight = METASLAB_WEIGHT_PRIMARY; 2817 for (i = 0; i < d; i++) { 2818 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2819 activation_weight = METASLAB_WEIGHT_SECONDARY; 2820 break; 2821 } 2822 } 2823 2824 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 2825 search->ms_weight = UINT64_MAX; 2826 search->ms_start = 0; 2827 for (;;) { 2828 boolean_t was_active; 2829 avl_tree_t *t = &mg->mg_metaslab_tree; 2830 avl_index_t idx; 2831 2832 mutex_enter(&mg->mg_lock); 2833 2834 /* 2835 * Find the metaslab with the highest weight that is less 2836 * than what we've already tried. In the common case, this 2837 * means that we will examine each metaslab at most once. 2838 * Note that concurrent callers could reorder metaslabs 2839 * by activation/passivation once we have dropped the mg_lock. 2840 * If a metaslab is activated by another thread, and we fail 2841 * to allocate from the metaslab we have selected, we may 2842 * not try the newly-activated metaslab, and instead activate 2843 * another metaslab. This is not optimal, but generally 2844 * does not cause any problems (a possible exception being 2845 * if every metaslab is completely full except for the 2846 * the newly-activated metaslab which we fail to examine). 2847 */ 2848 msp = avl_find(t, search, &idx); 2849 if (msp == NULL) 2850 msp = avl_nearest(t, idx, AVL_AFTER); 2851 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 2852 2853 if (!metaslab_should_allocate(msp, asize)) { 2854 metaslab_trace_add(zal, mg, msp, asize, d, 2855 TRACE_TOO_SMALL); 2856 continue; 2857 } 2858 2859 /* 2860 * If the selected metaslab is condensing, skip it. 2861 */ 2862 if (msp->ms_condensing) 2863 continue; 2864 2865 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2866 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2867 break; 2868 2869 target_distance = min_distance + 2870 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2871 min_distance >> 1); 2872 2873 for (i = 0; i < d; i++) { 2874 if (metaslab_distance(msp, &dva[i]) < 2875 target_distance) 2876 break; 2877 } 2878 if (i == d) 2879 break; 2880 } 2881 mutex_exit(&mg->mg_lock); 2882 if (msp == NULL) { 2883 kmem_free(search, sizeof (*search)); 2884 return (-1ULL); 2885 } 2886 search->ms_weight = msp->ms_weight; 2887 search->ms_start = msp->ms_start + 1; 2888 2889 mutex_enter(&msp->ms_lock); 2890 2891 /* 2892 * Ensure that the metaslab we have selected is still 2893 * capable of handling our request. It's possible that 2894 * another thread may have changed the weight while we 2895 * were blocked on the metaslab lock. We check the 2896 * active status first to see if we need to reselect 2897 * a new metaslab. 2898 */ 2899 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 2900 mutex_exit(&msp->ms_lock); 2901 continue; 2902 } 2903 2904 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2905 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2906 metaslab_passivate(msp, 2907 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2908 mutex_exit(&msp->ms_lock); 2909 continue; 2910 } 2911 2912 if (metaslab_activate(msp, activation_weight) != 0) { 2913 mutex_exit(&msp->ms_lock); 2914 continue; 2915 } 2916 msp->ms_selected_txg = txg; 2917 2918 /* 2919 * Now that we have the lock, recheck to see if we should 2920 * continue to use this metaslab for this allocation. The 2921 * the metaslab is now loaded so metaslab_should_allocate() can 2922 * accurately determine if the allocation attempt should 2923 * proceed. 2924 */ 2925 if (!metaslab_should_allocate(msp, asize)) { 2926 /* Passivate this metaslab and select a new one. */ 2927 metaslab_trace_add(zal, mg, msp, asize, d, 2928 TRACE_TOO_SMALL); 2929 goto next; 2930 } 2931 2932 /* 2933 * If this metaslab is currently condensing then pick again as 2934 * we can't manipulate this metaslab until it's committed 2935 * to disk. 2936 */ 2937 if (msp->ms_condensing) { 2938 metaslab_trace_add(zal, mg, msp, asize, d, 2939 TRACE_CONDENSING); 2940 mutex_exit(&msp->ms_lock); 2941 continue; 2942 } 2943 2944 offset = metaslab_block_alloc(msp, asize, txg); 2945 metaslab_trace_add(zal, mg, msp, asize, d, offset); 2946 2947 if (offset != -1ULL) { 2948 /* Proactively passivate the metaslab, if needed */ 2949 metaslab_segment_may_passivate(msp); 2950 break; 2951 } 2952 next: 2953 ASSERT(msp->ms_loaded); 2954 2955 /* 2956 * We were unable to allocate from this metaslab so determine 2957 * a new weight for this metaslab. Now that we have loaded 2958 * the metaslab we can provide a better hint to the metaslab 2959 * selector. 2960 * 2961 * For space-based metaslabs, we use the maximum block size. 2962 * This information is only available when the metaslab 2963 * is loaded and is more accurate than the generic free 2964 * space weight that was calculated by metaslab_weight(). 2965 * This information allows us to quickly compare the maximum 2966 * available allocation in the metaslab to the allocation 2967 * size being requested. 2968 * 2969 * For segment-based metaslabs, determine the new weight 2970 * based on the highest bucket in the range tree. We 2971 * explicitly use the loaded segment weight (i.e. the range 2972 * tree histogram) since it contains the space that is 2973 * currently available for allocation and is accurate 2974 * even within a sync pass. 2975 */ 2976 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 2977 uint64_t weight = metaslab_block_maxsize(msp); 2978 WEIGHT_SET_SPACEBASED(weight); 2979 metaslab_passivate(msp, weight); 2980 } else { 2981 metaslab_passivate(msp, 2982 metaslab_weight_from_range_tree(msp)); 2983 } 2984 2985 /* 2986 * We have just failed an allocation attempt, check 2987 * that metaslab_should_allocate() agrees. Otherwise, 2988 * we may end up in an infinite loop retrying the same 2989 * metaslab. 2990 */ 2991 ASSERT(!metaslab_should_allocate(msp, asize)); 2992 mutex_exit(&msp->ms_lock); 2993 } 2994 mutex_exit(&msp->ms_lock); 2995 kmem_free(search, sizeof (*search)); 2996 return (offset); 2997 } 2998 2999 static uint64_t 3000 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3001 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 3002 { 3003 uint64_t offset; 3004 ASSERT(mg->mg_initialized); 3005 3006 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, 3007 min_distance, dva, d); 3008 3009 mutex_enter(&mg->mg_lock); 3010 if (offset == -1ULL) { 3011 mg->mg_failed_allocations++; 3012 metaslab_trace_add(zal, mg, NULL, asize, d, 3013 TRACE_GROUP_FAILURE); 3014 if (asize == SPA_GANGBLOCKSIZE) { 3015 /* 3016 * This metaslab group was unable to allocate 3017 * the minimum gang block size so it must be out of 3018 * space. We must notify the allocation throttle 3019 * to start skipping allocation attempts to this 3020 * metaslab group until more space becomes available. 3021 * Note: this failure cannot be caused by the 3022 * allocation throttle since the allocation throttle 3023 * is only responsible for skipping devices and 3024 * not failing block allocations. 3025 */ 3026 mg->mg_no_free_space = B_TRUE; 3027 } 3028 } 3029 mg->mg_allocations++; 3030 mutex_exit(&mg->mg_lock); 3031 return (offset); 3032 } 3033 3034 /* 3035 * If we have to write a ditto block (i.e. more than one DVA for a given BP) 3036 * on the same vdev as an existing DVA of this BP, then try to allocate it 3037 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the 3038 * existing DVAs. 3039 */ 3040 int ditto_same_vdev_distance_shift = 3; 3041 3042 /* 3043 * Allocate a block for the specified i/o. 3044 */ 3045 int 3046 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3047 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3048 zio_alloc_list_t *zal) 3049 { 3050 metaslab_group_t *mg, *rotor; 3051 vdev_t *vd; 3052 boolean_t try_hard = B_FALSE; 3053 3054 ASSERT(!DVA_IS_VALID(&dva[d])); 3055 3056 /* 3057 * For testing, make some blocks above a certain size be gang blocks. 3058 */ 3059 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3060 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); 3061 return (SET_ERROR(ENOSPC)); 3062 } 3063 3064 /* 3065 * Start at the rotor and loop through all mgs until we find something. 3066 * Note that there's no locking on mc_rotor or mc_aliquot because 3067 * nothing actually breaks if we miss a few updates -- we just won't 3068 * allocate quite as evenly. It all balances out over time. 3069 * 3070 * If we are doing ditto or log blocks, try to spread them across 3071 * consecutive vdevs. If we're forced to reuse a vdev before we've 3072 * allocated all of our ditto blocks, then try and spread them out on 3073 * that vdev as much as possible. If it turns out to not be possible, 3074 * gradually lower our standards until anything becomes acceptable. 3075 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3076 * gives us hope of containing our fault domains to something we're 3077 * able to reason about. Otherwise, any two top-level vdev failures 3078 * will guarantee the loss of data. With consecutive allocation, 3079 * only two adjacent top-level vdev failures will result in data loss. 3080 * 3081 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3082 * ourselves on the same vdev as our gang block header. That 3083 * way, we can hope for locality in vdev_cache, plus it makes our 3084 * fault domains something tractable. 3085 */ 3086 if (hintdva) { 3087 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3088 3089 /* 3090 * It's possible the vdev we're using as the hint no 3091 * longer exists or its mg has been closed (e.g. by 3092 * device removal). Consult the rotor when 3093 * all else fails. 3094 */ 3095 if (vd != NULL && vd->vdev_mg != NULL) { 3096 mg = vd->vdev_mg; 3097 3098 if (flags & METASLAB_HINTBP_AVOID && 3099 mg->mg_next != NULL) 3100 mg = mg->mg_next; 3101 } else { 3102 mg = mc->mc_rotor; 3103 } 3104 } else if (d != 0) { 3105 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3106 mg = vd->vdev_mg->mg_next; 3107 } else { 3108 mg = mc->mc_rotor; 3109 } 3110 3111 /* 3112 * If the hint put us into the wrong metaslab class, or into a 3113 * metaslab group that has been passivated, just follow the rotor. 3114 */ 3115 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3116 mg = mc->mc_rotor; 3117 3118 rotor = mg; 3119 top: 3120 do { 3121 boolean_t allocatable; 3122 3123 ASSERT(mg->mg_activation_count == 1); 3124 vd = mg->mg_vd; 3125 3126 /* 3127 * Don't allocate from faulted devices. 3128 */ 3129 if (try_hard) { 3130 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3131 allocatable = vdev_allocatable(vd); 3132 spa_config_exit(spa, SCL_ZIO, FTAG); 3133 } else { 3134 allocatable = vdev_allocatable(vd); 3135 } 3136 3137 /* 3138 * Determine if the selected metaslab group is eligible 3139 * for allocations. If we're ganging then don't allow 3140 * this metaslab group to skip allocations since that would 3141 * inadvertently return ENOSPC and suspend the pool 3142 * even though space is still available. 3143 */ 3144 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3145 allocatable = metaslab_group_allocatable(mg, rotor, 3146 psize); 3147 } 3148 3149 if (!allocatable) { 3150 metaslab_trace_add(zal, mg, NULL, psize, d, 3151 TRACE_NOT_ALLOCATABLE); 3152 goto next; 3153 } 3154 3155 ASSERT(mg->mg_initialized); 3156 3157 /* 3158 * Avoid writing single-copy data to a failing, 3159 * non-redundant vdev, unless we've already tried all 3160 * other vdevs. 3161 */ 3162 if ((vd->vdev_stat.vs_write_errors > 0 || 3163 vd->vdev_state < VDEV_STATE_HEALTHY) && 3164 d == 0 && !try_hard && vd->vdev_children == 0) { 3165 metaslab_trace_add(zal, mg, NULL, psize, d, 3166 TRACE_VDEV_ERROR); 3167 goto next; 3168 } 3169 3170 ASSERT(mg->mg_class == mc); 3171 3172 /* 3173 * If we don't need to try hard, then require that the 3174 * block be 1/8th of the device away from any other DVAs 3175 * in this BP. If we are trying hard, allow any offset 3176 * to be used (distance=0). 3177 */ 3178 uint64_t distance = 0; 3179 if (!try_hard) { 3180 distance = vd->vdev_asize >> 3181 ditto_same_vdev_distance_shift; 3182 if (distance <= (1ULL << vd->vdev_ms_shift)) 3183 distance = 0; 3184 } 3185 3186 uint64_t asize = vdev_psize_to_asize(vd, psize); 3187 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3188 3189 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3190 distance, dva, d); 3191 3192 if (offset != -1ULL) { 3193 /* 3194 * If we've just selected this metaslab group, 3195 * figure out whether the corresponding vdev is 3196 * over- or under-used relative to the pool, 3197 * and set an allocation bias to even it out. 3198 */ 3199 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3200 vdev_stat_t *vs = &vd->vdev_stat; 3201 int64_t vu, cu; 3202 3203 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3204 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3205 3206 /* 3207 * Calculate how much more or less we should 3208 * try to allocate from this device during 3209 * this iteration around the rotor. 3210 * For example, if a device is 80% full 3211 * and the pool is 20% full then we should 3212 * reduce allocations by 60% on this device. 3213 * 3214 * mg_bias = (20 - 80) * 512K / 100 = -307K 3215 * 3216 * This reduces allocations by 307K for this 3217 * iteration. 3218 */ 3219 mg->mg_bias = ((cu - vu) * 3220 (int64_t)mg->mg_aliquot) / 100; 3221 } else if (!metaslab_bias_enabled) { 3222 mg->mg_bias = 0; 3223 } 3224 3225 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3226 mg->mg_aliquot + mg->mg_bias) { 3227 mc->mc_rotor = mg->mg_next; 3228 mc->mc_aliquot = 0; 3229 } 3230 3231 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3232 DVA_SET_OFFSET(&dva[d], offset); 3233 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3234 DVA_SET_ASIZE(&dva[d], asize); 3235 3236 return (0); 3237 } 3238 next: 3239 mc->mc_rotor = mg->mg_next; 3240 mc->mc_aliquot = 0; 3241 } while ((mg = mg->mg_next) != rotor); 3242 3243 /* 3244 * If we haven't tried hard, do so now. 3245 */ 3246 if (!try_hard) { 3247 try_hard = B_TRUE; 3248 goto top; 3249 } 3250 3251 bzero(&dva[d], sizeof (dva_t)); 3252 3253 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); 3254 return (SET_ERROR(ENOSPC)); 3255 } 3256 3257 void 3258 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3259 boolean_t checkpoint) 3260 { 3261 metaslab_t *msp; 3262 spa_t *spa = vd->vdev_spa; 3263 3264 ASSERT(vdev_is_concrete(vd)); 3265 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3266 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3267 3268 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3269 3270 VERIFY(!msp->ms_condensing); 3271 VERIFY3U(offset, >=, msp->ms_start); 3272 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3273 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3274 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3275 3276 metaslab_check_free_impl(vd, offset, asize); 3277 3278 mutex_enter(&msp->ms_lock); 3279 if (range_tree_is_empty(msp->ms_freeing) && 3280 range_tree_is_empty(msp->ms_checkpointing)) { 3281 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3282 } 3283 3284 if (checkpoint) { 3285 ASSERT(spa_has_checkpoint(spa)); 3286 range_tree_add(msp->ms_checkpointing, offset, asize); 3287 } else { 3288 range_tree_add(msp->ms_freeing, offset, asize); 3289 } 3290 mutex_exit(&msp->ms_lock); 3291 } 3292 3293 /* ARGSUSED */ 3294 void 3295 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3296 uint64_t size, void *arg) 3297 { 3298 boolean_t *checkpoint = arg; 3299 3300 ASSERT3P(checkpoint, !=, NULL); 3301 3302 if (vd->vdev_ops->vdev_op_remap != NULL) 3303 vdev_indirect_mark_obsolete(vd, offset, size); 3304 else 3305 metaslab_free_impl(vd, offset, size, *checkpoint); 3306 } 3307 3308 static void 3309 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3310 boolean_t checkpoint) 3311 { 3312 spa_t *spa = vd->vdev_spa; 3313 3314 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3315 3316 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3317 return; 3318 3319 if (spa->spa_vdev_removal != NULL && 3320 spa->spa_vdev_removal->svr_vdev == vd && 3321 vdev_is_concrete(vd)) { 3322 /* 3323 * Note: we check if the vdev is concrete because when 3324 * we complete the removal, we first change the vdev to be 3325 * an indirect vdev (in open context), and then (in syncing 3326 * context) clear spa_vdev_removal. 3327 */ 3328 free_from_removing_vdev(vd, offset, size); 3329 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3330 vdev_indirect_mark_obsolete(vd, offset, size); 3331 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3332 metaslab_free_impl_cb, &checkpoint); 3333 } else { 3334 metaslab_free_concrete(vd, offset, size, checkpoint); 3335 } 3336 } 3337 3338 typedef struct remap_blkptr_cb_arg { 3339 blkptr_t *rbca_bp; 3340 spa_remap_cb_t rbca_cb; 3341 vdev_t *rbca_remap_vd; 3342 uint64_t rbca_remap_offset; 3343 void *rbca_cb_arg; 3344 } remap_blkptr_cb_arg_t; 3345 3346 void 3347 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3348 uint64_t size, void *arg) 3349 { 3350 remap_blkptr_cb_arg_t *rbca = arg; 3351 blkptr_t *bp = rbca->rbca_bp; 3352 3353 /* We can not remap split blocks. */ 3354 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3355 return; 3356 ASSERT0(inner_offset); 3357 3358 if (rbca->rbca_cb != NULL) { 3359 /* 3360 * At this point we know that we are not handling split 3361 * blocks and we invoke the callback on the previous 3362 * vdev which must be indirect. 3363 */ 3364 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3365 3366 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3367 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3368 3369 /* set up remap_blkptr_cb_arg for the next call */ 3370 rbca->rbca_remap_vd = vd; 3371 rbca->rbca_remap_offset = offset; 3372 } 3373 3374 /* 3375 * The phys birth time is that of dva[0]. This ensures that we know 3376 * when each dva was written, so that resilver can determine which 3377 * blocks need to be scrubbed (i.e. those written during the time 3378 * the vdev was offline). It also ensures that the key used in 3379 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3380 * we didn't change the phys_birth, a lookup in the ARC for a 3381 * remapped BP could find the data that was previously stored at 3382 * this vdev + offset. 3383 */ 3384 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3385 DVA_GET_VDEV(&bp->blk_dva[0])); 3386 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3387 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3388 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3389 3390 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3391 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3392 } 3393 3394 /* 3395 * If the block pointer contains any indirect DVAs, modify them to refer to 3396 * concrete DVAs. Note that this will sometimes not be possible, leaving 3397 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3398 * segments in the mapping (i.e. it is a "split block"). 3399 * 3400 * If the BP was remapped, calls the callback on the original dva (note the 3401 * callback can be called multiple times if the original indirect DVA refers 3402 * to another indirect DVA, etc). 3403 * 3404 * Returns TRUE if the BP was remapped. 3405 */ 3406 boolean_t 3407 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3408 { 3409 remap_blkptr_cb_arg_t rbca; 3410 3411 if (!zfs_remap_blkptr_enable) 3412 return (B_FALSE); 3413 3414 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3415 return (B_FALSE); 3416 3417 /* 3418 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3419 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3420 */ 3421 if (BP_GET_DEDUP(bp)) 3422 return (B_FALSE); 3423 3424 /* 3425 * Gang blocks can not be remapped, because 3426 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3427 * the BP used to read the gang block header (GBH) being the same 3428 * as the DVA[0] that we allocated for the GBH. 3429 */ 3430 if (BP_IS_GANG(bp)) 3431 return (B_FALSE); 3432 3433 /* 3434 * Embedded BP's have no DVA to remap. 3435 */ 3436 if (BP_GET_NDVAS(bp) < 1) 3437 return (B_FALSE); 3438 3439 /* 3440 * Note: we only remap dva[0]. If we remapped other dvas, we 3441 * would no longer know what their phys birth txg is. 3442 */ 3443 dva_t *dva = &bp->blk_dva[0]; 3444 3445 uint64_t offset = DVA_GET_OFFSET(dva); 3446 uint64_t size = DVA_GET_ASIZE(dva); 3447 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3448 3449 if (vd->vdev_ops->vdev_op_remap == NULL) 3450 return (B_FALSE); 3451 3452 rbca.rbca_bp = bp; 3453 rbca.rbca_cb = callback; 3454 rbca.rbca_remap_vd = vd; 3455 rbca.rbca_remap_offset = offset; 3456 rbca.rbca_cb_arg = arg; 3457 3458 /* 3459 * remap_blkptr_cb() will be called in order for each level of 3460 * indirection, until a concrete vdev is reached or a split block is 3461 * encountered. old_vd and old_offset are updated within the callback 3462 * as we go from the one indirect vdev to the next one (either concrete 3463 * or indirect again) in that order. 3464 */ 3465 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3466 3467 /* Check if the DVA wasn't remapped because it is a split block */ 3468 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3469 return (B_FALSE); 3470 3471 return (B_TRUE); 3472 } 3473 3474 /* 3475 * Undo the allocation of a DVA which happened in the given transaction group. 3476 */ 3477 void 3478 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3479 { 3480 metaslab_t *msp; 3481 vdev_t *vd; 3482 uint64_t vdev = DVA_GET_VDEV(dva); 3483 uint64_t offset = DVA_GET_OFFSET(dva); 3484 uint64_t size = DVA_GET_ASIZE(dva); 3485 3486 ASSERT(DVA_IS_VALID(dva)); 3487 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3488 3489 if (txg > spa_freeze_txg(spa)) 3490 return; 3491 3492 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3493 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3494 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3495 (u_longlong_t)vdev, (u_longlong_t)offset); 3496 ASSERT(0); 3497 return; 3498 } 3499 3500 ASSERT(!vd->vdev_removing); 3501 ASSERT(vdev_is_concrete(vd)); 3502 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3503 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3504 3505 if (DVA_GET_GANG(dva)) 3506 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3507 3508 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3509 3510 mutex_enter(&msp->ms_lock); 3511 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3512 offset, size); 3513 3514 VERIFY(!msp->ms_condensing); 3515 VERIFY3U(offset, >=, msp->ms_start); 3516 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3517 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3518 msp->ms_size); 3519 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3520 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3521 range_tree_add(msp->ms_allocatable, offset, size); 3522 mutex_exit(&msp->ms_lock); 3523 } 3524 3525 /* 3526 * Free the block represented by the given DVA. 3527 */ 3528 void 3529 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3530 { 3531 uint64_t vdev = DVA_GET_VDEV(dva); 3532 uint64_t offset = DVA_GET_OFFSET(dva); 3533 uint64_t size = DVA_GET_ASIZE(dva); 3534 vdev_t *vd = vdev_lookup_top(spa, vdev); 3535 3536 ASSERT(DVA_IS_VALID(dva)); 3537 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3538 3539 if (DVA_GET_GANG(dva)) { 3540 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3541 } 3542 3543 metaslab_free_impl(vd, offset, size, checkpoint); 3544 } 3545 3546 /* 3547 * Reserve some allocation slots. The reservation system must be called 3548 * before we call into the allocator. If there aren't any available slots 3549 * then the I/O will be throttled until an I/O completes and its slots are 3550 * freed up. The function returns true if it was successful in placing 3551 * the reservation. 3552 */ 3553 boolean_t 3554 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 3555 int flags) 3556 { 3557 uint64_t available_slots = 0; 3558 boolean_t slot_reserved = B_FALSE; 3559 3560 ASSERT(mc->mc_alloc_throttle_enabled); 3561 mutex_enter(&mc->mc_lock); 3562 3563 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 3564 if (reserved_slots < mc->mc_alloc_max_slots) 3565 available_slots = mc->mc_alloc_max_slots - reserved_slots; 3566 3567 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3568 /* 3569 * We reserve the slots individually so that we can unreserve 3570 * them individually when an I/O completes. 3571 */ 3572 for (int d = 0; d < slots; d++) { 3573 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 3574 } 3575 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3576 slot_reserved = B_TRUE; 3577 } 3578 3579 mutex_exit(&mc->mc_lock); 3580 return (slot_reserved); 3581 } 3582 3583 void 3584 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) 3585 { 3586 ASSERT(mc->mc_alloc_throttle_enabled); 3587 mutex_enter(&mc->mc_lock); 3588 for (int d = 0; d < slots; d++) { 3589 (void) refcount_remove(&mc->mc_alloc_slots, zio); 3590 } 3591 mutex_exit(&mc->mc_lock); 3592 } 3593 3594 static int 3595 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3596 uint64_t txg) 3597 { 3598 metaslab_t *msp; 3599 spa_t *spa = vd->vdev_spa; 3600 int error = 0; 3601 3602 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3603 return (ENXIO); 3604 3605 ASSERT3P(vd->vdev_ms, !=, NULL); 3606 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3607 3608 mutex_enter(&msp->ms_lock); 3609 3610 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3611 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 3612 3613 if (error == 0 && 3614 !range_tree_contains(msp->ms_allocatable, offset, size)) 3615 error = SET_ERROR(ENOENT); 3616 3617 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3618 mutex_exit(&msp->ms_lock); 3619 return (error); 3620 } 3621 3622 VERIFY(!msp->ms_condensing); 3623 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3624 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3625 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3626 msp->ms_size); 3627 range_tree_remove(msp->ms_allocatable, offset, size); 3628 3629 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3630 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3631 vdev_dirty(vd, VDD_METASLAB, msp, txg); 3632 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3633 offset, size); 3634 } 3635 3636 mutex_exit(&msp->ms_lock); 3637 3638 return (0); 3639 } 3640 3641 typedef struct metaslab_claim_cb_arg_t { 3642 uint64_t mcca_txg; 3643 int mcca_error; 3644 } metaslab_claim_cb_arg_t; 3645 3646 /* ARGSUSED */ 3647 static void 3648 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3649 uint64_t size, void *arg) 3650 { 3651 metaslab_claim_cb_arg_t *mcca_arg = arg; 3652 3653 if (mcca_arg->mcca_error == 0) { 3654 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3655 size, mcca_arg->mcca_txg); 3656 } 3657 } 3658 3659 int 3660 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3661 { 3662 if (vd->vdev_ops->vdev_op_remap != NULL) { 3663 metaslab_claim_cb_arg_t arg; 3664 3665 /* 3666 * Only zdb(1M) can claim on indirect vdevs. This is used 3667 * to detect leaks of mapped space (that are not accounted 3668 * for in the obsolete counts, spacemap, or bpobj). 3669 */ 3670 ASSERT(!spa_writeable(vd->vdev_spa)); 3671 arg.mcca_error = 0; 3672 arg.mcca_txg = txg; 3673 3674 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3675 metaslab_claim_impl_cb, &arg); 3676 3677 if (arg.mcca_error == 0) { 3678 arg.mcca_error = metaslab_claim_concrete(vd, 3679 offset, size, txg); 3680 } 3681 return (arg.mcca_error); 3682 } else { 3683 return (metaslab_claim_concrete(vd, offset, size, txg)); 3684 } 3685 } 3686 3687 /* 3688 * Intent log support: upon opening the pool after a crash, notify the SPA 3689 * of blocks that the intent log has allocated for immediate write, but 3690 * which are still considered free by the SPA because the last transaction 3691 * group didn't commit yet. 3692 */ 3693 static int 3694 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3695 { 3696 uint64_t vdev = DVA_GET_VDEV(dva); 3697 uint64_t offset = DVA_GET_OFFSET(dva); 3698 uint64_t size = DVA_GET_ASIZE(dva); 3699 vdev_t *vd; 3700 3701 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 3702 return (SET_ERROR(ENXIO)); 3703 } 3704 3705 ASSERT(DVA_IS_VALID(dva)); 3706 3707 if (DVA_GET_GANG(dva)) 3708 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3709 3710 return (metaslab_claim_impl(vd, offset, size, txg)); 3711 } 3712 3713 int 3714 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 3715 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 3716 zio_alloc_list_t *zal, zio_t *zio) 3717 { 3718 dva_t *dva = bp->blk_dva; 3719 dva_t *hintdva = hintbp->blk_dva; 3720 int error = 0; 3721 3722 ASSERT(bp->blk_birth == 0); 3723 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 3724 3725 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3726 3727 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 3728 spa_config_exit(spa, SCL_ALLOC, FTAG); 3729 return (SET_ERROR(ENOSPC)); 3730 } 3731 3732 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 3733 ASSERT(BP_GET_NDVAS(bp) == 0); 3734 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 3735 ASSERT3P(zal, !=, NULL); 3736 3737 for (int d = 0; d < ndvas; d++) { 3738 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 3739 txg, flags, zal); 3740 if (error != 0) { 3741 for (d--; d >= 0; d--) { 3742 metaslab_unalloc_dva(spa, &dva[d], txg); 3743 metaslab_group_alloc_decrement(spa, 3744 DVA_GET_VDEV(&dva[d]), zio, flags); 3745 bzero(&dva[d], sizeof (dva_t)); 3746 } 3747 spa_config_exit(spa, SCL_ALLOC, FTAG); 3748 return (error); 3749 } else { 3750 /* 3751 * Update the metaslab group's queue depth 3752 * based on the newly allocated dva. 3753 */ 3754 metaslab_group_alloc_increment(spa, 3755 DVA_GET_VDEV(&dva[d]), zio, flags); 3756 } 3757 3758 } 3759 ASSERT(error == 0); 3760 ASSERT(BP_GET_NDVAS(bp) == ndvas); 3761 3762 spa_config_exit(spa, SCL_ALLOC, FTAG); 3763 3764 BP_SET_BIRTH(bp, txg, txg); 3765 3766 return (0); 3767 } 3768 3769 void 3770 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 3771 { 3772 const dva_t *dva = bp->blk_dva; 3773 int ndvas = BP_GET_NDVAS(bp); 3774 3775 ASSERT(!BP_IS_HOLE(bp)); 3776 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 3777 3778 /* 3779 * If we have a checkpoint for the pool we need to make sure that 3780 * the blocks that we free that are part of the checkpoint won't be 3781 * reused until the checkpoint is discarded or we revert to it. 3782 * 3783 * The checkpoint flag is passed down the metaslab_free code path 3784 * and is set whenever we want to add a block to the checkpoint's 3785 * accounting. That is, we "checkpoint" blocks that existed at the 3786 * time the checkpoint was created and are therefore referenced by 3787 * the checkpointed uberblock. 3788 * 3789 * Note that, we don't checkpoint any blocks if the current 3790 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 3791 * normally as they will be referenced by the checkpointed uberblock. 3792 */ 3793 boolean_t checkpoint = B_FALSE; 3794 if (bp->blk_birth <= spa->spa_checkpoint_txg && 3795 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 3796 /* 3797 * At this point, if the block is part of the checkpoint 3798 * there is no way it was created in the current txg. 3799 */ 3800 ASSERT(!now); 3801 ASSERT3U(spa_syncing_txg(spa), ==, txg); 3802 checkpoint = B_TRUE; 3803 } 3804 3805 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 3806 3807 for (int d = 0; d < ndvas; d++) { 3808 if (now) { 3809 metaslab_unalloc_dva(spa, &dva[d], txg); 3810 } else { 3811 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 3812 metaslab_free_dva(spa, &dva[d], checkpoint); 3813 } 3814 } 3815 3816 spa_config_exit(spa, SCL_FREE, FTAG); 3817 } 3818 3819 int 3820 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 3821 { 3822 const dva_t *dva = bp->blk_dva; 3823 int ndvas = BP_GET_NDVAS(bp); 3824 int error = 0; 3825 3826 ASSERT(!BP_IS_HOLE(bp)); 3827 3828 if (txg != 0) { 3829 /* 3830 * First do a dry run to make sure all DVAs are claimable, 3831 * so we don't have to unwind from partial failures below. 3832 */ 3833 if ((error = metaslab_claim(spa, bp, 0)) != 0) 3834 return (error); 3835 } 3836 3837 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3838 3839 for (int d = 0; d < ndvas; d++) 3840 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 3841 break; 3842 3843 spa_config_exit(spa, SCL_ALLOC, FTAG); 3844 3845 ASSERT(error == 0 || txg == 0); 3846 3847 return (error); 3848 } 3849 3850 /* ARGSUSED */ 3851 static void 3852 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 3853 uint64_t size, void *arg) 3854 { 3855 if (vd->vdev_ops == &vdev_indirect_ops) 3856 return; 3857 3858 metaslab_check_free_impl(vd, offset, size); 3859 } 3860 3861 static void 3862 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 3863 { 3864 metaslab_t *msp; 3865 spa_t *spa = vd->vdev_spa; 3866 3867 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 3868 return; 3869 3870 if (vd->vdev_ops->vdev_op_remap != NULL) { 3871 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3872 metaslab_check_free_impl_cb, NULL); 3873 return; 3874 } 3875 3876 ASSERT(vdev_is_concrete(vd)); 3877 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3878 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3879 3880 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3881 3882 mutex_enter(&msp->ms_lock); 3883 if (msp->ms_loaded) 3884 range_tree_verify(msp->ms_allocatable, offset, size); 3885 3886 range_tree_verify(msp->ms_freeing, offset, size); 3887 range_tree_verify(msp->ms_checkpointing, offset, size); 3888 range_tree_verify(msp->ms_freed, offset, size); 3889 for (int j = 0; j < TXG_DEFER_SIZE; j++) 3890 range_tree_verify(msp->ms_defer[j], offset, size); 3891 mutex_exit(&msp->ms_lock); 3892 } 3893 3894 void 3895 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 3896 { 3897 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 3898 return; 3899 3900 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3901 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 3902 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 3903 vdev_t *vd = vdev_lookup_top(spa, vdev); 3904 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 3905 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 3906 3907 if (DVA_GET_GANG(&bp->blk_dva[i])) 3908 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3909 3910 ASSERT3P(vd, !=, NULL); 3911 3912 metaslab_check_free_impl(vd, offset, size); 3913 } 3914 spa_config_exit(spa, SCL_VDEV, FTAG); 3915 } 3916