1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/space_map.h> 32 #include <sys/metaslab_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zfeature.h> 37 #include <sys/vdev_indirect_mapping.h> 38 #include <sys/zap.h> 39 40 #define GANG_ALLOCATION(flags) \ 41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 42 43 uint64_t metaslab_aliquot = 512ULL << 10; 44 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 45 46 /* 47 * Since we can touch multiple metaslabs (and their respective space maps) 48 * with each transaction group, we benefit from having a smaller space map 49 * block size since it allows us to issue more I/O operations scattered 50 * around the disk. 51 */ 52 int zfs_metaslab_sm_blksz = (1 << 12); 53 54 /* 55 * The in-core space map representation is more compact than its on-disk form. 56 * The zfs_condense_pct determines how much more compact the in-core 57 * space map representation must be before we compact it on-disk. 58 * Values should be greater than or equal to 100. 59 */ 60 int zfs_condense_pct = 200; 61 62 /* 63 * Condensing a metaslab is not guaranteed to actually reduce the amount of 64 * space used on disk. In particular, a space map uses data in increments of 65 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 66 * same number of blocks after condensing. Since the goal of condensing is to 67 * reduce the number of IOPs required to read the space map, we only want to 68 * condense when we can be sure we will reduce the number of blocks used by the 69 * space map. Unfortunately, we cannot precisely compute whether or not this is 70 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 71 * we apply the following heuristic: do not condense a spacemap unless the 72 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 73 * blocks. 74 */ 75 int zfs_metaslab_condense_block_threshold = 4; 76 77 /* 78 * The zfs_mg_noalloc_threshold defines which metaslab groups should 79 * be eligible for allocation. The value is defined as a percentage of 80 * free space. Metaslab groups that have more free space than 81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 82 * a metaslab group's free space is less than or equal to the 83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 86 * groups are allowed to accept allocations. Gang blocks are always 87 * eligible to allocate on any metaslab group. The default value of 0 means 88 * no metaslab group will be excluded based on this criterion. 89 */ 90 int zfs_mg_noalloc_threshold = 0; 91 92 /* 93 * Metaslab groups are considered eligible for allocations if their 94 * fragmenation metric (measured as a percentage) is less than or equal to 95 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 96 * then it will be skipped unless all metaslab groups within the metaslab 97 * class have also crossed this threshold. 98 */ 99 int zfs_mg_fragmentation_threshold = 85; 100 101 /* 102 * Allow metaslabs to keep their active state as long as their fragmentation 103 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 104 * active metaslab that exceeds this threshold will no longer keep its active 105 * status allowing better metaslabs to be selected. 106 */ 107 int zfs_metaslab_fragmentation_threshold = 70; 108 109 /* 110 * When set will load all metaslabs when pool is first opened. 111 */ 112 int metaslab_debug_load = 0; 113 114 /* 115 * When set will prevent metaslabs from being unloaded. 116 */ 117 int metaslab_debug_unload = 0; 118 119 /* 120 * Minimum size which forces the dynamic allocator to change 121 * it's allocation strategy. Once the space map cannot satisfy 122 * an allocation of this size then it switches to using more 123 * aggressive strategy (i.e search by size rather than offset). 124 */ 125 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 126 127 /* 128 * The minimum free space, in percent, which must be available 129 * in a space map to continue allocations in a first-fit fashion. 130 * Once the space map's free space drops below this level we dynamically 131 * switch to using best-fit allocations. 132 */ 133 int metaslab_df_free_pct = 4; 134 135 /* 136 * A metaslab is considered "free" if it contains a contiguous 137 * segment which is greater than metaslab_min_alloc_size. 138 */ 139 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 140 141 /* 142 * Percentage of all cpus that can be used by the metaslab taskq. 143 */ 144 int metaslab_load_pct = 50; 145 146 /* 147 * Determines how many txgs a metaslab may remain loaded without having any 148 * allocations from it. As long as a metaslab continues to be used we will 149 * keep it loaded. 150 */ 151 int metaslab_unload_delay = TXG_SIZE * 2; 152 153 /* 154 * Max number of metaslabs per group to preload. 155 */ 156 int metaslab_preload_limit = SPA_DVAS_PER_BP; 157 158 /* 159 * Enable/disable preloading of metaslab. 160 */ 161 boolean_t metaslab_preload_enabled = B_TRUE; 162 163 /* 164 * Enable/disable fragmentation weighting on metaslabs. 165 */ 166 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 167 168 /* 169 * Enable/disable lba weighting (i.e. outer tracks are given preference). 170 */ 171 boolean_t metaslab_lba_weighting_enabled = B_TRUE; 172 173 /* 174 * Enable/disable metaslab group biasing. 175 */ 176 boolean_t metaslab_bias_enabled = B_TRUE; 177 178 /* 179 * Enable/disable remapping of indirect DVAs to their concrete vdevs. 180 */ 181 boolean_t zfs_remap_blkptr_enable = B_TRUE; 182 183 /* 184 * Enable/disable segment-based metaslab selection. 185 */ 186 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; 187 188 /* 189 * When using segment-based metaslab selection, we will continue 190 * allocating from the active metaslab until we have exhausted 191 * zfs_metaslab_switch_threshold of its buckets. 192 */ 193 int zfs_metaslab_switch_threshold = 2; 194 195 /* 196 * Internal switch to enable/disable the metaslab allocation tracing 197 * facility. 198 */ 199 boolean_t metaslab_trace_enabled = B_TRUE; 200 201 /* 202 * Maximum entries that the metaslab allocation tracing facility will keep 203 * in a given list when running in non-debug mode. We limit the number 204 * of entries in non-debug mode to prevent us from using up too much memory. 205 * The limit should be sufficiently large that we don't expect any allocation 206 * to every exceed this value. In debug mode, the system will panic if this 207 * limit is ever reached allowing for further investigation. 208 */ 209 uint64_t metaslab_trace_max_entries = 5000; 210 211 static uint64_t metaslab_weight(metaslab_t *); 212 static void metaslab_set_fragmentation(metaslab_t *); 213 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); 214 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); 215 216 kmem_cache_t *metaslab_alloc_trace_cache; 217 218 /* 219 * ========================================================================== 220 * Metaslab classes 221 * ========================================================================== 222 */ 223 metaslab_class_t * 224 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 225 { 226 metaslab_class_t *mc; 227 228 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 229 230 mc->mc_spa = spa; 231 mc->mc_rotor = NULL; 232 mc->mc_ops = ops; 233 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 234 refcount_create_tracked(&mc->mc_alloc_slots); 235 236 return (mc); 237 } 238 239 void 240 metaslab_class_destroy(metaslab_class_t *mc) 241 { 242 ASSERT(mc->mc_rotor == NULL); 243 ASSERT(mc->mc_alloc == 0); 244 ASSERT(mc->mc_deferred == 0); 245 ASSERT(mc->mc_space == 0); 246 ASSERT(mc->mc_dspace == 0); 247 248 refcount_destroy(&mc->mc_alloc_slots); 249 mutex_destroy(&mc->mc_lock); 250 kmem_free(mc, sizeof (metaslab_class_t)); 251 } 252 253 int 254 metaslab_class_validate(metaslab_class_t *mc) 255 { 256 metaslab_group_t *mg; 257 vdev_t *vd; 258 259 /* 260 * Must hold one of the spa_config locks. 261 */ 262 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 263 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 264 265 if ((mg = mc->mc_rotor) == NULL) 266 return (0); 267 268 do { 269 vd = mg->mg_vd; 270 ASSERT(vd->vdev_mg != NULL); 271 ASSERT3P(vd->vdev_top, ==, vd); 272 ASSERT3P(mg->mg_class, ==, mc); 273 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 274 } while ((mg = mg->mg_next) != mc->mc_rotor); 275 276 return (0); 277 } 278 279 void 280 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 281 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 282 { 283 atomic_add_64(&mc->mc_alloc, alloc_delta); 284 atomic_add_64(&mc->mc_deferred, defer_delta); 285 atomic_add_64(&mc->mc_space, space_delta); 286 atomic_add_64(&mc->mc_dspace, dspace_delta); 287 } 288 289 uint64_t 290 metaslab_class_get_alloc(metaslab_class_t *mc) 291 { 292 return (mc->mc_alloc); 293 } 294 295 uint64_t 296 metaslab_class_get_deferred(metaslab_class_t *mc) 297 { 298 return (mc->mc_deferred); 299 } 300 301 uint64_t 302 metaslab_class_get_space(metaslab_class_t *mc) 303 { 304 return (mc->mc_space); 305 } 306 307 uint64_t 308 metaslab_class_get_dspace(metaslab_class_t *mc) 309 { 310 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 311 } 312 313 void 314 metaslab_class_histogram_verify(metaslab_class_t *mc) 315 { 316 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 317 uint64_t *mc_hist; 318 int i; 319 320 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 321 return; 322 323 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 324 KM_SLEEP); 325 326 for (int c = 0; c < rvd->vdev_children; c++) { 327 vdev_t *tvd = rvd->vdev_child[c]; 328 metaslab_group_t *mg = tvd->vdev_mg; 329 330 /* 331 * Skip any holes, uninitialized top-levels, or 332 * vdevs that are not in this metalab class. 333 */ 334 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 335 mg->mg_class != mc) { 336 continue; 337 } 338 339 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 340 mc_hist[i] += mg->mg_histogram[i]; 341 } 342 343 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 344 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 345 346 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 347 } 348 349 /* 350 * Calculate the metaslab class's fragmentation metric. The metric 351 * is weighted based on the space contribution of each metaslab group. 352 * The return value will be a number between 0 and 100 (inclusive), or 353 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 354 * zfs_frag_table for more information about the metric. 355 */ 356 uint64_t 357 metaslab_class_fragmentation(metaslab_class_t *mc) 358 { 359 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 360 uint64_t fragmentation = 0; 361 362 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 363 364 for (int c = 0; c < rvd->vdev_children; c++) { 365 vdev_t *tvd = rvd->vdev_child[c]; 366 metaslab_group_t *mg = tvd->vdev_mg; 367 368 /* 369 * Skip any holes, uninitialized top-levels, 370 * or vdevs that are not in this metalab class. 371 */ 372 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 373 mg->mg_class != mc) { 374 continue; 375 } 376 377 /* 378 * If a metaslab group does not contain a fragmentation 379 * metric then just bail out. 380 */ 381 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 382 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 383 return (ZFS_FRAG_INVALID); 384 } 385 386 /* 387 * Determine how much this metaslab_group is contributing 388 * to the overall pool fragmentation metric. 389 */ 390 fragmentation += mg->mg_fragmentation * 391 metaslab_group_get_space(mg); 392 } 393 fragmentation /= metaslab_class_get_space(mc); 394 395 ASSERT3U(fragmentation, <=, 100); 396 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 397 return (fragmentation); 398 } 399 400 /* 401 * Calculate the amount of expandable space that is available in 402 * this metaslab class. If a device is expanded then its expandable 403 * space will be the amount of allocatable space that is currently not 404 * part of this metaslab class. 405 */ 406 uint64_t 407 metaslab_class_expandable_space(metaslab_class_t *mc) 408 { 409 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 410 uint64_t space = 0; 411 412 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 413 for (int c = 0; c < rvd->vdev_children; c++) { 414 uint64_t tspace; 415 vdev_t *tvd = rvd->vdev_child[c]; 416 metaslab_group_t *mg = tvd->vdev_mg; 417 418 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || 419 mg->mg_class != mc) { 420 continue; 421 } 422 423 /* 424 * Calculate if we have enough space to add additional 425 * metaslabs. We report the expandable space in terms 426 * of the metaslab size since that's the unit of expansion. 427 * Adjust by efi system partition size. 428 */ 429 tspace = tvd->vdev_max_asize - tvd->vdev_asize; 430 if (tspace > mc->mc_spa->spa_bootsize) { 431 tspace -= mc->mc_spa->spa_bootsize; 432 } 433 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift); 434 } 435 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 436 return (space); 437 } 438 439 static int 440 metaslab_compare(const void *x1, const void *x2) 441 { 442 const metaslab_t *m1 = x1; 443 const metaslab_t *m2 = x2; 444 445 if (m1->ms_weight < m2->ms_weight) 446 return (1); 447 if (m1->ms_weight > m2->ms_weight) 448 return (-1); 449 450 /* 451 * If the weights are identical, use the offset to force uniqueness. 452 */ 453 if (m1->ms_start < m2->ms_start) 454 return (-1); 455 if (m1->ms_start > m2->ms_start) 456 return (1); 457 458 ASSERT3P(m1, ==, m2); 459 460 return (0); 461 } 462 463 /* 464 * Verify that the space accounting on disk matches the in-core range_trees. 465 */ 466 void 467 metaslab_verify_space(metaslab_t *msp, uint64_t txg) 468 { 469 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 470 uint64_t allocated = 0; 471 uint64_t sm_free_space, msp_free_space; 472 473 ASSERT(MUTEX_HELD(&msp->ms_lock)); 474 475 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) 476 return; 477 478 /* 479 * We can only verify the metaslab space when we're called 480 * from syncing context with a loaded metaslab that has an allocated 481 * space map. Calling this in non-syncing context does not 482 * provide a consistent view of the metaslab since we're performing 483 * allocations in the future. 484 */ 485 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || 486 !msp->ms_loaded) 487 return; 488 489 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - 490 space_map_alloc_delta(msp->ms_sm); 491 492 /* 493 * Account for future allocations since we would have already 494 * deducted that space from the ms_freetree. 495 */ 496 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { 497 allocated += 498 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); 499 } 500 501 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + 502 msp->ms_deferspace + range_tree_space(msp->ms_freed); 503 504 VERIFY3U(sm_free_space, ==, msp_free_space); 505 } 506 507 /* 508 * ========================================================================== 509 * Metaslab groups 510 * ========================================================================== 511 */ 512 /* 513 * Update the allocatable flag and the metaslab group's capacity. 514 * The allocatable flag is set to true if the capacity is below 515 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 516 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 517 * transitions from allocatable to non-allocatable or vice versa then the 518 * metaslab group's class is updated to reflect the transition. 519 */ 520 static void 521 metaslab_group_alloc_update(metaslab_group_t *mg) 522 { 523 vdev_t *vd = mg->mg_vd; 524 metaslab_class_t *mc = mg->mg_class; 525 vdev_stat_t *vs = &vd->vdev_stat; 526 boolean_t was_allocatable; 527 boolean_t was_initialized; 528 529 ASSERT(vd == vd->vdev_top); 530 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, 531 SCL_ALLOC); 532 533 mutex_enter(&mg->mg_lock); 534 was_allocatable = mg->mg_allocatable; 535 was_initialized = mg->mg_initialized; 536 537 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 538 (vs->vs_space + 1); 539 540 mutex_enter(&mc->mc_lock); 541 542 /* 543 * If the metaslab group was just added then it won't 544 * have any space until we finish syncing out this txg. 545 * At that point we will consider it initialized and available 546 * for allocations. We also don't consider non-activated 547 * metaslab groups (e.g. vdevs that are in the middle of being removed) 548 * to be initialized, because they can't be used for allocation. 549 */ 550 mg->mg_initialized = metaslab_group_initialized(mg); 551 if (!was_initialized && mg->mg_initialized) { 552 mc->mc_groups++; 553 } else if (was_initialized && !mg->mg_initialized) { 554 ASSERT3U(mc->mc_groups, >, 0); 555 mc->mc_groups--; 556 } 557 if (mg->mg_initialized) 558 mg->mg_no_free_space = B_FALSE; 559 560 /* 561 * A metaslab group is considered allocatable if it has plenty 562 * of free space or is not heavily fragmented. We only take 563 * fragmentation into account if the metaslab group has a valid 564 * fragmentation metric (i.e. a value between 0 and 100). 565 */ 566 mg->mg_allocatable = (mg->mg_activation_count > 0 && 567 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 568 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 569 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 570 571 /* 572 * The mc_alloc_groups maintains a count of the number of 573 * groups in this metaslab class that are still above the 574 * zfs_mg_noalloc_threshold. This is used by the allocating 575 * threads to determine if they should avoid allocations to 576 * a given group. The allocator will avoid allocations to a group 577 * if that group has reached or is below the zfs_mg_noalloc_threshold 578 * and there are still other groups that are above the threshold. 579 * When a group transitions from allocatable to non-allocatable or 580 * vice versa we update the metaslab class to reflect that change. 581 * When the mc_alloc_groups value drops to 0 that means that all 582 * groups have reached the zfs_mg_noalloc_threshold making all groups 583 * eligible for allocations. This effectively means that all devices 584 * are balanced again. 585 */ 586 if (was_allocatable && !mg->mg_allocatable) 587 mc->mc_alloc_groups--; 588 else if (!was_allocatable && mg->mg_allocatable) 589 mc->mc_alloc_groups++; 590 mutex_exit(&mc->mc_lock); 591 592 mutex_exit(&mg->mg_lock); 593 } 594 595 metaslab_group_t * 596 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 597 { 598 metaslab_group_t *mg; 599 600 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 601 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 602 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 603 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 604 mg->mg_vd = vd; 605 mg->mg_class = mc; 606 mg->mg_activation_count = 0; 607 mg->mg_initialized = B_FALSE; 608 mg->mg_no_free_space = B_TRUE; 609 refcount_create_tracked(&mg->mg_alloc_queue_depth); 610 611 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 612 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 613 614 return (mg); 615 } 616 617 void 618 metaslab_group_destroy(metaslab_group_t *mg) 619 { 620 ASSERT(mg->mg_prev == NULL); 621 ASSERT(mg->mg_next == NULL); 622 /* 623 * We may have gone below zero with the activation count 624 * either because we never activated in the first place or 625 * because we're done, and possibly removing the vdev. 626 */ 627 ASSERT(mg->mg_activation_count <= 0); 628 629 taskq_destroy(mg->mg_taskq); 630 avl_destroy(&mg->mg_metaslab_tree); 631 mutex_destroy(&mg->mg_lock); 632 refcount_destroy(&mg->mg_alloc_queue_depth); 633 kmem_free(mg, sizeof (metaslab_group_t)); 634 } 635 636 void 637 metaslab_group_activate(metaslab_group_t *mg) 638 { 639 metaslab_class_t *mc = mg->mg_class; 640 metaslab_group_t *mgprev, *mgnext; 641 642 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); 643 644 ASSERT(mc->mc_rotor != mg); 645 ASSERT(mg->mg_prev == NULL); 646 ASSERT(mg->mg_next == NULL); 647 ASSERT(mg->mg_activation_count <= 0); 648 649 if (++mg->mg_activation_count <= 0) 650 return; 651 652 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 653 metaslab_group_alloc_update(mg); 654 655 if ((mgprev = mc->mc_rotor) == NULL) { 656 mg->mg_prev = mg; 657 mg->mg_next = mg; 658 } else { 659 mgnext = mgprev->mg_next; 660 mg->mg_prev = mgprev; 661 mg->mg_next = mgnext; 662 mgprev->mg_next = mg; 663 mgnext->mg_prev = mg; 664 } 665 mc->mc_rotor = mg; 666 } 667 668 /* 669 * Passivate a metaslab group and remove it from the allocation rotor. 670 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating 671 * a metaslab group. This function will momentarily drop spa_config_locks 672 * that are lower than the SCL_ALLOC lock (see comment below). 673 */ 674 void 675 metaslab_group_passivate(metaslab_group_t *mg) 676 { 677 metaslab_class_t *mc = mg->mg_class; 678 spa_t *spa = mc->mc_spa; 679 metaslab_group_t *mgprev, *mgnext; 680 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); 681 682 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, 683 (SCL_ALLOC | SCL_ZIO)); 684 685 if (--mg->mg_activation_count != 0) { 686 ASSERT(mc->mc_rotor != mg); 687 ASSERT(mg->mg_prev == NULL); 688 ASSERT(mg->mg_next == NULL); 689 ASSERT(mg->mg_activation_count < 0); 690 return; 691 } 692 693 /* 694 * The spa_config_lock is an array of rwlocks, ordered as 695 * follows (from highest to lowest): 696 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > 697 * SCL_ZIO > SCL_FREE > SCL_VDEV 698 * (For more information about the spa_config_lock see spa_misc.c) 699 * The higher the lock, the broader its coverage. When we passivate 700 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO 701 * config locks. However, the metaslab group's taskq might be trying 702 * to preload metaslabs so we must drop the SCL_ZIO lock and any 703 * lower locks to allow the I/O to complete. At a minimum, 704 * we continue to hold the SCL_ALLOC lock, which prevents any future 705 * allocations from taking place and any changes to the vdev tree. 706 */ 707 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); 708 taskq_wait(mg->mg_taskq); 709 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); 710 metaslab_group_alloc_update(mg); 711 712 mgprev = mg->mg_prev; 713 mgnext = mg->mg_next; 714 715 if (mg == mgnext) { 716 mc->mc_rotor = NULL; 717 } else { 718 mc->mc_rotor = mgnext; 719 mgprev->mg_next = mgnext; 720 mgnext->mg_prev = mgprev; 721 } 722 723 mg->mg_prev = NULL; 724 mg->mg_next = NULL; 725 } 726 727 boolean_t 728 metaslab_group_initialized(metaslab_group_t *mg) 729 { 730 vdev_t *vd = mg->mg_vd; 731 vdev_stat_t *vs = &vd->vdev_stat; 732 733 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 734 } 735 736 uint64_t 737 metaslab_group_get_space(metaslab_group_t *mg) 738 { 739 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 740 } 741 742 void 743 metaslab_group_histogram_verify(metaslab_group_t *mg) 744 { 745 uint64_t *mg_hist; 746 vdev_t *vd = mg->mg_vd; 747 uint64_t ashift = vd->vdev_ashift; 748 int i; 749 750 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 751 return; 752 753 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 754 KM_SLEEP); 755 756 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 757 SPACE_MAP_HISTOGRAM_SIZE + ashift); 758 759 for (int m = 0; m < vd->vdev_ms_count; m++) { 760 metaslab_t *msp = vd->vdev_ms[m]; 761 762 if (msp->ms_sm == NULL) 763 continue; 764 765 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 766 mg_hist[i + ashift] += 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 770 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 771 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 772 773 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 774 } 775 776 static void 777 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 778 { 779 metaslab_class_t *mc = mg->mg_class; 780 uint64_t ashift = mg->mg_vd->vdev_ashift; 781 782 ASSERT(MUTEX_HELD(&msp->ms_lock)); 783 if (msp->ms_sm == NULL) 784 return; 785 786 mutex_enter(&mg->mg_lock); 787 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 788 mg->mg_histogram[i + ashift] += 789 msp->ms_sm->sm_phys->smp_histogram[i]; 790 mc->mc_histogram[i + ashift] += 791 msp->ms_sm->sm_phys->smp_histogram[i]; 792 } 793 mutex_exit(&mg->mg_lock); 794 } 795 796 void 797 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 798 { 799 metaslab_class_t *mc = mg->mg_class; 800 uint64_t ashift = mg->mg_vd->vdev_ashift; 801 802 ASSERT(MUTEX_HELD(&msp->ms_lock)); 803 if (msp->ms_sm == NULL) 804 return; 805 806 mutex_enter(&mg->mg_lock); 807 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 808 ASSERT3U(mg->mg_histogram[i + ashift], >=, 809 msp->ms_sm->sm_phys->smp_histogram[i]); 810 ASSERT3U(mc->mc_histogram[i + ashift], >=, 811 msp->ms_sm->sm_phys->smp_histogram[i]); 812 813 mg->mg_histogram[i + ashift] -= 814 msp->ms_sm->sm_phys->smp_histogram[i]; 815 mc->mc_histogram[i + ashift] -= 816 msp->ms_sm->sm_phys->smp_histogram[i]; 817 } 818 mutex_exit(&mg->mg_lock); 819 } 820 821 static void 822 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 823 { 824 ASSERT(msp->ms_group == NULL); 825 mutex_enter(&mg->mg_lock); 826 msp->ms_group = mg; 827 msp->ms_weight = 0; 828 avl_add(&mg->mg_metaslab_tree, msp); 829 mutex_exit(&mg->mg_lock); 830 831 mutex_enter(&msp->ms_lock); 832 metaslab_group_histogram_add(mg, msp); 833 mutex_exit(&msp->ms_lock); 834 } 835 836 static void 837 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 838 { 839 mutex_enter(&msp->ms_lock); 840 metaslab_group_histogram_remove(mg, msp); 841 mutex_exit(&msp->ms_lock); 842 843 mutex_enter(&mg->mg_lock); 844 ASSERT(msp->ms_group == mg); 845 avl_remove(&mg->mg_metaslab_tree, msp); 846 msp->ms_group = NULL; 847 mutex_exit(&mg->mg_lock); 848 } 849 850 static void 851 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 852 { 853 /* 854 * Although in principle the weight can be any value, in 855 * practice we do not use values in the range [1, 511]. 856 */ 857 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 858 ASSERT(MUTEX_HELD(&msp->ms_lock)); 859 860 mutex_enter(&mg->mg_lock); 861 ASSERT(msp->ms_group == mg); 862 avl_remove(&mg->mg_metaslab_tree, msp); 863 msp->ms_weight = weight; 864 avl_add(&mg->mg_metaslab_tree, msp); 865 mutex_exit(&mg->mg_lock); 866 } 867 868 /* 869 * Calculate the fragmentation for a given metaslab group. We can use 870 * a simple average here since all metaslabs within the group must have 871 * the same size. The return value will be a value between 0 and 100 872 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 873 * group have a fragmentation metric. 874 */ 875 uint64_t 876 metaslab_group_fragmentation(metaslab_group_t *mg) 877 { 878 vdev_t *vd = mg->mg_vd; 879 uint64_t fragmentation = 0; 880 uint64_t valid_ms = 0; 881 882 for (int m = 0; m < vd->vdev_ms_count; m++) { 883 metaslab_t *msp = vd->vdev_ms[m]; 884 885 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 886 continue; 887 888 valid_ms++; 889 fragmentation += msp->ms_fragmentation; 890 } 891 892 if (valid_ms <= vd->vdev_ms_count / 2) 893 return (ZFS_FRAG_INVALID); 894 895 fragmentation /= valid_ms; 896 ASSERT3U(fragmentation, <=, 100); 897 return (fragmentation); 898 } 899 900 /* 901 * Determine if a given metaslab group should skip allocations. A metaslab 902 * group should avoid allocations if its free capacity is less than the 903 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 904 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 905 * that can still handle allocations. If the allocation throttle is enabled 906 * then we skip allocations to devices that have reached their maximum 907 * allocation queue depth unless the selected metaslab group is the only 908 * eligible group remaining. 909 */ 910 static boolean_t 911 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 912 uint64_t psize) 913 { 914 spa_t *spa = mg->mg_vd->vdev_spa; 915 metaslab_class_t *mc = mg->mg_class; 916 917 /* 918 * We can only consider skipping this metaslab group if it's 919 * in the normal metaslab class and there are other metaslab 920 * groups to select from. Otherwise, we always consider it eligible 921 * for allocations. 922 */ 923 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 924 return (B_TRUE); 925 926 /* 927 * If the metaslab group's mg_allocatable flag is set (see comments 928 * in metaslab_group_alloc_update() for more information) and 929 * the allocation throttle is disabled then allow allocations to this 930 * device. However, if the allocation throttle is enabled then 931 * check if we have reached our allocation limit (mg_alloc_queue_depth) 932 * to determine if we should allow allocations to this metaslab group. 933 * If all metaslab groups are no longer considered allocatable 934 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 935 * gang block size then we allow allocations on this metaslab group 936 * regardless of the mg_allocatable or throttle settings. 937 */ 938 if (mg->mg_allocatable) { 939 metaslab_group_t *mgp; 940 int64_t qdepth; 941 uint64_t qmax = mg->mg_max_alloc_queue_depth; 942 943 if (!mc->mc_alloc_throttle_enabled) 944 return (B_TRUE); 945 946 /* 947 * If this metaslab group does not have any free space, then 948 * there is no point in looking further. 949 */ 950 if (mg->mg_no_free_space) 951 return (B_FALSE); 952 953 qdepth = refcount_count(&mg->mg_alloc_queue_depth); 954 955 /* 956 * If this metaslab group is below its qmax or it's 957 * the only allocatable metasable group, then attempt 958 * to allocate from it. 959 */ 960 if (qdepth < qmax || mc->mc_alloc_groups == 1) 961 return (B_TRUE); 962 ASSERT3U(mc->mc_alloc_groups, >, 1); 963 964 /* 965 * Since this metaslab group is at or over its qmax, we 966 * need to determine if there are metaslab groups after this 967 * one that might be able to handle this allocation. This is 968 * racy since we can't hold the locks for all metaslab 969 * groups at the same time when we make this check. 970 */ 971 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 972 qmax = mgp->mg_max_alloc_queue_depth; 973 974 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 975 976 /* 977 * If there is another metaslab group that 978 * might be able to handle the allocation, then 979 * we return false so that we skip this group. 980 */ 981 if (qdepth < qmax && !mgp->mg_no_free_space) 982 return (B_FALSE); 983 } 984 985 /* 986 * We didn't find another group to handle the allocation 987 * so we can't skip this metaslab group even though 988 * we are at or over our qmax. 989 */ 990 return (B_TRUE); 991 992 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 993 return (B_TRUE); 994 } 995 return (B_FALSE); 996 } 997 998 /* 999 * ========================================================================== 1000 * Range tree callbacks 1001 * ========================================================================== 1002 */ 1003 1004 /* 1005 * Comparison function for the private size-ordered tree. Tree is sorted 1006 * by size, larger sizes at the end of the tree. 1007 */ 1008 static int 1009 metaslab_rangesize_compare(const void *x1, const void *x2) 1010 { 1011 const range_seg_t *r1 = x1; 1012 const range_seg_t *r2 = x2; 1013 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1014 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1015 1016 if (rs_size1 < rs_size2) 1017 return (-1); 1018 if (rs_size1 > rs_size2) 1019 return (1); 1020 1021 if (r1->rs_start < r2->rs_start) 1022 return (-1); 1023 1024 if (r1->rs_start > r2->rs_start) 1025 return (1); 1026 1027 return (0); 1028 } 1029 1030 /* 1031 * Create any block allocator specific components. The current allocators 1032 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1033 */ 1034 static void 1035 metaslab_rt_create(range_tree_t *rt, void *arg) 1036 { 1037 metaslab_t *msp = arg; 1038 1039 ASSERT3P(rt->rt_arg, ==, msp); 1040 ASSERT(msp->ms_allocatable == NULL); 1041 1042 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1043 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1044 } 1045 1046 /* 1047 * Destroy the block allocator specific components. 1048 */ 1049 static void 1050 metaslab_rt_destroy(range_tree_t *rt, void *arg) 1051 { 1052 metaslab_t *msp = arg; 1053 1054 ASSERT3P(rt->rt_arg, ==, msp); 1055 ASSERT3P(msp->ms_allocatable, ==, rt); 1056 ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); 1057 1058 avl_destroy(&msp->ms_allocatable_by_size); 1059 } 1060 1061 static void 1062 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1063 { 1064 metaslab_t *msp = arg; 1065 1066 ASSERT3P(rt->rt_arg, ==, msp); 1067 ASSERT3P(msp->ms_allocatable, ==, rt); 1068 VERIFY(!msp->ms_condensing); 1069 avl_add(&msp->ms_allocatable_by_size, rs); 1070 } 1071 1072 static void 1073 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1074 { 1075 metaslab_t *msp = arg; 1076 1077 ASSERT3P(rt->rt_arg, ==, msp); 1078 ASSERT3P(msp->ms_allocatable, ==, rt); 1079 VERIFY(!msp->ms_condensing); 1080 avl_remove(&msp->ms_allocatable_by_size, rs); 1081 } 1082 1083 static void 1084 metaslab_rt_vacate(range_tree_t *rt, void *arg) 1085 { 1086 metaslab_t *msp = arg; 1087 1088 ASSERT3P(rt->rt_arg, ==, msp); 1089 ASSERT3P(msp->ms_allocatable, ==, rt); 1090 1091 /* 1092 * Normally one would walk the tree freeing nodes along the way. 1093 * Since the nodes are shared with the range trees we can avoid 1094 * walking all nodes and just reinitialize the avl tree. The nodes 1095 * will be freed by the range tree, so we don't want to free them here. 1096 */ 1097 avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, 1098 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1099 } 1100 1101 static range_tree_ops_t metaslab_rt_ops = { 1102 metaslab_rt_create, 1103 metaslab_rt_destroy, 1104 metaslab_rt_add, 1105 metaslab_rt_remove, 1106 metaslab_rt_vacate 1107 }; 1108 1109 /* 1110 * ========================================================================== 1111 * Common allocator routines 1112 * ========================================================================== 1113 */ 1114 1115 /* 1116 * Return the maximum contiguous segment within the metaslab. 1117 */ 1118 uint64_t 1119 metaslab_block_maxsize(metaslab_t *msp) 1120 { 1121 avl_tree_t *t = &msp->ms_allocatable_by_size; 1122 range_seg_t *rs; 1123 1124 if (t == NULL || (rs = avl_last(t)) == NULL) 1125 return (0ULL); 1126 1127 return (rs->rs_end - rs->rs_start); 1128 } 1129 1130 static range_seg_t * 1131 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) 1132 { 1133 range_seg_t *rs, rsearch; 1134 avl_index_t where; 1135 1136 rsearch.rs_start = start; 1137 rsearch.rs_end = start + size; 1138 1139 rs = avl_find(t, &rsearch, &where); 1140 if (rs == NULL) { 1141 rs = avl_nearest(t, where, AVL_AFTER); 1142 } 1143 1144 return (rs); 1145 } 1146 1147 /* 1148 * This is a helper function that can be used by the allocator to find 1149 * a suitable block to allocate. This will search the specified AVL 1150 * tree looking for a block that matches the specified criteria. 1151 */ 1152 static uint64_t 1153 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1154 uint64_t align) 1155 { 1156 range_seg_t *rs = metaslab_block_find(t, *cursor, size); 1157 1158 while (rs != NULL) { 1159 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1160 1161 if (offset + size <= rs->rs_end) { 1162 *cursor = offset + size; 1163 return (offset); 1164 } 1165 rs = AVL_NEXT(t, rs); 1166 } 1167 1168 /* 1169 * If we know we've searched the whole map (*cursor == 0), give up. 1170 * Otherwise, reset the cursor to the beginning and try again. 1171 */ 1172 if (*cursor == 0) 1173 return (-1ULL); 1174 1175 *cursor = 0; 1176 return (metaslab_block_picker(t, cursor, size, align)); 1177 } 1178 1179 /* 1180 * ========================================================================== 1181 * The first-fit block allocator 1182 * ========================================================================== 1183 */ 1184 static uint64_t 1185 metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1186 { 1187 /* 1188 * Find the largest power of 2 block size that evenly divides the 1189 * requested size. This is used to try to allocate blocks with similar 1190 * alignment from the same area of the metaslab (i.e. same cursor 1191 * bucket) but it does not guarantee that other allocations sizes 1192 * may exist in the same region. 1193 */ 1194 uint64_t align = size & -size; 1195 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1196 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1197 1198 return (metaslab_block_picker(t, cursor, size, align)); 1199 } 1200 1201 static metaslab_ops_t metaslab_ff_ops = { 1202 metaslab_ff_alloc 1203 }; 1204 1205 /* 1206 * ========================================================================== 1207 * Dynamic block allocator - 1208 * Uses the first fit allocation scheme until space get low and then 1209 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1210 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1211 * ========================================================================== 1212 */ 1213 static uint64_t 1214 metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1215 { 1216 /* 1217 * Find the largest power of 2 block size that evenly divides the 1218 * requested size. This is used to try to allocate blocks with similar 1219 * alignment from the same area of the metaslab (i.e. same cursor 1220 * bucket) but it does not guarantee that other allocations sizes 1221 * may exist in the same region. 1222 */ 1223 uint64_t align = size & -size; 1224 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1225 range_tree_t *rt = msp->ms_allocatable; 1226 avl_tree_t *t = &rt->rt_root; 1227 uint64_t max_size = metaslab_block_maxsize(msp); 1228 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1229 1230 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1231 ASSERT3U(avl_numnodes(t), ==, 1232 avl_numnodes(&msp->ms_allocatable_by_size)); 1233 1234 if (max_size < size) 1235 return (-1ULL); 1236 1237 /* 1238 * If we're running low on space switch to using the size 1239 * sorted AVL tree (best-fit). 1240 */ 1241 if (max_size < metaslab_df_alloc_threshold || 1242 free_pct < metaslab_df_free_pct) { 1243 t = &msp->ms_allocatable_by_size; 1244 *cursor = 0; 1245 } 1246 1247 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1248 } 1249 1250 static metaslab_ops_t metaslab_df_ops = { 1251 metaslab_df_alloc 1252 }; 1253 1254 /* 1255 * ========================================================================== 1256 * Cursor fit block allocator - 1257 * Select the largest region in the metaslab, set the cursor to the beginning 1258 * of the range and the cursor_end to the end of the range. As allocations 1259 * are made advance the cursor. Continue allocating from the cursor until 1260 * the range is exhausted and then find a new range. 1261 * ========================================================================== 1262 */ 1263 static uint64_t 1264 metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1265 { 1266 range_tree_t *rt = msp->ms_allocatable; 1267 avl_tree_t *t = &msp->ms_allocatable_by_size; 1268 uint64_t *cursor = &msp->ms_lbas[0]; 1269 uint64_t *cursor_end = &msp->ms_lbas[1]; 1270 uint64_t offset = 0; 1271 1272 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1273 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1274 1275 ASSERT3U(*cursor_end, >=, *cursor); 1276 1277 if ((*cursor + size) > *cursor_end) { 1278 range_seg_t *rs; 1279 1280 rs = avl_last(&msp->ms_allocatable_by_size); 1281 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1282 return (-1ULL); 1283 1284 *cursor = rs->rs_start; 1285 *cursor_end = rs->rs_end; 1286 } 1287 1288 offset = *cursor; 1289 *cursor += size; 1290 1291 return (offset); 1292 } 1293 1294 static metaslab_ops_t metaslab_cf_ops = { 1295 metaslab_cf_alloc 1296 }; 1297 1298 /* 1299 * ========================================================================== 1300 * New dynamic fit allocator - 1301 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1302 * contiguous blocks. If no region is found then just use the largest segment 1303 * that remains. 1304 * ========================================================================== 1305 */ 1306 1307 /* 1308 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1309 * to request from the allocator. 1310 */ 1311 uint64_t metaslab_ndf_clump_shift = 4; 1312 1313 static uint64_t 1314 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1315 { 1316 avl_tree_t *t = &msp->ms_allocatable->rt_root; 1317 avl_index_t where; 1318 range_seg_t *rs, rsearch; 1319 uint64_t hbit = highbit64(size); 1320 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1321 uint64_t max_size = metaslab_block_maxsize(msp); 1322 1323 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1324 ASSERT3U(avl_numnodes(t), ==, 1325 avl_numnodes(&msp->ms_allocatable_by_size)); 1326 1327 if (max_size < size) 1328 return (-1ULL); 1329 1330 rsearch.rs_start = *cursor; 1331 rsearch.rs_end = *cursor + size; 1332 1333 rs = avl_find(t, &rsearch, &where); 1334 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1335 t = &msp->ms_allocatable_by_size; 1336 1337 rsearch.rs_start = 0; 1338 rsearch.rs_end = MIN(max_size, 1339 1ULL << (hbit + metaslab_ndf_clump_shift)); 1340 rs = avl_find(t, &rsearch, &where); 1341 if (rs == NULL) 1342 rs = avl_nearest(t, where, AVL_AFTER); 1343 ASSERT(rs != NULL); 1344 } 1345 1346 if ((rs->rs_end - rs->rs_start) >= size) { 1347 *cursor = rs->rs_start + size; 1348 return (rs->rs_start); 1349 } 1350 return (-1ULL); 1351 } 1352 1353 static metaslab_ops_t metaslab_ndf_ops = { 1354 metaslab_ndf_alloc 1355 }; 1356 1357 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1358 1359 /* 1360 * ========================================================================== 1361 * Metaslabs 1362 * ========================================================================== 1363 */ 1364 1365 /* 1366 * Wait for any in-progress metaslab loads to complete. 1367 */ 1368 void 1369 metaslab_load_wait(metaslab_t *msp) 1370 { 1371 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1372 1373 while (msp->ms_loading) { 1374 ASSERT(!msp->ms_loaded); 1375 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1376 } 1377 } 1378 1379 int 1380 metaslab_load(metaslab_t *msp) 1381 { 1382 int error = 0; 1383 boolean_t success = B_FALSE; 1384 1385 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1386 ASSERT(!msp->ms_loaded); 1387 ASSERT(!msp->ms_loading); 1388 1389 msp->ms_loading = B_TRUE; 1390 /* 1391 * Nobody else can manipulate a loading metaslab, so it's now safe 1392 * to drop the lock. This way we don't have to hold the lock while 1393 * reading the spacemap from disk. 1394 */ 1395 mutex_exit(&msp->ms_lock); 1396 1397 /* 1398 * If the space map has not been allocated yet, then treat 1399 * all the space in the metaslab as free and add it to ms_allocatable. 1400 */ 1401 if (msp->ms_sm != NULL) { 1402 error = space_map_load(msp->ms_sm, msp->ms_allocatable, 1403 SM_FREE); 1404 } else { 1405 range_tree_add(msp->ms_allocatable, 1406 msp->ms_start, msp->ms_size); 1407 } 1408 1409 success = (error == 0); 1410 1411 mutex_enter(&msp->ms_lock); 1412 msp->ms_loading = B_FALSE; 1413 1414 if (success) { 1415 ASSERT3P(msp->ms_group, !=, NULL); 1416 msp->ms_loaded = B_TRUE; 1417 1418 /* 1419 * If the metaslab already has a spacemap, then we need to 1420 * remove all segments from the defer tree; otherwise, the 1421 * metaslab is completely empty and we can skip this. 1422 */ 1423 if (msp->ms_sm != NULL) { 1424 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1425 range_tree_walk(msp->ms_defer[t], 1426 range_tree_remove, msp->ms_allocatable); 1427 } 1428 } 1429 msp->ms_max_size = metaslab_block_maxsize(msp); 1430 } 1431 cv_broadcast(&msp->ms_load_cv); 1432 return (error); 1433 } 1434 1435 void 1436 metaslab_unload(metaslab_t *msp) 1437 { 1438 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1439 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 1440 msp->ms_loaded = B_FALSE; 1441 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1442 msp->ms_max_size = 0; 1443 } 1444 1445 int 1446 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1447 metaslab_t **msp) 1448 { 1449 vdev_t *vd = mg->mg_vd; 1450 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1451 metaslab_t *ms; 1452 int error; 1453 1454 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1455 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1456 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); 1457 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1458 ms->ms_id = id; 1459 ms->ms_start = id << vd->vdev_ms_shift; 1460 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1461 1462 /* 1463 * We only open space map objects that already exist. All others 1464 * will be opened when we finally allocate an object for it. 1465 */ 1466 if (object != 0) { 1467 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1468 ms->ms_size, vd->vdev_ashift); 1469 1470 if (error != 0) { 1471 kmem_free(ms, sizeof (metaslab_t)); 1472 return (error); 1473 } 1474 1475 ASSERT(ms->ms_sm != NULL); 1476 } 1477 1478 /* 1479 * We create the main range tree here, but we don't create the 1480 * other range trees until metaslab_sync_done(). This serves 1481 * two purposes: it allows metaslab_sync_done() to detect the 1482 * addition of new space; and for debugging, it ensures that we'd 1483 * data fault on any attempt to use this metaslab before it's ready. 1484 */ 1485 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); 1486 metaslab_group_add(mg, ms); 1487 1488 metaslab_set_fragmentation(ms); 1489 1490 /* 1491 * If we're opening an existing pool (txg == 0) or creating 1492 * a new one (txg == TXG_INITIAL), all space is available now. 1493 * If we're adding space to an existing pool, the new space 1494 * does not become available until after this txg has synced. 1495 * The metaslab's weight will also be initialized when we sync 1496 * out this txg. This ensures that we don't attempt to allocate 1497 * from it before we have initialized it completely. 1498 */ 1499 if (txg <= TXG_INITIAL) 1500 metaslab_sync_done(ms, 0); 1501 1502 /* 1503 * If metaslab_debug_load is set and we're initializing a metaslab 1504 * that has an allocated space map object then load the its space 1505 * map so that can verify frees. 1506 */ 1507 if (metaslab_debug_load && ms->ms_sm != NULL) { 1508 mutex_enter(&ms->ms_lock); 1509 VERIFY0(metaslab_load(ms)); 1510 mutex_exit(&ms->ms_lock); 1511 } 1512 1513 if (txg != 0) { 1514 vdev_dirty(vd, 0, NULL, txg); 1515 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1516 } 1517 1518 *msp = ms; 1519 1520 return (0); 1521 } 1522 1523 void 1524 metaslab_fini(metaslab_t *msp) 1525 { 1526 metaslab_group_t *mg = msp->ms_group; 1527 1528 metaslab_group_remove(mg, msp); 1529 1530 mutex_enter(&msp->ms_lock); 1531 VERIFY(msp->ms_group == NULL); 1532 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1533 0, -msp->ms_size); 1534 space_map_close(msp->ms_sm); 1535 1536 metaslab_unload(msp); 1537 range_tree_destroy(msp->ms_allocatable); 1538 range_tree_destroy(msp->ms_freeing); 1539 range_tree_destroy(msp->ms_freed); 1540 1541 for (int t = 0; t < TXG_SIZE; t++) { 1542 range_tree_destroy(msp->ms_allocating[t]); 1543 } 1544 1545 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1546 range_tree_destroy(msp->ms_defer[t]); 1547 } 1548 ASSERT0(msp->ms_deferspace); 1549 1550 range_tree_destroy(msp->ms_checkpointing); 1551 1552 mutex_exit(&msp->ms_lock); 1553 cv_destroy(&msp->ms_load_cv); 1554 mutex_destroy(&msp->ms_lock); 1555 mutex_destroy(&msp->ms_sync_lock); 1556 1557 kmem_free(msp, sizeof (metaslab_t)); 1558 } 1559 1560 #define FRAGMENTATION_TABLE_SIZE 17 1561 1562 /* 1563 * This table defines a segment size based fragmentation metric that will 1564 * allow each metaslab to derive its own fragmentation value. This is done 1565 * by calculating the space in each bucket of the spacemap histogram and 1566 * multiplying that by the fragmetation metric in this table. Doing 1567 * this for all buckets and dividing it by the total amount of free 1568 * space in this metaslab (i.e. the total free space in all buckets) gives 1569 * us the fragmentation metric. This means that a high fragmentation metric 1570 * equates to most of the free space being comprised of small segments. 1571 * Conversely, if the metric is low, then most of the free space is in 1572 * large segments. A 10% change in fragmentation equates to approximately 1573 * double the number of segments. 1574 * 1575 * This table defines 0% fragmented space using 16MB segments. Testing has 1576 * shown that segments that are greater than or equal to 16MB do not suffer 1577 * from drastic performance problems. Using this value, we derive the rest 1578 * of the table. Since the fragmentation value is never stored on disk, it 1579 * is possible to change these calculations in the future. 1580 */ 1581 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1582 100, /* 512B */ 1583 100, /* 1K */ 1584 98, /* 2K */ 1585 95, /* 4K */ 1586 90, /* 8K */ 1587 80, /* 16K */ 1588 70, /* 32K */ 1589 60, /* 64K */ 1590 50, /* 128K */ 1591 40, /* 256K */ 1592 30, /* 512K */ 1593 20, /* 1M */ 1594 15, /* 2M */ 1595 10, /* 4M */ 1596 5, /* 8M */ 1597 0 /* 16M */ 1598 }; 1599 1600 /* 1601 * Calclate the metaslab's fragmentation metric. A return value 1602 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1603 * not support this metric. Otherwise, the return value should be in the 1604 * range [0, 100]. 1605 */ 1606 static void 1607 metaslab_set_fragmentation(metaslab_t *msp) 1608 { 1609 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1610 uint64_t fragmentation = 0; 1611 uint64_t total = 0; 1612 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1613 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1614 1615 if (!feature_enabled) { 1616 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1617 return; 1618 } 1619 1620 /* 1621 * A null space map means that the entire metaslab is free 1622 * and thus is not fragmented. 1623 */ 1624 if (msp->ms_sm == NULL) { 1625 msp->ms_fragmentation = 0; 1626 return; 1627 } 1628 1629 /* 1630 * If this metaslab's space map has not been upgraded, flag it 1631 * so that we upgrade next time we encounter it. 1632 */ 1633 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1634 uint64_t txg = spa_syncing_txg(spa); 1635 vdev_t *vd = msp->ms_group->mg_vd; 1636 1637 /* 1638 * If we've reached the final dirty txg, then we must 1639 * be shutting down the pool. We don't want to dirty 1640 * any data past this point so skip setting the condense 1641 * flag. We can retry this action the next time the pool 1642 * is imported. 1643 */ 1644 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { 1645 msp->ms_condense_wanted = B_TRUE; 1646 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1647 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1648 "ms_id %llu, vdev_id %llu", txg, msp->ms_id, 1649 vd->vdev_id); 1650 } 1651 msp->ms_fragmentation = ZFS_FRAG_INVALID; 1652 return; 1653 } 1654 1655 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1656 uint64_t space = 0; 1657 uint8_t shift = msp->ms_sm->sm_shift; 1658 1659 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1660 FRAGMENTATION_TABLE_SIZE - 1); 1661 1662 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1663 continue; 1664 1665 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1666 total += space; 1667 1668 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1669 fragmentation += space * zfs_frag_table[idx]; 1670 } 1671 1672 if (total > 0) 1673 fragmentation /= total; 1674 ASSERT3U(fragmentation, <=, 100); 1675 1676 msp->ms_fragmentation = fragmentation; 1677 } 1678 1679 /* 1680 * Compute a weight -- a selection preference value -- for the given metaslab. 1681 * This is based on the amount of free space, the level of fragmentation, 1682 * the LBA range, and whether the metaslab is loaded. 1683 */ 1684 static uint64_t 1685 metaslab_space_weight(metaslab_t *msp) 1686 { 1687 metaslab_group_t *mg = msp->ms_group; 1688 vdev_t *vd = mg->mg_vd; 1689 uint64_t weight, space; 1690 1691 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1692 ASSERT(!vd->vdev_removing); 1693 1694 /* 1695 * The baseline weight is the metaslab's free space. 1696 */ 1697 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1698 1699 if (metaslab_fragmentation_factor_enabled && 1700 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1701 /* 1702 * Use the fragmentation information to inversely scale 1703 * down the baseline weight. We need to ensure that we 1704 * don't exclude this metaslab completely when it's 100% 1705 * fragmented. To avoid this we reduce the fragmented value 1706 * by 1. 1707 */ 1708 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1709 1710 /* 1711 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1712 * this metaslab again. The fragmentation metric may have 1713 * decreased the space to something smaller than 1714 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1715 * so that we can consume any remaining space. 1716 */ 1717 if (space > 0 && space < SPA_MINBLOCKSIZE) 1718 space = SPA_MINBLOCKSIZE; 1719 } 1720 weight = space; 1721 1722 /* 1723 * Modern disks have uniform bit density and constant angular velocity. 1724 * Therefore, the outer recording zones are faster (higher bandwidth) 1725 * than the inner zones by the ratio of outer to inner track diameter, 1726 * which is typically around 2:1. We account for this by assigning 1727 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1728 * In effect, this means that we'll select the metaslab with the most 1729 * free bandwidth rather than simply the one with the most free space. 1730 */ 1731 if (metaslab_lba_weighting_enabled) { 1732 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1733 ASSERT(weight >= space && weight <= 2 * space); 1734 } 1735 1736 /* 1737 * If this metaslab is one we're actively using, adjust its 1738 * weight to make it preferable to any inactive metaslab so 1739 * we'll polish it off. If the fragmentation on this metaslab 1740 * has exceed our threshold, then don't mark it active. 1741 */ 1742 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1743 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1744 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1745 } 1746 1747 WEIGHT_SET_SPACEBASED(weight); 1748 return (weight); 1749 } 1750 1751 /* 1752 * Return the weight of the specified metaslab, according to the segment-based 1753 * weighting algorithm. The metaslab must be loaded. This function can 1754 * be called within a sync pass since it relies only on the metaslab's 1755 * range tree which is always accurate when the metaslab is loaded. 1756 */ 1757 static uint64_t 1758 metaslab_weight_from_range_tree(metaslab_t *msp) 1759 { 1760 uint64_t weight = 0; 1761 uint32_t segments = 0; 1762 1763 ASSERT(msp->ms_loaded); 1764 1765 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; 1766 i--) { 1767 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; 1768 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1769 1770 segments <<= 1; 1771 segments += msp->ms_allocatable->rt_histogram[i]; 1772 1773 /* 1774 * The range tree provides more precision than the space map 1775 * and must be downgraded so that all values fit within the 1776 * space map's histogram. This allows us to compare loaded 1777 * vs. unloaded metaslabs to determine which metaslab is 1778 * considered "best". 1779 */ 1780 if (i > max_idx) 1781 continue; 1782 1783 if (segments != 0) { 1784 WEIGHT_SET_COUNT(weight, segments); 1785 WEIGHT_SET_INDEX(weight, i); 1786 WEIGHT_SET_ACTIVE(weight, 0); 1787 break; 1788 } 1789 } 1790 return (weight); 1791 } 1792 1793 /* 1794 * Calculate the weight based on the on-disk histogram. This should only 1795 * be called after a sync pass has completely finished since the on-disk 1796 * information is updated in metaslab_sync(). 1797 */ 1798 static uint64_t 1799 metaslab_weight_from_spacemap(metaslab_t *msp) 1800 { 1801 uint64_t weight = 0; 1802 1803 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { 1804 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { 1805 WEIGHT_SET_COUNT(weight, 1806 msp->ms_sm->sm_phys->smp_histogram[i]); 1807 WEIGHT_SET_INDEX(weight, i + 1808 msp->ms_sm->sm_shift); 1809 WEIGHT_SET_ACTIVE(weight, 0); 1810 break; 1811 } 1812 } 1813 return (weight); 1814 } 1815 1816 /* 1817 * Compute a segment-based weight for the specified metaslab. The weight 1818 * is determined by highest bucket in the histogram. The information 1819 * for the highest bucket is encoded into the weight value. 1820 */ 1821 static uint64_t 1822 metaslab_segment_weight(metaslab_t *msp) 1823 { 1824 metaslab_group_t *mg = msp->ms_group; 1825 uint64_t weight = 0; 1826 uint8_t shift = mg->mg_vd->vdev_ashift; 1827 1828 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1829 1830 /* 1831 * The metaslab is completely free. 1832 */ 1833 if (space_map_allocated(msp->ms_sm) == 0) { 1834 int idx = highbit64(msp->ms_size) - 1; 1835 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; 1836 1837 if (idx < max_idx) { 1838 WEIGHT_SET_COUNT(weight, 1ULL); 1839 WEIGHT_SET_INDEX(weight, idx); 1840 } else { 1841 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); 1842 WEIGHT_SET_INDEX(weight, max_idx); 1843 } 1844 WEIGHT_SET_ACTIVE(weight, 0); 1845 ASSERT(!WEIGHT_IS_SPACEBASED(weight)); 1846 1847 return (weight); 1848 } 1849 1850 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); 1851 1852 /* 1853 * If the metaslab is fully allocated then just make the weight 0. 1854 */ 1855 if (space_map_allocated(msp->ms_sm) == msp->ms_size) 1856 return (0); 1857 /* 1858 * If the metaslab is already loaded, then use the range tree to 1859 * determine the weight. Otherwise, we rely on the space map information 1860 * to generate the weight. 1861 */ 1862 if (msp->ms_loaded) { 1863 weight = metaslab_weight_from_range_tree(msp); 1864 } else { 1865 weight = metaslab_weight_from_spacemap(msp); 1866 } 1867 1868 /* 1869 * If the metaslab was active the last time we calculated its weight 1870 * then keep it active. We want to consume the entire region that 1871 * is associated with this weight. 1872 */ 1873 if (msp->ms_activation_weight != 0 && weight != 0) 1874 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); 1875 return (weight); 1876 } 1877 1878 /* 1879 * Determine if we should attempt to allocate from this metaslab. If the 1880 * metaslab has a maximum size then we can quickly determine if the desired 1881 * allocation size can be satisfied. Otherwise, if we're using segment-based 1882 * weighting then we can determine the maximum allocation that this metaslab 1883 * can accommodate based on the index encoded in the weight. If we're using 1884 * space-based weights then rely on the entire weight (excluding the weight 1885 * type bit). 1886 */ 1887 boolean_t 1888 metaslab_should_allocate(metaslab_t *msp, uint64_t asize) 1889 { 1890 boolean_t should_allocate; 1891 1892 if (msp->ms_max_size != 0) 1893 return (msp->ms_max_size >= asize); 1894 1895 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 1896 /* 1897 * The metaslab segment weight indicates segments in the 1898 * range [2^i, 2^(i+1)), where i is the index in the weight. 1899 * Since the asize might be in the middle of the range, we 1900 * should attempt the allocation if asize < 2^(i+1). 1901 */ 1902 should_allocate = (asize < 1903 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); 1904 } else { 1905 should_allocate = (asize <= 1906 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); 1907 } 1908 return (should_allocate); 1909 } 1910 1911 static uint64_t 1912 metaslab_weight(metaslab_t *msp) 1913 { 1914 vdev_t *vd = msp->ms_group->mg_vd; 1915 spa_t *spa = vd->vdev_spa; 1916 uint64_t weight; 1917 1918 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1919 1920 /* 1921 * If this vdev is in the process of being removed, there is nothing 1922 * for us to do here. 1923 */ 1924 if (vd->vdev_removing) 1925 return (0); 1926 1927 metaslab_set_fragmentation(msp); 1928 1929 /* 1930 * Update the maximum size if the metaslab is loaded. This will 1931 * ensure that we get an accurate maximum size if newly freed space 1932 * has been added back into the free tree. 1933 */ 1934 if (msp->ms_loaded) 1935 msp->ms_max_size = metaslab_block_maxsize(msp); 1936 1937 /* 1938 * Segment-based weighting requires space map histogram support. 1939 */ 1940 if (zfs_metaslab_segment_weight_enabled && 1941 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && 1942 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == 1943 sizeof (space_map_phys_t))) { 1944 weight = metaslab_segment_weight(msp); 1945 } else { 1946 weight = metaslab_space_weight(msp); 1947 } 1948 return (weight); 1949 } 1950 1951 static int 1952 metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1953 { 1954 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1955 1956 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1957 metaslab_load_wait(msp); 1958 if (!msp->ms_loaded) { 1959 int error = metaslab_load(msp); 1960 if (error) { 1961 metaslab_group_sort(msp->ms_group, msp, 0); 1962 return (error); 1963 } 1964 } 1965 1966 msp->ms_activation_weight = msp->ms_weight; 1967 metaslab_group_sort(msp->ms_group, msp, 1968 msp->ms_weight | activation_weight); 1969 } 1970 ASSERT(msp->ms_loaded); 1971 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1972 1973 return (0); 1974 } 1975 1976 static void 1977 metaslab_passivate(metaslab_t *msp, uint64_t weight) 1978 { 1979 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; 1980 1981 /* 1982 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1983 * this metaslab again. In that case, it had better be empty, 1984 * or we would be leaving space on the table. 1985 */ 1986 ASSERT(size >= SPA_MINBLOCKSIZE || 1987 range_tree_is_empty(msp->ms_allocatable)); 1988 ASSERT0(weight & METASLAB_ACTIVE_MASK); 1989 1990 msp->ms_activation_weight = 0; 1991 metaslab_group_sort(msp->ms_group, msp, weight); 1992 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1993 } 1994 1995 /* 1996 * Segment-based metaslabs are activated once and remain active until 1997 * we either fail an allocation attempt (similar to space-based metaslabs) 1998 * or have exhausted the free space in zfs_metaslab_switch_threshold 1999 * buckets since the metaslab was activated. This function checks to see 2000 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the 2001 * metaslab and passivates it proactively. This will allow us to select a 2002 * metaslabs with larger contiguous region if any remaining within this 2003 * metaslab group. If we're in sync pass > 1, then we continue using this 2004 * metaslab so that we don't dirty more block and cause more sync passes. 2005 */ 2006 void 2007 metaslab_segment_may_passivate(metaslab_t *msp) 2008 { 2009 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2010 2011 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) 2012 return; 2013 2014 /* 2015 * Since we are in the middle of a sync pass, the most accurate 2016 * information that is accessible to us is the in-core range tree 2017 * histogram; calculate the new weight based on that information. 2018 */ 2019 uint64_t weight = metaslab_weight_from_range_tree(msp); 2020 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); 2021 int current_idx = WEIGHT_GET_INDEX(weight); 2022 2023 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) 2024 metaslab_passivate(msp, weight); 2025 } 2026 2027 static void 2028 metaslab_preload(void *arg) 2029 { 2030 metaslab_t *msp = arg; 2031 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 2032 2033 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 2034 2035 mutex_enter(&msp->ms_lock); 2036 metaslab_load_wait(msp); 2037 if (!msp->ms_loaded) 2038 (void) metaslab_load(msp); 2039 msp->ms_selected_txg = spa_syncing_txg(spa); 2040 mutex_exit(&msp->ms_lock); 2041 } 2042 2043 static void 2044 metaslab_group_preload(metaslab_group_t *mg) 2045 { 2046 spa_t *spa = mg->mg_vd->vdev_spa; 2047 metaslab_t *msp; 2048 avl_tree_t *t = &mg->mg_metaslab_tree; 2049 int m = 0; 2050 2051 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 2052 taskq_wait(mg->mg_taskq); 2053 return; 2054 } 2055 2056 mutex_enter(&mg->mg_lock); 2057 2058 /* 2059 * Load the next potential metaslabs 2060 */ 2061 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { 2062 ASSERT3P(msp->ms_group, ==, mg); 2063 2064 /* 2065 * We preload only the maximum number of metaslabs specified 2066 * by metaslab_preload_limit. If a metaslab is being forced 2067 * to condense then we preload it too. This will ensure 2068 * that force condensing happens in the next txg. 2069 */ 2070 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 2071 continue; 2072 } 2073 2074 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 2075 msp, TQ_SLEEP) != NULL); 2076 } 2077 mutex_exit(&mg->mg_lock); 2078 } 2079 2080 /* 2081 * Determine if the space map's on-disk footprint is past our tolerance 2082 * for inefficiency. We would like to use the following criteria to make 2083 * our decision: 2084 * 2085 * 1. The size of the space map object should not dramatically increase as a 2086 * result of writing out the free space range tree. 2087 * 2088 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 2089 * times the size than the free space range tree representation 2090 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB). 2091 * 2092 * 3. The on-disk size of the space map should actually decrease. 2093 * 2094 * Checking the first condition is tricky since we don't want to walk 2095 * the entire AVL tree calculating the estimated on-disk size. Instead we 2096 * use the size-ordered range tree in the metaslab and calculate the 2097 * size required to write out the largest segment in our free tree. If the 2098 * size required to represent that segment on disk is larger than the space 2099 * map object then we avoid condensing this map. 2100 * 2101 * To determine the second criterion we use a best-case estimate and assume 2102 * each segment can be represented on-disk as a single 64-bit entry. We refer 2103 * to this best-case estimate as the space map's minimal form. 2104 * 2105 * Unfortunately, we cannot compute the on-disk size of the space map in this 2106 * context because we cannot accurately compute the effects of compression, etc. 2107 * Instead, we apply the heuristic described in the block comment for 2108 * zfs_metaslab_condense_block_threshold - we only condense if the space used 2109 * is greater than a threshold number of blocks. 2110 */ 2111 static boolean_t 2112 metaslab_should_condense(metaslab_t *msp) 2113 { 2114 space_map_t *sm = msp->ms_sm; 2115 range_seg_t *rs; 2116 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 2117 dmu_object_info_t doi; 2118 vdev_t *vd = msp->ms_group->mg_vd; 2119 uint64_t vdev_blocksize = 1 << vd->vdev_ashift; 2120 uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); 2121 2122 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2123 ASSERT(msp->ms_loaded); 2124 2125 /* 2126 * Allocations and frees in early passes are generally more space 2127 * efficient (in terms of blocks described in space map entries) 2128 * than the ones in later passes (e.g. we don't compress after 2129 * sync pass 5) and condensing a metaslab multiple times in a txg 2130 * could degrade performance. 2131 * 2132 * Thus we prefer condensing each metaslab at most once every txg at 2133 * the earliest sync pass possible. If a metaslab is eligible for 2134 * condensing again after being considered for condensing within the 2135 * same txg, it will hopefully be dirty in the next txg where it will 2136 * be condensed at an earlier pass. 2137 */ 2138 if (msp->ms_condense_checked_txg == current_txg) 2139 return (B_FALSE); 2140 msp->ms_condense_checked_txg = current_txg; 2141 2142 /* 2143 * Use the ms_allocatable_by_size range tree, which is ordered by 2144 * size, to obtain the largest segment in the free tree. We always 2145 * condense metaslabs that are empty and metaslabs for which a 2146 * condense request has been made. 2147 */ 2148 rs = avl_last(&msp->ms_allocatable_by_size); 2149 if (rs == NULL || msp->ms_condense_wanted) 2150 return (B_TRUE); 2151 2152 /* 2153 * Calculate the number of 64-bit entries this segment would 2154 * require when written to disk. If this single segment would be 2155 * larger on-disk than the entire current on-disk structure, then 2156 * clearly condensing will increase the on-disk structure size. 2157 */ 2158 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 2159 entries = size / (MIN(size, SM_RUN_MAX)); 2160 segsz = entries * sizeof (uint64_t); 2161 2162 optimal_size = 2163 sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); 2164 object_size = space_map_length(msp->ms_sm); 2165 2166 dmu_object_info_from_db(sm->sm_dbuf, &doi); 2167 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 2168 2169 return (segsz <= object_size && 2170 object_size >= (optimal_size * zfs_condense_pct / 100) && 2171 object_size > zfs_metaslab_condense_block_threshold * record_size); 2172 } 2173 2174 /* 2175 * Condense the on-disk space map representation to its minimized form. 2176 * The minimized form consists of a small number of allocations followed by 2177 * the entries of the free range tree. 2178 */ 2179 static void 2180 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 2181 { 2182 range_tree_t *condense_tree; 2183 space_map_t *sm = msp->ms_sm; 2184 2185 ASSERT(MUTEX_HELD(&msp->ms_lock)); 2186 ASSERT(msp->ms_loaded); 2187 2188 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 2189 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 2190 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 2191 msp->ms_group->mg_vd->vdev_spa->spa_name, 2192 space_map_length(msp->ms_sm), 2193 avl_numnodes(&msp->ms_allocatable->rt_root), 2194 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 2195 2196 msp->ms_condense_wanted = B_FALSE; 2197 2198 /* 2199 * Create an range tree that is 100% allocated. We remove segments 2200 * that have been freed in this txg, any deferred frees that exist, 2201 * and any allocation in the future. Removing segments should be 2202 * a relatively inexpensive operation since we expect these trees to 2203 * have a small number of nodes. 2204 */ 2205 condense_tree = range_tree_create(NULL, NULL); 2206 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 2207 2208 range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree); 2209 range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree); 2210 2211 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2212 range_tree_walk(msp->ms_defer[t], 2213 range_tree_remove, condense_tree); 2214 } 2215 2216 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2217 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], 2218 range_tree_remove, condense_tree); 2219 } 2220 2221 /* 2222 * We're about to drop the metaslab's lock thus allowing 2223 * other consumers to change it's content. Set the 2224 * metaslab's ms_condensing flag to ensure that 2225 * allocations on this metaslab do not occur while we're 2226 * in the middle of committing it to disk. This is only critical 2227 * for ms_allocatable as all other range trees use per txg 2228 * views of their content. 2229 */ 2230 msp->ms_condensing = B_TRUE; 2231 2232 mutex_exit(&msp->ms_lock); 2233 space_map_truncate(sm, zfs_metaslab_sm_blksz, tx); 2234 2235 /* 2236 * While we would ideally like to create a space map representation 2237 * that consists only of allocation records, doing so can be 2238 * prohibitively expensive because the in-core free tree can be 2239 * large, and therefore computationally expensive to subtract 2240 * from the condense_tree. Instead we sync out two trees, a cheap 2241 * allocation only tree followed by the in-core free tree. While not 2242 * optimal, this is typically close to optimal, and much cheaper to 2243 * compute. 2244 */ 2245 space_map_write(sm, condense_tree, SM_ALLOC, tx); 2246 range_tree_vacate(condense_tree, NULL, NULL); 2247 range_tree_destroy(condense_tree); 2248 2249 space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); 2250 mutex_enter(&msp->ms_lock); 2251 msp->ms_condensing = B_FALSE; 2252 } 2253 2254 /* 2255 * Write a metaslab to disk in the context of the specified transaction group. 2256 */ 2257 void 2258 metaslab_sync(metaslab_t *msp, uint64_t txg) 2259 { 2260 metaslab_group_t *mg = msp->ms_group; 2261 vdev_t *vd = mg->mg_vd; 2262 spa_t *spa = vd->vdev_spa; 2263 objset_t *mos = spa_meta_objset(spa); 2264 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; 2265 dmu_tx_t *tx; 2266 uint64_t object = space_map_object(msp->ms_sm); 2267 2268 ASSERT(!vd->vdev_ishole); 2269 2270 /* 2271 * This metaslab has just been added so there's no work to do now. 2272 */ 2273 if (msp->ms_freeing == NULL) { 2274 ASSERT3P(alloctree, ==, NULL); 2275 return; 2276 } 2277 2278 ASSERT3P(alloctree, !=, NULL); 2279 ASSERT3P(msp->ms_freeing, !=, NULL); 2280 ASSERT3P(msp->ms_freed, !=, NULL); 2281 ASSERT3P(msp->ms_checkpointing, !=, NULL); 2282 2283 /* 2284 * Normally, we don't want to process a metaslab if there are no 2285 * allocations or frees to perform. However, if the metaslab is being 2286 * forced to condense and it's loaded, we need to let it through. 2287 */ 2288 if (range_tree_is_empty(alloctree) && 2289 range_tree_is_empty(msp->ms_freeing) && 2290 range_tree_is_empty(msp->ms_checkpointing) && 2291 !(msp->ms_loaded && msp->ms_condense_wanted)) 2292 return; 2293 2294 2295 VERIFY(txg <= spa_final_dirty_txg(spa)); 2296 2297 /* 2298 * The only state that can actually be changing concurrently with 2299 * metaslab_sync() is the metaslab's ms_allocatable. No other 2300 * thread can be modifying this txg's alloc, freeing, 2301 * freed, or space_map_phys_t. We drop ms_lock whenever we 2302 * could call into the DMU, because the DMU can call down to us 2303 * (e.g. via zio_free()) at any time. 2304 * 2305 * The spa_vdev_remove_thread() can be reading metaslab state 2306 * concurrently, and it is locked out by the ms_sync_lock. Note 2307 * that the ms_lock is insufficient for this, because it is dropped 2308 * by space_map_write(). 2309 */ 2310 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2311 2312 if (msp->ms_sm == NULL) { 2313 uint64_t new_object; 2314 2315 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx); 2316 VERIFY3U(new_object, !=, 0); 2317 2318 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2319 msp->ms_start, msp->ms_size, vd->vdev_ashift)); 2320 ASSERT(msp->ms_sm != NULL); 2321 } 2322 2323 if (!range_tree_is_empty(msp->ms_checkpointing) && 2324 vd->vdev_checkpoint_sm == NULL) { 2325 ASSERT(spa_has_checkpoint(spa)); 2326 2327 uint64_t new_object = space_map_alloc(mos, 2328 vdev_standard_sm_blksz, tx); 2329 VERIFY3U(new_object, !=, 0); 2330 2331 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, 2332 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); 2333 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2334 2335 /* 2336 * We save the space map object as an entry in vdev_top_zap 2337 * so it can be retrieved when the pool is reopened after an 2338 * export or through zdb. 2339 */ 2340 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, 2341 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 2342 sizeof (new_object), 1, &new_object, tx)); 2343 } 2344 2345 mutex_enter(&msp->ms_sync_lock); 2346 mutex_enter(&msp->ms_lock); 2347 2348 /* 2349 * Note: metaslab_condense() clears the space map's histogram. 2350 * Therefore we must verify and remove this histogram before 2351 * condensing. 2352 */ 2353 metaslab_group_histogram_verify(mg); 2354 metaslab_class_histogram_verify(mg->mg_class); 2355 metaslab_group_histogram_remove(mg, msp); 2356 2357 if (msp->ms_loaded && metaslab_should_condense(msp)) { 2358 metaslab_condense(msp, txg, tx); 2359 } else { 2360 mutex_exit(&msp->ms_lock); 2361 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 2362 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); 2363 mutex_enter(&msp->ms_lock); 2364 } 2365 2366 if (!range_tree_is_empty(msp->ms_checkpointing)) { 2367 ASSERT(spa_has_checkpoint(spa)); 2368 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2369 2370 /* 2371 * Since we are doing writes to disk and the ms_checkpointing 2372 * tree won't be changing during that time, we drop the 2373 * ms_lock while writing to the checkpoint space map. 2374 */ 2375 mutex_exit(&msp->ms_lock); 2376 space_map_write(vd->vdev_checkpoint_sm, 2377 msp->ms_checkpointing, SM_FREE, tx); 2378 mutex_enter(&msp->ms_lock); 2379 space_map_update(vd->vdev_checkpoint_sm); 2380 2381 spa->spa_checkpoint_info.sci_dspace += 2382 range_tree_space(msp->ms_checkpointing); 2383 vd->vdev_stat.vs_checkpoint_space += 2384 range_tree_space(msp->ms_checkpointing); 2385 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, 2386 -vd->vdev_checkpoint_sm->sm_alloc); 2387 2388 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); 2389 } 2390 2391 if (msp->ms_loaded) { 2392 /* 2393 * When the space map is loaded, we have an accurate 2394 * histogram in the range tree. This gives us an opportunity 2395 * to bring the space map's histogram up-to-date so we clear 2396 * it first before updating it. 2397 */ 2398 space_map_histogram_clear(msp->ms_sm); 2399 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); 2400 2401 /* 2402 * Since we've cleared the histogram we need to add back 2403 * any free space that has already been processed, plus 2404 * any deferred space. This allows the on-disk histogram 2405 * to accurately reflect all free space even if some space 2406 * is not yet available for allocation (i.e. deferred). 2407 */ 2408 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); 2409 2410 /* 2411 * Add back any deferred free space that has not been 2412 * added back into the in-core free tree yet. This will 2413 * ensure that we don't end up with a space map histogram 2414 * that is completely empty unless the metaslab is fully 2415 * allocated. 2416 */ 2417 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2418 space_map_histogram_add(msp->ms_sm, 2419 msp->ms_defer[t], tx); 2420 } 2421 } 2422 2423 /* 2424 * Always add the free space from this sync pass to the space 2425 * map histogram. We want to make sure that the on-disk histogram 2426 * accounts for all free space. If the space map is not loaded, 2427 * then we will lose some accuracy but will correct it the next 2428 * time we load the space map. 2429 */ 2430 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); 2431 2432 metaslab_group_histogram_add(mg, msp); 2433 metaslab_group_histogram_verify(mg); 2434 metaslab_class_histogram_verify(mg->mg_class); 2435 2436 /* 2437 * For sync pass 1, we avoid traversing this txg's free range tree 2438 * and instead will just swap the pointers for freeing and 2439 * freed. We can safely do this since the freed_tree is 2440 * guaranteed to be empty on the initial pass. 2441 */ 2442 if (spa_sync_pass(spa) == 1) { 2443 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); 2444 } else { 2445 range_tree_vacate(msp->ms_freeing, 2446 range_tree_add, msp->ms_freed); 2447 } 2448 range_tree_vacate(alloctree, NULL, NULL); 2449 2450 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2451 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) 2452 & TXG_MASK])); 2453 ASSERT0(range_tree_space(msp->ms_freeing)); 2454 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2455 2456 mutex_exit(&msp->ms_lock); 2457 2458 if (object != space_map_object(msp->ms_sm)) { 2459 object = space_map_object(msp->ms_sm); 2460 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2461 msp->ms_id, sizeof (uint64_t), &object, tx); 2462 } 2463 mutex_exit(&msp->ms_sync_lock); 2464 dmu_tx_commit(tx); 2465 } 2466 2467 /* 2468 * Called after a transaction group has completely synced to mark 2469 * all of the metaslab's free space as usable. 2470 */ 2471 void 2472 metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2473 { 2474 metaslab_group_t *mg = msp->ms_group; 2475 vdev_t *vd = mg->mg_vd; 2476 spa_t *spa = vd->vdev_spa; 2477 range_tree_t **defer_tree; 2478 int64_t alloc_delta, defer_delta; 2479 boolean_t defer_allowed = B_TRUE; 2480 2481 ASSERT(!vd->vdev_ishole); 2482 2483 mutex_enter(&msp->ms_lock); 2484 2485 /* 2486 * If this metaslab is just becoming available, initialize its 2487 * range trees and add its capacity to the vdev. 2488 */ 2489 if (msp->ms_freed == NULL) { 2490 for (int t = 0; t < TXG_SIZE; t++) { 2491 ASSERT(msp->ms_allocating[t] == NULL); 2492 2493 msp->ms_allocating[t] = range_tree_create(NULL, NULL); 2494 } 2495 2496 ASSERT3P(msp->ms_freeing, ==, NULL); 2497 msp->ms_freeing = range_tree_create(NULL, NULL); 2498 2499 ASSERT3P(msp->ms_freed, ==, NULL); 2500 msp->ms_freed = range_tree_create(NULL, NULL); 2501 2502 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2503 ASSERT(msp->ms_defer[t] == NULL); 2504 2505 msp->ms_defer[t] = range_tree_create(NULL, NULL); 2506 } 2507 2508 ASSERT3P(msp->ms_checkpointing, ==, NULL); 2509 msp->ms_checkpointing = range_tree_create(NULL, NULL); 2510 2511 vdev_space_update(vd, 0, 0, msp->ms_size); 2512 } 2513 ASSERT0(range_tree_space(msp->ms_freeing)); 2514 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2515 2516 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; 2517 2518 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - 2519 metaslab_class_get_alloc(spa_normal_class(spa)); 2520 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { 2521 defer_allowed = B_FALSE; 2522 } 2523 2524 defer_delta = 0; 2525 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2526 if (defer_allowed) { 2527 defer_delta = range_tree_space(msp->ms_freed) - 2528 range_tree_space(*defer_tree); 2529 } else { 2530 defer_delta -= range_tree_space(*defer_tree); 2531 } 2532 2533 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2534 2535 /* 2536 * If there's a metaslab_load() in progress, wait for it to complete 2537 * so that we have a consistent view of the in-core space map. 2538 */ 2539 metaslab_load_wait(msp); 2540 2541 /* 2542 * Move the frees from the defer_tree back to the free 2543 * range tree (if it's loaded). Swap the freed_tree and 2544 * the defer_tree -- this is safe to do because we've 2545 * just emptied out the defer_tree. 2546 */ 2547 range_tree_vacate(*defer_tree, 2548 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); 2549 if (defer_allowed) { 2550 range_tree_swap(&msp->ms_freed, defer_tree); 2551 } else { 2552 range_tree_vacate(msp->ms_freed, 2553 msp->ms_loaded ? range_tree_add : NULL, 2554 msp->ms_allocatable); 2555 } 2556 space_map_update(msp->ms_sm); 2557 2558 msp->ms_deferspace += defer_delta; 2559 ASSERT3S(msp->ms_deferspace, >=, 0); 2560 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2561 if (msp->ms_deferspace != 0) { 2562 /* 2563 * Keep syncing this metaslab until all deferred frees 2564 * are back in circulation. 2565 */ 2566 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2567 } 2568 2569 /* 2570 * Calculate the new weights before unloading any metaslabs. 2571 * This will give us the most accurate weighting. 2572 */ 2573 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2574 2575 /* 2576 * If the metaslab is loaded and we've not tried to load or allocate 2577 * from it in 'metaslab_unload_delay' txgs, then unload it. 2578 */ 2579 if (msp->ms_loaded && 2580 msp->ms_selected_txg + metaslab_unload_delay < txg) { 2581 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2582 VERIFY0(range_tree_space( 2583 msp->ms_allocating[(txg + t) & TXG_MASK])); 2584 } 2585 2586 if (!metaslab_debug_unload) 2587 metaslab_unload(msp); 2588 } 2589 2590 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); 2591 ASSERT0(range_tree_space(msp->ms_freeing)); 2592 ASSERT0(range_tree_space(msp->ms_freed)); 2593 ASSERT0(range_tree_space(msp->ms_checkpointing)); 2594 2595 mutex_exit(&msp->ms_lock); 2596 } 2597 2598 void 2599 metaslab_sync_reassess(metaslab_group_t *mg) 2600 { 2601 spa_t *spa = mg->mg_class->mc_spa; 2602 2603 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2604 metaslab_group_alloc_update(mg); 2605 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2606 2607 /* 2608 * Preload the next potential metaslabs but only on active 2609 * metaslab groups. We can get into a state where the metaslab 2610 * is no longer active since we dirty metaslabs as we remove a 2611 * a device, thus potentially making the metaslab group eligible 2612 * for preloading. 2613 */ 2614 if (mg->mg_activation_count > 0) { 2615 metaslab_group_preload(mg); 2616 } 2617 spa_config_exit(spa, SCL_ALLOC, FTAG); 2618 } 2619 2620 static uint64_t 2621 metaslab_distance(metaslab_t *msp, dva_t *dva) 2622 { 2623 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2624 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2625 uint64_t start = msp->ms_id; 2626 2627 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2628 return (1ULL << 63); 2629 2630 if (offset < start) 2631 return ((start - offset) << ms_shift); 2632 if (offset > start) 2633 return ((offset - start) << ms_shift); 2634 return (0); 2635 } 2636 2637 /* 2638 * ========================================================================== 2639 * Metaslab allocation tracing facility 2640 * ========================================================================== 2641 */ 2642 kstat_t *metaslab_trace_ksp; 2643 kstat_named_t metaslab_trace_over_limit; 2644 2645 void 2646 metaslab_alloc_trace_init(void) 2647 { 2648 ASSERT(metaslab_alloc_trace_cache == NULL); 2649 metaslab_alloc_trace_cache = kmem_cache_create( 2650 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 2651 0, NULL, NULL, NULL, NULL, NULL, 0); 2652 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", 2653 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); 2654 if (metaslab_trace_ksp != NULL) { 2655 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; 2656 kstat_named_init(&metaslab_trace_over_limit, 2657 "metaslab_trace_over_limit", KSTAT_DATA_UINT64); 2658 kstat_install(metaslab_trace_ksp); 2659 } 2660 } 2661 2662 void 2663 metaslab_alloc_trace_fini(void) 2664 { 2665 if (metaslab_trace_ksp != NULL) { 2666 kstat_delete(metaslab_trace_ksp); 2667 metaslab_trace_ksp = NULL; 2668 } 2669 kmem_cache_destroy(metaslab_alloc_trace_cache); 2670 metaslab_alloc_trace_cache = NULL; 2671 } 2672 2673 /* 2674 * Add an allocation trace element to the allocation tracing list. 2675 */ 2676 static void 2677 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, 2678 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) 2679 { 2680 if (!metaslab_trace_enabled) 2681 return; 2682 2683 /* 2684 * When the tracing list reaches its maximum we remove 2685 * the second element in the list before adding a new one. 2686 * By removing the second element we preserve the original 2687 * entry as a clue to what allocations steps have already been 2688 * performed. 2689 */ 2690 if (zal->zal_size == metaslab_trace_max_entries) { 2691 metaslab_alloc_trace_t *mat_next; 2692 #ifdef DEBUG 2693 panic("too many entries in allocation list"); 2694 #endif 2695 atomic_inc_64(&metaslab_trace_over_limit.value.ui64); 2696 zal->zal_size--; 2697 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); 2698 list_remove(&zal->zal_list, mat_next); 2699 kmem_cache_free(metaslab_alloc_trace_cache, mat_next); 2700 } 2701 2702 metaslab_alloc_trace_t *mat = 2703 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); 2704 list_link_init(&mat->mat_list_node); 2705 mat->mat_mg = mg; 2706 mat->mat_msp = msp; 2707 mat->mat_size = psize; 2708 mat->mat_dva_id = dva_id; 2709 mat->mat_offset = offset; 2710 mat->mat_weight = 0; 2711 2712 if (msp != NULL) 2713 mat->mat_weight = msp->ms_weight; 2714 2715 /* 2716 * The list is part of the zio so locking is not required. Only 2717 * a single thread will perform allocations for a given zio. 2718 */ 2719 list_insert_tail(&zal->zal_list, mat); 2720 zal->zal_size++; 2721 2722 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); 2723 } 2724 2725 void 2726 metaslab_trace_init(zio_alloc_list_t *zal) 2727 { 2728 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), 2729 offsetof(metaslab_alloc_trace_t, mat_list_node)); 2730 zal->zal_size = 0; 2731 } 2732 2733 void 2734 metaslab_trace_fini(zio_alloc_list_t *zal) 2735 { 2736 metaslab_alloc_trace_t *mat; 2737 2738 while ((mat = list_remove_head(&zal->zal_list)) != NULL) 2739 kmem_cache_free(metaslab_alloc_trace_cache, mat); 2740 list_destroy(&zal->zal_list); 2741 zal->zal_size = 0; 2742 } 2743 2744 /* 2745 * ========================================================================== 2746 * Metaslab block operations 2747 * ========================================================================== 2748 */ 2749 2750 static void 2751 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) 2752 { 2753 if (!(flags & METASLAB_ASYNC_ALLOC) || 2754 flags & METASLAB_DONT_THROTTLE) 2755 return; 2756 2757 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2758 if (!mg->mg_class->mc_alloc_throttle_enabled) 2759 return; 2760 2761 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2762 } 2763 2764 void 2765 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) 2766 { 2767 if (!(flags & METASLAB_ASYNC_ALLOC) || 2768 flags & METASLAB_DONT_THROTTLE) 2769 return; 2770 2771 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2772 if (!mg->mg_class->mc_alloc_throttle_enabled) 2773 return; 2774 2775 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2776 } 2777 2778 void 2779 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) 2780 { 2781 #ifdef ZFS_DEBUG 2782 const dva_t *dva = bp->blk_dva; 2783 int ndvas = BP_GET_NDVAS(bp); 2784 2785 for (int d = 0; d < ndvas; d++) { 2786 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2787 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2788 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2789 } 2790 #endif 2791 } 2792 2793 static uint64_t 2794 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) 2795 { 2796 uint64_t start; 2797 range_tree_t *rt = msp->ms_allocatable; 2798 metaslab_class_t *mc = msp->ms_group->mg_class; 2799 2800 VERIFY(!msp->ms_condensing); 2801 2802 start = mc->mc_ops->msop_alloc(msp, size); 2803 if (start != -1ULL) { 2804 metaslab_group_t *mg = msp->ms_group; 2805 vdev_t *vd = mg->mg_vd; 2806 2807 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 2808 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2809 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 2810 range_tree_remove(rt, start, size); 2811 2812 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 2813 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2814 2815 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); 2816 2817 /* Track the last successful allocation */ 2818 msp->ms_alloc_txg = txg; 2819 metaslab_verify_space(msp, txg); 2820 } 2821 2822 /* 2823 * Now that we've attempted the allocation we need to update the 2824 * metaslab's maximum block size since it may have changed. 2825 */ 2826 msp->ms_max_size = metaslab_block_maxsize(msp); 2827 return (start); 2828 } 2829 2830 static uint64_t 2831 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, 2832 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2833 { 2834 metaslab_t *msp = NULL; 2835 uint64_t offset = -1ULL; 2836 uint64_t activation_weight; 2837 uint64_t target_distance; 2838 int i; 2839 2840 activation_weight = METASLAB_WEIGHT_PRIMARY; 2841 for (i = 0; i < d; i++) { 2842 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2843 activation_weight = METASLAB_WEIGHT_SECONDARY; 2844 break; 2845 } 2846 } 2847 2848 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); 2849 search->ms_weight = UINT64_MAX; 2850 search->ms_start = 0; 2851 for (;;) { 2852 boolean_t was_active; 2853 avl_tree_t *t = &mg->mg_metaslab_tree; 2854 avl_index_t idx; 2855 2856 mutex_enter(&mg->mg_lock); 2857 2858 /* 2859 * Find the metaslab with the highest weight that is less 2860 * than what we've already tried. In the common case, this 2861 * means that we will examine each metaslab at most once. 2862 * Note that concurrent callers could reorder metaslabs 2863 * by activation/passivation once we have dropped the mg_lock. 2864 * If a metaslab is activated by another thread, and we fail 2865 * to allocate from the metaslab we have selected, we may 2866 * not try the newly-activated metaslab, and instead activate 2867 * another metaslab. This is not optimal, but generally 2868 * does not cause any problems (a possible exception being 2869 * if every metaslab is completely full except for the 2870 * the newly-activated metaslab which we fail to examine). 2871 */ 2872 msp = avl_find(t, search, &idx); 2873 if (msp == NULL) 2874 msp = avl_nearest(t, idx, AVL_AFTER); 2875 for (; msp != NULL; msp = AVL_NEXT(t, msp)) { 2876 2877 if (!metaslab_should_allocate(msp, asize)) { 2878 metaslab_trace_add(zal, mg, msp, asize, d, 2879 TRACE_TOO_SMALL); 2880 continue; 2881 } 2882 2883 /* 2884 * If the selected metaslab is condensing, skip it. 2885 */ 2886 if (msp->ms_condensing) 2887 continue; 2888 2889 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2890 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2891 break; 2892 2893 target_distance = min_distance + 2894 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2895 min_distance >> 1); 2896 2897 for (i = 0; i < d; i++) { 2898 if (metaslab_distance(msp, &dva[i]) < 2899 target_distance) 2900 break; 2901 } 2902 if (i == d) 2903 break; 2904 } 2905 mutex_exit(&mg->mg_lock); 2906 if (msp == NULL) { 2907 kmem_free(search, sizeof (*search)); 2908 return (-1ULL); 2909 } 2910 search->ms_weight = msp->ms_weight; 2911 search->ms_start = msp->ms_start + 1; 2912 2913 mutex_enter(&msp->ms_lock); 2914 2915 /* 2916 * Ensure that the metaslab we have selected is still 2917 * capable of handling our request. It's possible that 2918 * another thread may have changed the weight while we 2919 * were blocked on the metaslab lock. We check the 2920 * active status first to see if we need to reselect 2921 * a new metaslab. 2922 */ 2923 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { 2924 mutex_exit(&msp->ms_lock); 2925 continue; 2926 } 2927 2928 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2929 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2930 metaslab_passivate(msp, 2931 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2932 mutex_exit(&msp->ms_lock); 2933 continue; 2934 } 2935 2936 if (metaslab_activate(msp, activation_weight) != 0) { 2937 mutex_exit(&msp->ms_lock); 2938 continue; 2939 } 2940 msp->ms_selected_txg = txg; 2941 2942 /* 2943 * Now that we have the lock, recheck to see if we should 2944 * continue to use this metaslab for this allocation. The 2945 * the metaslab is now loaded so metaslab_should_allocate() can 2946 * accurately determine if the allocation attempt should 2947 * proceed. 2948 */ 2949 if (!metaslab_should_allocate(msp, asize)) { 2950 /* Passivate this metaslab and select a new one. */ 2951 metaslab_trace_add(zal, mg, msp, asize, d, 2952 TRACE_TOO_SMALL); 2953 goto next; 2954 } 2955 2956 /* 2957 * If this metaslab is currently condensing then pick again as 2958 * we can't manipulate this metaslab until it's committed 2959 * to disk. 2960 */ 2961 if (msp->ms_condensing) { 2962 metaslab_trace_add(zal, mg, msp, asize, d, 2963 TRACE_CONDENSING); 2964 mutex_exit(&msp->ms_lock); 2965 continue; 2966 } 2967 2968 offset = metaslab_block_alloc(msp, asize, txg); 2969 metaslab_trace_add(zal, mg, msp, asize, d, offset); 2970 2971 if (offset != -1ULL) { 2972 /* Proactively passivate the metaslab, if needed */ 2973 metaslab_segment_may_passivate(msp); 2974 break; 2975 } 2976 next: 2977 ASSERT(msp->ms_loaded); 2978 2979 /* 2980 * We were unable to allocate from this metaslab so determine 2981 * a new weight for this metaslab. Now that we have loaded 2982 * the metaslab we can provide a better hint to the metaslab 2983 * selector. 2984 * 2985 * For space-based metaslabs, we use the maximum block size. 2986 * This information is only available when the metaslab 2987 * is loaded and is more accurate than the generic free 2988 * space weight that was calculated by metaslab_weight(). 2989 * This information allows us to quickly compare the maximum 2990 * available allocation in the metaslab to the allocation 2991 * size being requested. 2992 * 2993 * For segment-based metaslabs, determine the new weight 2994 * based on the highest bucket in the range tree. We 2995 * explicitly use the loaded segment weight (i.e. the range 2996 * tree histogram) since it contains the space that is 2997 * currently available for allocation and is accurate 2998 * even within a sync pass. 2999 */ 3000 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { 3001 uint64_t weight = metaslab_block_maxsize(msp); 3002 WEIGHT_SET_SPACEBASED(weight); 3003 metaslab_passivate(msp, weight); 3004 } else { 3005 metaslab_passivate(msp, 3006 metaslab_weight_from_range_tree(msp)); 3007 } 3008 3009 /* 3010 * We have just failed an allocation attempt, check 3011 * that metaslab_should_allocate() agrees. Otherwise, 3012 * we may end up in an infinite loop retrying the same 3013 * metaslab. 3014 */ 3015 ASSERT(!metaslab_should_allocate(msp, asize)); 3016 mutex_exit(&msp->ms_lock); 3017 } 3018 mutex_exit(&msp->ms_lock); 3019 kmem_free(search, sizeof (*search)); 3020 return (offset); 3021 } 3022 3023 static uint64_t 3024 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, 3025 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 3026 { 3027 uint64_t offset; 3028 ASSERT(mg->mg_initialized); 3029 3030 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, 3031 min_distance, dva, d); 3032 3033 mutex_enter(&mg->mg_lock); 3034 if (offset == -1ULL) { 3035 mg->mg_failed_allocations++; 3036 metaslab_trace_add(zal, mg, NULL, asize, d, 3037 TRACE_GROUP_FAILURE); 3038 if (asize == SPA_GANGBLOCKSIZE) { 3039 /* 3040 * This metaslab group was unable to allocate 3041 * the minimum gang block size so it must be out of 3042 * space. We must notify the allocation throttle 3043 * to start skipping allocation attempts to this 3044 * metaslab group until more space becomes available. 3045 * Note: this failure cannot be caused by the 3046 * allocation throttle since the allocation throttle 3047 * is only responsible for skipping devices and 3048 * not failing block allocations. 3049 */ 3050 mg->mg_no_free_space = B_TRUE; 3051 } 3052 } 3053 mg->mg_allocations++; 3054 mutex_exit(&mg->mg_lock); 3055 return (offset); 3056 } 3057 3058 /* 3059 * If we have to write a ditto block (i.e. more than one DVA for a given BP) 3060 * on the same vdev as an existing DVA of this BP, then try to allocate it 3061 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the 3062 * existing DVAs. 3063 */ 3064 int ditto_same_vdev_distance_shift = 3; 3065 3066 /* 3067 * Allocate a block for the specified i/o. 3068 */ 3069 int 3070 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 3071 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, 3072 zio_alloc_list_t *zal) 3073 { 3074 metaslab_group_t *mg, *rotor; 3075 vdev_t *vd; 3076 boolean_t try_hard = B_FALSE; 3077 3078 ASSERT(!DVA_IS_VALID(&dva[d])); 3079 3080 /* 3081 * For testing, make some blocks above a certain size be gang blocks. 3082 */ 3083 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { 3084 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); 3085 return (SET_ERROR(ENOSPC)); 3086 } 3087 3088 /* 3089 * Start at the rotor and loop through all mgs until we find something. 3090 * Note that there's no locking on mc_rotor or mc_aliquot because 3091 * nothing actually breaks if we miss a few updates -- we just won't 3092 * allocate quite as evenly. It all balances out over time. 3093 * 3094 * If we are doing ditto or log blocks, try to spread them across 3095 * consecutive vdevs. If we're forced to reuse a vdev before we've 3096 * allocated all of our ditto blocks, then try and spread them out on 3097 * that vdev as much as possible. If it turns out to not be possible, 3098 * gradually lower our standards until anything becomes acceptable. 3099 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 3100 * gives us hope of containing our fault domains to something we're 3101 * able to reason about. Otherwise, any two top-level vdev failures 3102 * will guarantee the loss of data. With consecutive allocation, 3103 * only two adjacent top-level vdev failures will result in data loss. 3104 * 3105 * If we are doing gang blocks (hintdva is non-NULL), try to keep 3106 * ourselves on the same vdev as our gang block header. That 3107 * way, we can hope for locality in vdev_cache, plus it makes our 3108 * fault domains something tractable. 3109 */ 3110 if (hintdva) { 3111 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 3112 3113 /* 3114 * It's possible the vdev we're using as the hint no 3115 * longer exists or its mg has been closed (e.g. by 3116 * device removal). Consult the rotor when 3117 * all else fails. 3118 */ 3119 if (vd != NULL && vd->vdev_mg != NULL) { 3120 mg = vd->vdev_mg; 3121 3122 if (flags & METASLAB_HINTBP_AVOID && 3123 mg->mg_next != NULL) 3124 mg = mg->mg_next; 3125 } else { 3126 mg = mc->mc_rotor; 3127 } 3128 } else if (d != 0) { 3129 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 3130 mg = vd->vdev_mg->mg_next; 3131 } else { 3132 mg = mc->mc_rotor; 3133 } 3134 3135 /* 3136 * If the hint put us into the wrong metaslab class, or into a 3137 * metaslab group that has been passivated, just follow the rotor. 3138 */ 3139 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 3140 mg = mc->mc_rotor; 3141 3142 rotor = mg; 3143 top: 3144 do { 3145 boolean_t allocatable; 3146 3147 ASSERT(mg->mg_activation_count == 1); 3148 vd = mg->mg_vd; 3149 3150 /* 3151 * Don't allocate from faulted devices. 3152 */ 3153 if (try_hard) { 3154 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 3155 allocatable = vdev_allocatable(vd); 3156 spa_config_exit(spa, SCL_ZIO, FTAG); 3157 } else { 3158 allocatable = vdev_allocatable(vd); 3159 } 3160 3161 /* 3162 * Determine if the selected metaslab group is eligible 3163 * for allocations. If we're ganging then don't allow 3164 * this metaslab group to skip allocations since that would 3165 * inadvertently return ENOSPC and suspend the pool 3166 * even though space is still available. 3167 */ 3168 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { 3169 allocatable = metaslab_group_allocatable(mg, rotor, 3170 psize); 3171 } 3172 3173 if (!allocatable) { 3174 metaslab_trace_add(zal, mg, NULL, psize, d, 3175 TRACE_NOT_ALLOCATABLE); 3176 goto next; 3177 } 3178 3179 ASSERT(mg->mg_initialized); 3180 3181 /* 3182 * Avoid writing single-copy data to a failing, 3183 * non-redundant vdev, unless we've already tried all 3184 * other vdevs. 3185 */ 3186 if ((vd->vdev_stat.vs_write_errors > 0 || 3187 vd->vdev_state < VDEV_STATE_HEALTHY) && 3188 d == 0 && !try_hard && vd->vdev_children == 0) { 3189 metaslab_trace_add(zal, mg, NULL, psize, d, 3190 TRACE_VDEV_ERROR); 3191 goto next; 3192 } 3193 3194 ASSERT(mg->mg_class == mc); 3195 3196 /* 3197 * If we don't need to try hard, then require that the 3198 * block be 1/8th of the device away from any other DVAs 3199 * in this BP. If we are trying hard, allow any offset 3200 * to be used (distance=0). 3201 */ 3202 uint64_t distance = 0; 3203 if (!try_hard) { 3204 distance = vd->vdev_asize >> 3205 ditto_same_vdev_distance_shift; 3206 if (distance <= (1ULL << vd->vdev_ms_shift)) 3207 distance = 0; 3208 } 3209 3210 uint64_t asize = vdev_psize_to_asize(vd, psize); 3211 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 3212 3213 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, 3214 distance, dva, d); 3215 3216 if (offset != -1ULL) { 3217 /* 3218 * If we've just selected this metaslab group, 3219 * figure out whether the corresponding vdev is 3220 * over- or under-used relative to the pool, 3221 * and set an allocation bias to even it out. 3222 */ 3223 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 3224 vdev_stat_t *vs = &vd->vdev_stat; 3225 int64_t vu, cu; 3226 3227 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 3228 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 3229 3230 /* 3231 * Calculate how much more or less we should 3232 * try to allocate from this device during 3233 * this iteration around the rotor. 3234 * For example, if a device is 80% full 3235 * and the pool is 20% full then we should 3236 * reduce allocations by 60% on this device. 3237 * 3238 * mg_bias = (20 - 80) * 512K / 100 = -307K 3239 * 3240 * This reduces allocations by 307K for this 3241 * iteration. 3242 */ 3243 mg->mg_bias = ((cu - vu) * 3244 (int64_t)mg->mg_aliquot) / 100; 3245 } else if (!metaslab_bias_enabled) { 3246 mg->mg_bias = 0; 3247 } 3248 3249 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 3250 mg->mg_aliquot + mg->mg_bias) { 3251 mc->mc_rotor = mg->mg_next; 3252 mc->mc_aliquot = 0; 3253 } 3254 3255 DVA_SET_VDEV(&dva[d], vd->vdev_id); 3256 DVA_SET_OFFSET(&dva[d], offset); 3257 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 3258 DVA_SET_ASIZE(&dva[d], asize); 3259 3260 return (0); 3261 } 3262 next: 3263 mc->mc_rotor = mg->mg_next; 3264 mc->mc_aliquot = 0; 3265 } while ((mg = mg->mg_next) != rotor); 3266 3267 /* 3268 * If we haven't tried hard, do so now. 3269 */ 3270 if (!try_hard) { 3271 try_hard = B_TRUE; 3272 goto top; 3273 } 3274 3275 bzero(&dva[d], sizeof (dva_t)); 3276 3277 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); 3278 return (SET_ERROR(ENOSPC)); 3279 } 3280 3281 void 3282 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, 3283 boolean_t checkpoint) 3284 { 3285 metaslab_t *msp; 3286 spa_t *spa = vd->vdev_spa; 3287 3288 ASSERT(vdev_is_concrete(vd)); 3289 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3290 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3291 3292 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3293 3294 VERIFY(!msp->ms_condensing); 3295 VERIFY3U(offset, >=, msp->ms_start); 3296 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); 3297 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3298 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); 3299 3300 metaslab_check_free_impl(vd, offset, asize); 3301 3302 mutex_enter(&msp->ms_lock); 3303 if (range_tree_is_empty(msp->ms_freeing) && 3304 range_tree_is_empty(msp->ms_checkpointing)) { 3305 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); 3306 } 3307 3308 if (checkpoint) { 3309 ASSERT(spa_has_checkpoint(spa)); 3310 range_tree_add(msp->ms_checkpointing, offset, asize); 3311 } else { 3312 range_tree_add(msp->ms_freeing, offset, asize); 3313 } 3314 mutex_exit(&msp->ms_lock); 3315 } 3316 3317 /* ARGSUSED */ 3318 void 3319 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3320 uint64_t size, void *arg) 3321 { 3322 boolean_t *checkpoint = arg; 3323 3324 ASSERT3P(checkpoint, !=, NULL); 3325 3326 if (vd->vdev_ops->vdev_op_remap != NULL) 3327 vdev_indirect_mark_obsolete(vd, offset, size); 3328 else 3329 metaslab_free_impl(vd, offset, size, *checkpoint); 3330 } 3331 3332 static void 3333 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, 3334 boolean_t checkpoint) 3335 { 3336 spa_t *spa = vd->vdev_spa; 3337 3338 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3339 3340 if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) 3341 return; 3342 3343 if (spa->spa_vdev_removal != NULL && 3344 spa->spa_vdev_removal->svr_vdev == vd && 3345 vdev_is_concrete(vd)) { 3346 /* 3347 * Note: we check if the vdev is concrete because when 3348 * we complete the removal, we first change the vdev to be 3349 * an indirect vdev (in open context), and then (in syncing 3350 * context) clear spa_vdev_removal. 3351 */ 3352 free_from_removing_vdev(vd, offset, size); 3353 } else if (vd->vdev_ops->vdev_op_remap != NULL) { 3354 vdev_indirect_mark_obsolete(vd, offset, size); 3355 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3356 metaslab_free_impl_cb, &checkpoint); 3357 } else { 3358 metaslab_free_concrete(vd, offset, size, checkpoint); 3359 } 3360 } 3361 3362 typedef struct remap_blkptr_cb_arg { 3363 blkptr_t *rbca_bp; 3364 spa_remap_cb_t rbca_cb; 3365 vdev_t *rbca_remap_vd; 3366 uint64_t rbca_remap_offset; 3367 void *rbca_cb_arg; 3368 } remap_blkptr_cb_arg_t; 3369 3370 void 3371 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3372 uint64_t size, void *arg) 3373 { 3374 remap_blkptr_cb_arg_t *rbca = arg; 3375 blkptr_t *bp = rbca->rbca_bp; 3376 3377 /* We can not remap split blocks. */ 3378 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) 3379 return; 3380 ASSERT0(inner_offset); 3381 3382 if (rbca->rbca_cb != NULL) { 3383 /* 3384 * At this point we know that we are not handling split 3385 * blocks and we invoke the callback on the previous 3386 * vdev which must be indirect. 3387 */ 3388 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); 3389 3390 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, 3391 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); 3392 3393 /* set up remap_blkptr_cb_arg for the next call */ 3394 rbca->rbca_remap_vd = vd; 3395 rbca->rbca_remap_offset = offset; 3396 } 3397 3398 /* 3399 * The phys birth time is that of dva[0]. This ensures that we know 3400 * when each dva was written, so that resilver can determine which 3401 * blocks need to be scrubbed (i.e. those written during the time 3402 * the vdev was offline). It also ensures that the key used in 3403 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If 3404 * we didn't change the phys_birth, a lookup in the ARC for a 3405 * remapped BP could find the data that was previously stored at 3406 * this vdev + offset. 3407 */ 3408 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, 3409 DVA_GET_VDEV(&bp->blk_dva[0])); 3410 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; 3411 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, 3412 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); 3413 3414 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 3415 DVA_SET_OFFSET(&bp->blk_dva[0], offset); 3416 } 3417 3418 /* 3419 * If the block pointer contains any indirect DVAs, modify them to refer to 3420 * concrete DVAs. Note that this will sometimes not be possible, leaving 3421 * the indirect DVA in place. This happens if the indirect DVA spans multiple 3422 * segments in the mapping (i.e. it is a "split block"). 3423 * 3424 * If the BP was remapped, calls the callback on the original dva (note the 3425 * callback can be called multiple times if the original indirect DVA refers 3426 * to another indirect DVA, etc). 3427 * 3428 * Returns TRUE if the BP was remapped. 3429 */ 3430 boolean_t 3431 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) 3432 { 3433 remap_blkptr_cb_arg_t rbca; 3434 3435 if (!zfs_remap_blkptr_enable) 3436 return (B_FALSE); 3437 3438 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) 3439 return (B_FALSE); 3440 3441 /* 3442 * Dedup BP's can not be remapped, because ddt_phys_select() depends 3443 * on DVA[0] being the same in the BP as in the DDT (dedup table). 3444 */ 3445 if (BP_GET_DEDUP(bp)) 3446 return (B_FALSE); 3447 3448 /* 3449 * Gang blocks can not be remapped, because 3450 * zio_checksum_gang_verifier() depends on the DVA[0] that's in 3451 * the BP used to read the gang block header (GBH) being the same 3452 * as the DVA[0] that we allocated for the GBH. 3453 */ 3454 if (BP_IS_GANG(bp)) 3455 return (B_FALSE); 3456 3457 /* 3458 * Embedded BP's have no DVA to remap. 3459 */ 3460 if (BP_GET_NDVAS(bp) < 1) 3461 return (B_FALSE); 3462 3463 /* 3464 * Note: we only remap dva[0]. If we remapped other dvas, we 3465 * would no longer know what their phys birth txg is. 3466 */ 3467 dva_t *dva = &bp->blk_dva[0]; 3468 3469 uint64_t offset = DVA_GET_OFFSET(dva); 3470 uint64_t size = DVA_GET_ASIZE(dva); 3471 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3472 3473 if (vd->vdev_ops->vdev_op_remap == NULL) 3474 return (B_FALSE); 3475 3476 rbca.rbca_bp = bp; 3477 rbca.rbca_cb = callback; 3478 rbca.rbca_remap_vd = vd; 3479 rbca.rbca_remap_offset = offset; 3480 rbca.rbca_cb_arg = arg; 3481 3482 /* 3483 * remap_blkptr_cb() will be called in order for each level of 3484 * indirection, until a concrete vdev is reached or a split block is 3485 * encountered. old_vd and old_offset are updated within the callback 3486 * as we go from the one indirect vdev to the next one (either concrete 3487 * or indirect again) in that order. 3488 */ 3489 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); 3490 3491 /* Check if the DVA wasn't remapped because it is a split block */ 3492 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) 3493 return (B_FALSE); 3494 3495 return (B_TRUE); 3496 } 3497 3498 /* 3499 * Undo the allocation of a DVA which happened in the given transaction group. 3500 */ 3501 void 3502 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3503 { 3504 metaslab_t *msp; 3505 vdev_t *vd; 3506 uint64_t vdev = DVA_GET_VDEV(dva); 3507 uint64_t offset = DVA_GET_OFFSET(dva); 3508 uint64_t size = DVA_GET_ASIZE(dva); 3509 3510 ASSERT(DVA_IS_VALID(dva)); 3511 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3512 3513 if (txg > spa_freeze_txg(spa)) 3514 return; 3515 3516 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 3517 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 3518 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 3519 (u_longlong_t)vdev, (u_longlong_t)offset); 3520 ASSERT(0); 3521 return; 3522 } 3523 3524 ASSERT(!vd->vdev_removing); 3525 ASSERT(vdev_is_concrete(vd)); 3526 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3527 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 3528 3529 if (DVA_GET_GANG(dva)) 3530 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3531 3532 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3533 3534 mutex_enter(&msp->ms_lock); 3535 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], 3536 offset, size); 3537 3538 VERIFY(!msp->ms_condensing); 3539 VERIFY3U(offset, >=, msp->ms_start); 3540 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 3541 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, 3542 msp->ms_size); 3543 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3544 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3545 range_tree_add(msp->ms_allocatable, offset, size); 3546 mutex_exit(&msp->ms_lock); 3547 } 3548 3549 /* 3550 * Free the block represented by the given DVA. 3551 */ 3552 void 3553 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) 3554 { 3555 uint64_t vdev = DVA_GET_VDEV(dva); 3556 uint64_t offset = DVA_GET_OFFSET(dva); 3557 uint64_t size = DVA_GET_ASIZE(dva); 3558 vdev_t *vd = vdev_lookup_top(spa, vdev); 3559 3560 ASSERT(DVA_IS_VALID(dva)); 3561 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3562 3563 if (DVA_GET_GANG(dva)) { 3564 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3565 } 3566 3567 metaslab_free_impl(vd, offset, size, checkpoint); 3568 } 3569 3570 /* 3571 * Reserve some allocation slots. The reservation system must be called 3572 * before we call into the allocator. If there aren't any available slots 3573 * then the I/O will be throttled until an I/O completes and its slots are 3574 * freed up. The function returns true if it was successful in placing 3575 * the reservation. 3576 */ 3577 boolean_t 3578 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 3579 int flags) 3580 { 3581 uint64_t available_slots = 0; 3582 boolean_t slot_reserved = B_FALSE; 3583 3584 ASSERT(mc->mc_alloc_throttle_enabled); 3585 mutex_enter(&mc->mc_lock); 3586 3587 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 3588 if (reserved_slots < mc->mc_alloc_max_slots) 3589 available_slots = mc->mc_alloc_max_slots - reserved_slots; 3590 3591 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 3592 /* 3593 * We reserve the slots individually so that we can unreserve 3594 * them individually when an I/O completes. 3595 */ 3596 for (int d = 0; d < slots; d++) { 3597 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 3598 } 3599 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 3600 slot_reserved = B_TRUE; 3601 } 3602 3603 mutex_exit(&mc->mc_lock); 3604 return (slot_reserved); 3605 } 3606 3607 void 3608 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) 3609 { 3610 ASSERT(mc->mc_alloc_throttle_enabled); 3611 mutex_enter(&mc->mc_lock); 3612 for (int d = 0; d < slots; d++) { 3613 (void) refcount_remove(&mc->mc_alloc_slots, zio); 3614 } 3615 mutex_exit(&mc->mc_lock); 3616 } 3617 3618 static int 3619 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, 3620 uint64_t txg) 3621 { 3622 metaslab_t *msp; 3623 spa_t *spa = vd->vdev_spa; 3624 int error = 0; 3625 3626 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) 3627 return (ENXIO); 3628 3629 ASSERT3P(vd->vdev_ms, !=, NULL); 3630 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3631 3632 mutex_enter(&msp->ms_lock); 3633 3634 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 3635 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 3636 3637 if (error == 0 && 3638 !range_tree_contains(msp->ms_allocatable, offset, size)) 3639 error = SET_ERROR(ENOENT); 3640 3641 if (error || txg == 0) { /* txg == 0 indicates dry run */ 3642 mutex_exit(&msp->ms_lock); 3643 return (error); 3644 } 3645 3646 VERIFY(!msp->ms_condensing); 3647 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 3648 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 3649 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, 3650 msp->ms_size); 3651 range_tree_remove(msp->ms_allocatable, offset, size); 3652 3653 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 3654 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) 3655 vdev_dirty(vd, VDD_METASLAB, msp, txg); 3656 range_tree_add(msp->ms_allocating[txg & TXG_MASK], 3657 offset, size); 3658 } 3659 3660 mutex_exit(&msp->ms_lock); 3661 3662 return (0); 3663 } 3664 3665 typedef struct metaslab_claim_cb_arg_t { 3666 uint64_t mcca_txg; 3667 int mcca_error; 3668 } metaslab_claim_cb_arg_t; 3669 3670 /* ARGSUSED */ 3671 static void 3672 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3673 uint64_t size, void *arg) 3674 { 3675 metaslab_claim_cb_arg_t *mcca_arg = arg; 3676 3677 if (mcca_arg->mcca_error == 0) { 3678 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, 3679 size, mcca_arg->mcca_txg); 3680 } 3681 } 3682 3683 int 3684 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) 3685 { 3686 if (vd->vdev_ops->vdev_op_remap != NULL) { 3687 metaslab_claim_cb_arg_t arg; 3688 3689 /* 3690 * Only zdb(1M) can claim on indirect vdevs. This is used 3691 * to detect leaks of mapped space (that are not accounted 3692 * for in the obsolete counts, spacemap, or bpobj). 3693 */ 3694 ASSERT(!spa_writeable(vd->vdev_spa)); 3695 arg.mcca_error = 0; 3696 arg.mcca_txg = txg; 3697 3698 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3699 metaslab_claim_impl_cb, &arg); 3700 3701 if (arg.mcca_error == 0) { 3702 arg.mcca_error = metaslab_claim_concrete(vd, 3703 offset, size, txg); 3704 } 3705 return (arg.mcca_error); 3706 } else { 3707 return (metaslab_claim_concrete(vd, offset, size, txg)); 3708 } 3709 } 3710 3711 /* 3712 * Intent log support: upon opening the pool after a crash, notify the SPA 3713 * of blocks that the intent log has allocated for immediate write, but 3714 * which are still considered free by the SPA because the last transaction 3715 * group didn't commit yet. 3716 */ 3717 static int 3718 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 3719 { 3720 uint64_t vdev = DVA_GET_VDEV(dva); 3721 uint64_t offset = DVA_GET_OFFSET(dva); 3722 uint64_t size = DVA_GET_ASIZE(dva); 3723 vdev_t *vd; 3724 3725 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { 3726 return (SET_ERROR(ENXIO)); 3727 } 3728 3729 ASSERT(DVA_IS_VALID(dva)); 3730 3731 if (DVA_GET_GANG(dva)) 3732 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3733 3734 return (metaslab_claim_impl(vd, offset, size, txg)); 3735 } 3736 3737 int 3738 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 3739 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, 3740 zio_alloc_list_t *zal, zio_t *zio) 3741 { 3742 dva_t *dva = bp->blk_dva; 3743 dva_t *hintdva = hintbp->blk_dva; 3744 int error = 0; 3745 3746 ASSERT(bp->blk_birth == 0); 3747 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 3748 3749 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3750 3751 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 3752 spa_config_exit(spa, SCL_ALLOC, FTAG); 3753 return (SET_ERROR(ENOSPC)); 3754 } 3755 3756 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 3757 ASSERT(BP_GET_NDVAS(bp) == 0); 3758 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 3759 ASSERT3P(zal, !=, NULL); 3760 3761 for (int d = 0; d < ndvas; d++) { 3762 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 3763 txg, flags, zal); 3764 if (error != 0) { 3765 for (d--; d >= 0; d--) { 3766 metaslab_unalloc_dva(spa, &dva[d], txg); 3767 metaslab_group_alloc_decrement(spa, 3768 DVA_GET_VDEV(&dva[d]), zio, flags); 3769 bzero(&dva[d], sizeof (dva_t)); 3770 } 3771 spa_config_exit(spa, SCL_ALLOC, FTAG); 3772 return (error); 3773 } else { 3774 /* 3775 * Update the metaslab group's queue depth 3776 * based on the newly allocated dva. 3777 */ 3778 metaslab_group_alloc_increment(spa, 3779 DVA_GET_VDEV(&dva[d]), zio, flags); 3780 } 3781 3782 } 3783 ASSERT(error == 0); 3784 ASSERT(BP_GET_NDVAS(bp) == ndvas); 3785 3786 spa_config_exit(spa, SCL_ALLOC, FTAG); 3787 3788 BP_SET_BIRTH(bp, txg, txg); 3789 3790 return (0); 3791 } 3792 3793 void 3794 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 3795 { 3796 const dva_t *dva = bp->blk_dva; 3797 int ndvas = BP_GET_NDVAS(bp); 3798 3799 ASSERT(!BP_IS_HOLE(bp)); 3800 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 3801 3802 /* 3803 * If we have a checkpoint for the pool we need to make sure that 3804 * the blocks that we free that are part of the checkpoint won't be 3805 * reused until the checkpoint is discarded or we revert to it. 3806 * 3807 * The checkpoint flag is passed down the metaslab_free code path 3808 * and is set whenever we want to add a block to the checkpoint's 3809 * accounting. That is, we "checkpoint" blocks that existed at the 3810 * time the checkpoint was created and are therefore referenced by 3811 * the checkpointed uberblock. 3812 * 3813 * Note that, we don't checkpoint any blocks if the current 3814 * syncing txg <= spa_checkpoint_txg. We want these frees to sync 3815 * normally as they will be referenced by the checkpointed uberblock. 3816 */ 3817 boolean_t checkpoint = B_FALSE; 3818 if (bp->blk_birth <= spa->spa_checkpoint_txg && 3819 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { 3820 /* 3821 * At this point, if the block is part of the checkpoint 3822 * there is no way it was created in the current txg. 3823 */ 3824 ASSERT(!now); 3825 ASSERT3U(spa_syncing_txg(spa), ==, txg); 3826 checkpoint = B_TRUE; 3827 } 3828 3829 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 3830 3831 for (int d = 0; d < ndvas; d++) { 3832 if (now) { 3833 metaslab_unalloc_dva(spa, &dva[d], txg); 3834 } else { 3835 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 3836 metaslab_free_dva(spa, &dva[d], checkpoint); 3837 } 3838 } 3839 3840 spa_config_exit(spa, SCL_FREE, FTAG); 3841 } 3842 3843 int 3844 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 3845 { 3846 const dva_t *dva = bp->blk_dva; 3847 int ndvas = BP_GET_NDVAS(bp); 3848 int error = 0; 3849 3850 ASSERT(!BP_IS_HOLE(bp)); 3851 3852 if (txg != 0) { 3853 /* 3854 * First do a dry run to make sure all DVAs are claimable, 3855 * so we don't have to unwind from partial failures below. 3856 */ 3857 if ((error = metaslab_claim(spa, bp, 0)) != 0) 3858 return (error); 3859 } 3860 3861 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 3862 3863 for (int d = 0; d < ndvas; d++) 3864 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 3865 break; 3866 3867 spa_config_exit(spa, SCL_ALLOC, FTAG); 3868 3869 ASSERT(error == 0 || txg == 0); 3870 3871 return (error); 3872 } 3873 3874 /* ARGSUSED */ 3875 static void 3876 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, 3877 uint64_t size, void *arg) 3878 { 3879 if (vd->vdev_ops == &vdev_indirect_ops) 3880 return; 3881 3882 metaslab_check_free_impl(vd, offset, size); 3883 } 3884 3885 static void 3886 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) 3887 { 3888 metaslab_t *msp; 3889 spa_t *spa = vd->vdev_spa; 3890 3891 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 3892 return; 3893 3894 if (vd->vdev_ops->vdev_op_remap != NULL) { 3895 vd->vdev_ops->vdev_op_remap(vd, offset, size, 3896 metaslab_check_free_impl_cb, NULL); 3897 return; 3898 } 3899 3900 ASSERT(vdev_is_concrete(vd)); 3901 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); 3902 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); 3903 3904 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3905 3906 mutex_enter(&msp->ms_lock); 3907 if (msp->ms_loaded) 3908 range_tree_verify(msp->ms_allocatable, offset, size); 3909 3910 range_tree_verify(msp->ms_freeing, offset, size); 3911 range_tree_verify(msp->ms_checkpointing, offset, size); 3912 range_tree_verify(msp->ms_freed, offset, size); 3913 for (int j = 0; j < TXG_DEFER_SIZE; j++) 3914 range_tree_verify(msp->ms_defer[j], offset, size); 3915 mutex_exit(&msp->ms_lock); 3916 } 3917 3918 void 3919 metaslab_check_free(spa_t *spa, const blkptr_t *bp) 3920 { 3921 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 3922 return; 3923 3924 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3925 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 3926 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 3927 vdev_t *vd = vdev_lookup_top(spa, vdev); 3928 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 3929 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 3930 3931 if (DVA_GET_GANG(&bp->blk_dva[i])) 3932 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 3933 3934 ASSERT3P(vd, !=, NULL); 3935 3936 metaslab_check_free_impl(vd, offset, size); 3937 } 3938 spa_config_exit(spa, SCL_VDEV, FTAG); 3939 } 3940