1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2012 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/dmu.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/uberblock_impl.h> 36 #include <sys/metaslab.h> 37 #include <sys/metaslab_impl.h> 38 #include <sys/space_map.h> 39 #include <sys/zio.h> 40 #include <sys/zap.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/arc.h> 43 #include <sys/zil.h> 44 #include <sys/dsl_scan.h> 45 46 /* 47 * Virtual device management. 48 */ 49 50 static vdev_ops_t *vdev_ops_table[] = { 51 &vdev_root_ops, 52 &vdev_raidz_ops, 53 &vdev_mirror_ops, 54 &vdev_replacing_ops, 55 &vdev_spare_ops, 56 &vdev_disk_ops, 57 &vdev_file_ops, 58 &vdev_missing_ops, 59 &vdev_hole_ops, 60 NULL 61 }; 62 63 /* maximum scrub/resilver I/O queue per leaf vdev */ 64 int zfs_scrub_limit = 10; 65 66 /* 67 * Given a vdev type, return the appropriate ops vector. 68 */ 69 static vdev_ops_t * 70 vdev_getops(const char *type) 71 { 72 vdev_ops_t *ops, **opspp; 73 74 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 75 if (strcmp(ops->vdev_op_type, type) == 0) 76 break; 77 78 return (ops); 79 } 80 81 /* 82 * Default asize function: return the MAX of psize with the asize of 83 * all children. This is what's used by anything other than RAID-Z. 84 */ 85 uint64_t 86 vdev_default_asize(vdev_t *vd, uint64_t psize) 87 { 88 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 89 uint64_t csize; 90 91 for (int c = 0; c < vd->vdev_children; c++) { 92 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 93 asize = MAX(asize, csize); 94 } 95 96 return (asize); 97 } 98 99 /* 100 * Get the minimum allocatable size. We define the allocatable size as 101 * the vdev's asize rounded to the nearest metaslab. This allows us to 102 * replace or attach devices which don't have the same physical size but 103 * can still satisfy the same number of allocations. 104 */ 105 uint64_t 106 vdev_get_min_asize(vdev_t *vd) 107 { 108 vdev_t *pvd = vd->vdev_parent; 109 110 /* 111 * If our parent is NULL (inactive spare or cache) or is the root, 112 * just return our own asize. 113 */ 114 if (pvd == NULL) 115 return (vd->vdev_asize); 116 117 /* 118 * The top-level vdev just returns the allocatable size rounded 119 * to the nearest metaslab. 120 */ 121 if (vd == vd->vdev_top) 122 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 123 124 /* 125 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 126 * so each child must provide at least 1/Nth of its asize. 127 */ 128 if (pvd->vdev_ops == &vdev_raidz_ops) 129 return (pvd->vdev_min_asize / pvd->vdev_children); 130 131 return (pvd->vdev_min_asize); 132 } 133 134 void 135 vdev_set_min_asize(vdev_t *vd) 136 { 137 vd->vdev_min_asize = vdev_get_min_asize(vd); 138 139 for (int c = 0; c < vd->vdev_children; c++) 140 vdev_set_min_asize(vd->vdev_child[c]); 141 } 142 143 vdev_t * 144 vdev_lookup_top(spa_t *spa, uint64_t vdev) 145 { 146 vdev_t *rvd = spa->spa_root_vdev; 147 148 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 149 150 if (vdev < rvd->vdev_children) { 151 ASSERT(rvd->vdev_child[vdev] != NULL); 152 return (rvd->vdev_child[vdev]); 153 } 154 155 return (NULL); 156 } 157 158 vdev_t * 159 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 160 { 161 vdev_t *mvd; 162 163 if (vd->vdev_guid == guid) 164 return (vd); 165 166 for (int c = 0; c < vd->vdev_children; c++) 167 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 168 NULL) 169 return (mvd); 170 171 return (NULL); 172 } 173 174 void 175 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 176 { 177 size_t oldsize, newsize; 178 uint64_t id = cvd->vdev_id; 179 vdev_t **newchild; 180 181 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 182 ASSERT(cvd->vdev_parent == NULL); 183 184 cvd->vdev_parent = pvd; 185 186 if (pvd == NULL) 187 return; 188 189 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 190 191 oldsize = pvd->vdev_children * sizeof (vdev_t *); 192 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 193 newsize = pvd->vdev_children * sizeof (vdev_t *); 194 195 newchild = kmem_zalloc(newsize, KM_SLEEP); 196 if (pvd->vdev_child != NULL) { 197 bcopy(pvd->vdev_child, newchild, oldsize); 198 kmem_free(pvd->vdev_child, oldsize); 199 } 200 201 pvd->vdev_child = newchild; 202 pvd->vdev_child[id] = cvd; 203 204 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 205 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 206 207 /* 208 * Walk up all ancestors to update guid sum. 209 */ 210 for (; pvd != NULL; pvd = pvd->vdev_parent) 211 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 212 } 213 214 void 215 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 216 { 217 int c; 218 uint_t id = cvd->vdev_id; 219 220 ASSERT(cvd->vdev_parent == pvd); 221 222 if (pvd == NULL) 223 return; 224 225 ASSERT(id < pvd->vdev_children); 226 ASSERT(pvd->vdev_child[id] == cvd); 227 228 pvd->vdev_child[id] = NULL; 229 cvd->vdev_parent = NULL; 230 231 for (c = 0; c < pvd->vdev_children; c++) 232 if (pvd->vdev_child[c]) 233 break; 234 235 if (c == pvd->vdev_children) { 236 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 237 pvd->vdev_child = NULL; 238 pvd->vdev_children = 0; 239 } 240 241 /* 242 * Walk up all ancestors to update guid sum. 243 */ 244 for (; pvd != NULL; pvd = pvd->vdev_parent) 245 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 246 } 247 248 /* 249 * Remove any holes in the child array. 250 */ 251 void 252 vdev_compact_children(vdev_t *pvd) 253 { 254 vdev_t **newchild, *cvd; 255 int oldc = pvd->vdev_children; 256 int newc; 257 258 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 259 260 for (int c = newc = 0; c < oldc; c++) 261 if (pvd->vdev_child[c]) 262 newc++; 263 264 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 265 266 for (int c = newc = 0; c < oldc; c++) { 267 if ((cvd = pvd->vdev_child[c]) != NULL) { 268 newchild[newc] = cvd; 269 cvd->vdev_id = newc++; 270 } 271 } 272 273 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 274 pvd->vdev_child = newchild; 275 pvd->vdev_children = newc; 276 } 277 278 /* 279 * Allocate and minimally initialize a vdev_t. 280 */ 281 vdev_t * 282 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 283 { 284 vdev_t *vd; 285 286 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 287 288 if (spa->spa_root_vdev == NULL) { 289 ASSERT(ops == &vdev_root_ops); 290 spa->spa_root_vdev = vd; 291 spa->spa_load_guid = spa_generate_guid(NULL); 292 } 293 294 if (guid == 0 && ops != &vdev_hole_ops) { 295 if (spa->spa_root_vdev == vd) { 296 /* 297 * The root vdev's guid will also be the pool guid, 298 * which must be unique among all pools. 299 */ 300 guid = spa_generate_guid(NULL); 301 } else { 302 /* 303 * Any other vdev's guid must be unique within the pool. 304 */ 305 guid = spa_generate_guid(spa); 306 } 307 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 308 } 309 310 vd->vdev_spa = spa; 311 vd->vdev_id = id; 312 vd->vdev_guid = guid; 313 vd->vdev_guid_sum = guid; 314 vd->vdev_ops = ops; 315 vd->vdev_state = VDEV_STATE_CLOSED; 316 vd->vdev_ishole = (ops == &vdev_hole_ops); 317 318 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 319 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 320 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 321 for (int t = 0; t < DTL_TYPES; t++) { 322 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 323 &vd->vdev_dtl_lock); 324 } 325 txg_list_create(&vd->vdev_ms_list, 326 offsetof(struct metaslab, ms_txg_node)); 327 txg_list_create(&vd->vdev_dtl_list, 328 offsetof(struct vdev, vdev_dtl_node)); 329 vd->vdev_stat.vs_timestamp = gethrtime(); 330 vdev_queue_init(vd); 331 vdev_cache_init(vd); 332 333 return (vd); 334 } 335 336 /* 337 * Allocate a new vdev. The 'alloctype' is used to control whether we are 338 * creating a new vdev or loading an existing one - the behavior is slightly 339 * different for each case. 340 */ 341 int 342 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 343 int alloctype) 344 { 345 vdev_ops_t *ops; 346 char *type; 347 uint64_t guid = 0, islog, nparity; 348 vdev_t *vd; 349 350 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 351 352 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 353 return (EINVAL); 354 355 if ((ops = vdev_getops(type)) == NULL) 356 return (EINVAL); 357 358 /* 359 * If this is a load, get the vdev guid from the nvlist. 360 * Otherwise, vdev_alloc_common() will generate one for us. 361 */ 362 if (alloctype == VDEV_ALLOC_LOAD) { 363 uint64_t label_id; 364 365 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 366 label_id != id) 367 return (EINVAL); 368 369 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 370 return (EINVAL); 371 } else if (alloctype == VDEV_ALLOC_SPARE) { 372 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 373 return (EINVAL); 374 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 375 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 376 return (EINVAL); 377 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 378 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 379 return (EINVAL); 380 } 381 382 /* 383 * The first allocated vdev must be of type 'root'. 384 */ 385 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 386 return (EINVAL); 387 388 /* 389 * Determine whether we're a log vdev. 390 */ 391 islog = 0; 392 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 393 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 394 return (ENOTSUP); 395 396 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 397 return (ENOTSUP); 398 399 /* 400 * Set the nparity property for RAID-Z vdevs. 401 */ 402 nparity = -1ULL; 403 if (ops == &vdev_raidz_ops) { 404 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 405 &nparity) == 0) { 406 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 407 return (EINVAL); 408 /* 409 * Previous versions could only support 1 or 2 parity 410 * device. 411 */ 412 if (nparity > 1 && 413 spa_version(spa) < SPA_VERSION_RAIDZ2) 414 return (ENOTSUP); 415 if (nparity > 2 && 416 spa_version(spa) < SPA_VERSION_RAIDZ3) 417 return (ENOTSUP); 418 } else { 419 /* 420 * We require the parity to be specified for SPAs that 421 * support multiple parity levels. 422 */ 423 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 424 return (EINVAL); 425 /* 426 * Otherwise, we default to 1 parity device for RAID-Z. 427 */ 428 nparity = 1; 429 } 430 } else { 431 nparity = 0; 432 } 433 ASSERT(nparity != -1ULL); 434 435 vd = vdev_alloc_common(spa, id, guid, ops); 436 437 vd->vdev_islog = islog; 438 vd->vdev_nparity = nparity; 439 440 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 441 vd->vdev_path = spa_strdup(vd->vdev_path); 442 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 443 vd->vdev_devid = spa_strdup(vd->vdev_devid); 444 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 445 &vd->vdev_physpath) == 0) 446 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 447 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 448 vd->vdev_fru = spa_strdup(vd->vdev_fru); 449 450 /* 451 * Set the whole_disk property. If it's not specified, leave the value 452 * as -1. 453 */ 454 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 455 &vd->vdev_wholedisk) != 0) 456 vd->vdev_wholedisk = -1ULL; 457 458 /* 459 * Look for the 'not present' flag. This will only be set if the device 460 * was not present at the time of import. 461 */ 462 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 463 &vd->vdev_not_present); 464 465 /* 466 * Get the alignment requirement. 467 */ 468 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 469 470 /* 471 * Retrieve the vdev creation time. 472 */ 473 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 474 &vd->vdev_crtxg); 475 476 /* 477 * If we're a top-level vdev, try to load the allocation parameters. 478 */ 479 if (parent && !parent->vdev_parent && 480 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 481 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 482 &vd->vdev_ms_array); 483 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 484 &vd->vdev_ms_shift); 485 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 486 &vd->vdev_asize); 487 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 488 &vd->vdev_removing); 489 } 490 491 if (parent && !parent->vdev_parent) { 492 ASSERT(alloctype == VDEV_ALLOC_LOAD || 493 alloctype == VDEV_ALLOC_ADD || 494 alloctype == VDEV_ALLOC_SPLIT || 495 alloctype == VDEV_ALLOC_ROOTPOOL); 496 vd->vdev_mg = metaslab_group_create(islog ? 497 spa_log_class(spa) : spa_normal_class(spa), vd); 498 } 499 500 /* 501 * If we're a leaf vdev, try to load the DTL object and other state. 502 */ 503 if (vd->vdev_ops->vdev_op_leaf && 504 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 505 alloctype == VDEV_ALLOC_ROOTPOOL)) { 506 if (alloctype == VDEV_ALLOC_LOAD) { 507 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 508 &vd->vdev_dtl_smo.smo_object); 509 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 510 &vd->vdev_unspare); 511 } 512 513 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 514 uint64_t spare = 0; 515 516 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 517 &spare) == 0 && spare) 518 spa_spare_add(vd); 519 } 520 521 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 522 &vd->vdev_offline); 523 524 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, 525 &vd->vdev_resilvering); 526 527 /* 528 * When importing a pool, we want to ignore the persistent fault 529 * state, as the diagnosis made on another system may not be 530 * valid in the current context. Local vdevs will 531 * remain in the faulted state. 532 */ 533 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 534 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 535 &vd->vdev_faulted); 536 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 537 &vd->vdev_degraded); 538 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 539 &vd->vdev_removed); 540 541 if (vd->vdev_faulted || vd->vdev_degraded) { 542 char *aux; 543 544 vd->vdev_label_aux = 545 VDEV_AUX_ERR_EXCEEDED; 546 if (nvlist_lookup_string(nv, 547 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 548 strcmp(aux, "external") == 0) 549 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 550 } 551 } 552 } 553 554 /* 555 * Add ourselves to the parent's list of children. 556 */ 557 vdev_add_child(parent, vd); 558 559 *vdp = vd; 560 561 return (0); 562 } 563 564 void 565 vdev_free(vdev_t *vd) 566 { 567 spa_t *spa = vd->vdev_spa; 568 569 /* 570 * vdev_free() implies closing the vdev first. This is simpler than 571 * trying to ensure complicated semantics for all callers. 572 */ 573 vdev_close(vd); 574 575 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 576 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 577 578 /* 579 * Free all children. 580 */ 581 for (int c = 0; c < vd->vdev_children; c++) 582 vdev_free(vd->vdev_child[c]); 583 584 ASSERT(vd->vdev_child == NULL); 585 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 586 587 /* 588 * Discard allocation state. 589 */ 590 if (vd->vdev_mg != NULL) { 591 vdev_metaslab_fini(vd); 592 metaslab_group_destroy(vd->vdev_mg); 593 } 594 595 ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 596 ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 597 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 598 599 /* 600 * Remove this vdev from its parent's child list. 601 */ 602 vdev_remove_child(vd->vdev_parent, vd); 603 604 ASSERT(vd->vdev_parent == NULL); 605 606 /* 607 * Clean up vdev structure. 608 */ 609 vdev_queue_fini(vd); 610 vdev_cache_fini(vd); 611 612 if (vd->vdev_path) 613 spa_strfree(vd->vdev_path); 614 if (vd->vdev_devid) 615 spa_strfree(vd->vdev_devid); 616 if (vd->vdev_physpath) 617 spa_strfree(vd->vdev_physpath); 618 if (vd->vdev_fru) 619 spa_strfree(vd->vdev_fru); 620 621 if (vd->vdev_isspare) 622 spa_spare_remove(vd); 623 if (vd->vdev_isl2cache) 624 spa_l2cache_remove(vd); 625 626 txg_list_destroy(&vd->vdev_ms_list); 627 txg_list_destroy(&vd->vdev_dtl_list); 628 629 mutex_enter(&vd->vdev_dtl_lock); 630 for (int t = 0; t < DTL_TYPES; t++) { 631 space_map_unload(&vd->vdev_dtl[t]); 632 space_map_destroy(&vd->vdev_dtl[t]); 633 } 634 mutex_exit(&vd->vdev_dtl_lock); 635 636 mutex_destroy(&vd->vdev_dtl_lock); 637 mutex_destroy(&vd->vdev_stat_lock); 638 mutex_destroy(&vd->vdev_probe_lock); 639 640 if (vd == spa->spa_root_vdev) 641 spa->spa_root_vdev = NULL; 642 643 kmem_free(vd, sizeof (vdev_t)); 644 } 645 646 /* 647 * Transfer top-level vdev state from svd to tvd. 648 */ 649 static void 650 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 651 { 652 spa_t *spa = svd->vdev_spa; 653 metaslab_t *msp; 654 vdev_t *vd; 655 int t; 656 657 ASSERT(tvd == tvd->vdev_top); 658 659 tvd->vdev_ms_array = svd->vdev_ms_array; 660 tvd->vdev_ms_shift = svd->vdev_ms_shift; 661 tvd->vdev_ms_count = svd->vdev_ms_count; 662 663 svd->vdev_ms_array = 0; 664 svd->vdev_ms_shift = 0; 665 svd->vdev_ms_count = 0; 666 667 tvd->vdev_mg = svd->vdev_mg; 668 tvd->vdev_ms = svd->vdev_ms; 669 670 svd->vdev_mg = NULL; 671 svd->vdev_ms = NULL; 672 673 if (tvd->vdev_mg != NULL) 674 tvd->vdev_mg->mg_vd = tvd; 675 676 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 677 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 678 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 679 680 svd->vdev_stat.vs_alloc = 0; 681 svd->vdev_stat.vs_space = 0; 682 svd->vdev_stat.vs_dspace = 0; 683 684 for (t = 0; t < TXG_SIZE; t++) { 685 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 686 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 687 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 688 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 689 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 690 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 691 } 692 693 if (list_link_active(&svd->vdev_config_dirty_node)) { 694 vdev_config_clean(svd); 695 vdev_config_dirty(tvd); 696 } 697 698 if (list_link_active(&svd->vdev_state_dirty_node)) { 699 vdev_state_clean(svd); 700 vdev_state_dirty(tvd); 701 } 702 703 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 704 svd->vdev_deflate_ratio = 0; 705 706 tvd->vdev_islog = svd->vdev_islog; 707 svd->vdev_islog = 0; 708 } 709 710 static void 711 vdev_top_update(vdev_t *tvd, vdev_t *vd) 712 { 713 if (vd == NULL) 714 return; 715 716 vd->vdev_top = tvd; 717 718 for (int c = 0; c < vd->vdev_children; c++) 719 vdev_top_update(tvd, vd->vdev_child[c]); 720 } 721 722 /* 723 * Add a mirror/replacing vdev above an existing vdev. 724 */ 725 vdev_t * 726 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 727 { 728 spa_t *spa = cvd->vdev_spa; 729 vdev_t *pvd = cvd->vdev_parent; 730 vdev_t *mvd; 731 732 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 733 734 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 735 736 mvd->vdev_asize = cvd->vdev_asize; 737 mvd->vdev_min_asize = cvd->vdev_min_asize; 738 mvd->vdev_max_asize = cvd->vdev_max_asize; 739 mvd->vdev_ashift = cvd->vdev_ashift; 740 mvd->vdev_state = cvd->vdev_state; 741 mvd->vdev_crtxg = cvd->vdev_crtxg; 742 743 vdev_remove_child(pvd, cvd); 744 vdev_add_child(pvd, mvd); 745 cvd->vdev_id = mvd->vdev_children; 746 vdev_add_child(mvd, cvd); 747 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 748 749 if (mvd == mvd->vdev_top) 750 vdev_top_transfer(cvd, mvd); 751 752 return (mvd); 753 } 754 755 /* 756 * Remove a 1-way mirror/replacing vdev from the tree. 757 */ 758 void 759 vdev_remove_parent(vdev_t *cvd) 760 { 761 vdev_t *mvd = cvd->vdev_parent; 762 vdev_t *pvd = mvd->vdev_parent; 763 764 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 765 766 ASSERT(mvd->vdev_children == 1); 767 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 768 mvd->vdev_ops == &vdev_replacing_ops || 769 mvd->vdev_ops == &vdev_spare_ops); 770 cvd->vdev_ashift = mvd->vdev_ashift; 771 772 vdev_remove_child(mvd, cvd); 773 vdev_remove_child(pvd, mvd); 774 775 /* 776 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 777 * Otherwise, we could have detached an offline device, and when we 778 * go to import the pool we'll think we have two top-level vdevs, 779 * instead of a different version of the same top-level vdev. 780 */ 781 if (mvd->vdev_top == mvd) { 782 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 783 cvd->vdev_orig_guid = cvd->vdev_guid; 784 cvd->vdev_guid += guid_delta; 785 cvd->vdev_guid_sum += guid_delta; 786 } 787 cvd->vdev_id = mvd->vdev_id; 788 vdev_add_child(pvd, cvd); 789 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 790 791 if (cvd == cvd->vdev_top) 792 vdev_top_transfer(mvd, cvd); 793 794 ASSERT(mvd->vdev_children == 0); 795 vdev_free(mvd); 796 } 797 798 int 799 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 800 { 801 spa_t *spa = vd->vdev_spa; 802 objset_t *mos = spa->spa_meta_objset; 803 uint64_t m; 804 uint64_t oldc = vd->vdev_ms_count; 805 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 806 metaslab_t **mspp; 807 int error; 808 809 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 810 811 /* 812 * This vdev is not being allocated from yet or is a hole. 813 */ 814 if (vd->vdev_ms_shift == 0) 815 return (0); 816 817 ASSERT(!vd->vdev_ishole); 818 819 /* 820 * Compute the raidz-deflation ratio. Note, we hard-code 821 * in 128k (1 << 17) because it is the current "typical" blocksize. 822 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 823 * or we will inconsistently account for existing bp's. 824 */ 825 vd->vdev_deflate_ratio = (1 << 17) / 826 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 827 828 ASSERT(oldc <= newc); 829 830 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 831 832 if (oldc != 0) { 833 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 834 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 835 } 836 837 vd->vdev_ms = mspp; 838 vd->vdev_ms_count = newc; 839 840 for (m = oldc; m < newc; m++) { 841 space_map_obj_t smo = { 0, 0, 0 }; 842 if (txg == 0) { 843 uint64_t object = 0; 844 error = dmu_read(mos, vd->vdev_ms_array, 845 m * sizeof (uint64_t), sizeof (uint64_t), &object, 846 DMU_READ_PREFETCH); 847 if (error) 848 return (error); 849 if (object != 0) { 850 dmu_buf_t *db; 851 error = dmu_bonus_hold(mos, object, FTAG, &db); 852 if (error) 853 return (error); 854 ASSERT3U(db->db_size, >=, sizeof (smo)); 855 bcopy(db->db_data, &smo, sizeof (smo)); 856 ASSERT3U(smo.smo_object, ==, object); 857 dmu_buf_rele(db, FTAG); 858 } 859 } 860 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 861 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 862 } 863 864 if (txg == 0) 865 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 866 867 /* 868 * If the vdev is being removed we don't activate 869 * the metaslabs since we want to ensure that no new 870 * allocations are performed on this device. 871 */ 872 if (oldc == 0 && !vd->vdev_removing) 873 metaslab_group_activate(vd->vdev_mg); 874 875 if (txg == 0) 876 spa_config_exit(spa, SCL_ALLOC, FTAG); 877 878 return (0); 879 } 880 881 void 882 vdev_metaslab_fini(vdev_t *vd) 883 { 884 uint64_t m; 885 uint64_t count = vd->vdev_ms_count; 886 887 if (vd->vdev_ms != NULL) { 888 metaslab_group_passivate(vd->vdev_mg); 889 for (m = 0; m < count; m++) 890 if (vd->vdev_ms[m] != NULL) 891 metaslab_fini(vd->vdev_ms[m]); 892 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 893 vd->vdev_ms = NULL; 894 } 895 } 896 897 typedef struct vdev_probe_stats { 898 boolean_t vps_readable; 899 boolean_t vps_writeable; 900 int vps_flags; 901 } vdev_probe_stats_t; 902 903 static void 904 vdev_probe_done(zio_t *zio) 905 { 906 spa_t *spa = zio->io_spa; 907 vdev_t *vd = zio->io_vd; 908 vdev_probe_stats_t *vps = zio->io_private; 909 910 ASSERT(vd->vdev_probe_zio != NULL); 911 912 if (zio->io_type == ZIO_TYPE_READ) { 913 if (zio->io_error == 0) 914 vps->vps_readable = 1; 915 if (zio->io_error == 0 && spa_writeable(spa)) { 916 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 917 zio->io_offset, zio->io_size, zio->io_data, 918 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 919 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 920 } else { 921 zio_buf_free(zio->io_data, zio->io_size); 922 } 923 } else if (zio->io_type == ZIO_TYPE_WRITE) { 924 if (zio->io_error == 0) 925 vps->vps_writeable = 1; 926 zio_buf_free(zio->io_data, zio->io_size); 927 } else if (zio->io_type == ZIO_TYPE_NULL) { 928 zio_t *pio; 929 930 vd->vdev_cant_read |= !vps->vps_readable; 931 vd->vdev_cant_write |= !vps->vps_writeable; 932 933 if (vdev_readable(vd) && 934 (vdev_writeable(vd) || !spa_writeable(spa))) { 935 zio->io_error = 0; 936 } else { 937 ASSERT(zio->io_error != 0); 938 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 939 spa, vd, NULL, 0, 0); 940 zio->io_error = ENXIO; 941 } 942 943 mutex_enter(&vd->vdev_probe_lock); 944 ASSERT(vd->vdev_probe_zio == zio); 945 vd->vdev_probe_zio = NULL; 946 mutex_exit(&vd->vdev_probe_lock); 947 948 while ((pio = zio_walk_parents(zio)) != NULL) 949 if (!vdev_accessible(vd, pio)) 950 pio->io_error = ENXIO; 951 952 kmem_free(vps, sizeof (*vps)); 953 } 954 } 955 956 /* 957 * Determine whether this device is accessible by reading and writing 958 * to several known locations: the pad regions of each vdev label 959 * but the first (which we leave alone in case it contains a VTOC). 960 */ 961 zio_t * 962 vdev_probe(vdev_t *vd, zio_t *zio) 963 { 964 spa_t *spa = vd->vdev_spa; 965 vdev_probe_stats_t *vps = NULL; 966 zio_t *pio; 967 968 ASSERT(vd->vdev_ops->vdev_op_leaf); 969 970 /* 971 * Don't probe the probe. 972 */ 973 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 974 return (NULL); 975 976 /* 977 * To prevent 'probe storms' when a device fails, we create 978 * just one probe i/o at a time. All zios that want to probe 979 * this vdev will become parents of the probe io. 980 */ 981 mutex_enter(&vd->vdev_probe_lock); 982 983 if ((pio = vd->vdev_probe_zio) == NULL) { 984 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 985 986 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 987 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 988 ZIO_FLAG_TRYHARD; 989 990 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 991 /* 992 * vdev_cant_read and vdev_cant_write can only 993 * transition from TRUE to FALSE when we have the 994 * SCL_ZIO lock as writer; otherwise they can only 995 * transition from FALSE to TRUE. This ensures that 996 * any zio looking at these values can assume that 997 * failures persist for the life of the I/O. That's 998 * important because when a device has intermittent 999 * connectivity problems, we want to ensure that 1000 * they're ascribed to the device (ENXIO) and not 1001 * the zio (EIO). 1002 * 1003 * Since we hold SCL_ZIO as writer here, clear both 1004 * values so the probe can reevaluate from first 1005 * principles. 1006 */ 1007 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1008 vd->vdev_cant_read = B_FALSE; 1009 vd->vdev_cant_write = B_FALSE; 1010 } 1011 1012 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1013 vdev_probe_done, vps, 1014 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1015 1016 /* 1017 * We can't change the vdev state in this context, so we 1018 * kick off an async task to do it on our behalf. 1019 */ 1020 if (zio != NULL) { 1021 vd->vdev_probe_wanted = B_TRUE; 1022 spa_async_request(spa, SPA_ASYNC_PROBE); 1023 } 1024 } 1025 1026 if (zio != NULL) 1027 zio_add_child(zio, pio); 1028 1029 mutex_exit(&vd->vdev_probe_lock); 1030 1031 if (vps == NULL) { 1032 ASSERT(zio != NULL); 1033 return (NULL); 1034 } 1035 1036 for (int l = 1; l < VDEV_LABELS; l++) { 1037 zio_nowait(zio_read_phys(pio, vd, 1038 vdev_label_offset(vd->vdev_psize, l, 1039 offsetof(vdev_label_t, vl_pad2)), 1040 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1041 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1042 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1043 } 1044 1045 if (zio == NULL) 1046 return (pio); 1047 1048 zio_nowait(pio); 1049 return (NULL); 1050 } 1051 1052 static void 1053 vdev_open_child(void *arg) 1054 { 1055 vdev_t *vd = arg; 1056 1057 vd->vdev_open_thread = curthread; 1058 vd->vdev_open_error = vdev_open(vd); 1059 vd->vdev_open_thread = NULL; 1060 } 1061 1062 boolean_t 1063 vdev_uses_zvols(vdev_t *vd) 1064 { 1065 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1066 strlen(ZVOL_DIR)) == 0) 1067 return (B_TRUE); 1068 for (int c = 0; c < vd->vdev_children; c++) 1069 if (vdev_uses_zvols(vd->vdev_child[c])) 1070 return (B_TRUE); 1071 return (B_FALSE); 1072 } 1073 1074 void 1075 vdev_open_children(vdev_t *vd) 1076 { 1077 taskq_t *tq; 1078 int children = vd->vdev_children; 1079 1080 /* 1081 * in order to handle pools on top of zvols, do the opens 1082 * in a single thread so that the same thread holds the 1083 * spa_namespace_lock 1084 */ 1085 if (vdev_uses_zvols(vd)) { 1086 for (int c = 0; c < children; c++) 1087 vd->vdev_child[c]->vdev_open_error = 1088 vdev_open(vd->vdev_child[c]); 1089 return; 1090 } 1091 tq = taskq_create("vdev_open", children, minclsyspri, 1092 children, children, TASKQ_PREPOPULATE); 1093 1094 for (int c = 0; c < children; c++) 1095 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1096 TQ_SLEEP) != NULL); 1097 1098 taskq_destroy(tq); 1099 } 1100 1101 /* 1102 * Prepare a virtual device for access. 1103 */ 1104 int 1105 vdev_open(vdev_t *vd) 1106 { 1107 spa_t *spa = vd->vdev_spa; 1108 int error; 1109 uint64_t osize = 0; 1110 uint64_t max_osize = 0; 1111 uint64_t asize, max_asize, psize; 1112 uint64_t ashift = 0; 1113 1114 ASSERT(vd->vdev_open_thread == curthread || 1115 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1116 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1117 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1118 vd->vdev_state == VDEV_STATE_OFFLINE); 1119 1120 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1121 vd->vdev_cant_read = B_FALSE; 1122 vd->vdev_cant_write = B_FALSE; 1123 vd->vdev_min_asize = vdev_get_min_asize(vd); 1124 1125 /* 1126 * If this vdev is not removed, check its fault status. If it's 1127 * faulted, bail out of the open. 1128 */ 1129 if (!vd->vdev_removed && vd->vdev_faulted) { 1130 ASSERT(vd->vdev_children == 0); 1131 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1132 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1133 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1134 vd->vdev_label_aux); 1135 return (ENXIO); 1136 } else if (vd->vdev_offline) { 1137 ASSERT(vd->vdev_children == 0); 1138 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1139 return (ENXIO); 1140 } 1141 1142 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 1143 1144 /* 1145 * Reset the vdev_reopening flag so that we actually close 1146 * the vdev on error. 1147 */ 1148 vd->vdev_reopening = B_FALSE; 1149 if (zio_injection_enabled && error == 0) 1150 error = zio_handle_device_injection(vd, NULL, ENXIO); 1151 1152 if (error) { 1153 if (vd->vdev_removed && 1154 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1155 vd->vdev_removed = B_FALSE; 1156 1157 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1158 vd->vdev_stat.vs_aux); 1159 return (error); 1160 } 1161 1162 vd->vdev_removed = B_FALSE; 1163 1164 /* 1165 * Recheck the faulted flag now that we have confirmed that 1166 * the vdev is accessible. If we're faulted, bail. 1167 */ 1168 if (vd->vdev_faulted) { 1169 ASSERT(vd->vdev_children == 0); 1170 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1171 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1172 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1173 vd->vdev_label_aux); 1174 return (ENXIO); 1175 } 1176 1177 if (vd->vdev_degraded) { 1178 ASSERT(vd->vdev_children == 0); 1179 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1180 VDEV_AUX_ERR_EXCEEDED); 1181 } else { 1182 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1183 } 1184 1185 /* 1186 * For hole or missing vdevs we just return success. 1187 */ 1188 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1189 return (0); 1190 1191 for (int c = 0; c < vd->vdev_children; c++) { 1192 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1193 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1194 VDEV_AUX_NONE); 1195 break; 1196 } 1197 } 1198 1199 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1200 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1201 1202 if (vd->vdev_children == 0) { 1203 if (osize < SPA_MINDEVSIZE) { 1204 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1205 VDEV_AUX_TOO_SMALL); 1206 return (EOVERFLOW); 1207 } 1208 psize = osize; 1209 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1210 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1211 VDEV_LABEL_END_SIZE); 1212 } else { 1213 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1214 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1215 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1216 VDEV_AUX_TOO_SMALL); 1217 return (EOVERFLOW); 1218 } 1219 psize = 0; 1220 asize = osize; 1221 max_asize = max_osize; 1222 } 1223 1224 vd->vdev_psize = psize; 1225 1226 /* 1227 * Make sure the allocatable size hasn't shrunk. 1228 */ 1229 if (asize < vd->vdev_min_asize) { 1230 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231 VDEV_AUX_BAD_LABEL); 1232 return (EINVAL); 1233 } 1234 1235 if (vd->vdev_asize == 0) { 1236 /* 1237 * This is the first-ever open, so use the computed values. 1238 * For testing purposes, a higher ashift can be requested. 1239 */ 1240 vd->vdev_asize = asize; 1241 vd->vdev_max_asize = max_asize; 1242 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1243 } else { 1244 /* 1245 * Make sure the alignment requirement hasn't increased. 1246 */ 1247 if (ashift > vd->vdev_top->vdev_ashift) { 1248 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1249 VDEV_AUX_BAD_LABEL); 1250 return (EINVAL); 1251 } 1252 vd->vdev_max_asize = max_asize; 1253 } 1254 1255 /* 1256 * If all children are healthy and the asize has increased, 1257 * then we've experienced dynamic LUN growth. If automatic 1258 * expansion is enabled then use the additional space. 1259 */ 1260 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1261 (vd->vdev_expanding || spa->spa_autoexpand)) 1262 vd->vdev_asize = asize; 1263 1264 vdev_set_min_asize(vd); 1265 1266 /* 1267 * Ensure we can issue some IO before declaring the 1268 * vdev open for business. 1269 */ 1270 if (vd->vdev_ops->vdev_op_leaf && 1271 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1272 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1273 VDEV_AUX_ERR_EXCEEDED); 1274 return (error); 1275 } 1276 1277 /* 1278 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1279 * resilver. But don't do this if we are doing a reopen for a scrub, 1280 * since this would just restart the scrub we are already doing. 1281 */ 1282 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1283 vdev_resilver_needed(vd, NULL, NULL)) 1284 spa_async_request(spa, SPA_ASYNC_RESILVER); 1285 1286 return (0); 1287 } 1288 1289 /* 1290 * Called once the vdevs are all opened, this routine validates the label 1291 * contents. This needs to be done before vdev_load() so that we don't 1292 * inadvertently do repair I/Os to the wrong device. 1293 * 1294 * If 'strict' is false ignore the spa guid check. This is necessary because 1295 * if the machine crashed during a re-guid the new guid might have been written 1296 * to all of the vdev labels, but not the cached config. The strict check 1297 * will be performed when the pool is opened again using the mos config. 1298 * 1299 * This function will only return failure if one of the vdevs indicates that it 1300 * has since been destroyed or exported. This is only possible if 1301 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1302 * will be updated but the function will return 0. 1303 */ 1304 int 1305 vdev_validate(vdev_t *vd, boolean_t strict) 1306 { 1307 spa_t *spa = vd->vdev_spa; 1308 nvlist_t *label; 1309 uint64_t guid = 0, top_guid; 1310 uint64_t state; 1311 1312 for (int c = 0; c < vd->vdev_children; c++) 1313 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1314 return (EBADF); 1315 1316 /* 1317 * If the device has already failed, or was marked offline, don't do 1318 * any further validation. Otherwise, label I/O will fail and we will 1319 * overwrite the previous state. 1320 */ 1321 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1322 uint64_t aux_guid = 0; 1323 nvlist_t *nvl; 1324 1325 if ((label = vdev_label_read_config(vd)) == NULL) { 1326 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1327 VDEV_AUX_BAD_LABEL); 1328 return (0); 1329 } 1330 1331 /* 1332 * Determine if this vdev has been split off into another 1333 * pool. If so, then refuse to open it. 1334 */ 1335 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1336 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1337 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1338 VDEV_AUX_SPLIT_POOL); 1339 nvlist_free(label); 1340 return (0); 1341 } 1342 1343 if (strict && (nvlist_lookup_uint64(label, 1344 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1345 guid != spa_guid(spa))) { 1346 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1347 VDEV_AUX_CORRUPT_DATA); 1348 nvlist_free(label); 1349 return (0); 1350 } 1351 1352 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1353 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1354 &aux_guid) != 0) 1355 aux_guid = 0; 1356 1357 /* 1358 * If this vdev just became a top-level vdev because its 1359 * sibling was detached, it will have adopted the parent's 1360 * vdev guid -- but the label may or may not be on disk yet. 1361 * Fortunately, either version of the label will have the 1362 * same top guid, so if we're a top-level vdev, we can 1363 * safely compare to that instead. 1364 * 1365 * If we split this vdev off instead, then we also check the 1366 * original pool's guid. We don't want to consider the vdev 1367 * corrupt if it is partway through a split operation. 1368 */ 1369 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1370 &guid) != 0 || 1371 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1372 &top_guid) != 0 || 1373 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1374 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1375 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1376 VDEV_AUX_CORRUPT_DATA); 1377 nvlist_free(label); 1378 return (0); 1379 } 1380 1381 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1382 &state) != 0) { 1383 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1384 VDEV_AUX_CORRUPT_DATA); 1385 nvlist_free(label); 1386 return (0); 1387 } 1388 1389 nvlist_free(label); 1390 1391 /* 1392 * If this is a verbatim import, no need to check the 1393 * state of the pool. 1394 */ 1395 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1396 spa_load_state(spa) == SPA_LOAD_OPEN && 1397 state != POOL_STATE_ACTIVE) 1398 return (EBADF); 1399 1400 /* 1401 * If we were able to open and validate a vdev that was 1402 * previously marked permanently unavailable, clear that state 1403 * now. 1404 */ 1405 if (vd->vdev_not_present) 1406 vd->vdev_not_present = 0; 1407 } 1408 1409 return (0); 1410 } 1411 1412 /* 1413 * Close a virtual device. 1414 */ 1415 void 1416 vdev_close(vdev_t *vd) 1417 { 1418 spa_t *spa = vd->vdev_spa; 1419 vdev_t *pvd = vd->vdev_parent; 1420 1421 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1422 1423 /* 1424 * If our parent is reopening, then we are as well, unless we are 1425 * going offline. 1426 */ 1427 if (pvd != NULL && pvd->vdev_reopening) 1428 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1429 1430 vd->vdev_ops->vdev_op_close(vd); 1431 1432 vdev_cache_purge(vd); 1433 1434 /* 1435 * We record the previous state before we close it, so that if we are 1436 * doing a reopen(), we don't generate FMA ereports if we notice that 1437 * it's still faulted. 1438 */ 1439 vd->vdev_prevstate = vd->vdev_state; 1440 1441 if (vd->vdev_offline) 1442 vd->vdev_state = VDEV_STATE_OFFLINE; 1443 else 1444 vd->vdev_state = VDEV_STATE_CLOSED; 1445 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1446 } 1447 1448 void 1449 vdev_hold(vdev_t *vd) 1450 { 1451 spa_t *spa = vd->vdev_spa; 1452 1453 ASSERT(spa_is_root(spa)); 1454 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1455 return; 1456 1457 for (int c = 0; c < vd->vdev_children; c++) 1458 vdev_hold(vd->vdev_child[c]); 1459 1460 if (vd->vdev_ops->vdev_op_leaf) 1461 vd->vdev_ops->vdev_op_hold(vd); 1462 } 1463 1464 void 1465 vdev_rele(vdev_t *vd) 1466 { 1467 spa_t *spa = vd->vdev_spa; 1468 1469 ASSERT(spa_is_root(spa)); 1470 for (int c = 0; c < vd->vdev_children; c++) 1471 vdev_rele(vd->vdev_child[c]); 1472 1473 if (vd->vdev_ops->vdev_op_leaf) 1474 vd->vdev_ops->vdev_op_rele(vd); 1475 } 1476 1477 /* 1478 * Reopen all interior vdevs and any unopened leaves. We don't actually 1479 * reopen leaf vdevs which had previously been opened as they might deadlock 1480 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1481 * If the leaf has never been opened then open it, as usual. 1482 */ 1483 void 1484 vdev_reopen(vdev_t *vd) 1485 { 1486 spa_t *spa = vd->vdev_spa; 1487 1488 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1489 1490 /* set the reopening flag unless we're taking the vdev offline */ 1491 vd->vdev_reopening = !vd->vdev_offline; 1492 vdev_close(vd); 1493 (void) vdev_open(vd); 1494 1495 /* 1496 * Call vdev_validate() here to make sure we have the same device. 1497 * Otherwise, a device with an invalid label could be successfully 1498 * opened in response to vdev_reopen(). 1499 */ 1500 if (vd->vdev_aux) { 1501 (void) vdev_validate_aux(vd); 1502 if (vdev_readable(vd) && vdev_writeable(vd) && 1503 vd->vdev_aux == &spa->spa_l2cache && 1504 !l2arc_vdev_present(vd)) 1505 l2arc_add_vdev(spa, vd); 1506 } else { 1507 (void) vdev_validate(vd, B_TRUE); 1508 } 1509 1510 /* 1511 * Reassess parent vdev's health. 1512 */ 1513 vdev_propagate_state(vd); 1514 } 1515 1516 int 1517 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1518 { 1519 int error; 1520 1521 /* 1522 * Normally, partial opens (e.g. of a mirror) are allowed. 1523 * For a create, however, we want to fail the request if 1524 * there are any components we can't open. 1525 */ 1526 error = vdev_open(vd); 1527 1528 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1529 vdev_close(vd); 1530 return (error ? error : ENXIO); 1531 } 1532 1533 /* 1534 * Recursively initialize all labels. 1535 */ 1536 if ((error = vdev_label_init(vd, txg, isreplacing ? 1537 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1538 vdev_close(vd); 1539 return (error); 1540 } 1541 1542 return (0); 1543 } 1544 1545 void 1546 vdev_metaslab_set_size(vdev_t *vd) 1547 { 1548 /* 1549 * Aim for roughly 200 metaslabs per vdev. 1550 */ 1551 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1552 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1553 } 1554 1555 void 1556 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1557 { 1558 ASSERT(vd == vd->vdev_top); 1559 ASSERT(!vd->vdev_ishole); 1560 ASSERT(ISP2(flags)); 1561 ASSERT(spa_writeable(vd->vdev_spa)); 1562 1563 if (flags & VDD_METASLAB) 1564 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1565 1566 if (flags & VDD_DTL) 1567 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1568 1569 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1570 } 1571 1572 /* 1573 * DTLs. 1574 * 1575 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1576 * the vdev has less than perfect replication. There are four kinds of DTL: 1577 * 1578 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1579 * 1580 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1581 * 1582 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1583 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1584 * txgs that was scrubbed. 1585 * 1586 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1587 * persistent errors or just some device being offline. 1588 * Unlike the other three, the DTL_OUTAGE map is not generally 1589 * maintained; it's only computed when needed, typically to 1590 * determine whether a device can be detached. 1591 * 1592 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1593 * either has the data or it doesn't. 1594 * 1595 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1596 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1597 * if any child is less than fully replicated, then so is its parent. 1598 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1599 * comprising only those txgs which appear in 'maxfaults' or more children; 1600 * those are the txgs we don't have enough replication to read. For example, 1601 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1602 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1603 * two child DTL_MISSING maps. 1604 * 1605 * It should be clear from the above that to compute the DTLs and outage maps 1606 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1607 * Therefore, that is all we keep on disk. When loading the pool, or after 1608 * a configuration change, we generate all other DTLs from first principles. 1609 */ 1610 void 1611 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1612 { 1613 space_map_t *sm = &vd->vdev_dtl[t]; 1614 1615 ASSERT(t < DTL_TYPES); 1616 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1617 ASSERT(spa_writeable(vd->vdev_spa)); 1618 1619 mutex_enter(sm->sm_lock); 1620 if (!space_map_contains(sm, txg, size)) 1621 space_map_add(sm, txg, size); 1622 mutex_exit(sm->sm_lock); 1623 } 1624 1625 boolean_t 1626 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1627 { 1628 space_map_t *sm = &vd->vdev_dtl[t]; 1629 boolean_t dirty = B_FALSE; 1630 1631 ASSERT(t < DTL_TYPES); 1632 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1633 1634 mutex_enter(sm->sm_lock); 1635 if (sm->sm_space != 0) 1636 dirty = space_map_contains(sm, txg, size); 1637 mutex_exit(sm->sm_lock); 1638 1639 return (dirty); 1640 } 1641 1642 boolean_t 1643 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1644 { 1645 space_map_t *sm = &vd->vdev_dtl[t]; 1646 boolean_t empty; 1647 1648 mutex_enter(sm->sm_lock); 1649 empty = (sm->sm_space == 0); 1650 mutex_exit(sm->sm_lock); 1651 1652 return (empty); 1653 } 1654 1655 /* 1656 * Reassess DTLs after a config change or scrub completion. 1657 */ 1658 void 1659 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1660 { 1661 spa_t *spa = vd->vdev_spa; 1662 avl_tree_t reftree; 1663 int minref; 1664 1665 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1666 1667 for (int c = 0; c < vd->vdev_children; c++) 1668 vdev_dtl_reassess(vd->vdev_child[c], txg, 1669 scrub_txg, scrub_done); 1670 1671 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1672 return; 1673 1674 if (vd->vdev_ops->vdev_op_leaf) { 1675 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1676 1677 mutex_enter(&vd->vdev_dtl_lock); 1678 if (scrub_txg != 0 && 1679 (spa->spa_scrub_started || 1680 (scn && scn->scn_phys.scn_errors == 0))) { 1681 /* 1682 * We completed a scrub up to scrub_txg. If we 1683 * did it without rebooting, then the scrub dtl 1684 * will be valid, so excise the old region and 1685 * fold in the scrub dtl. Otherwise, leave the 1686 * dtl as-is if there was an error. 1687 * 1688 * There's little trick here: to excise the beginning 1689 * of the DTL_MISSING map, we put it into a reference 1690 * tree and then add a segment with refcnt -1 that 1691 * covers the range [0, scrub_txg). This means 1692 * that each txg in that range has refcnt -1 or 0. 1693 * We then add DTL_SCRUB with a refcnt of 2, so that 1694 * entries in the range [0, scrub_txg) will have a 1695 * positive refcnt -- either 1 or 2. We then convert 1696 * the reference tree into the new DTL_MISSING map. 1697 */ 1698 space_map_ref_create(&reftree); 1699 space_map_ref_add_map(&reftree, 1700 &vd->vdev_dtl[DTL_MISSING], 1); 1701 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 1702 space_map_ref_add_map(&reftree, 1703 &vd->vdev_dtl[DTL_SCRUB], 2); 1704 space_map_ref_generate_map(&reftree, 1705 &vd->vdev_dtl[DTL_MISSING], 1); 1706 space_map_ref_destroy(&reftree); 1707 } 1708 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1709 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1710 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1711 if (scrub_done) 1712 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1713 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1714 if (!vdev_readable(vd)) 1715 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1716 else 1717 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1718 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1719 mutex_exit(&vd->vdev_dtl_lock); 1720 1721 if (txg != 0) 1722 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1723 return; 1724 } 1725 1726 mutex_enter(&vd->vdev_dtl_lock); 1727 for (int t = 0; t < DTL_TYPES; t++) { 1728 /* account for child's outage in parent's missing map */ 1729 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 1730 if (t == DTL_SCRUB) 1731 continue; /* leaf vdevs only */ 1732 if (t == DTL_PARTIAL) 1733 minref = 1; /* i.e. non-zero */ 1734 else if (vd->vdev_nparity != 0) 1735 minref = vd->vdev_nparity + 1; /* RAID-Z */ 1736 else 1737 minref = vd->vdev_children; /* any kind of mirror */ 1738 space_map_ref_create(&reftree); 1739 for (int c = 0; c < vd->vdev_children; c++) { 1740 vdev_t *cvd = vd->vdev_child[c]; 1741 mutex_enter(&cvd->vdev_dtl_lock); 1742 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 1743 mutex_exit(&cvd->vdev_dtl_lock); 1744 } 1745 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 1746 space_map_ref_destroy(&reftree); 1747 } 1748 mutex_exit(&vd->vdev_dtl_lock); 1749 } 1750 1751 static int 1752 vdev_dtl_load(vdev_t *vd) 1753 { 1754 spa_t *spa = vd->vdev_spa; 1755 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1756 objset_t *mos = spa->spa_meta_objset; 1757 dmu_buf_t *db; 1758 int error; 1759 1760 ASSERT(vd->vdev_children == 0); 1761 1762 if (smo->smo_object == 0) 1763 return (0); 1764 1765 ASSERT(!vd->vdev_ishole); 1766 1767 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1768 return (error); 1769 1770 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1771 bcopy(db->db_data, smo, sizeof (*smo)); 1772 dmu_buf_rele(db, FTAG); 1773 1774 mutex_enter(&vd->vdev_dtl_lock); 1775 error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 1776 NULL, SM_ALLOC, smo, mos); 1777 mutex_exit(&vd->vdev_dtl_lock); 1778 1779 return (error); 1780 } 1781 1782 void 1783 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1784 { 1785 spa_t *spa = vd->vdev_spa; 1786 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1787 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1788 objset_t *mos = spa->spa_meta_objset; 1789 space_map_t smsync; 1790 kmutex_t smlock; 1791 dmu_buf_t *db; 1792 dmu_tx_t *tx; 1793 1794 ASSERT(!vd->vdev_ishole); 1795 1796 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1797 1798 if (vd->vdev_detached) { 1799 if (smo->smo_object != 0) { 1800 int err = dmu_object_free(mos, smo->smo_object, tx); 1801 ASSERT3U(err, ==, 0); 1802 smo->smo_object = 0; 1803 } 1804 dmu_tx_commit(tx); 1805 return; 1806 } 1807 1808 if (smo->smo_object == 0) { 1809 ASSERT(smo->smo_objsize == 0); 1810 ASSERT(smo->smo_alloc == 0); 1811 smo->smo_object = dmu_object_alloc(mos, 1812 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1813 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1814 ASSERT(smo->smo_object != 0); 1815 vdev_config_dirty(vd->vdev_top); 1816 } 1817 1818 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1819 1820 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1821 &smlock); 1822 1823 mutex_enter(&smlock); 1824 1825 mutex_enter(&vd->vdev_dtl_lock); 1826 space_map_walk(sm, space_map_add, &smsync); 1827 mutex_exit(&vd->vdev_dtl_lock); 1828 1829 space_map_truncate(smo, mos, tx); 1830 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1831 1832 space_map_destroy(&smsync); 1833 1834 mutex_exit(&smlock); 1835 mutex_destroy(&smlock); 1836 1837 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1838 dmu_buf_will_dirty(db, tx); 1839 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1840 bcopy(smo, db->db_data, sizeof (*smo)); 1841 dmu_buf_rele(db, FTAG); 1842 1843 dmu_tx_commit(tx); 1844 } 1845 1846 /* 1847 * Determine whether the specified vdev can be offlined/detached/removed 1848 * without losing data. 1849 */ 1850 boolean_t 1851 vdev_dtl_required(vdev_t *vd) 1852 { 1853 spa_t *spa = vd->vdev_spa; 1854 vdev_t *tvd = vd->vdev_top; 1855 uint8_t cant_read = vd->vdev_cant_read; 1856 boolean_t required; 1857 1858 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1859 1860 if (vd == spa->spa_root_vdev || vd == tvd) 1861 return (B_TRUE); 1862 1863 /* 1864 * Temporarily mark the device as unreadable, and then determine 1865 * whether this results in any DTL outages in the top-level vdev. 1866 * If not, we can safely offline/detach/remove the device. 1867 */ 1868 vd->vdev_cant_read = B_TRUE; 1869 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1870 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 1871 vd->vdev_cant_read = cant_read; 1872 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1873 1874 if (!required && zio_injection_enabled) 1875 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 1876 1877 return (required); 1878 } 1879 1880 /* 1881 * Determine if resilver is needed, and if so the txg range. 1882 */ 1883 boolean_t 1884 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1885 { 1886 boolean_t needed = B_FALSE; 1887 uint64_t thismin = UINT64_MAX; 1888 uint64_t thismax = 0; 1889 1890 if (vd->vdev_children == 0) { 1891 mutex_enter(&vd->vdev_dtl_lock); 1892 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 1893 vdev_writeable(vd)) { 1894 space_seg_t *ss; 1895 1896 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1897 thismin = ss->ss_start - 1; 1898 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1899 thismax = ss->ss_end; 1900 needed = B_TRUE; 1901 } 1902 mutex_exit(&vd->vdev_dtl_lock); 1903 } else { 1904 for (int c = 0; c < vd->vdev_children; c++) { 1905 vdev_t *cvd = vd->vdev_child[c]; 1906 uint64_t cmin, cmax; 1907 1908 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1909 thismin = MIN(thismin, cmin); 1910 thismax = MAX(thismax, cmax); 1911 needed = B_TRUE; 1912 } 1913 } 1914 } 1915 1916 if (needed && minp) { 1917 *minp = thismin; 1918 *maxp = thismax; 1919 } 1920 return (needed); 1921 } 1922 1923 void 1924 vdev_load(vdev_t *vd) 1925 { 1926 /* 1927 * Recursively load all children. 1928 */ 1929 for (int c = 0; c < vd->vdev_children; c++) 1930 vdev_load(vd->vdev_child[c]); 1931 1932 /* 1933 * If this is a top-level vdev, initialize its metaslabs. 1934 */ 1935 if (vd == vd->vdev_top && !vd->vdev_ishole && 1936 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1937 vdev_metaslab_init(vd, 0) != 0)) 1938 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1939 VDEV_AUX_CORRUPT_DATA); 1940 1941 /* 1942 * If this is a leaf vdev, load its DTL. 1943 */ 1944 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1945 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1946 VDEV_AUX_CORRUPT_DATA); 1947 } 1948 1949 /* 1950 * The special vdev case is used for hot spares and l2cache devices. Its 1951 * sole purpose it to set the vdev state for the associated vdev. To do this, 1952 * we make sure that we can open the underlying device, then try to read the 1953 * label, and make sure that the label is sane and that it hasn't been 1954 * repurposed to another pool. 1955 */ 1956 int 1957 vdev_validate_aux(vdev_t *vd) 1958 { 1959 nvlist_t *label; 1960 uint64_t guid, version; 1961 uint64_t state; 1962 1963 if (!vdev_readable(vd)) 1964 return (0); 1965 1966 if ((label = vdev_label_read_config(vd)) == NULL) { 1967 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1968 VDEV_AUX_CORRUPT_DATA); 1969 return (-1); 1970 } 1971 1972 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1973 version > SPA_VERSION || 1974 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 1975 guid != vd->vdev_guid || 1976 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 1977 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1978 VDEV_AUX_CORRUPT_DATA); 1979 nvlist_free(label); 1980 return (-1); 1981 } 1982 1983 /* 1984 * We don't actually check the pool state here. If it's in fact in 1985 * use by another pool, we update this fact on the fly when requested. 1986 */ 1987 nvlist_free(label); 1988 return (0); 1989 } 1990 1991 void 1992 vdev_remove(vdev_t *vd, uint64_t txg) 1993 { 1994 spa_t *spa = vd->vdev_spa; 1995 objset_t *mos = spa->spa_meta_objset; 1996 dmu_tx_t *tx; 1997 1998 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1999 2000 if (vd->vdev_dtl_smo.smo_object) { 2001 ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); 2002 (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 2003 vd->vdev_dtl_smo.smo_object = 0; 2004 } 2005 2006 if (vd->vdev_ms != NULL) { 2007 for (int m = 0; m < vd->vdev_ms_count; m++) { 2008 metaslab_t *msp = vd->vdev_ms[m]; 2009 2010 if (msp == NULL || msp->ms_smo.smo_object == 0) 2011 continue; 2012 2013 ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); 2014 (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 2015 msp->ms_smo.smo_object = 0; 2016 } 2017 } 2018 2019 if (vd->vdev_ms_array) { 2020 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2021 vd->vdev_ms_array = 0; 2022 vd->vdev_ms_shift = 0; 2023 } 2024 dmu_tx_commit(tx); 2025 } 2026 2027 void 2028 vdev_sync_done(vdev_t *vd, uint64_t txg) 2029 { 2030 metaslab_t *msp; 2031 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2032 2033 ASSERT(!vd->vdev_ishole); 2034 2035 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2036 metaslab_sync_done(msp, txg); 2037 2038 if (reassess) 2039 metaslab_sync_reassess(vd->vdev_mg); 2040 } 2041 2042 void 2043 vdev_sync(vdev_t *vd, uint64_t txg) 2044 { 2045 spa_t *spa = vd->vdev_spa; 2046 vdev_t *lvd; 2047 metaslab_t *msp; 2048 dmu_tx_t *tx; 2049 2050 ASSERT(!vd->vdev_ishole); 2051 2052 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2053 ASSERT(vd == vd->vdev_top); 2054 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2055 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2056 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2057 ASSERT(vd->vdev_ms_array != 0); 2058 vdev_config_dirty(vd); 2059 dmu_tx_commit(tx); 2060 } 2061 2062 /* 2063 * Remove the metadata associated with this vdev once it's empty. 2064 */ 2065 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2066 vdev_remove(vd, txg); 2067 2068 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2069 metaslab_sync(msp, txg); 2070 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2071 } 2072 2073 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2074 vdev_dtl_sync(lvd, txg); 2075 2076 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2077 } 2078 2079 uint64_t 2080 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2081 { 2082 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2083 } 2084 2085 /* 2086 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2087 * not be opened, and no I/O is attempted. 2088 */ 2089 int 2090 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2091 { 2092 vdev_t *vd, *tvd; 2093 2094 spa_vdev_state_enter(spa, SCL_NONE); 2095 2096 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2097 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2098 2099 if (!vd->vdev_ops->vdev_op_leaf) 2100 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2101 2102 tvd = vd->vdev_top; 2103 2104 /* 2105 * We don't directly use the aux state here, but if we do a 2106 * vdev_reopen(), we need this value to be present to remember why we 2107 * were faulted. 2108 */ 2109 vd->vdev_label_aux = aux; 2110 2111 /* 2112 * Faulted state takes precedence over degraded. 2113 */ 2114 vd->vdev_delayed_close = B_FALSE; 2115 vd->vdev_faulted = 1ULL; 2116 vd->vdev_degraded = 0ULL; 2117 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2118 2119 /* 2120 * If this device has the only valid copy of the data, then 2121 * back off and simply mark the vdev as degraded instead. 2122 */ 2123 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2124 vd->vdev_degraded = 1ULL; 2125 vd->vdev_faulted = 0ULL; 2126 2127 /* 2128 * If we reopen the device and it's not dead, only then do we 2129 * mark it degraded. 2130 */ 2131 vdev_reopen(tvd); 2132 2133 if (vdev_readable(vd)) 2134 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2135 } 2136 2137 return (spa_vdev_state_exit(spa, vd, 0)); 2138 } 2139 2140 /* 2141 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2142 * user that something is wrong. The vdev continues to operate as normal as far 2143 * as I/O is concerned. 2144 */ 2145 int 2146 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2147 { 2148 vdev_t *vd; 2149 2150 spa_vdev_state_enter(spa, SCL_NONE); 2151 2152 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2153 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2154 2155 if (!vd->vdev_ops->vdev_op_leaf) 2156 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2157 2158 /* 2159 * If the vdev is already faulted, then don't do anything. 2160 */ 2161 if (vd->vdev_faulted || vd->vdev_degraded) 2162 return (spa_vdev_state_exit(spa, NULL, 0)); 2163 2164 vd->vdev_degraded = 1ULL; 2165 if (!vdev_is_dead(vd)) 2166 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2167 aux); 2168 2169 return (spa_vdev_state_exit(spa, vd, 0)); 2170 } 2171 2172 /* 2173 * Online the given vdev. If 'unspare' is set, it implies two things. First, 2174 * any attached spare device should be detached when the device finishes 2175 * resilvering. Second, the online should be treated like a 'test' online case, 2176 * so no FMA events are generated if the device fails to open. 2177 */ 2178 int 2179 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2180 { 2181 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2182 2183 spa_vdev_state_enter(spa, SCL_NONE); 2184 2185 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2186 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2187 2188 if (!vd->vdev_ops->vdev_op_leaf) 2189 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2190 2191 tvd = vd->vdev_top; 2192 vd->vdev_offline = B_FALSE; 2193 vd->vdev_tmpoffline = B_FALSE; 2194 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2195 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2196 2197 /* XXX - L2ARC 1.0 does not support expansion */ 2198 if (!vd->vdev_aux) { 2199 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2200 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2201 } 2202 2203 vdev_reopen(tvd); 2204 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2205 2206 if (!vd->vdev_aux) { 2207 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2208 pvd->vdev_expanding = B_FALSE; 2209 } 2210 2211 if (newstate) 2212 *newstate = vd->vdev_state; 2213 if ((flags & ZFS_ONLINE_UNSPARE) && 2214 !vdev_is_dead(vd) && vd->vdev_parent && 2215 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2216 vd->vdev_parent->vdev_child[0] == vd) 2217 vd->vdev_unspare = B_TRUE; 2218 2219 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2220 2221 /* XXX - L2ARC 1.0 does not support expansion */ 2222 if (vd->vdev_aux) 2223 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2224 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2225 } 2226 return (spa_vdev_state_exit(spa, vd, 0)); 2227 } 2228 2229 static int 2230 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2231 { 2232 vdev_t *vd, *tvd; 2233 int error = 0; 2234 uint64_t generation; 2235 metaslab_group_t *mg; 2236 2237 top: 2238 spa_vdev_state_enter(spa, SCL_ALLOC); 2239 2240 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2241 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2242 2243 if (!vd->vdev_ops->vdev_op_leaf) 2244 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2245 2246 tvd = vd->vdev_top; 2247 mg = tvd->vdev_mg; 2248 generation = spa->spa_config_generation + 1; 2249 2250 /* 2251 * If the device isn't already offline, try to offline it. 2252 */ 2253 if (!vd->vdev_offline) { 2254 /* 2255 * If this device has the only valid copy of some data, 2256 * don't allow it to be offlined. Log devices are always 2257 * expendable. 2258 */ 2259 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2260 vdev_dtl_required(vd)) 2261 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2262 2263 /* 2264 * If the top-level is a slog and it has had allocations 2265 * then proceed. We check that the vdev's metaslab group 2266 * is not NULL since it's possible that we may have just 2267 * added this vdev but not yet initialized its metaslabs. 2268 */ 2269 if (tvd->vdev_islog && mg != NULL) { 2270 /* 2271 * Prevent any future allocations. 2272 */ 2273 metaslab_group_passivate(mg); 2274 (void) spa_vdev_state_exit(spa, vd, 0); 2275 2276 error = spa_offline_log(spa); 2277 2278 spa_vdev_state_enter(spa, SCL_ALLOC); 2279 2280 /* 2281 * Check to see if the config has changed. 2282 */ 2283 if (error || generation != spa->spa_config_generation) { 2284 metaslab_group_activate(mg); 2285 if (error) 2286 return (spa_vdev_state_exit(spa, 2287 vd, error)); 2288 (void) spa_vdev_state_exit(spa, vd, 0); 2289 goto top; 2290 } 2291 ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); 2292 } 2293 2294 /* 2295 * Offline this device and reopen its top-level vdev. 2296 * If the top-level vdev is a log device then just offline 2297 * it. Otherwise, if this action results in the top-level 2298 * vdev becoming unusable, undo it and fail the request. 2299 */ 2300 vd->vdev_offline = B_TRUE; 2301 vdev_reopen(tvd); 2302 2303 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2304 vdev_is_dead(tvd)) { 2305 vd->vdev_offline = B_FALSE; 2306 vdev_reopen(tvd); 2307 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2308 } 2309 2310 /* 2311 * Add the device back into the metaslab rotor so that 2312 * once we online the device it's open for business. 2313 */ 2314 if (tvd->vdev_islog && mg != NULL) 2315 metaslab_group_activate(mg); 2316 } 2317 2318 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2319 2320 return (spa_vdev_state_exit(spa, vd, 0)); 2321 } 2322 2323 int 2324 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2325 { 2326 int error; 2327 2328 mutex_enter(&spa->spa_vdev_top_lock); 2329 error = vdev_offline_locked(spa, guid, flags); 2330 mutex_exit(&spa->spa_vdev_top_lock); 2331 2332 return (error); 2333 } 2334 2335 /* 2336 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2337 * vdev_offline(), we assume the spa config is locked. We also clear all 2338 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2339 */ 2340 void 2341 vdev_clear(spa_t *spa, vdev_t *vd) 2342 { 2343 vdev_t *rvd = spa->spa_root_vdev; 2344 2345 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2346 2347 if (vd == NULL) 2348 vd = rvd; 2349 2350 vd->vdev_stat.vs_read_errors = 0; 2351 vd->vdev_stat.vs_write_errors = 0; 2352 vd->vdev_stat.vs_checksum_errors = 0; 2353 2354 for (int c = 0; c < vd->vdev_children; c++) 2355 vdev_clear(spa, vd->vdev_child[c]); 2356 2357 /* 2358 * If we're in the FAULTED state or have experienced failed I/O, then 2359 * clear the persistent state and attempt to reopen the device. We 2360 * also mark the vdev config dirty, so that the new faulted state is 2361 * written out to disk. 2362 */ 2363 if (vd->vdev_faulted || vd->vdev_degraded || 2364 !vdev_readable(vd) || !vdev_writeable(vd)) { 2365 2366 /* 2367 * When reopening in reponse to a clear event, it may be due to 2368 * a fmadm repair request. In this case, if the device is 2369 * still broken, we want to still post the ereport again. 2370 */ 2371 vd->vdev_forcefault = B_TRUE; 2372 2373 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2374 vd->vdev_cant_read = B_FALSE; 2375 vd->vdev_cant_write = B_FALSE; 2376 2377 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2378 2379 vd->vdev_forcefault = B_FALSE; 2380 2381 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2382 vdev_state_dirty(vd->vdev_top); 2383 2384 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2385 spa_async_request(spa, SPA_ASYNC_RESILVER); 2386 2387 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2388 } 2389 2390 /* 2391 * When clearing a FMA-diagnosed fault, we always want to 2392 * unspare the device, as we assume that the original spare was 2393 * done in response to the FMA fault. 2394 */ 2395 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2396 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2397 vd->vdev_parent->vdev_child[0] == vd) 2398 vd->vdev_unspare = B_TRUE; 2399 } 2400 2401 boolean_t 2402 vdev_is_dead(vdev_t *vd) 2403 { 2404 /* 2405 * Holes and missing devices are always considered "dead". 2406 * This simplifies the code since we don't have to check for 2407 * these types of devices in the various code paths. 2408 * Instead we rely on the fact that we skip over dead devices 2409 * before issuing I/O to them. 2410 */ 2411 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2412 vd->vdev_ops == &vdev_missing_ops); 2413 } 2414 2415 boolean_t 2416 vdev_readable(vdev_t *vd) 2417 { 2418 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2419 } 2420 2421 boolean_t 2422 vdev_writeable(vdev_t *vd) 2423 { 2424 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2425 } 2426 2427 boolean_t 2428 vdev_allocatable(vdev_t *vd) 2429 { 2430 uint64_t state = vd->vdev_state; 2431 2432 /* 2433 * We currently allow allocations from vdevs which may be in the 2434 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2435 * fails to reopen then we'll catch it later when we're holding 2436 * the proper locks. Note that we have to get the vdev state 2437 * in a local variable because although it changes atomically, 2438 * we're asking two separate questions about it. 2439 */ 2440 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2441 !vd->vdev_cant_write && !vd->vdev_ishole); 2442 } 2443 2444 boolean_t 2445 vdev_accessible(vdev_t *vd, zio_t *zio) 2446 { 2447 ASSERT(zio->io_vd == vd); 2448 2449 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2450 return (B_FALSE); 2451 2452 if (zio->io_type == ZIO_TYPE_READ) 2453 return (!vd->vdev_cant_read); 2454 2455 if (zio->io_type == ZIO_TYPE_WRITE) 2456 return (!vd->vdev_cant_write); 2457 2458 return (B_TRUE); 2459 } 2460 2461 /* 2462 * Get statistics for the given vdev. 2463 */ 2464 void 2465 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2466 { 2467 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2468 2469 mutex_enter(&vd->vdev_stat_lock); 2470 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2471 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2472 vs->vs_state = vd->vdev_state; 2473 vs->vs_rsize = vdev_get_min_asize(vd); 2474 if (vd->vdev_ops->vdev_op_leaf) 2475 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2476 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2477 mutex_exit(&vd->vdev_stat_lock); 2478 2479 /* 2480 * If we're getting stats on the root vdev, aggregate the I/O counts 2481 * over all top-level vdevs (i.e. the direct children of the root). 2482 */ 2483 if (vd == rvd) { 2484 for (int c = 0; c < rvd->vdev_children; c++) { 2485 vdev_t *cvd = rvd->vdev_child[c]; 2486 vdev_stat_t *cvs = &cvd->vdev_stat; 2487 2488 mutex_enter(&vd->vdev_stat_lock); 2489 for (int t = 0; t < ZIO_TYPES; t++) { 2490 vs->vs_ops[t] += cvs->vs_ops[t]; 2491 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2492 } 2493 cvs->vs_scan_removing = cvd->vdev_removing; 2494 mutex_exit(&vd->vdev_stat_lock); 2495 } 2496 } 2497 } 2498 2499 void 2500 vdev_clear_stats(vdev_t *vd) 2501 { 2502 mutex_enter(&vd->vdev_stat_lock); 2503 vd->vdev_stat.vs_space = 0; 2504 vd->vdev_stat.vs_dspace = 0; 2505 vd->vdev_stat.vs_alloc = 0; 2506 mutex_exit(&vd->vdev_stat_lock); 2507 } 2508 2509 void 2510 vdev_scan_stat_init(vdev_t *vd) 2511 { 2512 vdev_stat_t *vs = &vd->vdev_stat; 2513 2514 for (int c = 0; c < vd->vdev_children; c++) 2515 vdev_scan_stat_init(vd->vdev_child[c]); 2516 2517 mutex_enter(&vd->vdev_stat_lock); 2518 vs->vs_scan_processed = 0; 2519 mutex_exit(&vd->vdev_stat_lock); 2520 } 2521 2522 void 2523 vdev_stat_update(zio_t *zio, uint64_t psize) 2524 { 2525 spa_t *spa = zio->io_spa; 2526 vdev_t *rvd = spa->spa_root_vdev; 2527 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2528 vdev_t *pvd; 2529 uint64_t txg = zio->io_txg; 2530 vdev_stat_t *vs = &vd->vdev_stat; 2531 zio_type_t type = zio->io_type; 2532 int flags = zio->io_flags; 2533 2534 /* 2535 * If this i/o is a gang leader, it didn't do any actual work. 2536 */ 2537 if (zio->io_gang_tree) 2538 return; 2539 2540 if (zio->io_error == 0) { 2541 /* 2542 * If this is a root i/o, don't count it -- we've already 2543 * counted the top-level vdevs, and vdev_get_stats() will 2544 * aggregate them when asked. This reduces contention on 2545 * the root vdev_stat_lock and implicitly handles blocks 2546 * that compress away to holes, for which there is no i/o. 2547 * (Holes never create vdev children, so all the counters 2548 * remain zero, which is what we want.) 2549 * 2550 * Note: this only applies to successful i/o (io_error == 0) 2551 * because unlike i/o counts, errors are not additive. 2552 * When reading a ditto block, for example, failure of 2553 * one top-level vdev does not imply a root-level error. 2554 */ 2555 if (vd == rvd) 2556 return; 2557 2558 ASSERT(vd == zio->io_vd); 2559 2560 if (flags & ZIO_FLAG_IO_BYPASS) 2561 return; 2562 2563 mutex_enter(&vd->vdev_stat_lock); 2564 2565 if (flags & ZIO_FLAG_IO_REPAIR) { 2566 if (flags & ZIO_FLAG_SCAN_THREAD) { 2567 dsl_scan_phys_t *scn_phys = 2568 &spa->spa_dsl_pool->dp_scan->scn_phys; 2569 uint64_t *processed = &scn_phys->scn_processed; 2570 2571 /* XXX cleanup? */ 2572 if (vd->vdev_ops->vdev_op_leaf) 2573 atomic_add_64(processed, psize); 2574 vs->vs_scan_processed += psize; 2575 } 2576 2577 if (flags & ZIO_FLAG_SELF_HEAL) 2578 vs->vs_self_healed += psize; 2579 } 2580 2581 vs->vs_ops[type]++; 2582 vs->vs_bytes[type] += psize; 2583 2584 mutex_exit(&vd->vdev_stat_lock); 2585 return; 2586 } 2587 2588 if (flags & ZIO_FLAG_SPECULATIVE) 2589 return; 2590 2591 /* 2592 * If this is an I/O error that is going to be retried, then ignore the 2593 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2594 * hard errors, when in reality they can happen for any number of 2595 * innocuous reasons (bus resets, MPxIO link failure, etc). 2596 */ 2597 if (zio->io_error == EIO && 2598 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2599 return; 2600 2601 /* 2602 * Intent logs writes won't propagate their error to the root 2603 * I/O so don't mark these types of failures as pool-level 2604 * errors. 2605 */ 2606 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2607 return; 2608 2609 mutex_enter(&vd->vdev_stat_lock); 2610 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2611 if (zio->io_error == ECKSUM) 2612 vs->vs_checksum_errors++; 2613 else 2614 vs->vs_read_errors++; 2615 } 2616 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2617 vs->vs_write_errors++; 2618 mutex_exit(&vd->vdev_stat_lock); 2619 2620 if (type == ZIO_TYPE_WRITE && txg != 0 && 2621 (!(flags & ZIO_FLAG_IO_REPAIR) || 2622 (flags & ZIO_FLAG_SCAN_THREAD) || 2623 spa->spa_claiming)) { 2624 /* 2625 * This is either a normal write (not a repair), or it's 2626 * a repair induced by the scrub thread, or it's a repair 2627 * made by zil_claim() during spa_load() in the first txg. 2628 * In the normal case, we commit the DTL change in the same 2629 * txg as the block was born. In the scrub-induced repair 2630 * case, we know that scrubs run in first-pass syncing context, 2631 * so we commit the DTL change in spa_syncing_txg(spa). 2632 * In the zil_claim() case, we commit in spa_first_txg(spa). 2633 * 2634 * We currently do not make DTL entries for failed spontaneous 2635 * self-healing writes triggered by normal (non-scrubbing) 2636 * reads, because we have no transactional context in which to 2637 * do so -- and it's not clear that it'd be desirable anyway. 2638 */ 2639 if (vd->vdev_ops->vdev_op_leaf) { 2640 uint64_t commit_txg = txg; 2641 if (flags & ZIO_FLAG_SCAN_THREAD) { 2642 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2643 ASSERT(spa_sync_pass(spa) == 1); 2644 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2645 commit_txg = spa_syncing_txg(spa); 2646 } else if (spa->spa_claiming) { 2647 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2648 commit_txg = spa_first_txg(spa); 2649 } 2650 ASSERT(commit_txg >= spa_syncing_txg(spa)); 2651 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2652 return; 2653 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2654 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2655 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2656 } 2657 if (vd != rvd) 2658 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2659 } 2660 } 2661 2662 /* 2663 * Update the in-core space usage stats for this vdev, its metaslab class, 2664 * and the root vdev. 2665 */ 2666 void 2667 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2668 int64_t space_delta) 2669 { 2670 int64_t dspace_delta = space_delta; 2671 spa_t *spa = vd->vdev_spa; 2672 vdev_t *rvd = spa->spa_root_vdev; 2673 metaslab_group_t *mg = vd->vdev_mg; 2674 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2675 2676 ASSERT(vd == vd->vdev_top); 2677 2678 /* 2679 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2680 * factor. We must calculate this here and not at the root vdev 2681 * because the root vdev's psize-to-asize is simply the max of its 2682 * childrens', thus not accurate enough for us. 2683 */ 2684 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2685 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 2686 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2687 vd->vdev_deflate_ratio; 2688 2689 mutex_enter(&vd->vdev_stat_lock); 2690 vd->vdev_stat.vs_alloc += alloc_delta; 2691 vd->vdev_stat.vs_space += space_delta; 2692 vd->vdev_stat.vs_dspace += dspace_delta; 2693 mutex_exit(&vd->vdev_stat_lock); 2694 2695 if (mc == spa_normal_class(spa)) { 2696 mutex_enter(&rvd->vdev_stat_lock); 2697 rvd->vdev_stat.vs_alloc += alloc_delta; 2698 rvd->vdev_stat.vs_space += space_delta; 2699 rvd->vdev_stat.vs_dspace += dspace_delta; 2700 mutex_exit(&rvd->vdev_stat_lock); 2701 } 2702 2703 if (mc != NULL) { 2704 ASSERT(rvd == vd->vdev_parent); 2705 ASSERT(vd->vdev_ms_count != 0); 2706 2707 metaslab_class_space_update(mc, 2708 alloc_delta, defer_delta, space_delta, dspace_delta); 2709 } 2710 } 2711 2712 /* 2713 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2714 * so that it will be written out next time the vdev configuration is synced. 2715 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2716 */ 2717 void 2718 vdev_config_dirty(vdev_t *vd) 2719 { 2720 spa_t *spa = vd->vdev_spa; 2721 vdev_t *rvd = spa->spa_root_vdev; 2722 int c; 2723 2724 ASSERT(spa_writeable(spa)); 2725 2726 /* 2727 * If this is an aux vdev (as with l2cache and spare devices), then we 2728 * update the vdev config manually and set the sync flag. 2729 */ 2730 if (vd->vdev_aux != NULL) { 2731 spa_aux_vdev_t *sav = vd->vdev_aux; 2732 nvlist_t **aux; 2733 uint_t naux; 2734 2735 for (c = 0; c < sav->sav_count; c++) { 2736 if (sav->sav_vdevs[c] == vd) 2737 break; 2738 } 2739 2740 if (c == sav->sav_count) { 2741 /* 2742 * We're being removed. There's nothing more to do. 2743 */ 2744 ASSERT(sav->sav_sync == B_TRUE); 2745 return; 2746 } 2747 2748 sav->sav_sync = B_TRUE; 2749 2750 if (nvlist_lookup_nvlist_array(sav->sav_config, 2751 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 2752 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2753 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 2754 } 2755 2756 ASSERT(c < naux); 2757 2758 /* 2759 * Setting the nvlist in the middle if the array is a little 2760 * sketchy, but it will work. 2761 */ 2762 nvlist_free(aux[c]); 2763 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 2764 2765 return; 2766 } 2767 2768 /* 2769 * The dirty list is protected by the SCL_CONFIG lock. The caller 2770 * must either hold SCL_CONFIG as writer, or must be the sync thread 2771 * (which holds SCL_CONFIG as reader). There's only one sync thread, 2772 * so this is sufficient to ensure mutual exclusion. 2773 */ 2774 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2775 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2776 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2777 2778 if (vd == rvd) { 2779 for (c = 0; c < rvd->vdev_children; c++) 2780 vdev_config_dirty(rvd->vdev_child[c]); 2781 } else { 2782 ASSERT(vd == vd->vdev_top); 2783 2784 if (!list_link_active(&vd->vdev_config_dirty_node) && 2785 !vd->vdev_ishole) 2786 list_insert_head(&spa->spa_config_dirty_list, vd); 2787 } 2788 } 2789 2790 void 2791 vdev_config_clean(vdev_t *vd) 2792 { 2793 spa_t *spa = vd->vdev_spa; 2794 2795 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2796 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2797 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2798 2799 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2800 list_remove(&spa->spa_config_dirty_list, vd); 2801 } 2802 2803 /* 2804 * Mark a top-level vdev's state as dirty, so that the next pass of 2805 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2806 * the state changes from larger config changes because they require 2807 * much less locking, and are often needed for administrative actions. 2808 */ 2809 void 2810 vdev_state_dirty(vdev_t *vd) 2811 { 2812 spa_t *spa = vd->vdev_spa; 2813 2814 ASSERT(spa_writeable(spa)); 2815 ASSERT(vd == vd->vdev_top); 2816 2817 /* 2818 * The state list is protected by the SCL_STATE lock. The caller 2819 * must either hold SCL_STATE as writer, or must be the sync thread 2820 * (which holds SCL_STATE as reader). There's only one sync thread, 2821 * so this is sufficient to ensure mutual exclusion. 2822 */ 2823 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2824 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2825 spa_config_held(spa, SCL_STATE, RW_READER))); 2826 2827 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 2828 list_insert_head(&spa->spa_state_dirty_list, vd); 2829 } 2830 2831 void 2832 vdev_state_clean(vdev_t *vd) 2833 { 2834 spa_t *spa = vd->vdev_spa; 2835 2836 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2837 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2838 spa_config_held(spa, SCL_STATE, RW_READER))); 2839 2840 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2841 list_remove(&spa->spa_state_dirty_list, vd); 2842 } 2843 2844 /* 2845 * Propagate vdev state up from children to parent. 2846 */ 2847 void 2848 vdev_propagate_state(vdev_t *vd) 2849 { 2850 spa_t *spa = vd->vdev_spa; 2851 vdev_t *rvd = spa->spa_root_vdev; 2852 int degraded = 0, faulted = 0; 2853 int corrupted = 0; 2854 vdev_t *child; 2855 2856 if (vd->vdev_children > 0) { 2857 for (int c = 0; c < vd->vdev_children; c++) { 2858 child = vd->vdev_child[c]; 2859 2860 /* 2861 * Don't factor holes into the decision. 2862 */ 2863 if (child->vdev_ishole) 2864 continue; 2865 2866 if (!vdev_readable(child) || 2867 (!vdev_writeable(child) && spa_writeable(spa))) { 2868 /* 2869 * Root special: if there is a top-level log 2870 * device, treat the root vdev as if it were 2871 * degraded. 2872 */ 2873 if (child->vdev_islog && vd == rvd) 2874 degraded++; 2875 else 2876 faulted++; 2877 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2878 degraded++; 2879 } 2880 2881 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2882 corrupted++; 2883 } 2884 2885 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2886 2887 /* 2888 * Root special: if there is a top-level vdev that cannot be 2889 * opened due to corrupted metadata, then propagate the root 2890 * vdev's aux state as 'corrupt' rather than 'insufficient 2891 * replicas'. 2892 */ 2893 if (corrupted && vd == rvd && 2894 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2895 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2896 VDEV_AUX_CORRUPT_DATA); 2897 } 2898 2899 if (vd->vdev_parent) 2900 vdev_propagate_state(vd->vdev_parent); 2901 } 2902 2903 /* 2904 * Set a vdev's state. If this is during an open, we don't update the parent 2905 * state, because we're in the process of opening children depth-first. 2906 * Otherwise, we propagate the change to the parent. 2907 * 2908 * If this routine places a device in a faulted state, an appropriate ereport is 2909 * generated. 2910 */ 2911 void 2912 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2913 { 2914 uint64_t save_state; 2915 spa_t *spa = vd->vdev_spa; 2916 2917 if (state == vd->vdev_state) { 2918 vd->vdev_stat.vs_aux = aux; 2919 return; 2920 } 2921 2922 save_state = vd->vdev_state; 2923 2924 vd->vdev_state = state; 2925 vd->vdev_stat.vs_aux = aux; 2926 2927 /* 2928 * If we are setting the vdev state to anything but an open state, then 2929 * always close the underlying device unless the device has requested 2930 * a delayed close (i.e. we're about to remove or fault the device). 2931 * Otherwise, we keep accessible but invalid devices open forever. 2932 * We don't call vdev_close() itself, because that implies some extra 2933 * checks (offline, etc) that we don't want here. This is limited to 2934 * leaf devices, because otherwise closing the device will affect other 2935 * children. 2936 */ 2937 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 2938 vd->vdev_ops->vdev_op_leaf) 2939 vd->vdev_ops->vdev_op_close(vd); 2940 2941 /* 2942 * If we have brought this vdev back into service, we need 2943 * to notify fmd so that it can gracefully repair any outstanding 2944 * cases due to a missing device. We do this in all cases, even those 2945 * that probably don't correlate to a repaired fault. This is sure to 2946 * catch all cases, and we let the zfs-retire agent sort it out. If 2947 * this is a transient state it's OK, as the retire agent will 2948 * double-check the state of the vdev before repairing it. 2949 */ 2950 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 2951 vd->vdev_prevstate != state) 2952 zfs_post_state_change(spa, vd); 2953 2954 if (vd->vdev_removed && 2955 state == VDEV_STATE_CANT_OPEN && 2956 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2957 /* 2958 * If the previous state is set to VDEV_STATE_REMOVED, then this 2959 * device was previously marked removed and someone attempted to 2960 * reopen it. If this failed due to a nonexistent device, then 2961 * keep the device in the REMOVED state. We also let this be if 2962 * it is one of our special test online cases, which is only 2963 * attempting to online the device and shouldn't generate an FMA 2964 * fault. 2965 */ 2966 vd->vdev_state = VDEV_STATE_REMOVED; 2967 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2968 } else if (state == VDEV_STATE_REMOVED) { 2969 vd->vdev_removed = B_TRUE; 2970 } else if (state == VDEV_STATE_CANT_OPEN) { 2971 /* 2972 * If we fail to open a vdev during an import or recovery, we 2973 * mark it as "not available", which signifies that it was 2974 * never there to begin with. Failure to open such a device 2975 * is not considered an error. 2976 */ 2977 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 2978 spa_load_state(spa) == SPA_LOAD_RECOVER) && 2979 vd->vdev_ops->vdev_op_leaf) 2980 vd->vdev_not_present = 1; 2981 2982 /* 2983 * Post the appropriate ereport. If the 'prevstate' field is 2984 * set to something other than VDEV_STATE_UNKNOWN, it indicates 2985 * that this is part of a vdev_reopen(). In this case, we don't 2986 * want to post the ereport if the device was already in the 2987 * CANT_OPEN state beforehand. 2988 * 2989 * If the 'checkremove' flag is set, then this is an attempt to 2990 * online the device in response to an insertion event. If we 2991 * hit this case, then we have detected an insertion event for a 2992 * faulted or offline device that wasn't in the removed state. 2993 * In this scenario, we don't post an ereport because we are 2994 * about to replace the device, or attempt an online with 2995 * vdev_forcefault, which will generate the fault for us. 2996 */ 2997 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2998 !vd->vdev_not_present && !vd->vdev_checkremove && 2999 vd != spa->spa_root_vdev) { 3000 const char *class; 3001 3002 switch (aux) { 3003 case VDEV_AUX_OPEN_FAILED: 3004 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3005 break; 3006 case VDEV_AUX_CORRUPT_DATA: 3007 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3008 break; 3009 case VDEV_AUX_NO_REPLICAS: 3010 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3011 break; 3012 case VDEV_AUX_BAD_GUID_SUM: 3013 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3014 break; 3015 case VDEV_AUX_TOO_SMALL: 3016 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3017 break; 3018 case VDEV_AUX_BAD_LABEL: 3019 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3020 break; 3021 default: 3022 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3023 } 3024 3025 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3026 } 3027 3028 /* Erase any notion of persistent removed state */ 3029 vd->vdev_removed = B_FALSE; 3030 } else { 3031 vd->vdev_removed = B_FALSE; 3032 } 3033 3034 if (!isopen && vd->vdev_parent) 3035 vdev_propagate_state(vd->vdev_parent); 3036 } 3037 3038 /* 3039 * Check the vdev configuration to ensure that it's capable of supporting 3040 * a root pool. Currently, we do not support RAID-Z or partial configuration. 3041 * In addition, only a single top-level vdev is allowed and none of the leaves 3042 * can be wholedisks. 3043 */ 3044 boolean_t 3045 vdev_is_bootable(vdev_t *vd) 3046 { 3047 if (!vd->vdev_ops->vdev_op_leaf) { 3048 char *vdev_type = vd->vdev_ops->vdev_op_type; 3049 3050 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3051 vd->vdev_children > 1) { 3052 return (B_FALSE); 3053 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3054 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3055 return (B_FALSE); 3056 } 3057 } else if (vd->vdev_wholedisk == 1) { 3058 return (B_FALSE); 3059 } 3060 3061 for (int c = 0; c < vd->vdev_children; c++) { 3062 if (!vdev_is_bootable(vd->vdev_child[c])) 3063 return (B_FALSE); 3064 } 3065 return (B_TRUE); 3066 } 3067 3068 /* 3069 * Load the state from the original vdev tree (ovd) which 3070 * we've retrieved from the MOS config object. If the original 3071 * vdev was offline or faulted then we transfer that state to the 3072 * device in the current vdev tree (nvd). 3073 */ 3074 void 3075 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3076 { 3077 spa_t *spa = nvd->vdev_spa; 3078 3079 ASSERT(nvd->vdev_top->vdev_islog); 3080 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3081 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3082 3083 for (int c = 0; c < nvd->vdev_children; c++) 3084 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3085 3086 if (nvd->vdev_ops->vdev_op_leaf) { 3087 /* 3088 * Restore the persistent vdev state 3089 */ 3090 nvd->vdev_offline = ovd->vdev_offline; 3091 nvd->vdev_faulted = ovd->vdev_faulted; 3092 nvd->vdev_degraded = ovd->vdev_degraded; 3093 nvd->vdev_removed = ovd->vdev_removed; 3094 } 3095 } 3096 3097 /* 3098 * Determine if a log device has valid content. If the vdev was 3099 * removed or faulted in the MOS config then we know that 3100 * the content on the log device has already been written to the pool. 3101 */ 3102 boolean_t 3103 vdev_log_state_valid(vdev_t *vd) 3104 { 3105 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3106 !vd->vdev_removed) 3107 return (B_TRUE); 3108 3109 for (int c = 0; c < vd->vdev_children; c++) 3110 if (vdev_log_state_valid(vd->vdev_child[c])) 3111 return (B_TRUE); 3112 3113 return (B_FALSE); 3114 } 3115 3116 /* 3117 * Expand a vdev if possible. 3118 */ 3119 void 3120 vdev_expand(vdev_t *vd, uint64_t txg) 3121 { 3122 ASSERT(vd->vdev_top == vd); 3123 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3124 3125 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3126 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3127 vdev_config_dirty(vd); 3128 } 3129 } 3130 3131 /* 3132 * Split a vdev. 3133 */ 3134 void 3135 vdev_split(vdev_t *vd) 3136 { 3137 vdev_t *cvd, *pvd = vd->vdev_parent; 3138 3139 vdev_remove_child(pvd, vd); 3140 vdev_compact_children(pvd); 3141 3142 cvd = pvd->vdev_child[0]; 3143 if (pvd->vdev_children == 1) { 3144 vdev_remove_parent(cvd); 3145 cvd->vdev_splitting = B_TRUE; 3146 } 3147 vdev_propagate_state(cvd); 3148 } 3149