1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 static uint32_t spa_active_count; 58 59 /* 60 * ========================================================================== 61 * SPA state manipulation (open/create/destroy/import/export) 62 * ========================================================================== 63 */ 64 65 static int 66 spa_error_entry_compare(const void *a, const void *b) 67 { 68 spa_error_entry_t *sa = (spa_error_entry_t *)a; 69 spa_error_entry_t *sb = (spa_error_entry_t *)b; 70 int ret; 71 72 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 73 sizeof (zbookmark_t)); 74 75 if (ret < 0) 76 return (-1); 77 else if (ret > 0) 78 return (1); 79 else 80 return (0); 81 } 82 83 /* 84 * Utility function which retrieves copies of the current logs and 85 * re-initializes them in the process. 86 */ 87 void 88 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 89 { 90 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 91 92 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 93 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 94 95 avl_create(&spa->spa_errlist_scrub, 96 spa_error_entry_compare, sizeof (spa_error_entry_t), 97 offsetof(spa_error_entry_t, se_avl)); 98 avl_create(&spa->spa_errlist_last, 99 spa_error_entry_compare, sizeof (spa_error_entry_t), 100 offsetof(spa_error_entry_t, se_avl)); 101 } 102 103 /* 104 * Activate an uninitialized pool. 105 */ 106 static void 107 spa_activate(spa_t *spa) 108 { 109 int t; 110 111 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 112 113 spa->spa_state = POOL_STATE_ACTIVE; 114 115 spa->spa_normal_class = metaslab_class_create(); 116 117 for (t = 0; t < ZIO_TYPES; t++) { 118 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 119 8, maxclsyspri, 50, INT_MAX, 120 TASKQ_PREPOPULATE); 121 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 122 8, maxclsyspri, 50, INT_MAX, 123 TASKQ_PREPOPULATE); 124 } 125 126 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 127 128 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 129 offsetof(vdev_t, vdev_dirty_node)); 130 131 txg_list_create(&spa->spa_vdev_txg_list, 132 offsetof(struct vdev, vdev_txg_node)); 133 134 avl_create(&spa->spa_errlist_scrub, 135 spa_error_entry_compare, sizeof (spa_error_entry_t), 136 offsetof(spa_error_entry_t, se_avl)); 137 avl_create(&spa->spa_errlist_last, 138 spa_error_entry_compare, sizeof (spa_error_entry_t), 139 offsetof(spa_error_entry_t, se_avl)); 140 } 141 142 /* 143 * Opposite of spa_activate(). 144 */ 145 static void 146 spa_deactivate(spa_t *spa) 147 { 148 int t; 149 150 ASSERT(spa->spa_sync_on == B_FALSE); 151 ASSERT(spa->spa_dsl_pool == NULL); 152 ASSERT(spa->spa_root_vdev == NULL); 153 154 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 155 156 txg_list_destroy(&spa->spa_vdev_txg_list); 157 158 list_destroy(&spa->spa_dirty_list); 159 160 rw_destroy(&spa->spa_traverse_lock); 161 162 for (t = 0; t < ZIO_TYPES; t++) { 163 taskq_destroy(spa->spa_zio_issue_taskq[t]); 164 taskq_destroy(spa->spa_zio_intr_taskq[t]); 165 spa->spa_zio_issue_taskq[t] = NULL; 166 spa->spa_zio_intr_taskq[t] = NULL; 167 } 168 169 metaslab_class_destroy(spa->spa_normal_class); 170 spa->spa_normal_class = NULL; 171 172 /* 173 * If this was part of an import or the open otherwise failed, we may 174 * still have errors left in the queues. Empty them just in case. 175 */ 176 spa_errlog_drain(spa); 177 178 avl_destroy(&spa->spa_errlist_scrub); 179 avl_destroy(&spa->spa_errlist_last); 180 181 spa->spa_state = POOL_STATE_UNINITIALIZED; 182 } 183 184 /* 185 * Verify a pool configuration, and construct the vdev tree appropriately. This 186 * will create all the necessary vdevs in the appropriate layout, with each vdev 187 * in the CLOSED state. This will prep the pool before open/creation/import. 188 * All vdev validation is done by the vdev_alloc() routine. 189 */ 190 static vdev_t * 191 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 192 { 193 nvlist_t **child; 194 uint_t c, children; 195 vdev_t *vd; 196 197 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 198 return (NULL); 199 200 if (vd->vdev_ops->vdev_op_leaf) 201 return (vd); 202 203 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204 &child, &children) != 0) { 205 vdev_free(vd); 206 return (NULL); 207 } 208 209 for (c = 0; c < children; c++) { 210 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 211 vdev_free(vd); 212 return (NULL); 213 } 214 } 215 216 return (vd); 217 } 218 219 /* 220 * Opposite of spa_load(). 221 */ 222 static void 223 spa_unload(spa_t *spa) 224 { 225 /* 226 * Stop async tasks. 227 */ 228 spa_async_suspend(spa); 229 230 /* 231 * Stop syncing. 232 */ 233 if (spa->spa_sync_on) { 234 txg_sync_stop(spa->spa_dsl_pool); 235 spa->spa_sync_on = B_FALSE; 236 } 237 238 /* 239 * Wait for any outstanding prefetch I/O to complete. 240 */ 241 spa_config_enter(spa, RW_WRITER, FTAG); 242 spa_config_exit(spa, FTAG); 243 244 /* 245 * Close the dsl pool. 246 */ 247 if (spa->spa_dsl_pool) { 248 dsl_pool_close(spa->spa_dsl_pool); 249 spa->spa_dsl_pool = NULL; 250 } 251 252 /* 253 * Close all vdevs. 254 */ 255 if (spa->spa_root_vdev) 256 vdev_free(spa->spa_root_vdev); 257 ASSERT(spa->spa_root_vdev == NULL); 258 259 spa->spa_async_suspended = 0; 260 } 261 262 /* 263 * Load an existing storage pool, using the pool's builtin spa_config as a 264 * source of configuration information. 265 */ 266 static int 267 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 268 { 269 int error = 0; 270 uint64_t config_cache_txg = spa->spa_config_txg; 271 nvlist_t *nvroot = NULL; 272 vdev_t *rvd; 273 uberblock_t *ub = &spa->spa_uberblock; 274 uint64_t pool_guid; 275 zio_t *zio; 276 277 spa->spa_load_state = state; 278 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 279 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 280 error = EINVAL; 281 goto out; 282 } 283 284 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 285 &spa->spa_config_txg); 286 287 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 288 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 289 spa_guid_exists(pool_guid, 0)) { 290 error = EEXIST; 291 goto out; 292 } 293 294 /* 295 * Parse the configuration into a vdev tree. 296 */ 297 spa_config_enter(spa, RW_WRITER, FTAG); 298 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 299 spa_config_exit(spa, FTAG); 300 301 if (rvd == NULL) { 302 error = EINVAL; 303 goto out; 304 } 305 306 ASSERT(spa->spa_root_vdev == rvd); 307 ASSERT(spa_guid(spa) == pool_guid); 308 309 /* 310 * Try to open all vdevs, loading each label in the process. 311 */ 312 if (vdev_open(rvd) != 0) { 313 error = ENXIO; 314 goto out; 315 } 316 317 /* 318 * Find the best uberblock. 319 */ 320 bzero(ub, sizeof (uberblock_t)); 321 322 zio = zio_root(spa, NULL, NULL, 323 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 324 vdev_uberblock_load(zio, rvd, ub); 325 error = zio_wait(zio); 326 327 /* 328 * If we weren't able to find a single valid uberblock, return failure. 329 */ 330 if (ub->ub_txg == 0) { 331 error = ENXIO; 332 goto out; 333 } 334 335 /* 336 * If the pool is newer than the code, we can't open it. 337 */ 338 if (ub->ub_version > UBERBLOCK_VERSION) { 339 error = ENOTSUP; 340 goto out; 341 } 342 343 /* 344 * If the vdev guid sum doesn't match the uberblock, we have an 345 * incomplete configuration. 346 */ 347 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 348 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 349 VDEV_AUX_BAD_GUID_SUM); 350 error = ENXIO; 351 goto out; 352 } 353 354 /* 355 * Initialize internal SPA structures. 356 */ 357 spa->spa_state = POOL_STATE_ACTIVE; 358 spa->spa_ubsync = spa->spa_uberblock; 359 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 360 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 361 if (error) { 362 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 363 VDEV_AUX_CORRUPT_DATA); 364 goto out; 365 } 366 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 367 368 if (zap_lookup(spa->spa_meta_objset, 369 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 370 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 371 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 372 VDEV_AUX_CORRUPT_DATA); 373 error = EIO; 374 goto out; 375 } 376 377 if (!mosconfig) { 378 dmu_buf_t *db; 379 char *packed = NULL; 380 size_t nvsize = 0; 381 nvlist_t *newconfig = NULL; 382 383 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 384 spa->spa_config_object, FTAG, &db)); 385 nvsize = *(uint64_t *)db->db_data; 386 dmu_buf_rele(db, FTAG); 387 388 packed = kmem_alloc(nvsize, KM_SLEEP); 389 error = dmu_read(spa->spa_meta_objset, 390 spa->spa_config_object, 0, nvsize, packed); 391 if (error == 0) 392 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 393 kmem_free(packed, nvsize); 394 395 if (error) { 396 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 397 VDEV_AUX_CORRUPT_DATA); 398 error = EIO; 399 goto out; 400 } 401 402 spa_config_set(spa, newconfig); 403 404 spa_unload(spa); 405 spa_deactivate(spa); 406 spa_activate(spa); 407 408 return (spa_load(spa, newconfig, state, B_TRUE)); 409 } 410 411 if (zap_lookup(spa->spa_meta_objset, 412 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 413 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 414 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 415 VDEV_AUX_CORRUPT_DATA); 416 error = EIO; 417 goto out; 418 } 419 420 /* 421 * Load the persistent error log. If we have an older pool, this will 422 * not be present. 423 */ 424 error = zap_lookup(spa->spa_meta_objset, 425 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 426 sizeof (uint64_t), 1, &spa->spa_errlog_last); 427 if (error != 0 &&error != ENOENT) { 428 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 429 VDEV_AUX_CORRUPT_DATA); 430 error = EIO; 431 goto out; 432 } 433 434 error = zap_lookup(spa->spa_meta_objset, 435 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 436 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 437 if (error != 0 && error != ENOENT) { 438 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 439 VDEV_AUX_CORRUPT_DATA); 440 error = EIO; 441 goto out; 442 } 443 444 /* 445 * Load the vdev state for all top level vdevs. We need to grab the 446 * config lock because all label I/O is done with the 447 * ZIO_FLAG_CONFIG_HELD flag. 448 */ 449 spa_config_enter(spa, RW_READER, FTAG); 450 if ((error = vdev_load(rvd)) != 0) { 451 spa_config_exit(spa, FTAG); 452 goto out; 453 } 454 spa_config_exit(spa, FTAG); 455 456 /* 457 * Propagate the leaf DTLs we just loaded all the way up the tree. 458 */ 459 spa_config_enter(spa, RW_WRITER, FTAG); 460 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 461 spa_config_exit(spa, FTAG); 462 463 /* 464 * Check the state of the root vdev. If it can't be opened, it 465 * indicates one or more toplevel vdevs are faulted. 466 */ 467 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 468 error = ENXIO; 469 goto out; 470 } 471 472 /* 473 * Claim log blocks that haven't been committed yet, and update all 474 * top-level vdevs to sync any config changes found in vdev_load(). 475 * This must all happen in a single txg. 476 */ 477 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 478 int c; 479 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 480 spa_first_txg(spa)); 481 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 482 vdev_config_dirty(rvd); 483 dmu_tx_commit(tx); 484 485 spa->spa_sync_on = B_TRUE; 486 txg_sync_start(spa->spa_dsl_pool); 487 488 /* 489 * Wait for all claims to sync. 490 */ 491 txg_wait_synced(spa->spa_dsl_pool, 0); 492 493 /* 494 * If the config cache is stale relative to the mosconfig, 495 * sync the config cache. 496 */ 497 if (config_cache_txg != spa->spa_config_txg) 498 spa_config_sync(); 499 500 /* 501 * If we have top-level vdevs that were added but have 502 * not yet been prepared for allocation, do that now. 503 * (It's safe now because the config cache is up to date, 504 * so it will be able to translate the new DVAs.) 505 * See comments in spa_vdev_add() for full details. 506 */ 507 for (c = 0; c < rvd->vdev_children; c++) { 508 vdev_t *tvd = rvd->vdev_child[c]; 509 if (tvd->vdev_ms_array == 0) { 510 uint64_t txg = spa_last_synced_txg(spa) + 1; 511 ASSERT(tvd->vdev_ms_shift == 0); 512 spa_config_enter(spa, RW_WRITER, FTAG); 513 vdev_init(tvd, txg); 514 vdev_config_dirty(tvd); 515 spa_config_set(spa, 516 spa_config_generate(spa, rvd, txg, 0)); 517 spa_config_exit(spa, FTAG); 518 txg_wait_synced(spa->spa_dsl_pool, txg); 519 ASSERT(tvd->vdev_ms_shift != 0); 520 ASSERT(tvd->vdev_ms_array != 0); 521 spa_config_sync(); 522 } 523 } 524 } 525 526 error = 0; 527 out: 528 if (error) 529 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 530 spa->spa_load_state = SPA_LOAD_NONE; 531 spa->spa_ena = 0; 532 533 return (error); 534 } 535 536 /* 537 * Pool Open/Import 538 * 539 * The import case is identical to an open except that the configuration is sent 540 * down from userland, instead of grabbed from the configuration cache. For the 541 * case of an open, the pool configuration will exist in the 542 * POOL_STATE_UNITIALIZED state. 543 * 544 * The stats information (gen/count/ustats) is used to gather vdev statistics at 545 * the same time open the pool, without having to keep around the spa_t in some 546 * ambiguous state. 547 */ 548 static int 549 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 550 { 551 spa_t *spa; 552 int error; 553 int loaded = B_FALSE; 554 int locked = B_FALSE; 555 556 *spapp = NULL; 557 558 /* 559 * As disgusting as this is, we need to support recursive calls to this 560 * function because dsl_dir_open() is called during spa_load(), and ends 561 * up calling spa_open() again. The real fix is to figure out how to 562 * avoid dsl_dir_open() calling this in the first place. 563 */ 564 if (mutex_owner(&spa_namespace_lock) != curthread) { 565 mutex_enter(&spa_namespace_lock); 566 locked = B_TRUE; 567 } 568 569 if ((spa = spa_lookup(pool)) == NULL) { 570 if (locked) 571 mutex_exit(&spa_namespace_lock); 572 return (ENOENT); 573 } 574 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 575 576 spa_activate(spa); 577 578 error = spa_load(spa, spa->spa_config, 579 SPA_LOAD_OPEN, B_FALSE); 580 581 if (error == EBADF) { 582 /* 583 * If vdev_load() returns EBADF, it indicates that one 584 * of the vdevs indicates that the pool has been 585 * exported or destroyed. If this is the case, the 586 * config cache is out of sync and we should remove the 587 * pool from the namespace. 588 */ 589 spa_unload(spa); 590 spa_deactivate(spa); 591 spa_remove(spa); 592 spa_config_sync(); 593 if (locked) 594 mutex_exit(&spa_namespace_lock); 595 return (ENOENT); 596 } 597 598 if (error) { 599 /* 600 * We can't open the pool, but we still have useful 601 * information: the state of each vdev after the 602 * attempted vdev_open(). Return this to the user. 603 */ 604 if (config != NULL && spa->spa_root_vdev != NULL) 605 *config = spa_config_generate(spa, NULL, -1ULL, 606 B_TRUE); 607 spa_unload(spa); 608 spa_deactivate(spa); 609 spa->spa_last_open_failed = B_TRUE; 610 if (locked) 611 mutex_exit(&spa_namespace_lock); 612 *spapp = NULL; 613 return (error); 614 } else { 615 zfs_post_ok(spa, NULL); 616 spa->spa_last_open_failed = B_FALSE; 617 } 618 619 loaded = B_TRUE; 620 } 621 622 spa_open_ref(spa, tag); 623 if (locked) 624 mutex_exit(&spa_namespace_lock); 625 626 *spapp = spa; 627 628 if (config != NULL) { 629 spa_config_enter(spa, RW_READER, FTAG); 630 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 631 spa_config_exit(spa, FTAG); 632 } 633 634 /* 635 * If we just loaded the pool, resilver anything that's out of date. 636 */ 637 if (loaded && (spa_mode & FWRITE)) 638 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 639 640 return (0); 641 } 642 643 int 644 spa_open(const char *name, spa_t **spapp, void *tag) 645 { 646 return (spa_open_common(name, spapp, tag, NULL)); 647 } 648 649 /* 650 * Lookup the given spa_t, incrementing the inject count in the process, 651 * preventing it from being exported or destroyed. 652 */ 653 spa_t * 654 spa_inject_addref(char *name) 655 { 656 spa_t *spa; 657 658 mutex_enter(&spa_namespace_lock); 659 if ((spa = spa_lookup(name)) == NULL) { 660 mutex_exit(&spa_namespace_lock); 661 return (NULL); 662 } 663 spa->spa_inject_ref++; 664 mutex_exit(&spa_namespace_lock); 665 666 return (spa); 667 } 668 669 void 670 spa_inject_delref(spa_t *spa) 671 { 672 mutex_enter(&spa_namespace_lock); 673 spa->spa_inject_ref--; 674 mutex_exit(&spa_namespace_lock); 675 } 676 677 int 678 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 679 { 680 int error; 681 spa_t *spa; 682 683 *config = NULL; 684 error = spa_open_common(name, &spa, FTAG, config); 685 686 if (spa && *config != NULL) 687 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 688 spa_get_errlog_size(spa)) == 0); 689 690 /* 691 * We want to get the alternate root even for faulted pools, so we cheat 692 * and call spa_lookup() directly. 693 */ 694 if (altroot) { 695 if (spa == NULL) { 696 mutex_enter(&spa_namespace_lock); 697 spa = spa_lookup(name); 698 if (spa) 699 spa_altroot(spa, altroot, buflen); 700 else 701 altroot[0] = '\0'; 702 spa = NULL; 703 mutex_exit(&spa_namespace_lock); 704 } else { 705 spa_altroot(spa, altroot, buflen); 706 } 707 } 708 709 if (spa != NULL) 710 spa_close(spa, FTAG); 711 712 return (error); 713 } 714 715 /* 716 * Pool Creation 717 */ 718 int 719 spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 720 { 721 spa_t *spa; 722 dsl_pool_t *dp; 723 dmu_tx_t *tx; 724 int error; 725 uint64_t txg = TXG_INITIAL; 726 727 /* 728 * If this pool already exists, return failure. 729 */ 730 mutex_enter(&spa_namespace_lock); 731 if (spa_lookup(pool) != NULL) { 732 mutex_exit(&spa_namespace_lock); 733 return (EEXIST); 734 } 735 spa = spa_add(pool); 736 737 /* 738 * Allocate a new spa_t structure. 739 */ 740 spa_activate(spa); 741 742 spa->spa_uberblock.ub_txg = txg - 1; 743 spa->spa_ubsync = spa->spa_uberblock; 744 745 error = spa_vdev_add(spa, nvroot); 746 747 if (error) { 748 spa_unload(spa); 749 spa_deactivate(spa); 750 spa_remove(spa); 751 mutex_exit(&spa_namespace_lock); 752 return (error); 753 } 754 755 if (altroot != NULL) { 756 spa->spa_root = spa_strdup(altroot); 757 atomic_add_32(&spa_active_count, 1); 758 } 759 760 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 761 spa->spa_meta_objset = dp->dp_meta_objset; 762 763 tx = dmu_tx_create_assigned(dp, txg); 764 765 /* 766 * Create the pool config object. 767 */ 768 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 769 DMU_OT_PACKED_NVLIST, 1 << 14, 770 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 771 772 if (zap_add(spa->spa_meta_objset, 773 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 774 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 775 cmn_err(CE_PANIC, "failed to add pool config"); 776 } 777 778 /* 779 * Create the deferred-free bplist object. Turn off compression 780 * because sync-to-convergence takes longer if the blocksize 781 * keeps changing. 782 */ 783 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 784 1 << 14, tx); 785 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 786 ZIO_COMPRESS_OFF, tx); 787 788 if (zap_add(spa->spa_meta_objset, 789 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 790 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 791 cmn_err(CE_PANIC, "failed to add bplist"); 792 } 793 794 dmu_tx_commit(tx); 795 796 spa->spa_sync_on = B_TRUE; 797 txg_sync_start(spa->spa_dsl_pool); 798 799 /* 800 * We explicitly wait for the first transaction to complete so that our 801 * bean counters are appropriately updated. 802 */ 803 txg_wait_synced(spa->spa_dsl_pool, txg); 804 805 spa_config_sync(); 806 807 mutex_exit(&spa_namespace_lock); 808 809 return (0); 810 } 811 812 /* 813 * Import the given pool into the system. We set up the necessary spa_t and 814 * then call spa_load() to do the dirty work. 815 */ 816 int 817 spa_import(const char *pool, nvlist_t *config, char *altroot) 818 { 819 spa_t *spa; 820 int error; 821 822 if (!(spa_mode & FWRITE)) 823 return (EROFS); 824 825 /* 826 * If a pool with this name exists, return failure. 827 */ 828 mutex_enter(&spa_namespace_lock); 829 if (spa_lookup(pool) != NULL) { 830 mutex_exit(&spa_namespace_lock); 831 return (EEXIST); 832 } 833 834 /* 835 * Create an initialize the spa structure 836 */ 837 spa = spa_add(pool); 838 spa_activate(spa); 839 840 /* 841 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 842 * so that we don't try to open the pool if the config is damaged. 843 */ 844 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 845 846 if (error) { 847 spa_unload(spa); 848 spa_deactivate(spa); 849 spa_remove(spa); 850 mutex_exit(&spa_namespace_lock); 851 return (error); 852 } 853 854 /* 855 * Set the alternate root, if there is one. 856 */ 857 if (altroot != NULL) { 858 atomic_add_32(&spa_active_count, 1); 859 spa->spa_root = spa_strdup(altroot); 860 } 861 862 /* 863 * Initialize the config based on the in-core state. 864 */ 865 config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 866 867 spa_config_set(spa, config); 868 869 /* 870 * Sync the configuration cache. 871 */ 872 spa_config_sync(); 873 874 mutex_exit(&spa_namespace_lock); 875 876 /* 877 * Resilver anything that's out of date. 878 */ 879 if (spa_mode & FWRITE) 880 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 881 882 return (0); 883 } 884 885 /* 886 * This (illegal) pool name is used when temporarily importing a spa_t in order 887 * to get the vdev stats associated with the imported devices. 888 */ 889 #define TRYIMPORT_NAME "$import" 890 891 nvlist_t * 892 spa_tryimport(nvlist_t *tryconfig) 893 { 894 nvlist_t *config = NULL; 895 char *poolname; 896 spa_t *spa; 897 uint64_t state; 898 899 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 900 return (NULL); 901 902 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 903 return (NULL); 904 905 mutex_enter(&spa_namespace_lock); 906 spa = spa_add(TRYIMPORT_NAME); 907 908 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 909 910 /* 911 * Initialize the spa_t structure. 912 */ 913 spa_activate(spa); 914 915 /* 916 * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 917 * so we don't try to open the pool if the config is damaged. 918 */ 919 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 920 921 /* 922 * If 'tryconfig' was at least parsable, return the current config. 923 */ 924 if (spa->spa_root_vdev != NULL) { 925 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 926 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 927 poolname) == 0); 928 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 929 state) == 0); 930 } 931 932 spa_unload(spa); 933 spa_deactivate(spa); 934 spa_remove(spa); 935 mutex_exit(&spa_namespace_lock); 936 937 return (config); 938 } 939 940 /* 941 * Pool export/destroy 942 * 943 * The act of destroying or exporting a pool is very simple. We make sure there 944 * is no more pending I/O and any references to the pool are gone. Then, we 945 * update the pool state and sync all the labels to disk, removing the 946 * configuration from the cache afterwards. 947 */ 948 static int 949 spa_export_common(char *pool, int new_state) 950 { 951 spa_t *spa; 952 953 if (!(spa_mode & FWRITE)) 954 return (EROFS); 955 956 mutex_enter(&spa_namespace_lock); 957 if ((spa = spa_lookup(pool)) == NULL) { 958 mutex_exit(&spa_namespace_lock); 959 return (ENOENT); 960 } 961 962 /* 963 * Put a hold on the pool, drop the namespace lock, stop async tasks, 964 * reacquire the namespace lock, and see if we can export. 965 */ 966 spa_open_ref(spa, FTAG); 967 mutex_exit(&spa_namespace_lock); 968 spa_async_suspend(spa); 969 mutex_enter(&spa_namespace_lock); 970 spa_close(spa, FTAG); 971 972 /* 973 * The pool will be in core if it's openable, 974 * in which case we can modify its state. 975 */ 976 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 977 /* 978 * Objsets may be open only because they're dirty, so we 979 * have to force it to sync before checking spa_refcnt. 980 */ 981 spa_scrub_suspend(spa); 982 txg_wait_synced(spa->spa_dsl_pool, 0); 983 984 /* 985 * A pool cannot be exported or destroyed if there are active 986 * references. If we are resetting a pool, allow references by 987 * fault injection handlers. 988 */ 989 if (!spa_refcount_zero(spa) || 990 (spa->spa_inject_ref != 0 && 991 new_state != POOL_STATE_UNINITIALIZED)) { 992 spa_scrub_resume(spa); 993 spa_async_resume(spa); 994 mutex_exit(&spa_namespace_lock); 995 return (EBUSY); 996 } 997 998 spa_scrub_resume(spa); 999 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1000 1001 if (spa->spa_root != NULL) 1002 atomic_add_32(&spa_active_count, -1); 1003 1004 /* 1005 * We want this to be reflected on every label, 1006 * so mark them all dirty. spa_unload() will do the 1007 * final sync that pushes these changes out. 1008 */ 1009 if (new_state != POOL_STATE_UNINITIALIZED) { 1010 spa->spa_state = new_state; 1011 vdev_config_dirty(spa->spa_root_vdev); 1012 } 1013 } 1014 1015 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1016 spa_unload(spa); 1017 spa_deactivate(spa); 1018 } 1019 1020 if (new_state != POOL_STATE_UNINITIALIZED) { 1021 spa_remove(spa); 1022 spa_config_sync(); 1023 } 1024 mutex_exit(&spa_namespace_lock); 1025 1026 return (0); 1027 } 1028 1029 /* 1030 * Destroy a storage pool. 1031 */ 1032 int 1033 spa_destroy(char *pool) 1034 { 1035 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1036 } 1037 1038 /* 1039 * Export a storage pool. 1040 */ 1041 int 1042 spa_export(char *pool) 1043 { 1044 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1045 } 1046 1047 /* 1048 * Similar to spa_export(), this unloads the spa_t without actually removing it 1049 * from the namespace in any way. 1050 */ 1051 int 1052 spa_reset(char *pool) 1053 { 1054 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1055 } 1056 1057 1058 /* 1059 * ========================================================================== 1060 * Device manipulation 1061 * ========================================================================== 1062 */ 1063 1064 /* 1065 * Add capacity to a storage pool. 1066 */ 1067 int 1068 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1069 { 1070 uint64_t txg; 1071 int c, c0, children, error; 1072 vdev_t *rvd = spa->spa_root_vdev; 1073 vdev_t *vd, *tvd; 1074 1075 txg = spa_vdev_enter(spa); 1076 1077 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1078 1079 if (vd == NULL) 1080 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1081 1082 if (rvd == NULL) { /* spa_create() */ 1083 rvd = vd; 1084 c0 = 0; 1085 } else { 1086 c0 = rvd->vdev_children; 1087 } 1088 1089 ASSERT(spa->spa_root_vdev == rvd); 1090 1091 if ((error = vdev_create(vd, txg)) != 0) 1092 return (spa_vdev_exit(spa, vd, txg, error)); 1093 1094 children = vd->vdev_children; 1095 1096 /* 1097 * Transfer each new top-level vdev from vd to rvd. 1098 */ 1099 for (c = 0; c < children; c++) { 1100 tvd = vd->vdev_child[c]; 1101 if (vd != rvd) { 1102 vdev_remove_child(vd, tvd); 1103 tvd->vdev_id = c0 + c; 1104 vdev_add_child(rvd, tvd); 1105 } 1106 vdev_config_dirty(tvd); 1107 } 1108 1109 /* 1110 * We have to be careful when adding new vdevs to an existing pool. 1111 * If other threads start allocating from these vdevs before we 1112 * sync the config cache, and we lose power, then upon reboot we may 1113 * fail to open the pool because there are DVAs that the config cache 1114 * can't translate. Therefore, we first add the vdevs without 1115 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1116 * initialize the metaslabs; and sync the config cache again. 1117 * 1118 * spa_load() checks for added-but-not-initialized vdevs, so that 1119 * if we lose power at any point in this sequence, the remaining 1120 * steps will be completed the next time we load the pool. 1121 */ 1122 if (vd != rvd) { 1123 (void) spa_vdev_exit(spa, vd, txg, 0); 1124 txg = spa_vdev_enter(spa); 1125 vd = NULL; 1126 } 1127 1128 /* 1129 * Now that the config is safely on disk, we can use the new space. 1130 */ 1131 for (c = 0; c < children; c++) { 1132 tvd = rvd->vdev_child[c0 + c]; 1133 ASSERT(tvd->vdev_ms_array == 0); 1134 vdev_init(tvd, txg); 1135 vdev_config_dirty(tvd); 1136 } 1137 1138 return (spa_vdev_exit(spa, vd, txg, 0)); 1139 } 1140 1141 /* 1142 * Attach a device to a mirror. The arguments are the path to any device 1143 * in the mirror, and the nvroot for the new device. If the path specifies 1144 * a device that is not mirrored, we automatically insert the mirror vdev. 1145 * 1146 * If 'replacing' is specified, the new device is intended to replace the 1147 * existing device; in this case the two devices are made into their own 1148 * mirror using the 'replacing' vdev, which is functionally idendical to 1149 * the mirror vdev (it actually reuses all the same ops) but has a few 1150 * extra rules: you can't attach to it after it's been created, and upon 1151 * completion of resilvering, the first disk (the one being replaced) 1152 * is automatically detached. 1153 */ 1154 int 1155 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1156 { 1157 uint64_t txg, open_txg; 1158 int error; 1159 vdev_t *rvd = spa->spa_root_vdev; 1160 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1161 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1162 1163 txg = spa_vdev_enter(spa); 1164 1165 oldvd = vdev_lookup_by_guid(rvd, guid); 1166 1167 if (oldvd == NULL) 1168 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1169 1170 if (!oldvd->vdev_ops->vdev_op_leaf) 1171 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1172 1173 pvd = oldvd->vdev_parent; 1174 1175 /* 1176 * The parent must be a mirror or the root, unless we're replacing; 1177 * in that case, the parent can be anything but another replacing vdev. 1178 */ 1179 if (pvd->vdev_ops != &vdev_mirror_ops && 1180 pvd->vdev_ops != &vdev_root_ops && 1181 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1182 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1183 1184 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1185 1186 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1187 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1188 1189 newvd = newrootvd->vdev_child[0]; 1190 1191 if (!newvd->vdev_ops->vdev_op_leaf) 1192 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1193 1194 if ((error = vdev_create(newrootvd, txg)) != 0) 1195 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1196 1197 /* 1198 * Compare the new device size with the replaceable/attachable 1199 * device size. 1200 */ 1201 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1202 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1203 1204 if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1205 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1206 1207 /* 1208 * If this is an in-place replacement, update oldvd's path and devid 1209 * to make it distinguishable from newvd, and unopenable from now on. 1210 */ 1211 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1212 spa_strfree(oldvd->vdev_path); 1213 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1214 KM_SLEEP); 1215 (void) sprintf(oldvd->vdev_path, "%s/%s", 1216 newvd->vdev_path, "old"); 1217 if (oldvd->vdev_devid != NULL) { 1218 spa_strfree(oldvd->vdev_devid); 1219 oldvd->vdev_devid = NULL; 1220 } 1221 } 1222 1223 /* 1224 * If the parent is not a mirror, or if we're replacing, 1225 * insert the new mirror/replacing vdev above oldvd. 1226 */ 1227 if (pvd->vdev_ops != pvops) 1228 pvd = vdev_add_parent(oldvd, pvops); 1229 1230 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1231 ASSERT(pvd->vdev_ops == pvops); 1232 ASSERT(oldvd->vdev_parent == pvd); 1233 1234 /* 1235 * Extract the new device from its root and add it to pvd. 1236 */ 1237 vdev_remove_child(newrootvd, newvd); 1238 newvd->vdev_id = pvd->vdev_children; 1239 vdev_add_child(pvd, newvd); 1240 1241 /* 1242 * If newvd is smaller than oldvd, but larger than its rsize, 1243 * the addition of newvd may have decreased our parent's asize. 1244 */ 1245 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1246 1247 tvd = newvd->vdev_top; 1248 ASSERT(pvd->vdev_top == tvd); 1249 ASSERT(tvd->vdev_parent == rvd); 1250 1251 vdev_config_dirty(tvd); 1252 1253 /* 1254 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1255 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1256 */ 1257 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1258 1259 mutex_enter(&newvd->vdev_dtl_lock); 1260 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1261 open_txg - TXG_INITIAL + 1); 1262 mutex_exit(&newvd->vdev_dtl_lock); 1263 1264 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1265 1266 /* 1267 * Mark newvd's DTL dirty in this txg. 1268 */ 1269 vdev_dirty(tvd, VDD_DTL, txg); 1270 (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1271 1272 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1273 1274 /* 1275 * Kick off a resilver to update newvd. 1276 */ 1277 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1278 1279 return (0); 1280 } 1281 1282 /* 1283 * Detach a device from a mirror or replacing vdev. 1284 * If 'replace_done' is specified, only detach if the parent 1285 * is a replacing vdev. 1286 */ 1287 int 1288 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1289 { 1290 uint64_t txg; 1291 int c, t, error; 1292 vdev_t *rvd = spa->spa_root_vdev; 1293 vdev_t *vd, *pvd, *cvd, *tvd; 1294 1295 txg = spa_vdev_enter(spa); 1296 1297 vd = vdev_lookup_by_guid(rvd, guid); 1298 1299 if (vd == NULL) 1300 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1301 1302 if (!vd->vdev_ops->vdev_op_leaf) 1303 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1304 1305 pvd = vd->vdev_parent; 1306 1307 /* 1308 * If replace_done is specified, only remove this device if it's 1309 * the first child of a replacing vdev. 1310 */ 1311 if (replace_done && 1312 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1313 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1314 1315 /* 1316 * Only mirror and replacing vdevs support detach. 1317 */ 1318 if (pvd->vdev_ops != &vdev_replacing_ops && 1319 pvd->vdev_ops != &vdev_mirror_ops) 1320 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1321 1322 /* 1323 * If there's only one replica, you can't detach it. 1324 */ 1325 if (pvd->vdev_children <= 1) 1326 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1327 1328 /* 1329 * If all siblings have non-empty DTLs, this device may have the only 1330 * valid copy of the data, which means we cannot safely detach it. 1331 * 1332 * XXX -- as in the vdev_offline() case, we really want a more 1333 * precise DTL check. 1334 */ 1335 for (c = 0; c < pvd->vdev_children; c++) { 1336 uint64_t dirty; 1337 1338 cvd = pvd->vdev_child[c]; 1339 if (cvd == vd) 1340 continue; 1341 if (vdev_is_dead(cvd)) 1342 continue; 1343 mutex_enter(&cvd->vdev_dtl_lock); 1344 dirty = cvd->vdev_dtl_map.sm_space | 1345 cvd->vdev_dtl_scrub.sm_space; 1346 mutex_exit(&cvd->vdev_dtl_lock); 1347 if (!dirty) 1348 break; 1349 } 1350 if (c == pvd->vdev_children) 1351 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1352 1353 /* 1354 * Erase the disk labels so the disk can be used for other things. 1355 * This must be done after all other error cases are handled, 1356 * but before we disembowel vd (so we can still do I/O to it). 1357 * But if we can't do it, don't treat the error as fatal -- 1358 * it may be that the unwritability of the disk is the reason 1359 * it's being detached! 1360 */ 1361 error = vdev_label_init(vd, 0); 1362 if (error) 1363 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1364 1365 /* 1366 * Remove vd from its parent and compact the parent's children. 1367 */ 1368 vdev_remove_child(pvd, vd); 1369 vdev_compact_children(pvd); 1370 1371 /* 1372 * Remember one of the remaining children so we can get tvd below. 1373 */ 1374 cvd = pvd->vdev_child[0]; 1375 1376 /* 1377 * If the parent mirror/replacing vdev only has one child, 1378 * the parent is no longer needed. Remove it from the tree. 1379 */ 1380 if (pvd->vdev_children == 1) 1381 vdev_remove_parent(cvd); 1382 1383 /* 1384 * We don't set tvd until now because the parent we just removed 1385 * may have been the previous top-level vdev. 1386 */ 1387 tvd = cvd->vdev_top; 1388 ASSERT(tvd->vdev_parent == rvd); 1389 1390 /* 1391 * Reopen this top-level vdev to reassess health after detach. 1392 */ 1393 vdev_reopen(tvd); 1394 1395 /* 1396 * If the device we just detached was smaller than the others, 1397 * it may be possible to add metaslabs (i.e. grow the pool). We ignore 1398 * the error here because the detach still succeeded - we just weren't 1399 * able to reinitialize the metaslabs. This pool is in for a world of 1400 * hurt, in any case. 1401 */ 1402 (void) vdev_metaslab_init(tvd, txg); 1403 1404 vdev_config_dirty(tvd); 1405 1406 /* 1407 * Mark vd's DTL as dirty in this txg. 1408 * vdev_dtl_sync() will see that vd->vdev_detached is set 1409 * and free vd's DTL object in syncing context. 1410 * But first make sure we're not on any *other* txg's DTL list, 1411 * to prevent vd from being accessed after it's freed. 1412 */ 1413 vdev_dirty(tvd, VDD_DTL, txg); 1414 vd->vdev_detached = B_TRUE; 1415 for (t = 0; t < TXG_SIZE; t++) 1416 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1417 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1418 1419 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1420 1421 return (spa_vdev_exit(spa, vd, txg, 0)); 1422 } 1423 1424 /* 1425 * Find any device that's done replacing, so we can detach it. 1426 */ 1427 static vdev_t * 1428 spa_vdev_replace_done_hunt(vdev_t *vd) 1429 { 1430 vdev_t *newvd, *oldvd; 1431 int c; 1432 1433 for (c = 0; c < vd->vdev_children; c++) { 1434 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1435 if (oldvd != NULL) 1436 return (oldvd); 1437 } 1438 1439 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1440 oldvd = vd->vdev_child[0]; 1441 newvd = vd->vdev_child[1]; 1442 1443 mutex_enter(&newvd->vdev_dtl_lock); 1444 if (newvd->vdev_dtl_map.sm_space == 0 && 1445 newvd->vdev_dtl_scrub.sm_space == 0) { 1446 mutex_exit(&newvd->vdev_dtl_lock); 1447 return (oldvd); 1448 } 1449 mutex_exit(&newvd->vdev_dtl_lock); 1450 } 1451 1452 return (NULL); 1453 } 1454 1455 static void 1456 spa_vdev_replace_done(spa_t *spa) 1457 { 1458 vdev_t *vd; 1459 uint64_t guid; 1460 1461 spa_config_enter(spa, RW_READER, FTAG); 1462 1463 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1464 guid = vd->vdev_guid; 1465 spa_config_exit(spa, FTAG); 1466 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1467 return; 1468 spa_config_enter(spa, RW_READER, FTAG); 1469 } 1470 1471 spa_config_exit(spa, FTAG); 1472 } 1473 1474 /* 1475 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1476 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1477 */ 1478 int 1479 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1480 { 1481 vdev_t *rvd, *vd; 1482 uint64_t txg; 1483 1484 rvd = spa->spa_root_vdev; 1485 1486 txg = spa_vdev_enter(spa); 1487 1488 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1489 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1490 1491 if (!vd->vdev_ops->vdev_op_leaf) 1492 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1493 1494 spa_strfree(vd->vdev_path); 1495 vd->vdev_path = spa_strdup(newpath); 1496 1497 vdev_config_dirty(vd->vdev_top); 1498 1499 return (spa_vdev_exit(spa, NULL, txg, 0)); 1500 } 1501 1502 /* 1503 * ========================================================================== 1504 * SPA Scrubbing 1505 * ========================================================================== 1506 */ 1507 1508 void 1509 spa_scrub_throttle(spa_t *spa, int direction) 1510 { 1511 mutex_enter(&spa->spa_scrub_lock); 1512 spa->spa_scrub_throttled += direction; 1513 ASSERT(spa->spa_scrub_throttled >= 0); 1514 if (spa->spa_scrub_throttled == 0) 1515 cv_broadcast(&spa->spa_scrub_io_cv); 1516 mutex_exit(&spa->spa_scrub_lock); 1517 } 1518 1519 static void 1520 spa_scrub_io_done(zio_t *zio) 1521 { 1522 spa_t *spa = zio->io_spa; 1523 1524 zio_buf_free(zio->io_data, zio->io_size); 1525 1526 mutex_enter(&spa->spa_scrub_lock); 1527 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1528 vdev_t *vd = zio->io_vd; 1529 spa->spa_scrub_errors++; 1530 mutex_enter(&vd->vdev_stat_lock); 1531 vd->vdev_stat.vs_scrub_errors++; 1532 mutex_exit(&vd->vdev_stat_lock); 1533 } 1534 if (--spa->spa_scrub_inflight == 0) { 1535 cv_broadcast(&spa->spa_scrub_io_cv); 1536 ASSERT(spa->spa_scrub_throttled == 0); 1537 } 1538 mutex_exit(&spa->spa_scrub_lock); 1539 } 1540 1541 static void 1542 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1543 zbookmark_t *zb) 1544 { 1545 size_t size = BP_GET_LSIZE(bp); 1546 void *data = zio_buf_alloc(size); 1547 1548 mutex_enter(&spa->spa_scrub_lock); 1549 spa->spa_scrub_inflight++; 1550 mutex_exit(&spa->spa_scrub_lock); 1551 1552 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1553 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1554 1555 flags |= ZIO_FLAG_CANFAIL; 1556 1557 zio_nowait(zio_read(NULL, spa, bp, data, size, 1558 spa_scrub_io_done, NULL, priority, flags, zb)); 1559 } 1560 1561 /* ARGSUSED */ 1562 static int 1563 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1564 { 1565 blkptr_t *bp = &bc->bc_blkptr; 1566 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1567 1568 if (bc->bc_errno || vd == NULL) { 1569 /* 1570 * We can't scrub this block, but we can continue to scrub 1571 * the rest of the pool. Note the error and move along. 1572 */ 1573 mutex_enter(&spa->spa_scrub_lock); 1574 spa->spa_scrub_errors++; 1575 mutex_exit(&spa->spa_scrub_lock); 1576 1577 if (vd != NULL) { 1578 mutex_enter(&vd->vdev_stat_lock); 1579 vd->vdev_stat.vs_scrub_errors++; 1580 mutex_exit(&vd->vdev_stat_lock); 1581 } 1582 1583 return (ERESTART); 1584 } 1585 1586 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1587 1588 /* 1589 * Keep track of how much data we've examined so that 1590 * zpool(1M) status can make useful progress reports. 1591 */ 1592 mutex_enter(&vd->vdev_stat_lock); 1593 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1594 mutex_exit(&vd->vdev_stat_lock); 1595 1596 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1597 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1598 /* 1599 * Gang members may be spread across multiple vdevs, 1600 * so the best we can do is look at the pool-wide DTL. 1601 * XXX -- it would be better to change our allocation 1602 * policy to ensure that this can't happen. 1603 */ 1604 vd = spa->spa_root_vdev; 1605 } 1606 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1607 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1608 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1609 } 1610 } else { 1611 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1612 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1613 } 1614 1615 return (0); 1616 } 1617 1618 static void 1619 spa_scrub_thread(spa_t *spa) 1620 { 1621 callb_cpr_t cprinfo; 1622 traverse_handle_t *th = spa->spa_scrub_th; 1623 vdev_t *rvd = spa->spa_root_vdev; 1624 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1625 int error = 0; 1626 boolean_t complete; 1627 1628 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1629 1630 /* 1631 * If we're restarting due to a snapshot create/delete, 1632 * wait for that to complete. 1633 */ 1634 txg_wait_synced(spa_get_dsl(spa), 0); 1635 1636 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1637 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1638 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1639 1640 spa_config_enter(spa, RW_WRITER, FTAG); 1641 vdev_reopen(rvd); /* purge all vdev caches */ 1642 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1643 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1644 spa_config_exit(spa, FTAG); 1645 1646 mutex_enter(&spa->spa_scrub_lock); 1647 spa->spa_scrub_errors = 0; 1648 spa->spa_scrub_active = 1; 1649 ASSERT(spa->spa_scrub_inflight == 0); 1650 ASSERT(spa->spa_scrub_throttled == 0); 1651 1652 while (!spa->spa_scrub_stop) { 1653 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1654 while (spa->spa_scrub_suspended) { 1655 spa->spa_scrub_active = 0; 1656 cv_broadcast(&spa->spa_scrub_cv); 1657 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1658 spa->spa_scrub_active = 1; 1659 } 1660 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1661 1662 if (spa->spa_scrub_restart_txg != 0) 1663 break; 1664 1665 mutex_exit(&spa->spa_scrub_lock); 1666 error = traverse_more(th); 1667 mutex_enter(&spa->spa_scrub_lock); 1668 if (error != EAGAIN) 1669 break; 1670 1671 while (spa->spa_scrub_throttled > 0) 1672 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1673 } 1674 1675 while (spa->spa_scrub_inflight) 1676 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1677 1678 if (spa->spa_scrub_restart_txg != 0) 1679 error = ERESTART; 1680 1681 if (spa->spa_scrub_stop) 1682 error = EINTR; 1683 1684 spa->spa_scrub_active = 0; 1685 cv_broadcast(&spa->spa_scrub_cv); 1686 1687 /* 1688 * Even if there were uncorrectable errors, we consider the scrub 1689 * completed. The downside is that if there is a transient error during 1690 * a resilver, we won't resilver the data properly to the target. But 1691 * if the damage is permanent (more likely) we will resilver forever, 1692 * which isn't really acceptable. Since there is enough information for 1693 * the user to know what has failed and why, this seems like a more 1694 * tractable approach. 1695 */ 1696 complete = (error == 0); 1697 1698 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1699 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1700 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1701 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1702 1703 mutex_exit(&spa->spa_scrub_lock); 1704 1705 /* 1706 * If the scrub/resilver completed, update all DTLs to reflect this. 1707 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1708 */ 1709 spa_config_enter(spa, RW_WRITER, FTAG); 1710 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1711 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1712 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1713 spa_errlog_rotate(spa); 1714 spa_config_exit(spa, FTAG); 1715 1716 mutex_enter(&spa->spa_scrub_lock); 1717 1718 /* 1719 * We may have finished replacing a device. 1720 * Let the async thread assess this and handle the detach. 1721 */ 1722 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1723 1724 /* 1725 * If we were told to restart, our final act is to start a new scrub. 1726 */ 1727 if (error == ERESTART) 1728 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1729 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1730 1731 spa->spa_scrub_type = POOL_SCRUB_NONE; 1732 spa->spa_scrub_active = 0; 1733 spa->spa_scrub_thread = NULL; 1734 cv_broadcast(&spa->spa_scrub_cv); 1735 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1736 thread_exit(); 1737 } 1738 1739 void 1740 spa_scrub_suspend(spa_t *spa) 1741 { 1742 mutex_enter(&spa->spa_scrub_lock); 1743 spa->spa_scrub_suspended++; 1744 while (spa->spa_scrub_active) { 1745 cv_broadcast(&spa->spa_scrub_cv); 1746 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1747 } 1748 while (spa->spa_scrub_inflight) 1749 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1750 mutex_exit(&spa->spa_scrub_lock); 1751 } 1752 1753 void 1754 spa_scrub_resume(spa_t *spa) 1755 { 1756 mutex_enter(&spa->spa_scrub_lock); 1757 ASSERT(spa->spa_scrub_suspended != 0); 1758 if (--spa->spa_scrub_suspended == 0) 1759 cv_broadcast(&spa->spa_scrub_cv); 1760 mutex_exit(&spa->spa_scrub_lock); 1761 } 1762 1763 void 1764 spa_scrub_restart(spa_t *spa, uint64_t txg) 1765 { 1766 /* 1767 * Something happened (e.g. snapshot create/delete) that means 1768 * we must restart any in-progress scrubs. The itinerary will 1769 * fix this properly. 1770 */ 1771 mutex_enter(&spa->spa_scrub_lock); 1772 spa->spa_scrub_restart_txg = txg; 1773 mutex_exit(&spa->spa_scrub_lock); 1774 } 1775 1776 int 1777 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1778 { 1779 space_seg_t *ss; 1780 uint64_t mintxg, maxtxg; 1781 vdev_t *rvd = spa->spa_root_vdev; 1782 int advance = ADVANCE_PRE | ADVANCE_ZIL; 1783 1784 if ((uint_t)type >= POOL_SCRUB_TYPES) 1785 return (ENOTSUP); 1786 1787 mutex_enter(&spa->spa_scrub_lock); 1788 1789 /* 1790 * If there's a scrub or resilver already in progress, stop it. 1791 */ 1792 while (spa->spa_scrub_thread != NULL) { 1793 /* 1794 * Don't stop a resilver unless forced. 1795 */ 1796 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1797 mutex_exit(&spa->spa_scrub_lock); 1798 return (EBUSY); 1799 } 1800 spa->spa_scrub_stop = 1; 1801 cv_broadcast(&spa->spa_scrub_cv); 1802 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1803 } 1804 1805 /* 1806 * Terminate the previous traverse. 1807 */ 1808 if (spa->spa_scrub_th != NULL) { 1809 traverse_fini(spa->spa_scrub_th); 1810 spa->spa_scrub_th = NULL; 1811 } 1812 1813 if (rvd == NULL) { 1814 ASSERT(spa->spa_scrub_stop == 0); 1815 ASSERT(spa->spa_scrub_type == type); 1816 ASSERT(spa->spa_scrub_restart_txg == 0); 1817 mutex_exit(&spa->spa_scrub_lock); 1818 return (0); 1819 } 1820 1821 mintxg = TXG_INITIAL - 1; 1822 maxtxg = spa_last_synced_txg(spa) + 1; 1823 1824 mutex_enter(&rvd->vdev_dtl_lock); 1825 1826 if (rvd->vdev_dtl_map.sm_space == 0) { 1827 /* 1828 * The pool-wide DTL is empty. 1829 * If this is a resilver, there's nothing to do. 1830 */ 1831 if (type == POOL_SCRUB_RESILVER) 1832 type = POOL_SCRUB_NONE; 1833 } else { 1834 /* 1835 * The pool-wide DTL is non-empty. 1836 * If this is a normal scrub, upgrade to a resilver instead. 1837 */ 1838 if (type == POOL_SCRUB_EVERYTHING) 1839 type = POOL_SCRUB_RESILVER; 1840 } 1841 1842 if (type == POOL_SCRUB_RESILVER) { 1843 /* 1844 * Determine the resilvering boundaries. 1845 * 1846 * Note: (mintxg, maxtxg) is an open interval, 1847 * i.e. mintxg and maxtxg themselves are not included. 1848 * 1849 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1850 * so we don't claim to resilver a txg that's still changing. 1851 */ 1852 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1853 mintxg = ss->ss_start - 1; 1854 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1855 maxtxg = MIN(ss->ss_end, maxtxg); 1856 1857 advance |= ADVANCE_PRUNE; 1858 } 1859 1860 mutex_exit(&rvd->vdev_dtl_lock); 1861 1862 spa->spa_scrub_stop = 0; 1863 spa->spa_scrub_type = type; 1864 spa->spa_scrub_restart_txg = 0; 1865 1866 if (type != POOL_SCRUB_NONE) { 1867 spa->spa_scrub_mintxg = mintxg; 1868 spa->spa_scrub_maxtxg = maxtxg; 1869 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1870 advance, ZIO_FLAG_CANFAIL); 1871 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1872 spa->spa_scrub_thread = thread_create(NULL, 0, 1873 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1874 } 1875 1876 mutex_exit(&spa->spa_scrub_lock); 1877 1878 return (0); 1879 } 1880 1881 /* 1882 * ========================================================================== 1883 * SPA async task processing 1884 * ========================================================================== 1885 */ 1886 1887 static void 1888 spa_async_reopen(spa_t *spa) 1889 { 1890 vdev_t *rvd = spa->spa_root_vdev; 1891 vdev_t *tvd; 1892 int c; 1893 1894 spa_config_enter(spa, RW_WRITER, FTAG); 1895 1896 for (c = 0; c < rvd->vdev_children; c++) { 1897 tvd = rvd->vdev_child[c]; 1898 if (tvd->vdev_reopen_wanted) { 1899 tvd->vdev_reopen_wanted = 0; 1900 vdev_reopen(tvd); 1901 } 1902 } 1903 1904 spa_config_exit(spa, FTAG); 1905 } 1906 1907 static void 1908 spa_async_thread(spa_t *spa) 1909 { 1910 int tasks; 1911 1912 ASSERT(spa->spa_sync_on); 1913 1914 mutex_enter(&spa->spa_async_lock); 1915 tasks = spa->spa_async_tasks; 1916 spa->spa_async_tasks = 0; 1917 mutex_exit(&spa->spa_async_lock); 1918 1919 /* 1920 * See if any devices need to be reopened. 1921 */ 1922 if (tasks & SPA_ASYNC_REOPEN) 1923 spa_async_reopen(spa); 1924 1925 /* 1926 * If any devices are done replacing, detach them. 1927 */ 1928 if (tasks & SPA_ASYNC_REPLACE_DONE) 1929 spa_vdev_replace_done(spa); 1930 1931 /* 1932 * Kick off a scrub. 1933 */ 1934 if (tasks & SPA_ASYNC_SCRUB) 1935 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1936 1937 /* 1938 * Kick off a resilver. 1939 */ 1940 if (tasks & SPA_ASYNC_RESILVER) 1941 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1942 1943 /* 1944 * Let the world know that we're done. 1945 */ 1946 mutex_enter(&spa->spa_async_lock); 1947 spa->spa_async_thread = NULL; 1948 cv_broadcast(&spa->spa_async_cv); 1949 mutex_exit(&spa->spa_async_lock); 1950 thread_exit(); 1951 } 1952 1953 void 1954 spa_async_suspend(spa_t *spa) 1955 { 1956 mutex_enter(&spa->spa_async_lock); 1957 spa->spa_async_suspended++; 1958 while (spa->spa_async_thread != NULL) 1959 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1960 mutex_exit(&spa->spa_async_lock); 1961 } 1962 1963 void 1964 spa_async_resume(spa_t *spa) 1965 { 1966 mutex_enter(&spa->spa_async_lock); 1967 ASSERT(spa->spa_async_suspended != 0); 1968 spa->spa_async_suspended--; 1969 mutex_exit(&spa->spa_async_lock); 1970 } 1971 1972 static void 1973 spa_async_dispatch(spa_t *spa) 1974 { 1975 mutex_enter(&spa->spa_async_lock); 1976 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1977 spa->spa_async_thread == NULL) 1978 spa->spa_async_thread = thread_create(NULL, 0, 1979 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1980 mutex_exit(&spa->spa_async_lock); 1981 } 1982 1983 void 1984 spa_async_request(spa_t *spa, int task) 1985 { 1986 mutex_enter(&spa->spa_async_lock); 1987 spa->spa_async_tasks |= task; 1988 mutex_exit(&spa->spa_async_lock); 1989 } 1990 1991 /* 1992 * ========================================================================== 1993 * SPA syncing routines 1994 * ========================================================================== 1995 */ 1996 1997 static void 1998 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1999 { 2000 bplist_t *bpl = &spa->spa_sync_bplist; 2001 dmu_tx_t *tx; 2002 blkptr_t blk; 2003 uint64_t itor = 0; 2004 zio_t *zio; 2005 int error; 2006 uint8_t c = 1; 2007 2008 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2009 2010 while (bplist_iterate(bpl, &itor, &blk) == 0) 2011 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2012 2013 error = zio_wait(zio); 2014 ASSERT3U(error, ==, 0); 2015 2016 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2017 bplist_vacate(bpl, tx); 2018 2019 /* 2020 * Pre-dirty the first block so we sync to convergence faster. 2021 * (Usually only the first block is needed.) 2022 */ 2023 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2024 dmu_tx_commit(tx); 2025 } 2026 2027 static void 2028 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2029 { 2030 nvlist_t *config; 2031 char *packed = NULL; 2032 size_t nvsize = 0; 2033 dmu_buf_t *db; 2034 2035 if (list_is_empty(&spa->spa_dirty_list)) 2036 return; 2037 2038 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2039 2040 spa_config_set(spa, config); 2041 2042 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2043 2044 packed = kmem_alloc(nvsize, KM_SLEEP); 2045 2046 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2047 KM_SLEEP) == 0); 2048 2049 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2050 packed, tx); 2051 2052 kmem_free(packed, nvsize); 2053 2054 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2055 spa->spa_config_object, FTAG, &db)); 2056 dmu_buf_will_dirty(db, tx); 2057 *(uint64_t *)db->db_data = nvsize; 2058 dmu_buf_rele(db, FTAG); 2059 } 2060 2061 /* 2062 * Sync the specified transaction group. New blocks may be dirtied as 2063 * part of the process, so we iterate until it converges. 2064 */ 2065 void 2066 spa_sync(spa_t *spa, uint64_t txg) 2067 { 2068 dsl_pool_t *dp = spa->spa_dsl_pool; 2069 objset_t *mos = spa->spa_meta_objset; 2070 bplist_t *bpl = &spa->spa_sync_bplist; 2071 vdev_t *vd; 2072 dmu_tx_t *tx; 2073 int dirty_vdevs; 2074 2075 /* 2076 * Lock out configuration changes. 2077 */ 2078 spa_config_enter(spa, RW_READER, FTAG); 2079 2080 spa->spa_syncing_txg = txg; 2081 spa->spa_sync_pass = 0; 2082 2083 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2084 2085 /* 2086 * If anything has changed in this txg, push the deferred frees 2087 * from the previous txg. If not, leave them alone so that we 2088 * don't generate work on an otherwise idle system. 2089 */ 2090 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2091 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2092 spa_sync_deferred_frees(spa, txg); 2093 2094 /* 2095 * Iterate to convergence. 2096 */ 2097 do { 2098 spa->spa_sync_pass++; 2099 2100 tx = dmu_tx_create_assigned(dp, txg); 2101 spa_sync_config_object(spa, tx); 2102 dmu_tx_commit(tx); 2103 2104 spa_errlog_sync(spa, txg); 2105 2106 dsl_pool_sync(dp, txg); 2107 2108 dirty_vdevs = 0; 2109 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2110 vdev_sync(vd, txg); 2111 dirty_vdevs++; 2112 } 2113 2114 tx = dmu_tx_create_assigned(dp, txg); 2115 bplist_sync(bpl, tx); 2116 dmu_tx_commit(tx); 2117 2118 } while (dirty_vdevs); 2119 2120 bplist_close(bpl); 2121 2122 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2123 2124 /* 2125 * Rewrite the vdev configuration (which includes the uberblock) 2126 * to commit the transaction group. 2127 */ 2128 VERIFY(0 == spa_sync_labels(spa, txg)); 2129 2130 /* 2131 * Make a stable copy of the fully synced uberblock. 2132 * We use this as the root for pool traversals. 2133 */ 2134 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2135 2136 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2137 2138 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2139 spa->spa_traverse_wanted = 0; 2140 spa->spa_ubsync = spa->spa_uberblock; 2141 rw_exit(&spa->spa_traverse_lock); 2142 2143 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2144 2145 /* 2146 * Clean up the ZIL records for the synced txg. 2147 */ 2148 dsl_pool_zil_clean(dp); 2149 2150 /* 2151 * Update usable space statistics. 2152 */ 2153 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2154 vdev_sync_done(vd, txg); 2155 2156 /* 2157 * It had better be the case that we didn't dirty anything 2158 * since spa_sync_labels(). 2159 */ 2160 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2161 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2162 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2163 ASSERT(bpl->bpl_queue == NULL); 2164 2165 spa_config_exit(spa, FTAG); 2166 2167 /* 2168 * If any async tasks have been requested, kick them off. 2169 */ 2170 spa_async_dispatch(spa); 2171 } 2172 2173 /* 2174 * Sync all pools. We don't want to hold the namespace lock across these 2175 * operations, so we take a reference on the spa_t and drop the lock during the 2176 * sync. 2177 */ 2178 void 2179 spa_sync_allpools(void) 2180 { 2181 spa_t *spa = NULL; 2182 mutex_enter(&spa_namespace_lock); 2183 while ((spa = spa_next(spa)) != NULL) { 2184 if (spa_state(spa) != POOL_STATE_ACTIVE) 2185 continue; 2186 spa_open_ref(spa, FTAG); 2187 mutex_exit(&spa_namespace_lock); 2188 txg_wait_synced(spa_get_dsl(spa), 0); 2189 mutex_enter(&spa_namespace_lock); 2190 spa_close(spa, FTAG); 2191 } 2192 mutex_exit(&spa_namespace_lock); 2193 } 2194 2195 /* 2196 * ========================================================================== 2197 * Miscellaneous routines 2198 * ========================================================================== 2199 */ 2200 2201 int 2202 spa_busy(void) 2203 { 2204 return (spa_active_count != 0); 2205 } 2206 2207 /* 2208 * Remove all pools in the system. 2209 */ 2210 void 2211 spa_evict_all(void) 2212 { 2213 spa_t *spa; 2214 2215 /* 2216 * Remove all cached state. All pools should be closed now, 2217 * so every spa in the AVL tree should be unreferenced. 2218 */ 2219 mutex_enter(&spa_namespace_lock); 2220 while ((spa = spa_next(NULL)) != NULL) { 2221 /* 2222 * Stop async tasks. The async thread may need to detach 2223 * a device that's been replaced, which requires grabbing 2224 * spa_namespace_lock, so we must drop it here. 2225 */ 2226 spa_open_ref(spa, FTAG); 2227 mutex_exit(&spa_namespace_lock); 2228 spa_async_suspend(spa); 2229 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2230 mutex_enter(&spa_namespace_lock); 2231 spa_close(spa, FTAG); 2232 2233 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2234 spa_unload(spa); 2235 spa_deactivate(spa); 2236 } 2237 spa_remove(spa); 2238 } 2239 mutex_exit(&spa_namespace_lock); 2240 } 2241 2242 vdev_t * 2243 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2244 { 2245 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2246 } 2247