1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #ifdef _KERNEL 64 #include <sys/zone.h> 65 #endif /* _KERNEL */ 66 67 #include "zfs_prop.h" 68 #include "zfs_comutil.h" 69 70 int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 71 /* ISSUE INTR */ 72 { 1, 1 }, /* ZIO_TYPE_NULL */ 73 { 8, 8 }, /* ZIO_TYPE_READ */ 74 { 8, 8 }, /* ZIO_TYPE_WRITE */ 75 { 1, 1 }, /* ZIO_TYPE_FREE */ 76 { 1, 1 }, /* ZIO_TYPE_CLAIM */ 77 { 1, 1 }, /* ZIO_TYPE_IOCTL */ 78 }; 79 80 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 81 static boolean_t spa_has_active_shared_spare(spa_t *spa); 82 83 /* 84 * ========================================================================== 85 * SPA properties routines 86 * ========================================================================== 87 */ 88 89 /* 90 * Add a (source=src, propname=propval) list to an nvlist. 91 */ 92 static void 93 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 94 uint64_t intval, zprop_source_t src) 95 { 96 const char *propname = zpool_prop_to_name(prop); 97 nvlist_t *propval; 98 99 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 100 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 101 102 if (strval != NULL) 103 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 104 else 105 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 106 107 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 108 nvlist_free(propval); 109 } 110 111 /* 112 * Get property values from the spa configuration. 113 */ 114 static void 115 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 116 { 117 uint64_t size; 118 uint64_t used; 119 uint64_t cap, version; 120 zprop_source_t src = ZPROP_SRC_NONE; 121 spa_config_dirent_t *dp; 122 123 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 124 125 if (spa->spa_root_vdev != NULL) { 126 size = spa_get_space(spa); 127 used = spa_get_alloc(spa); 128 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 129 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 130 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 131 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 132 size - used, src); 133 134 cap = (size == 0) ? 0 : (used * 100 / size); 135 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 136 137 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 138 spa->spa_root_vdev->vdev_state, src); 139 140 version = spa_version(spa); 141 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 142 src = ZPROP_SRC_DEFAULT; 143 else 144 src = ZPROP_SRC_LOCAL; 145 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 146 } 147 148 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 149 150 if (spa->spa_root != NULL) 151 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 152 0, ZPROP_SRC_LOCAL); 153 154 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 155 if (dp->scd_path == NULL) { 156 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 157 "none", 0, ZPROP_SRC_LOCAL); 158 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 159 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 160 dp->scd_path, 0, ZPROP_SRC_LOCAL); 161 } 162 } 163 } 164 165 /* 166 * Get zpool property values. 167 */ 168 int 169 spa_prop_get(spa_t *spa, nvlist_t **nvp) 170 { 171 zap_cursor_t zc; 172 zap_attribute_t za; 173 objset_t *mos = spa->spa_meta_objset; 174 int err; 175 176 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 177 178 mutex_enter(&spa->spa_props_lock); 179 180 /* 181 * Get properties from the spa config. 182 */ 183 spa_prop_get_config(spa, nvp); 184 185 /* If no pool property object, no more prop to get. */ 186 if (spa->spa_pool_props_object == 0) { 187 mutex_exit(&spa->spa_props_lock); 188 return (0); 189 } 190 191 /* 192 * Get properties from the MOS pool property object. 193 */ 194 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 195 (err = zap_cursor_retrieve(&zc, &za)) == 0; 196 zap_cursor_advance(&zc)) { 197 uint64_t intval = 0; 198 char *strval = NULL; 199 zprop_source_t src = ZPROP_SRC_DEFAULT; 200 zpool_prop_t prop; 201 202 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 203 continue; 204 205 switch (za.za_integer_length) { 206 case 8: 207 /* integer property */ 208 if (za.za_first_integer != 209 zpool_prop_default_numeric(prop)) 210 src = ZPROP_SRC_LOCAL; 211 212 if (prop == ZPOOL_PROP_BOOTFS) { 213 dsl_pool_t *dp; 214 dsl_dataset_t *ds = NULL; 215 216 dp = spa_get_dsl(spa); 217 rw_enter(&dp->dp_config_rwlock, RW_READER); 218 if (err = dsl_dataset_hold_obj(dp, 219 za.za_first_integer, FTAG, &ds)) { 220 rw_exit(&dp->dp_config_rwlock); 221 break; 222 } 223 224 strval = kmem_alloc( 225 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 226 KM_SLEEP); 227 dsl_dataset_name(ds, strval); 228 dsl_dataset_rele(ds, FTAG); 229 rw_exit(&dp->dp_config_rwlock); 230 } else { 231 strval = NULL; 232 intval = za.za_first_integer; 233 } 234 235 spa_prop_add_list(*nvp, prop, strval, intval, src); 236 237 if (strval != NULL) 238 kmem_free(strval, 239 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 240 241 break; 242 243 case 1: 244 /* string property */ 245 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 246 err = zap_lookup(mos, spa->spa_pool_props_object, 247 za.za_name, 1, za.za_num_integers, strval); 248 if (err) { 249 kmem_free(strval, za.za_num_integers); 250 break; 251 } 252 spa_prop_add_list(*nvp, prop, strval, 0, src); 253 kmem_free(strval, za.za_num_integers); 254 break; 255 256 default: 257 break; 258 } 259 } 260 zap_cursor_fini(&zc); 261 mutex_exit(&spa->spa_props_lock); 262 out: 263 if (err && err != ENOENT) { 264 nvlist_free(*nvp); 265 *nvp = NULL; 266 return (err); 267 } 268 269 return (0); 270 } 271 272 /* 273 * Validate the given pool properties nvlist and modify the list 274 * for the property values to be set. 275 */ 276 static int 277 spa_prop_validate(spa_t *spa, nvlist_t *props) 278 { 279 nvpair_t *elem; 280 int error = 0, reset_bootfs = 0; 281 uint64_t objnum; 282 283 elem = NULL; 284 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 285 zpool_prop_t prop; 286 char *propname, *strval; 287 uint64_t intval; 288 objset_t *os; 289 char *slash; 290 291 propname = nvpair_name(elem); 292 293 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 294 return (EINVAL); 295 296 switch (prop) { 297 case ZPOOL_PROP_VERSION: 298 error = nvpair_value_uint64(elem, &intval); 299 if (!error && 300 (intval < spa_version(spa) || intval > SPA_VERSION)) 301 error = EINVAL; 302 break; 303 304 case ZPOOL_PROP_DELEGATION: 305 case ZPOOL_PROP_AUTOREPLACE: 306 case ZPOOL_PROP_LISTSNAPS: 307 error = nvpair_value_uint64(elem, &intval); 308 if (!error && intval > 1) 309 error = EINVAL; 310 break; 311 312 case ZPOOL_PROP_BOOTFS: 313 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 314 error = ENOTSUP; 315 break; 316 } 317 318 /* 319 * Make sure the vdev config is bootable 320 */ 321 if (!vdev_is_bootable(spa->spa_root_vdev)) { 322 error = ENOTSUP; 323 break; 324 } 325 326 reset_bootfs = 1; 327 328 error = nvpair_value_string(elem, &strval); 329 330 if (!error) { 331 uint64_t compress; 332 333 if (strval == NULL || strval[0] == '\0') { 334 objnum = zpool_prop_default_numeric( 335 ZPOOL_PROP_BOOTFS); 336 break; 337 } 338 339 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 340 DS_MODE_USER | DS_MODE_READONLY, &os)) 341 break; 342 343 /* We don't support gzip bootable datasets */ 344 if ((error = dsl_prop_get_integer(strval, 345 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 346 &compress, NULL)) == 0 && 347 !BOOTFS_COMPRESS_VALID(compress)) { 348 error = ENOTSUP; 349 } else { 350 objnum = dmu_objset_id(os); 351 } 352 dmu_objset_close(os); 353 } 354 break; 355 356 case ZPOOL_PROP_FAILUREMODE: 357 error = nvpair_value_uint64(elem, &intval); 358 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 359 intval > ZIO_FAILURE_MODE_PANIC)) 360 error = EINVAL; 361 362 /* 363 * This is a special case which only occurs when 364 * the pool has completely failed. This allows 365 * the user to change the in-core failmode property 366 * without syncing it out to disk (I/Os might 367 * currently be blocked). We do this by returning 368 * EIO to the caller (spa_prop_set) to trick it 369 * into thinking we encountered a property validation 370 * error. 371 */ 372 if (!error && spa_suspended(spa)) { 373 spa->spa_failmode = intval; 374 error = EIO; 375 } 376 break; 377 378 case ZPOOL_PROP_CACHEFILE: 379 if ((error = nvpair_value_string(elem, &strval)) != 0) 380 break; 381 382 if (strval[0] == '\0') 383 break; 384 385 if (strcmp(strval, "none") == 0) 386 break; 387 388 if (strval[0] != '/') { 389 error = EINVAL; 390 break; 391 } 392 393 slash = strrchr(strval, '/'); 394 ASSERT(slash != NULL); 395 396 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 397 strcmp(slash, "/..") == 0) 398 error = EINVAL; 399 break; 400 } 401 402 if (error) 403 break; 404 } 405 406 if (!error && reset_bootfs) { 407 error = nvlist_remove(props, 408 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 409 410 if (!error) { 411 error = nvlist_add_uint64(props, 412 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 413 } 414 } 415 416 return (error); 417 } 418 419 void 420 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 421 { 422 char *cachefile; 423 spa_config_dirent_t *dp; 424 425 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 426 &cachefile) != 0) 427 return; 428 429 dp = kmem_alloc(sizeof (spa_config_dirent_t), 430 KM_SLEEP); 431 432 if (cachefile[0] == '\0') 433 dp->scd_path = spa_strdup(spa_config_path); 434 else if (strcmp(cachefile, "none") == 0) 435 dp->scd_path = NULL; 436 else 437 dp->scd_path = spa_strdup(cachefile); 438 439 list_insert_head(&spa->spa_config_list, dp); 440 if (need_sync) 441 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 442 } 443 444 int 445 spa_prop_set(spa_t *spa, nvlist_t *nvp) 446 { 447 int error; 448 nvpair_t *elem; 449 boolean_t need_sync = B_FALSE; 450 zpool_prop_t prop; 451 452 if ((error = spa_prop_validate(spa, nvp)) != 0) 453 return (error); 454 455 elem = NULL; 456 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 457 if ((prop = zpool_name_to_prop( 458 nvpair_name(elem))) == ZPROP_INVAL) 459 return (EINVAL); 460 461 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 462 continue; 463 464 need_sync = B_TRUE; 465 break; 466 } 467 468 if (need_sync) 469 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 470 spa, nvp, 3)); 471 else 472 return (0); 473 } 474 475 /* 476 * If the bootfs property value is dsobj, clear it. 477 */ 478 void 479 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 480 { 481 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 482 VERIFY(zap_remove(spa->spa_meta_objset, 483 spa->spa_pool_props_object, 484 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 485 spa->spa_bootfs = 0; 486 } 487 } 488 489 /* 490 * ========================================================================== 491 * SPA state manipulation (open/create/destroy/import/export) 492 * ========================================================================== 493 */ 494 495 static int 496 spa_error_entry_compare(const void *a, const void *b) 497 { 498 spa_error_entry_t *sa = (spa_error_entry_t *)a; 499 spa_error_entry_t *sb = (spa_error_entry_t *)b; 500 int ret; 501 502 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 503 sizeof (zbookmark_t)); 504 505 if (ret < 0) 506 return (-1); 507 else if (ret > 0) 508 return (1); 509 else 510 return (0); 511 } 512 513 /* 514 * Utility function which retrieves copies of the current logs and 515 * re-initializes them in the process. 516 */ 517 void 518 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 519 { 520 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 521 522 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 523 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 524 525 avl_create(&spa->spa_errlist_scrub, 526 spa_error_entry_compare, sizeof (spa_error_entry_t), 527 offsetof(spa_error_entry_t, se_avl)); 528 avl_create(&spa->spa_errlist_last, 529 spa_error_entry_compare, sizeof (spa_error_entry_t), 530 offsetof(spa_error_entry_t, se_avl)); 531 } 532 533 /* 534 * Activate an uninitialized pool. 535 */ 536 static void 537 spa_activate(spa_t *spa, int mode) 538 { 539 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 540 541 spa->spa_state = POOL_STATE_ACTIVE; 542 spa->spa_mode = mode; 543 544 spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); 545 spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); 546 547 for (int t = 0; t < ZIO_TYPES; t++) { 548 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 549 spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", 550 zio_taskq_threads[t][q], maxclsyspri, 50, 551 INT_MAX, TASKQ_PREPOPULATE); 552 } 553 } 554 555 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 556 offsetof(vdev_t, vdev_config_dirty_node)); 557 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 558 offsetof(vdev_t, vdev_state_dirty_node)); 559 560 txg_list_create(&spa->spa_vdev_txg_list, 561 offsetof(struct vdev, vdev_txg_node)); 562 563 avl_create(&spa->spa_errlist_scrub, 564 spa_error_entry_compare, sizeof (spa_error_entry_t), 565 offsetof(spa_error_entry_t, se_avl)); 566 avl_create(&spa->spa_errlist_last, 567 spa_error_entry_compare, sizeof (spa_error_entry_t), 568 offsetof(spa_error_entry_t, se_avl)); 569 } 570 571 /* 572 * Opposite of spa_activate(). 573 */ 574 static void 575 spa_deactivate(spa_t *spa) 576 { 577 ASSERT(spa->spa_sync_on == B_FALSE); 578 ASSERT(spa->spa_dsl_pool == NULL); 579 ASSERT(spa->spa_root_vdev == NULL); 580 581 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 582 583 txg_list_destroy(&spa->spa_vdev_txg_list); 584 585 list_destroy(&spa->spa_config_dirty_list); 586 list_destroy(&spa->spa_state_dirty_list); 587 588 for (int t = 0; t < ZIO_TYPES; t++) { 589 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 590 taskq_destroy(spa->spa_zio_taskq[t][q]); 591 spa->spa_zio_taskq[t][q] = NULL; 592 } 593 } 594 595 metaslab_class_destroy(spa->spa_normal_class); 596 spa->spa_normal_class = NULL; 597 598 metaslab_class_destroy(spa->spa_log_class); 599 spa->spa_log_class = NULL; 600 601 /* 602 * If this was part of an import or the open otherwise failed, we may 603 * still have errors left in the queues. Empty them just in case. 604 */ 605 spa_errlog_drain(spa); 606 607 avl_destroy(&spa->spa_errlist_scrub); 608 avl_destroy(&spa->spa_errlist_last); 609 610 spa->spa_state = POOL_STATE_UNINITIALIZED; 611 } 612 613 /* 614 * Verify a pool configuration, and construct the vdev tree appropriately. This 615 * will create all the necessary vdevs in the appropriate layout, with each vdev 616 * in the CLOSED state. This will prep the pool before open/creation/import. 617 * All vdev validation is done by the vdev_alloc() routine. 618 */ 619 static int 620 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 621 uint_t id, int atype) 622 { 623 nvlist_t **child; 624 uint_t c, children; 625 int error; 626 627 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 628 return (error); 629 630 if ((*vdp)->vdev_ops->vdev_op_leaf) 631 return (0); 632 633 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 634 &child, &children); 635 636 if (error == ENOENT) 637 return (0); 638 639 if (error) { 640 vdev_free(*vdp); 641 *vdp = NULL; 642 return (EINVAL); 643 } 644 645 for (c = 0; c < children; c++) { 646 vdev_t *vd; 647 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 648 atype)) != 0) { 649 vdev_free(*vdp); 650 *vdp = NULL; 651 return (error); 652 } 653 } 654 655 ASSERT(*vdp != NULL); 656 657 return (0); 658 } 659 660 /* 661 * Opposite of spa_load(). 662 */ 663 static void 664 spa_unload(spa_t *spa) 665 { 666 int i; 667 668 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 669 670 /* 671 * Stop async tasks. 672 */ 673 spa_async_suspend(spa); 674 675 /* 676 * Stop syncing. 677 */ 678 if (spa->spa_sync_on) { 679 txg_sync_stop(spa->spa_dsl_pool); 680 spa->spa_sync_on = B_FALSE; 681 } 682 683 /* 684 * Wait for any outstanding async I/O to complete. 685 */ 686 if (spa->spa_async_zio_root != NULL) { 687 (void) zio_wait(spa->spa_async_zio_root); 688 spa->spa_async_zio_root = NULL; 689 } 690 691 /* 692 * Close the dsl pool. 693 */ 694 if (spa->spa_dsl_pool) { 695 dsl_pool_close(spa->spa_dsl_pool); 696 spa->spa_dsl_pool = NULL; 697 } 698 699 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 700 701 /* 702 * Drop and purge level 2 cache 703 */ 704 spa_l2cache_drop(spa); 705 706 /* 707 * Close all vdevs. 708 */ 709 if (spa->spa_root_vdev) 710 vdev_free(spa->spa_root_vdev); 711 ASSERT(spa->spa_root_vdev == NULL); 712 713 for (i = 0; i < spa->spa_spares.sav_count; i++) 714 vdev_free(spa->spa_spares.sav_vdevs[i]); 715 if (spa->spa_spares.sav_vdevs) { 716 kmem_free(spa->spa_spares.sav_vdevs, 717 spa->spa_spares.sav_count * sizeof (void *)); 718 spa->spa_spares.sav_vdevs = NULL; 719 } 720 if (spa->spa_spares.sav_config) { 721 nvlist_free(spa->spa_spares.sav_config); 722 spa->spa_spares.sav_config = NULL; 723 } 724 spa->spa_spares.sav_count = 0; 725 726 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 727 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 728 if (spa->spa_l2cache.sav_vdevs) { 729 kmem_free(spa->spa_l2cache.sav_vdevs, 730 spa->spa_l2cache.sav_count * sizeof (void *)); 731 spa->spa_l2cache.sav_vdevs = NULL; 732 } 733 if (spa->spa_l2cache.sav_config) { 734 nvlist_free(spa->spa_l2cache.sav_config); 735 spa->spa_l2cache.sav_config = NULL; 736 } 737 spa->spa_l2cache.sav_count = 0; 738 739 spa->spa_async_suspended = 0; 740 741 spa_config_exit(spa, SCL_ALL, FTAG); 742 } 743 744 /* 745 * Load (or re-load) the current list of vdevs describing the active spares for 746 * this pool. When this is called, we have some form of basic information in 747 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 748 * then re-generate a more complete list including status information. 749 */ 750 static void 751 spa_load_spares(spa_t *spa) 752 { 753 nvlist_t **spares; 754 uint_t nspares; 755 int i; 756 vdev_t *vd, *tvd; 757 758 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 759 760 /* 761 * First, close and free any existing spare vdevs. 762 */ 763 for (i = 0; i < spa->spa_spares.sav_count; i++) { 764 vd = spa->spa_spares.sav_vdevs[i]; 765 766 /* Undo the call to spa_activate() below */ 767 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 768 B_FALSE)) != NULL && tvd->vdev_isspare) 769 spa_spare_remove(tvd); 770 vdev_close(vd); 771 vdev_free(vd); 772 } 773 774 if (spa->spa_spares.sav_vdevs) 775 kmem_free(spa->spa_spares.sav_vdevs, 776 spa->spa_spares.sav_count * sizeof (void *)); 777 778 if (spa->spa_spares.sav_config == NULL) 779 nspares = 0; 780 else 781 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 782 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 783 784 spa->spa_spares.sav_count = (int)nspares; 785 spa->spa_spares.sav_vdevs = NULL; 786 787 if (nspares == 0) 788 return; 789 790 /* 791 * Construct the array of vdevs, opening them to get status in the 792 * process. For each spare, there is potentially two different vdev_t 793 * structures associated with it: one in the list of spares (used only 794 * for basic validation purposes) and one in the active vdev 795 * configuration (if it's spared in). During this phase we open and 796 * validate each vdev on the spare list. If the vdev also exists in the 797 * active configuration, then we also mark this vdev as an active spare. 798 */ 799 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 800 KM_SLEEP); 801 for (i = 0; i < spa->spa_spares.sav_count; i++) { 802 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 803 VDEV_ALLOC_SPARE) == 0); 804 ASSERT(vd != NULL); 805 806 spa->spa_spares.sav_vdevs[i] = vd; 807 808 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 809 B_FALSE)) != NULL) { 810 if (!tvd->vdev_isspare) 811 spa_spare_add(tvd); 812 813 /* 814 * We only mark the spare active if we were successfully 815 * able to load the vdev. Otherwise, importing a pool 816 * with a bad active spare would result in strange 817 * behavior, because multiple pool would think the spare 818 * is actively in use. 819 * 820 * There is a vulnerability here to an equally bizarre 821 * circumstance, where a dead active spare is later 822 * brought back to life (onlined or otherwise). Given 823 * the rarity of this scenario, and the extra complexity 824 * it adds, we ignore the possibility. 825 */ 826 if (!vdev_is_dead(tvd)) 827 spa_spare_activate(tvd); 828 } 829 830 vd->vdev_top = vd; 831 vd->vdev_aux = &spa->spa_spares; 832 833 if (vdev_open(vd) != 0) 834 continue; 835 836 if (vdev_validate_aux(vd) == 0) 837 spa_spare_add(vd); 838 } 839 840 /* 841 * Recompute the stashed list of spares, with status information 842 * this time. 843 */ 844 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 845 DATA_TYPE_NVLIST_ARRAY) == 0); 846 847 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 848 KM_SLEEP); 849 for (i = 0; i < spa->spa_spares.sav_count; i++) 850 spares[i] = vdev_config_generate(spa, 851 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 852 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 853 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 854 for (i = 0; i < spa->spa_spares.sav_count; i++) 855 nvlist_free(spares[i]); 856 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 857 } 858 859 /* 860 * Load (or re-load) the current list of vdevs describing the active l2cache for 861 * this pool. When this is called, we have some form of basic information in 862 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 863 * then re-generate a more complete list including status information. 864 * Devices which are already active have their details maintained, and are 865 * not re-opened. 866 */ 867 static void 868 spa_load_l2cache(spa_t *spa) 869 { 870 nvlist_t **l2cache; 871 uint_t nl2cache; 872 int i, j, oldnvdevs; 873 uint64_t guid, size; 874 vdev_t *vd, **oldvdevs, **newvdevs; 875 spa_aux_vdev_t *sav = &spa->spa_l2cache; 876 877 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 878 879 if (sav->sav_config != NULL) { 880 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 881 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 882 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 883 } else { 884 nl2cache = 0; 885 } 886 887 oldvdevs = sav->sav_vdevs; 888 oldnvdevs = sav->sav_count; 889 sav->sav_vdevs = NULL; 890 sav->sav_count = 0; 891 892 /* 893 * Process new nvlist of vdevs. 894 */ 895 for (i = 0; i < nl2cache; i++) { 896 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 897 &guid) == 0); 898 899 newvdevs[i] = NULL; 900 for (j = 0; j < oldnvdevs; j++) { 901 vd = oldvdevs[j]; 902 if (vd != NULL && guid == vd->vdev_guid) { 903 /* 904 * Retain previous vdev for add/remove ops. 905 */ 906 newvdevs[i] = vd; 907 oldvdevs[j] = NULL; 908 break; 909 } 910 } 911 912 if (newvdevs[i] == NULL) { 913 /* 914 * Create new vdev 915 */ 916 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 917 VDEV_ALLOC_L2CACHE) == 0); 918 ASSERT(vd != NULL); 919 newvdevs[i] = vd; 920 921 /* 922 * Commit this vdev as an l2cache device, 923 * even if it fails to open. 924 */ 925 spa_l2cache_add(vd); 926 927 vd->vdev_top = vd; 928 vd->vdev_aux = sav; 929 930 spa_l2cache_activate(vd); 931 932 if (vdev_open(vd) != 0) 933 continue; 934 935 (void) vdev_validate_aux(vd); 936 937 if (!vdev_is_dead(vd)) { 938 size = vdev_get_rsize(vd); 939 l2arc_add_vdev(spa, vd, 940 VDEV_LABEL_START_SIZE, 941 size - VDEV_LABEL_START_SIZE); 942 } 943 } 944 } 945 946 /* 947 * Purge vdevs that were dropped 948 */ 949 for (i = 0; i < oldnvdevs; i++) { 950 uint64_t pool; 951 952 vd = oldvdevs[i]; 953 if (vd != NULL) { 954 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 955 pool != 0ULL && l2arc_vdev_present(vd)) 956 l2arc_remove_vdev(vd); 957 (void) vdev_close(vd); 958 spa_l2cache_remove(vd); 959 } 960 } 961 962 if (oldvdevs) 963 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 964 965 if (sav->sav_config == NULL) 966 goto out; 967 968 sav->sav_vdevs = newvdevs; 969 sav->sav_count = (int)nl2cache; 970 971 /* 972 * Recompute the stashed list of l2cache devices, with status 973 * information this time. 974 */ 975 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 976 DATA_TYPE_NVLIST_ARRAY) == 0); 977 978 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 979 for (i = 0; i < sav->sav_count; i++) 980 l2cache[i] = vdev_config_generate(spa, 981 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 982 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 983 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 984 out: 985 for (i = 0; i < sav->sav_count; i++) 986 nvlist_free(l2cache[i]); 987 if (sav->sav_count) 988 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 989 } 990 991 static int 992 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 993 { 994 dmu_buf_t *db; 995 char *packed = NULL; 996 size_t nvsize = 0; 997 int error; 998 *value = NULL; 999 1000 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1001 nvsize = *(uint64_t *)db->db_data; 1002 dmu_buf_rele(db, FTAG); 1003 1004 packed = kmem_alloc(nvsize, KM_SLEEP); 1005 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1006 DMU_READ_PREFETCH); 1007 if (error == 0) 1008 error = nvlist_unpack(packed, nvsize, value, 0); 1009 kmem_free(packed, nvsize); 1010 1011 return (error); 1012 } 1013 1014 /* 1015 * Checks to see if the given vdev could not be opened, in which case we post a 1016 * sysevent to notify the autoreplace code that the device has been removed. 1017 */ 1018 static void 1019 spa_check_removed(vdev_t *vd) 1020 { 1021 int c; 1022 1023 for (c = 0; c < vd->vdev_children; c++) 1024 spa_check_removed(vd->vdev_child[c]); 1025 1026 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1027 zfs_post_autoreplace(vd->vdev_spa, vd); 1028 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1029 } 1030 } 1031 1032 /* 1033 * Check for missing log devices 1034 */ 1035 int 1036 spa_check_logs(spa_t *spa) 1037 { 1038 switch (spa->spa_log_state) { 1039 case SPA_LOG_MISSING: 1040 /* need to recheck in case slog has been restored */ 1041 case SPA_LOG_UNKNOWN: 1042 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1043 DS_FIND_CHILDREN)) { 1044 spa->spa_log_state = SPA_LOG_MISSING; 1045 return (1); 1046 } 1047 break; 1048 1049 case SPA_LOG_CLEAR: 1050 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 1051 DS_FIND_CHILDREN); 1052 break; 1053 } 1054 spa->spa_log_state = SPA_LOG_GOOD; 1055 return (0); 1056 } 1057 1058 /* 1059 * Load an existing storage pool, using the pool's builtin spa_config as a 1060 * source of configuration information. 1061 */ 1062 static int 1063 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1064 { 1065 int error = 0; 1066 nvlist_t *nvroot = NULL; 1067 vdev_t *rvd; 1068 uberblock_t *ub = &spa->spa_uberblock; 1069 uint64_t config_cache_txg = spa->spa_config_txg; 1070 uint64_t pool_guid; 1071 uint64_t version; 1072 uint64_t autoreplace = 0; 1073 int orig_mode = spa->spa_mode; 1074 char *ereport = FM_EREPORT_ZFS_POOL; 1075 1076 /* 1077 * If this is an untrusted config, access the pool in read-only mode. 1078 * This prevents things like resilvering recently removed devices. 1079 */ 1080 if (!mosconfig) 1081 spa->spa_mode = FREAD; 1082 1083 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1084 1085 spa->spa_load_state = state; 1086 1087 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1088 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1089 error = EINVAL; 1090 goto out; 1091 } 1092 1093 /* 1094 * Versioning wasn't explicitly added to the label until later, so if 1095 * it's not present treat it as the initial version. 1096 */ 1097 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1098 version = SPA_VERSION_INITIAL; 1099 1100 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1101 &spa->spa_config_txg); 1102 1103 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1104 spa_guid_exists(pool_guid, 0)) { 1105 error = EEXIST; 1106 goto out; 1107 } 1108 1109 spa->spa_load_guid = pool_guid; 1110 1111 /* 1112 * Create "The Godfather" zio to hold all async IOs 1113 */ 1114 if (spa->spa_async_zio_root == NULL) 1115 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1116 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1117 ZIO_FLAG_GODFATHER); 1118 1119 /* 1120 * Parse the configuration into a vdev tree. We explicitly set the 1121 * value that will be returned by spa_version() since parsing the 1122 * configuration requires knowing the version number. 1123 */ 1124 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1125 spa->spa_ubsync.ub_version = version; 1126 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1127 spa_config_exit(spa, SCL_ALL, FTAG); 1128 1129 if (error != 0) 1130 goto out; 1131 1132 ASSERT(spa->spa_root_vdev == rvd); 1133 ASSERT(spa_guid(spa) == pool_guid); 1134 1135 /* 1136 * Try to open all vdevs, loading each label in the process. 1137 */ 1138 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1139 error = vdev_open(rvd); 1140 spa_config_exit(spa, SCL_ALL, FTAG); 1141 if (error != 0) 1142 goto out; 1143 1144 /* 1145 * We need to validate the vdev labels against the configuration that 1146 * we have in hand, which is dependent on the setting of mosconfig. If 1147 * mosconfig is true then we're validating the vdev labels based on 1148 * that config. Otherwise, we're validating against the cached config 1149 * (zpool.cache) that was read when we loaded the zfs module, and then 1150 * later we will recursively call spa_load() and validate against 1151 * the vdev config. 1152 */ 1153 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1154 error = vdev_validate(rvd); 1155 spa_config_exit(spa, SCL_ALL, FTAG); 1156 if (error != 0) 1157 goto out; 1158 1159 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1160 error = ENXIO; 1161 goto out; 1162 } 1163 1164 /* 1165 * Find the best uberblock. 1166 */ 1167 vdev_uberblock_load(NULL, rvd, ub); 1168 1169 /* 1170 * If we weren't able to find a single valid uberblock, return failure. 1171 */ 1172 if (ub->ub_txg == 0) { 1173 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1174 VDEV_AUX_CORRUPT_DATA); 1175 error = ENXIO; 1176 goto out; 1177 } 1178 1179 /* 1180 * If the pool is newer than the code, we can't open it. 1181 */ 1182 if (ub->ub_version > SPA_VERSION) { 1183 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1184 VDEV_AUX_VERSION_NEWER); 1185 error = ENOTSUP; 1186 goto out; 1187 } 1188 1189 /* 1190 * If the vdev guid sum doesn't match the uberblock, we have an 1191 * incomplete configuration. 1192 */ 1193 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1194 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1195 VDEV_AUX_BAD_GUID_SUM); 1196 error = ENXIO; 1197 goto out; 1198 } 1199 1200 /* 1201 * Initialize internal SPA structures. 1202 */ 1203 spa->spa_state = POOL_STATE_ACTIVE; 1204 spa->spa_ubsync = spa->spa_uberblock; 1205 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1206 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1207 if (error) { 1208 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1209 VDEV_AUX_CORRUPT_DATA); 1210 goto out; 1211 } 1212 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1213 1214 if (zap_lookup(spa->spa_meta_objset, 1215 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1216 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1217 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1218 VDEV_AUX_CORRUPT_DATA); 1219 error = EIO; 1220 goto out; 1221 } 1222 1223 if (!mosconfig) { 1224 nvlist_t *newconfig; 1225 uint64_t hostid; 1226 1227 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1228 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1229 VDEV_AUX_CORRUPT_DATA); 1230 error = EIO; 1231 goto out; 1232 } 1233 1234 if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, 1235 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1236 char *hostname; 1237 unsigned long myhostid = 0; 1238 1239 VERIFY(nvlist_lookup_string(newconfig, 1240 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1241 1242 #ifdef _KERNEL 1243 myhostid = zone_get_hostid(NULL); 1244 #else /* _KERNEL */ 1245 /* 1246 * We're emulating the system's hostid in userland, so 1247 * we can't use zone_get_hostid(). 1248 */ 1249 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1250 #endif /* _KERNEL */ 1251 if (hostid != 0 && myhostid != 0 && 1252 hostid != myhostid) { 1253 cmn_err(CE_WARN, "pool '%s' could not be " 1254 "loaded as it was last accessed by " 1255 "another system (host: %s hostid: 0x%lx). " 1256 "See: http://www.sun.com/msg/ZFS-8000-EY", 1257 spa_name(spa), hostname, 1258 (unsigned long)hostid); 1259 error = EBADF; 1260 goto out; 1261 } 1262 } 1263 1264 spa_config_set(spa, newconfig); 1265 spa_unload(spa); 1266 spa_deactivate(spa); 1267 spa_activate(spa, orig_mode); 1268 1269 return (spa_load(spa, newconfig, state, B_TRUE)); 1270 } 1271 1272 if (zap_lookup(spa->spa_meta_objset, 1273 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1274 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1275 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1276 VDEV_AUX_CORRUPT_DATA); 1277 error = EIO; 1278 goto out; 1279 } 1280 1281 /* 1282 * Load the bit that tells us to use the new accounting function 1283 * (raid-z deflation). If we have an older pool, this will not 1284 * be present. 1285 */ 1286 error = zap_lookup(spa->spa_meta_objset, 1287 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1288 sizeof (uint64_t), 1, &spa->spa_deflate); 1289 if (error != 0 && error != ENOENT) { 1290 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1291 VDEV_AUX_CORRUPT_DATA); 1292 error = EIO; 1293 goto out; 1294 } 1295 1296 /* 1297 * Load the persistent error log. If we have an older pool, this will 1298 * not be present. 1299 */ 1300 error = zap_lookup(spa->spa_meta_objset, 1301 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1302 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1303 if (error != 0 && error != ENOENT) { 1304 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1305 VDEV_AUX_CORRUPT_DATA); 1306 error = EIO; 1307 goto out; 1308 } 1309 1310 error = zap_lookup(spa->spa_meta_objset, 1311 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1312 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1313 if (error != 0 && error != ENOENT) { 1314 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1315 VDEV_AUX_CORRUPT_DATA); 1316 error = EIO; 1317 goto out; 1318 } 1319 1320 /* 1321 * Load the history object. If we have an older pool, this 1322 * will not be present. 1323 */ 1324 error = zap_lookup(spa->spa_meta_objset, 1325 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1326 sizeof (uint64_t), 1, &spa->spa_history); 1327 if (error != 0 && error != ENOENT) { 1328 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1329 VDEV_AUX_CORRUPT_DATA); 1330 error = EIO; 1331 goto out; 1332 } 1333 1334 /* 1335 * Load any hot spares for this pool. 1336 */ 1337 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1338 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1339 if (error != 0 && error != ENOENT) { 1340 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1341 VDEV_AUX_CORRUPT_DATA); 1342 error = EIO; 1343 goto out; 1344 } 1345 if (error == 0) { 1346 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1347 if (load_nvlist(spa, spa->spa_spares.sav_object, 1348 &spa->spa_spares.sav_config) != 0) { 1349 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1350 VDEV_AUX_CORRUPT_DATA); 1351 error = EIO; 1352 goto out; 1353 } 1354 1355 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1356 spa_load_spares(spa); 1357 spa_config_exit(spa, SCL_ALL, FTAG); 1358 } 1359 1360 /* 1361 * Load any level 2 ARC devices for this pool. 1362 */ 1363 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1364 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1365 &spa->spa_l2cache.sav_object); 1366 if (error != 0 && error != ENOENT) { 1367 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1368 VDEV_AUX_CORRUPT_DATA); 1369 error = EIO; 1370 goto out; 1371 } 1372 if (error == 0) { 1373 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1374 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1375 &spa->spa_l2cache.sav_config) != 0) { 1376 vdev_set_state(rvd, B_TRUE, 1377 VDEV_STATE_CANT_OPEN, 1378 VDEV_AUX_CORRUPT_DATA); 1379 error = EIO; 1380 goto out; 1381 } 1382 1383 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1384 spa_load_l2cache(spa); 1385 spa_config_exit(spa, SCL_ALL, FTAG); 1386 } 1387 1388 if (spa_check_logs(spa)) { 1389 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1390 VDEV_AUX_BAD_LOG); 1391 error = ENXIO; 1392 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1393 goto out; 1394 } 1395 1396 1397 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1398 1399 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1400 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1401 1402 if (error && error != ENOENT) { 1403 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1404 VDEV_AUX_CORRUPT_DATA); 1405 error = EIO; 1406 goto out; 1407 } 1408 1409 if (error == 0) { 1410 (void) zap_lookup(spa->spa_meta_objset, 1411 spa->spa_pool_props_object, 1412 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1413 sizeof (uint64_t), 1, &spa->spa_bootfs); 1414 (void) zap_lookup(spa->spa_meta_objset, 1415 spa->spa_pool_props_object, 1416 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1417 sizeof (uint64_t), 1, &autoreplace); 1418 (void) zap_lookup(spa->spa_meta_objset, 1419 spa->spa_pool_props_object, 1420 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1421 sizeof (uint64_t), 1, &spa->spa_delegation); 1422 (void) zap_lookup(spa->spa_meta_objset, 1423 spa->spa_pool_props_object, 1424 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1425 sizeof (uint64_t), 1, &spa->spa_failmode); 1426 } 1427 1428 /* 1429 * If the 'autoreplace' property is set, then post a resource notifying 1430 * the ZFS DE that it should not issue any faults for unopenable 1431 * devices. We also iterate over the vdevs, and post a sysevent for any 1432 * unopenable vdevs so that the normal autoreplace handler can take 1433 * over. 1434 */ 1435 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1436 spa_check_removed(spa->spa_root_vdev); 1437 1438 /* 1439 * Load the vdev state for all toplevel vdevs. 1440 */ 1441 vdev_load(rvd); 1442 1443 /* 1444 * Propagate the leaf DTLs we just loaded all the way up the tree. 1445 */ 1446 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1447 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1448 spa_config_exit(spa, SCL_ALL, FTAG); 1449 1450 /* 1451 * Check the state of the root vdev. If it can't be opened, it 1452 * indicates one or more toplevel vdevs are faulted. 1453 */ 1454 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1455 error = ENXIO; 1456 goto out; 1457 } 1458 1459 if (spa_writeable(spa)) { 1460 dmu_tx_t *tx; 1461 int need_update = B_FALSE; 1462 1463 ASSERT(state != SPA_LOAD_TRYIMPORT); 1464 1465 /* 1466 * Claim log blocks that haven't been committed yet. 1467 * This must all happen in a single txg. 1468 */ 1469 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1470 spa_first_txg(spa)); 1471 (void) dmu_objset_find(spa_name(spa), 1472 zil_claim, tx, DS_FIND_CHILDREN); 1473 dmu_tx_commit(tx); 1474 1475 spa->spa_sync_on = B_TRUE; 1476 txg_sync_start(spa->spa_dsl_pool); 1477 1478 /* 1479 * Wait for all claims to sync. 1480 */ 1481 txg_wait_synced(spa->spa_dsl_pool, 0); 1482 1483 /* 1484 * If the config cache is stale, or we have uninitialized 1485 * metaslabs (see spa_vdev_add()), then update the config. 1486 */ 1487 if (config_cache_txg != spa->spa_config_txg || 1488 state == SPA_LOAD_IMPORT) 1489 need_update = B_TRUE; 1490 1491 for (int c = 0; c < rvd->vdev_children; c++) 1492 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1493 need_update = B_TRUE; 1494 1495 /* 1496 * Update the config cache asychronously in case we're the 1497 * root pool, in which case the config cache isn't writable yet. 1498 */ 1499 if (need_update) 1500 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1501 1502 /* 1503 * Check all DTLs to see if anything needs resilvering. 1504 */ 1505 if (vdev_resilver_needed(rvd, NULL, NULL)) 1506 spa_async_request(spa, SPA_ASYNC_RESILVER); 1507 } 1508 1509 error = 0; 1510 out: 1511 spa->spa_minref = refcount_count(&spa->spa_refcount); 1512 if (error && error != EBADF) 1513 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1514 spa->spa_load_state = SPA_LOAD_NONE; 1515 spa->spa_ena = 0; 1516 1517 return (error); 1518 } 1519 1520 /* 1521 * Pool Open/Import 1522 * 1523 * The import case is identical to an open except that the configuration is sent 1524 * down from userland, instead of grabbed from the configuration cache. For the 1525 * case of an open, the pool configuration will exist in the 1526 * POOL_STATE_UNINITIALIZED state. 1527 * 1528 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1529 * the same time open the pool, without having to keep around the spa_t in some 1530 * ambiguous state. 1531 */ 1532 static int 1533 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1534 { 1535 spa_t *spa; 1536 int error; 1537 int locked = B_FALSE; 1538 1539 *spapp = NULL; 1540 1541 /* 1542 * As disgusting as this is, we need to support recursive calls to this 1543 * function because dsl_dir_open() is called during spa_load(), and ends 1544 * up calling spa_open() again. The real fix is to figure out how to 1545 * avoid dsl_dir_open() calling this in the first place. 1546 */ 1547 if (mutex_owner(&spa_namespace_lock) != curthread) { 1548 mutex_enter(&spa_namespace_lock); 1549 locked = B_TRUE; 1550 } 1551 1552 if ((spa = spa_lookup(pool)) == NULL) { 1553 if (locked) 1554 mutex_exit(&spa_namespace_lock); 1555 return (ENOENT); 1556 } 1557 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1558 1559 spa_activate(spa, spa_mode_global); 1560 1561 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1562 1563 if (error == EBADF) { 1564 /* 1565 * If vdev_validate() returns failure (indicated by 1566 * EBADF), it indicates that one of the vdevs indicates 1567 * that the pool has been exported or destroyed. If 1568 * this is the case, the config cache is out of sync and 1569 * we should remove the pool from the namespace. 1570 */ 1571 spa_unload(spa); 1572 spa_deactivate(spa); 1573 spa_config_sync(spa, B_TRUE, B_TRUE); 1574 spa_remove(spa); 1575 if (locked) 1576 mutex_exit(&spa_namespace_lock); 1577 return (ENOENT); 1578 } 1579 1580 if (error) { 1581 /* 1582 * We can't open the pool, but we still have useful 1583 * information: the state of each vdev after the 1584 * attempted vdev_open(). Return this to the user. 1585 */ 1586 if (config != NULL && spa->spa_root_vdev != NULL) 1587 *config = spa_config_generate(spa, NULL, -1ULL, 1588 B_TRUE); 1589 spa_unload(spa); 1590 spa_deactivate(spa); 1591 spa->spa_last_open_failed = B_TRUE; 1592 if (locked) 1593 mutex_exit(&spa_namespace_lock); 1594 *spapp = NULL; 1595 return (error); 1596 } else { 1597 spa->spa_last_open_failed = B_FALSE; 1598 } 1599 } 1600 1601 spa_open_ref(spa, tag); 1602 1603 if (locked) 1604 mutex_exit(&spa_namespace_lock); 1605 1606 *spapp = spa; 1607 1608 if (config != NULL) 1609 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1610 1611 return (0); 1612 } 1613 1614 int 1615 spa_open(const char *name, spa_t **spapp, void *tag) 1616 { 1617 return (spa_open_common(name, spapp, tag, NULL)); 1618 } 1619 1620 /* 1621 * Lookup the given spa_t, incrementing the inject count in the process, 1622 * preventing it from being exported or destroyed. 1623 */ 1624 spa_t * 1625 spa_inject_addref(char *name) 1626 { 1627 spa_t *spa; 1628 1629 mutex_enter(&spa_namespace_lock); 1630 if ((spa = spa_lookup(name)) == NULL) { 1631 mutex_exit(&spa_namespace_lock); 1632 return (NULL); 1633 } 1634 spa->spa_inject_ref++; 1635 mutex_exit(&spa_namespace_lock); 1636 1637 return (spa); 1638 } 1639 1640 void 1641 spa_inject_delref(spa_t *spa) 1642 { 1643 mutex_enter(&spa_namespace_lock); 1644 spa->spa_inject_ref--; 1645 mutex_exit(&spa_namespace_lock); 1646 } 1647 1648 /* 1649 * Add spares device information to the nvlist. 1650 */ 1651 static void 1652 spa_add_spares(spa_t *spa, nvlist_t *config) 1653 { 1654 nvlist_t **spares; 1655 uint_t i, nspares; 1656 nvlist_t *nvroot; 1657 uint64_t guid; 1658 vdev_stat_t *vs; 1659 uint_t vsc; 1660 uint64_t pool; 1661 1662 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1663 1664 if (spa->spa_spares.sav_count == 0) 1665 return; 1666 1667 VERIFY(nvlist_lookup_nvlist(config, 1668 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1669 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1670 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1671 if (nspares != 0) { 1672 VERIFY(nvlist_add_nvlist_array(nvroot, 1673 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1674 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1675 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1676 1677 /* 1678 * Go through and find any spares which have since been 1679 * repurposed as an active spare. If this is the case, update 1680 * their status appropriately. 1681 */ 1682 for (i = 0; i < nspares; i++) { 1683 VERIFY(nvlist_lookup_uint64(spares[i], 1684 ZPOOL_CONFIG_GUID, &guid) == 0); 1685 if (spa_spare_exists(guid, &pool, NULL) && 1686 pool != 0ULL) { 1687 VERIFY(nvlist_lookup_uint64_array( 1688 spares[i], ZPOOL_CONFIG_STATS, 1689 (uint64_t **)&vs, &vsc) == 0); 1690 vs->vs_state = VDEV_STATE_CANT_OPEN; 1691 vs->vs_aux = VDEV_AUX_SPARED; 1692 } 1693 } 1694 } 1695 } 1696 1697 /* 1698 * Add l2cache device information to the nvlist, including vdev stats. 1699 */ 1700 static void 1701 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1702 { 1703 nvlist_t **l2cache; 1704 uint_t i, j, nl2cache; 1705 nvlist_t *nvroot; 1706 uint64_t guid; 1707 vdev_t *vd; 1708 vdev_stat_t *vs; 1709 uint_t vsc; 1710 1711 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1712 1713 if (spa->spa_l2cache.sav_count == 0) 1714 return; 1715 1716 VERIFY(nvlist_lookup_nvlist(config, 1717 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1718 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1719 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1720 if (nl2cache != 0) { 1721 VERIFY(nvlist_add_nvlist_array(nvroot, 1722 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1723 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1724 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1725 1726 /* 1727 * Update level 2 cache device stats. 1728 */ 1729 1730 for (i = 0; i < nl2cache; i++) { 1731 VERIFY(nvlist_lookup_uint64(l2cache[i], 1732 ZPOOL_CONFIG_GUID, &guid) == 0); 1733 1734 vd = NULL; 1735 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1736 if (guid == 1737 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1738 vd = spa->spa_l2cache.sav_vdevs[j]; 1739 break; 1740 } 1741 } 1742 ASSERT(vd != NULL); 1743 1744 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1745 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1746 vdev_get_stats(vd, vs); 1747 } 1748 } 1749 } 1750 1751 int 1752 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1753 { 1754 int error; 1755 spa_t *spa; 1756 1757 *config = NULL; 1758 error = spa_open_common(name, &spa, FTAG, config); 1759 1760 if (spa != NULL) { 1761 /* 1762 * This still leaves a window of inconsistency where the spares 1763 * or l2cache devices could change and the config would be 1764 * self-inconsistent. 1765 */ 1766 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1767 1768 if (*config != NULL) { 1769 VERIFY(nvlist_add_uint64(*config, 1770 ZPOOL_CONFIG_ERRCOUNT, 1771 spa_get_errlog_size(spa)) == 0); 1772 1773 if (spa_suspended(spa)) 1774 VERIFY(nvlist_add_uint64(*config, 1775 ZPOOL_CONFIG_SUSPENDED, 1776 spa->spa_failmode) == 0); 1777 1778 spa_add_spares(spa, *config); 1779 spa_add_l2cache(spa, *config); 1780 } 1781 } 1782 1783 /* 1784 * We want to get the alternate root even for faulted pools, so we cheat 1785 * and call spa_lookup() directly. 1786 */ 1787 if (altroot) { 1788 if (spa == NULL) { 1789 mutex_enter(&spa_namespace_lock); 1790 spa = spa_lookup(name); 1791 if (spa) 1792 spa_altroot(spa, altroot, buflen); 1793 else 1794 altroot[0] = '\0'; 1795 spa = NULL; 1796 mutex_exit(&spa_namespace_lock); 1797 } else { 1798 spa_altroot(spa, altroot, buflen); 1799 } 1800 } 1801 1802 if (spa != NULL) { 1803 spa_config_exit(spa, SCL_CONFIG, FTAG); 1804 spa_close(spa, FTAG); 1805 } 1806 1807 return (error); 1808 } 1809 1810 /* 1811 * Validate that the auxiliary device array is well formed. We must have an 1812 * array of nvlists, each which describes a valid leaf vdev. If this is an 1813 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1814 * specified, as long as they are well-formed. 1815 */ 1816 static int 1817 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1818 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1819 vdev_labeltype_t label) 1820 { 1821 nvlist_t **dev; 1822 uint_t i, ndev; 1823 vdev_t *vd; 1824 int error; 1825 1826 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1827 1828 /* 1829 * It's acceptable to have no devs specified. 1830 */ 1831 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1832 return (0); 1833 1834 if (ndev == 0) 1835 return (EINVAL); 1836 1837 /* 1838 * Make sure the pool is formatted with a version that supports this 1839 * device type. 1840 */ 1841 if (spa_version(spa) < version) 1842 return (ENOTSUP); 1843 1844 /* 1845 * Set the pending device list so we correctly handle device in-use 1846 * checking. 1847 */ 1848 sav->sav_pending = dev; 1849 sav->sav_npending = ndev; 1850 1851 for (i = 0; i < ndev; i++) { 1852 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1853 mode)) != 0) 1854 goto out; 1855 1856 if (!vd->vdev_ops->vdev_op_leaf) { 1857 vdev_free(vd); 1858 error = EINVAL; 1859 goto out; 1860 } 1861 1862 /* 1863 * The L2ARC currently only supports disk devices in 1864 * kernel context. For user-level testing, we allow it. 1865 */ 1866 #ifdef _KERNEL 1867 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1868 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1869 error = ENOTBLK; 1870 goto out; 1871 } 1872 #endif 1873 vd->vdev_top = vd; 1874 1875 if ((error = vdev_open(vd)) == 0 && 1876 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1877 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1878 vd->vdev_guid) == 0); 1879 } 1880 1881 vdev_free(vd); 1882 1883 if (error && 1884 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1885 goto out; 1886 else 1887 error = 0; 1888 } 1889 1890 out: 1891 sav->sav_pending = NULL; 1892 sav->sav_npending = 0; 1893 return (error); 1894 } 1895 1896 static int 1897 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1898 { 1899 int error; 1900 1901 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1902 1903 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1904 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1905 VDEV_LABEL_SPARE)) != 0) { 1906 return (error); 1907 } 1908 1909 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1910 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1911 VDEV_LABEL_L2CACHE)); 1912 } 1913 1914 static void 1915 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1916 const char *config) 1917 { 1918 int i; 1919 1920 if (sav->sav_config != NULL) { 1921 nvlist_t **olddevs; 1922 uint_t oldndevs; 1923 nvlist_t **newdevs; 1924 1925 /* 1926 * Generate new dev list by concatentating with the 1927 * current dev list. 1928 */ 1929 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1930 &olddevs, &oldndevs) == 0); 1931 1932 newdevs = kmem_alloc(sizeof (void *) * 1933 (ndevs + oldndevs), KM_SLEEP); 1934 for (i = 0; i < oldndevs; i++) 1935 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1936 KM_SLEEP) == 0); 1937 for (i = 0; i < ndevs; i++) 1938 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1939 KM_SLEEP) == 0); 1940 1941 VERIFY(nvlist_remove(sav->sav_config, config, 1942 DATA_TYPE_NVLIST_ARRAY) == 0); 1943 1944 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1945 config, newdevs, ndevs + oldndevs) == 0); 1946 for (i = 0; i < oldndevs + ndevs; i++) 1947 nvlist_free(newdevs[i]); 1948 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1949 } else { 1950 /* 1951 * Generate a new dev list. 1952 */ 1953 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1954 KM_SLEEP) == 0); 1955 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1956 devs, ndevs) == 0); 1957 } 1958 } 1959 1960 /* 1961 * Stop and drop level 2 ARC devices 1962 */ 1963 void 1964 spa_l2cache_drop(spa_t *spa) 1965 { 1966 vdev_t *vd; 1967 int i; 1968 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1969 1970 for (i = 0; i < sav->sav_count; i++) { 1971 uint64_t pool; 1972 1973 vd = sav->sav_vdevs[i]; 1974 ASSERT(vd != NULL); 1975 1976 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1977 pool != 0ULL && l2arc_vdev_present(vd)) 1978 l2arc_remove_vdev(vd); 1979 if (vd->vdev_isl2cache) 1980 spa_l2cache_remove(vd); 1981 vdev_clear_stats(vd); 1982 (void) vdev_close(vd); 1983 } 1984 } 1985 1986 /* 1987 * Pool Creation 1988 */ 1989 int 1990 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1991 const char *history_str, nvlist_t *zplprops) 1992 { 1993 spa_t *spa; 1994 char *altroot = NULL; 1995 vdev_t *rvd; 1996 dsl_pool_t *dp; 1997 dmu_tx_t *tx; 1998 int c, error = 0; 1999 uint64_t txg = TXG_INITIAL; 2000 nvlist_t **spares, **l2cache; 2001 uint_t nspares, nl2cache; 2002 uint64_t version; 2003 2004 /* 2005 * If this pool already exists, return failure. 2006 */ 2007 mutex_enter(&spa_namespace_lock); 2008 if (spa_lookup(pool) != NULL) { 2009 mutex_exit(&spa_namespace_lock); 2010 return (EEXIST); 2011 } 2012 2013 /* 2014 * Allocate a new spa_t structure. 2015 */ 2016 (void) nvlist_lookup_string(props, 2017 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2018 spa = spa_add(pool, altroot); 2019 spa_activate(spa, spa_mode_global); 2020 2021 spa->spa_uberblock.ub_txg = txg - 1; 2022 2023 if (props && (error = spa_prop_validate(spa, props))) { 2024 spa_unload(spa); 2025 spa_deactivate(spa); 2026 spa_remove(spa); 2027 mutex_exit(&spa_namespace_lock); 2028 return (error); 2029 } 2030 2031 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2032 &version) != 0) 2033 version = SPA_VERSION; 2034 ASSERT(version <= SPA_VERSION); 2035 spa->spa_uberblock.ub_version = version; 2036 spa->spa_ubsync = spa->spa_uberblock; 2037 2038 /* 2039 * Create "The Godfather" zio to hold all async IOs 2040 */ 2041 if (spa->spa_async_zio_root == NULL) 2042 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2043 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2044 ZIO_FLAG_GODFATHER); 2045 2046 /* 2047 * Create the root vdev. 2048 */ 2049 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2050 2051 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2052 2053 ASSERT(error != 0 || rvd != NULL); 2054 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2055 2056 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2057 error = EINVAL; 2058 2059 if (error == 0 && 2060 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2061 (error = spa_validate_aux(spa, nvroot, txg, 2062 VDEV_ALLOC_ADD)) == 0) { 2063 for (c = 0; c < rvd->vdev_children; c++) 2064 vdev_init(rvd->vdev_child[c], txg); 2065 vdev_config_dirty(rvd); 2066 } 2067 2068 spa_config_exit(spa, SCL_ALL, FTAG); 2069 2070 if (error != 0) { 2071 spa_unload(spa); 2072 spa_deactivate(spa); 2073 spa_remove(spa); 2074 mutex_exit(&spa_namespace_lock); 2075 return (error); 2076 } 2077 2078 /* 2079 * Get the list of spares, if specified. 2080 */ 2081 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2082 &spares, &nspares) == 0) { 2083 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2084 KM_SLEEP) == 0); 2085 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2086 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2087 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2088 spa_load_spares(spa); 2089 spa_config_exit(spa, SCL_ALL, FTAG); 2090 spa->spa_spares.sav_sync = B_TRUE; 2091 } 2092 2093 /* 2094 * Get the list of level 2 cache devices, if specified. 2095 */ 2096 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2097 &l2cache, &nl2cache) == 0) { 2098 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2099 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2100 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2101 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2102 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2103 spa_load_l2cache(spa); 2104 spa_config_exit(spa, SCL_ALL, FTAG); 2105 spa->spa_l2cache.sav_sync = B_TRUE; 2106 } 2107 2108 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2109 spa->spa_meta_objset = dp->dp_meta_objset; 2110 2111 tx = dmu_tx_create_assigned(dp, txg); 2112 2113 /* 2114 * Create the pool config object. 2115 */ 2116 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2117 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2118 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2119 2120 if (zap_add(spa->spa_meta_objset, 2121 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2122 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2123 cmn_err(CE_PANIC, "failed to add pool config"); 2124 } 2125 2126 /* Newly created pools with the right version are always deflated. */ 2127 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2128 spa->spa_deflate = TRUE; 2129 if (zap_add(spa->spa_meta_objset, 2130 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2131 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2132 cmn_err(CE_PANIC, "failed to add deflate"); 2133 } 2134 } 2135 2136 /* 2137 * Create the deferred-free bplist object. Turn off compression 2138 * because sync-to-convergence takes longer if the blocksize 2139 * keeps changing. 2140 */ 2141 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2142 1 << 14, tx); 2143 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2144 ZIO_COMPRESS_OFF, tx); 2145 2146 if (zap_add(spa->spa_meta_objset, 2147 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2148 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2149 cmn_err(CE_PANIC, "failed to add bplist"); 2150 } 2151 2152 /* 2153 * Create the pool's history object. 2154 */ 2155 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2156 spa_history_create_obj(spa, tx); 2157 2158 /* 2159 * Set pool properties. 2160 */ 2161 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2162 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2163 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2164 if (props != NULL) { 2165 spa_configfile_set(spa, props, B_FALSE); 2166 spa_sync_props(spa, props, CRED(), tx); 2167 } 2168 2169 dmu_tx_commit(tx); 2170 2171 spa->spa_sync_on = B_TRUE; 2172 txg_sync_start(spa->spa_dsl_pool); 2173 2174 /* 2175 * We explicitly wait for the first transaction to complete so that our 2176 * bean counters are appropriately updated. 2177 */ 2178 txg_wait_synced(spa->spa_dsl_pool, txg); 2179 2180 spa_config_sync(spa, B_FALSE, B_TRUE); 2181 2182 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2183 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2184 2185 spa->spa_minref = refcount_count(&spa->spa_refcount); 2186 2187 mutex_exit(&spa_namespace_lock); 2188 2189 return (0); 2190 } 2191 2192 #ifdef _KERNEL 2193 /* 2194 * Build a "root" vdev for a top level vdev read in from a rootpool 2195 * device label. 2196 */ 2197 static void 2198 spa_build_rootpool_config(nvlist_t *config) 2199 { 2200 nvlist_t *nvtop, *nvroot; 2201 uint64_t pgid; 2202 2203 /* 2204 * Add this top-level vdev to the child array. 2205 */ 2206 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2207 == 0); 2208 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2209 == 0); 2210 2211 /* 2212 * Put this pool's top-level vdevs into a root vdev. 2213 */ 2214 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2215 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2216 == 0); 2217 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2218 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2219 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2220 &nvtop, 1) == 0); 2221 2222 /* 2223 * Replace the existing vdev_tree with the new root vdev in 2224 * this pool's configuration (remove the old, add the new). 2225 */ 2226 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2227 nvlist_free(nvroot); 2228 } 2229 2230 /* 2231 * Get the root pool information from the root disk, then import the root pool 2232 * during the system boot up time. 2233 */ 2234 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2235 2236 int 2237 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2238 uint64_t *besttxg) 2239 { 2240 nvlist_t *config; 2241 uint64_t txg; 2242 int error; 2243 2244 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2245 return (error); 2246 2247 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2248 2249 if (bestconf != NULL) 2250 *bestconf = config; 2251 else 2252 nvlist_free(config); 2253 *besttxg = txg; 2254 return (0); 2255 } 2256 2257 boolean_t 2258 spa_rootdev_validate(nvlist_t *nv) 2259 { 2260 uint64_t ival; 2261 2262 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2263 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2264 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2265 return (B_FALSE); 2266 2267 return (B_TRUE); 2268 } 2269 2270 2271 /* 2272 * Given the boot device's physical path or devid, check if the device 2273 * is in a valid state. If so, return the configuration from the vdev 2274 * label. 2275 */ 2276 int 2277 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2278 { 2279 nvlist_t *conf = NULL; 2280 uint64_t txg = 0; 2281 nvlist_t *nvtop, **child; 2282 char *type; 2283 char *bootpath = NULL; 2284 uint_t children, c; 2285 char *tmp; 2286 int error; 2287 2288 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2289 *tmp = '\0'; 2290 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2291 cmn_err(CE_NOTE, "error reading device label"); 2292 return (error); 2293 } 2294 if (txg == 0) { 2295 cmn_err(CE_NOTE, "this device is detached"); 2296 nvlist_free(conf); 2297 return (EINVAL); 2298 } 2299 2300 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2301 &nvtop) == 0); 2302 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2303 2304 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2305 if (spa_rootdev_validate(nvtop)) { 2306 goto out; 2307 } else { 2308 nvlist_free(conf); 2309 return (EINVAL); 2310 } 2311 } 2312 2313 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2314 2315 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2316 &child, &children) == 0); 2317 2318 /* 2319 * Go thru vdevs in the mirror to see if the given device 2320 * has the most recent txg. Only the device with the most 2321 * recent txg has valid information and should be booted. 2322 */ 2323 for (c = 0; c < children; c++) { 2324 char *cdevid, *cpath; 2325 uint64_t tmptxg; 2326 2327 cpath = NULL; 2328 cdevid = NULL; 2329 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2330 &cpath) != 0 && nvlist_lookup_string(child[c], 2331 ZPOOL_CONFIG_DEVID, &cdevid) != 0) 2332 return (EINVAL); 2333 if ((spa_check_rootconf(cpath, cdevid, NULL, 2334 &tmptxg) == 0) && (tmptxg > txg)) { 2335 txg = tmptxg; 2336 VERIFY(nvlist_lookup_string(child[c], 2337 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2338 } 2339 } 2340 2341 /* Does the best device match the one we've booted from? */ 2342 if (bootpath) { 2343 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2344 return (EINVAL); 2345 } 2346 out: 2347 *bestconf = conf; 2348 return (0); 2349 } 2350 2351 /* 2352 * Import a root pool. 2353 * 2354 * For x86. devpath_list will consist of devid and/or physpath name of 2355 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2356 * The GRUB "findroot" command will return the vdev we should boot. 2357 * 2358 * For Sparc, devpath_list consists the physpath name of the booting device 2359 * no matter the rootpool is a single device pool or a mirrored pool. 2360 * e.g. 2361 * "/pci@1f,0/ide@d/disk@0,0:a" 2362 */ 2363 int 2364 spa_import_rootpool(char *devpath, char *devid) 2365 { 2366 nvlist_t *conf = NULL; 2367 char *pname; 2368 int error; 2369 spa_t *spa; 2370 2371 /* 2372 * Get the vdev pathname and configuation from the most 2373 * recently updated vdev (highest txg). 2374 */ 2375 if (error = spa_get_rootconf(devpath, devid, &conf)) 2376 goto msg_out; 2377 2378 /* 2379 * Add type "root" vdev to the config. 2380 */ 2381 spa_build_rootpool_config(conf); 2382 2383 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2384 2385 mutex_enter(&spa_namespace_lock); 2386 if ((spa = spa_lookup(pname)) != NULL) { 2387 /* 2388 * Remove the existing root pool from the namespace so that we 2389 * can replace it with the correct config we just read in. 2390 */ 2391 spa_remove(spa); 2392 } 2393 2394 spa = spa_add(pname, NULL); 2395 2396 spa->spa_is_root = B_TRUE; 2397 VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0); 2398 mutex_exit(&spa_namespace_lock); 2399 2400 nvlist_free(conf); 2401 return (0); 2402 2403 msg_out: 2404 cmn_err(CE_NOTE, "\n" 2405 " *************************************************** \n" 2406 " * This device is not bootable! * \n" 2407 " * It is either offlined or detached or faulted. * \n" 2408 " * Please try to boot from a different device. * \n" 2409 " *************************************************** "); 2410 2411 return (error); 2412 } 2413 #endif 2414 2415 /* 2416 * Take a pool and insert it into the namespace as if it had been loaded at 2417 * boot. 2418 */ 2419 int 2420 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2421 { 2422 spa_t *spa; 2423 char *altroot = NULL; 2424 2425 mutex_enter(&spa_namespace_lock); 2426 if (spa_lookup(pool) != NULL) { 2427 mutex_exit(&spa_namespace_lock); 2428 return (EEXIST); 2429 } 2430 2431 (void) nvlist_lookup_string(props, 2432 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2433 spa = spa_add(pool, altroot); 2434 2435 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2436 2437 if (props != NULL) 2438 spa_configfile_set(spa, props, B_FALSE); 2439 2440 spa_config_sync(spa, B_FALSE, B_TRUE); 2441 2442 mutex_exit(&spa_namespace_lock); 2443 2444 return (0); 2445 } 2446 2447 /* 2448 * Import a non-root pool into the system. 2449 */ 2450 int 2451 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2452 { 2453 spa_t *spa; 2454 char *altroot = NULL; 2455 int error; 2456 nvlist_t *nvroot; 2457 nvlist_t **spares, **l2cache; 2458 uint_t nspares, nl2cache; 2459 2460 /* 2461 * If a pool with this name exists, return failure. 2462 */ 2463 mutex_enter(&spa_namespace_lock); 2464 if ((spa = spa_lookup(pool)) != NULL) { 2465 mutex_exit(&spa_namespace_lock); 2466 return (EEXIST); 2467 } 2468 2469 /* 2470 * Create and initialize the spa structure. 2471 */ 2472 (void) nvlist_lookup_string(props, 2473 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2474 spa = spa_add(pool, altroot); 2475 spa_activate(spa, spa_mode_global); 2476 2477 /* 2478 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2479 * because the user-supplied config is actually the one to trust when 2480 * doing an import. 2481 */ 2482 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2483 2484 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2485 /* 2486 * Toss any existing sparelist, as it doesn't have any validity 2487 * anymore, and conflicts with spa_has_spare(). 2488 */ 2489 if (spa->spa_spares.sav_config) { 2490 nvlist_free(spa->spa_spares.sav_config); 2491 spa->spa_spares.sav_config = NULL; 2492 spa_load_spares(spa); 2493 } 2494 if (spa->spa_l2cache.sav_config) { 2495 nvlist_free(spa->spa_l2cache.sav_config); 2496 spa->spa_l2cache.sav_config = NULL; 2497 spa_load_l2cache(spa); 2498 } 2499 2500 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2501 &nvroot) == 0); 2502 if (error == 0) 2503 error = spa_validate_aux(spa, nvroot, -1ULL, 2504 VDEV_ALLOC_SPARE); 2505 if (error == 0) 2506 error = spa_validate_aux(spa, nvroot, -1ULL, 2507 VDEV_ALLOC_L2CACHE); 2508 spa_config_exit(spa, SCL_ALL, FTAG); 2509 2510 if (props != NULL) 2511 spa_configfile_set(spa, props, B_FALSE); 2512 2513 if (error != 0 || (props && spa_writeable(spa) && 2514 (error = spa_prop_set(spa, props)))) { 2515 spa_unload(spa); 2516 spa_deactivate(spa); 2517 spa_remove(spa); 2518 mutex_exit(&spa_namespace_lock); 2519 return (error); 2520 } 2521 2522 /* 2523 * Override any spares and level 2 cache devices as specified by 2524 * the user, as these may have correct device names/devids, etc. 2525 */ 2526 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2527 &spares, &nspares) == 0) { 2528 if (spa->spa_spares.sav_config) 2529 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2530 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2531 else 2532 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2533 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2534 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2535 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2536 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2537 spa_load_spares(spa); 2538 spa_config_exit(spa, SCL_ALL, FTAG); 2539 spa->spa_spares.sav_sync = B_TRUE; 2540 } 2541 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2542 &l2cache, &nl2cache) == 0) { 2543 if (spa->spa_l2cache.sav_config) 2544 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2545 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2546 else 2547 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2548 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2549 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2550 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2551 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2552 spa_load_l2cache(spa); 2553 spa_config_exit(spa, SCL_ALL, FTAG); 2554 spa->spa_l2cache.sav_sync = B_TRUE; 2555 } 2556 2557 if (spa_writeable(spa)) { 2558 /* 2559 * Update the config cache to include the newly-imported pool. 2560 */ 2561 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, B_FALSE); 2562 } 2563 2564 mutex_exit(&spa_namespace_lock); 2565 2566 return (0); 2567 } 2568 2569 2570 /* 2571 * This (illegal) pool name is used when temporarily importing a spa_t in order 2572 * to get the vdev stats associated with the imported devices. 2573 */ 2574 #define TRYIMPORT_NAME "$import" 2575 2576 nvlist_t * 2577 spa_tryimport(nvlist_t *tryconfig) 2578 { 2579 nvlist_t *config = NULL; 2580 char *poolname; 2581 spa_t *spa; 2582 uint64_t state; 2583 int error; 2584 2585 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2586 return (NULL); 2587 2588 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2589 return (NULL); 2590 2591 /* 2592 * Create and initialize the spa structure. 2593 */ 2594 mutex_enter(&spa_namespace_lock); 2595 spa = spa_add(TRYIMPORT_NAME, NULL); 2596 spa_activate(spa, FREAD); 2597 2598 /* 2599 * Pass off the heavy lifting to spa_load(). 2600 * Pass TRUE for mosconfig because the user-supplied config 2601 * is actually the one to trust when doing an import. 2602 */ 2603 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2604 2605 /* 2606 * If 'tryconfig' was at least parsable, return the current config. 2607 */ 2608 if (spa->spa_root_vdev != NULL) { 2609 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2610 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2611 poolname) == 0); 2612 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2613 state) == 0); 2614 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2615 spa->spa_uberblock.ub_timestamp) == 0); 2616 2617 /* 2618 * If the bootfs property exists on this pool then we 2619 * copy it out so that external consumers can tell which 2620 * pools are bootable. 2621 */ 2622 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2623 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2624 2625 /* 2626 * We have to play games with the name since the 2627 * pool was opened as TRYIMPORT_NAME. 2628 */ 2629 if (dsl_dsobj_to_dsname(spa_name(spa), 2630 spa->spa_bootfs, tmpname) == 0) { 2631 char *cp; 2632 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2633 2634 cp = strchr(tmpname, '/'); 2635 if (cp == NULL) { 2636 (void) strlcpy(dsname, tmpname, 2637 MAXPATHLEN); 2638 } else { 2639 (void) snprintf(dsname, MAXPATHLEN, 2640 "%s/%s", poolname, ++cp); 2641 } 2642 VERIFY(nvlist_add_string(config, 2643 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2644 kmem_free(dsname, MAXPATHLEN); 2645 } 2646 kmem_free(tmpname, MAXPATHLEN); 2647 } 2648 2649 /* 2650 * Add the list of hot spares and level 2 cache devices. 2651 */ 2652 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2653 spa_add_spares(spa, config); 2654 spa_add_l2cache(spa, config); 2655 spa_config_exit(spa, SCL_CONFIG, FTAG); 2656 } 2657 2658 spa_unload(spa); 2659 spa_deactivate(spa); 2660 spa_remove(spa); 2661 mutex_exit(&spa_namespace_lock); 2662 2663 return (config); 2664 } 2665 2666 /* 2667 * Pool export/destroy 2668 * 2669 * The act of destroying or exporting a pool is very simple. We make sure there 2670 * is no more pending I/O and any references to the pool are gone. Then, we 2671 * update the pool state and sync all the labels to disk, removing the 2672 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2673 * we don't sync the labels or remove the configuration cache. 2674 */ 2675 static int 2676 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2677 boolean_t force, boolean_t hardforce) 2678 { 2679 spa_t *spa; 2680 2681 if (oldconfig) 2682 *oldconfig = NULL; 2683 2684 if (!(spa_mode_global & FWRITE)) 2685 return (EROFS); 2686 2687 mutex_enter(&spa_namespace_lock); 2688 if ((spa = spa_lookup(pool)) == NULL) { 2689 mutex_exit(&spa_namespace_lock); 2690 return (ENOENT); 2691 } 2692 2693 /* 2694 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2695 * reacquire the namespace lock, and see if we can export. 2696 */ 2697 spa_open_ref(spa, FTAG); 2698 mutex_exit(&spa_namespace_lock); 2699 spa_async_suspend(spa); 2700 mutex_enter(&spa_namespace_lock); 2701 spa_close(spa, FTAG); 2702 2703 /* 2704 * The pool will be in core if it's openable, 2705 * in which case we can modify its state. 2706 */ 2707 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2708 /* 2709 * Objsets may be open only because they're dirty, so we 2710 * have to force it to sync before checking spa_refcnt. 2711 */ 2712 txg_wait_synced(spa->spa_dsl_pool, 0); 2713 2714 /* 2715 * A pool cannot be exported or destroyed if there are active 2716 * references. If we are resetting a pool, allow references by 2717 * fault injection handlers. 2718 */ 2719 if (!spa_refcount_zero(spa) || 2720 (spa->spa_inject_ref != 0 && 2721 new_state != POOL_STATE_UNINITIALIZED)) { 2722 spa_async_resume(spa); 2723 mutex_exit(&spa_namespace_lock); 2724 return (EBUSY); 2725 } 2726 2727 /* 2728 * A pool cannot be exported if it has an active shared spare. 2729 * This is to prevent other pools stealing the active spare 2730 * from an exported pool. At user's own will, such pool can 2731 * be forcedly exported. 2732 */ 2733 if (!force && new_state == POOL_STATE_EXPORTED && 2734 spa_has_active_shared_spare(spa)) { 2735 spa_async_resume(spa); 2736 mutex_exit(&spa_namespace_lock); 2737 return (EXDEV); 2738 } 2739 2740 /* 2741 * We want this to be reflected on every label, 2742 * so mark them all dirty. spa_unload() will do the 2743 * final sync that pushes these changes out. 2744 */ 2745 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2746 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2747 spa->spa_state = new_state; 2748 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2749 vdev_config_dirty(spa->spa_root_vdev); 2750 spa_config_exit(spa, SCL_ALL, FTAG); 2751 } 2752 } 2753 2754 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2755 2756 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2757 spa_unload(spa); 2758 spa_deactivate(spa); 2759 } 2760 2761 if (oldconfig && spa->spa_config) 2762 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2763 2764 if (new_state != POOL_STATE_UNINITIALIZED) { 2765 if (!hardforce) 2766 spa_config_sync(spa, B_TRUE, B_TRUE); 2767 spa_remove(spa); 2768 } 2769 mutex_exit(&spa_namespace_lock); 2770 2771 return (0); 2772 } 2773 2774 /* 2775 * Destroy a storage pool. 2776 */ 2777 int 2778 spa_destroy(char *pool) 2779 { 2780 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2781 B_FALSE, B_FALSE)); 2782 } 2783 2784 /* 2785 * Export a storage pool. 2786 */ 2787 int 2788 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2789 boolean_t hardforce) 2790 { 2791 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2792 force, hardforce)); 2793 } 2794 2795 /* 2796 * Similar to spa_export(), this unloads the spa_t without actually removing it 2797 * from the namespace in any way. 2798 */ 2799 int 2800 spa_reset(char *pool) 2801 { 2802 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2803 B_FALSE, B_FALSE)); 2804 } 2805 2806 /* 2807 * ========================================================================== 2808 * Device manipulation 2809 * ========================================================================== 2810 */ 2811 2812 /* 2813 * Add a device to a storage pool. 2814 */ 2815 int 2816 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2817 { 2818 uint64_t txg; 2819 int error; 2820 vdev_t *rvd = spa->spa_root_vdev; 2821 vdev_t *vd, *tvd; 2822 nvlist_t **spares, **l2cache; 2823 uint_t nspares, nl2cache; 2824 2825 txg = spa_vdev_enter(spa); 2826 2827 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2828 VDEV_ALLOC_ADD)) != 0) 2829 return (spa_vdev_exit(spa, NULL, txg, error)); 2830 2831 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2832 2833 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2834 &nspares) != 0) 2835 nspares = 0; 2836 2837 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2838 &nl2cache) != 0) 2839 nl2cache = 0; 2840 2841 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2842 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2843 2844 if (vd->vdev_children != 0 && 2845 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2846 return (spa_vdev_exit(spa, vd, txg, error)); 2847 2848 /* 2849 * We must validate the spares and l2cache devices after checking the 2850 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2851 */ 2852 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2853 return (spa_vdev_exit(spa, vd, txg, error)); 2854 2855 /* 2856 * Transfer each new top-level vdev from vd to rvd. 2857 */ 2858 for (int c = 0; c < vd->vdev_children; c++) { 2859 tvd = vd->vdev_child[c]; 2860 vdev_remove_child(vd, tvd); 2861 tvd->vdev_id = rvd->vdev_children; 2862 vdev_add_child(rvd, tvd); 2863 vdev_config_dirty(tvd); 2864 } 2865 2866 if (nspares != 0) { 2867 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2868 ZPOOL_CONFIG_SPARES); 2869 spa_load_spares(spa); 2870 spa->spa_spares.sav_sync = B_TRUE; 2871 } 2872 2873 if (nl2cache != 0) { 2874 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2875 ZPOOL_CONFIG_L2CACHE); 2876 spa_load_l2cache(spa); 2877 spa->spa_l2cache.sav_sync = B_TRUE; 2878 } 2879 2880 /* 2881 * We have to be careful when adding new vdevs to an existing pool. 2882 * If other threads start allocating from these vdevs before we 2883 * sync the config cache, and we lose power, then upon reboot we may 2884 * fail to open the pool because there are DVAs that the config cache 2885 * can't translate. Therefore, we first add the vdevs without 2886 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2887 * and then let spa_config_update() initialize the new metaslabs. 2888 * 2889 * spa_load() checks for added-but-not-initialized vdevs, so that 2890 * if we lose power at any point in this sequence, the remaining 2891 * steps will be completed the next time we load the pool. 2892 */ 2893 (void) spa_vdev_exit(spa, vd, txg, 0); 2894 2895 mutex_enter(&spa_namespace_lock); 2896 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2897 mutex_exit(&spa_namespace_lock); 2898 2899 return (0); 2900 } 2901 2902 /* 2903 * Attach a device to a mirror. The arguments are the path to any device 2904 * in the mirror, and the nvroot for the new device. If the path specifies 2905 * a device that is not mirrored, we automatically insert the mirror vdev. 2906 * 2907 * If 'replacing' is specified, the new device is intended to replace the 2908 * existing device; in this case the two devices are made into their own 2909 * mirror using the 'replacing' vdev, which is functionally identical to 2910 * the mirror vdev (it actually reuses all the same ops) but has a few 2911 * extra rules: you can't attach to it after it's been created, and upon 2912 * completion of resilvering, the first disk (the one being replaced) 2913 * is automatically detached. 2914 */ 2915 int 2916 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2917 { 2918 uint64_t txg, open_txg; 2919 vdev_t *rvd = spa->spa_root_vdev; 2920 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2921 vdev_ops_t *pvops; 2922 dmu_tx_t *tx; 2923 char *oldvdpath, *newvdpath; 2924 int newvd_isspare; 2925 int error; 2926 2927 txg = spa_vdev_enter(spa); 2928 2929 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2930 2931 if (oldvd == NULL) 2932 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2933 2934 if (!oldvd->vdev_ops->vdev_op_leaf) 2935 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2936 2937 pvd = oldvd->vdev_parent; 2938 2939 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2940 VDEV_ALLOC_ADD)) != 0) 2941 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2942 2943 if (newrootvd->vdev_children != 1) 2944 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2945 2946 newvd = newrootvd->vdev_child[0]; 2947 2948 if (!newvd->vdev_ops->vdev_op_leaf) 2949 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2950 2951 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2952 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2953 2954 /* 2955 * Spares can't replace logs 2956 */ 2957 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2958 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2959 2960 if (!replacing) { 2961 /* 2962 * For attach, the only allowable parent is a mirror or the root 2963 * vdev. 2964 */ 2965 if (pvd->vdev_ops != &vdev_mirror_ops && 2966 pvd->vdev_ops != &vdev_root_ops) 2967 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2968 2969 pvops = &vdev_mirror_ops; 2970 } else { 2971 /* 2972 * Active hot spares can only be replaced by inactive hot 2973 * spares. 2974 */ 2975 if (pvd->vdev_ops == &vdev_spare_ops && 2976 pvd->vdev_child[1] == oldvd && 2977 !spa_has_spare(spa, newvd->vdev_guid)) 2978 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2979 2980 /* 2981 * If the source is a hot spare, and the parent isn't already a 2982 * spare, then we want to create a new hot spare. Otherwise, we 2983 * want to create a replacing vdev. The user is not allowed to 2984 * attach to a spared vdev child unless the 'isspare' state is 2985 * the same (spare replaces spare, non-spare replaces 2986 * non-spare). 2987 */ 2988 if (pvd->vdev_ops == &vdev_replacing_ops) 2989 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2990 else if (pvd->vdev_ops == &vdev_spare_ops && 2991 newvd->vdev_isspare != oldvd->vdev_isspare) 2992 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2993 else if (pvd->vdev_ops != &vdev_spare_ops && 2994 newvd->vdev_isspare) 2995 pvops = &vdev_spare_ops; 2996 else 2997 pvops = &vdev_replacing_ops; 2998 } 2999 3000 /* 3001 * Compare the new device size with the replaceable/attachable 3002 * device size. 3003 */ 3004 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 3005 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3006 3007 /* 3008 * The new device cannot have a higher alignment requirement 3009 * than the top-level vdev. 3010 */ 3011 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3012 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3013 3014 /* 3015 * If this is an in-place replacement, update oldvd's path and devid 3016 * to make it distinguishable from newvd, and unopenable from now on. 3017 */ 3018 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3019 spa_strfree(oldvd->vdev_path); 3020 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3021 KM_SLEEP); 3022 (void) sprintf(oldvd->vdev_path, "%s/%s", 3023 newvd->vdev_path, "old"); 3024 if (oldvd->vdev_devid != NULL) { 3025 spa_strfree(oldvd->vdev_devid); 3026 oldvd->vdev_devid = NULL; 3027 } 3028 } 3029 3030 /* 3031 * If the parent is not a mirror, or if we're replacing, insert the new 3032 * mirror/replacing/spare vdev above oldvd. 3033 */ 3034 if (pvd->vdev_ops != pvops) 3035 pvd = vdev_add_parent(oldvd, pvops); 3036 3037 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3038 ASSERT(pvd->vdev_ops == pvops); 3039 ASSERT(oldvd->vdev_parent == pvd); 3040 3041 /* 3042 * Extract the new device from its root and add it to pvd. 3043 */ 3044 vdev_remove_child(newrootvd, newvd); 3045 newvd->vdev_id = pvd->vdev_children; 3046 vdev_add_child(pvd, newvd); 3047 3048 /* 3049 * If newvd is smaller than oldvd, but larger than its rsize, 3050 * the addition of newvd may have decreased our parent's asize. 3051 */ 3052 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 3053 3054 tvd = newvd->vdev_top; 3055 ASSERT(pvd->vdev_top == tvd); 3056 ASSERT(tvd->vdev_parent == rvd); 3057 3058 vdev_config_dirty(tvd); 3059 3060 /* 3061 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3062 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3063 */ 3064 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3065 3066 vdev_dtl_dirty(newvd, DTL_MISSING, 3067 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3068 3069 if (newvd->vdev_isspare) { 3070 spa_spare_activate(newvd); 3071 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3072 } 3073 3074 oldvdpath = spa_strdup(oldvd->vdev_path); 3075 newvdpath = spa_strdup(newvd->vdev_path); 3076 newvd_isspare = newvd->vdev_isspare; 3077 3078 /* 3079 * Mark newvd's DTL dirty in this txg. 3080 */ 3081 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3082 3083 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3084 3085 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3086 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 3087 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 3088 CRED(), "%s vdev=%s %s vdev=%s", 3089 replacing && newvd_isspare ? "spare in" : 3090 replacing ? "replace" : "attach", newvdpath, 3091 replacing ? "for" : "to", oldvdpath); 3092 dmu_tx_commit(tx); 3093 } else { 3094 dmu_tx_abort(tx); 3095 } 3096 3097 spa_strfree(oldvdpath); 3098 spa_strfree(newvdpath); 3099 3100 /* 3101 * Kick off a resilver to update newvd. 3102 */ 3103 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3104 3105 return (0); 3106 } 3107 3108 /* 3109 * Detach a device from a mirror or replacing vdev. 3110 * If 'replace_done' is specified, only detach if the parent 3111 * is a replacing vdev. 3112 */ 3113 int 3114 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3115 { 3116 uint64_t txg; 3117 int error; 3118 vdev_t *rvd = spa->spa_root_vdev; 3119 vdev_t *vd, *pvd, *cvd, *tvd; 3120 boolean_t unspare = B_FALSE; 3121 uint64_t unspare_guid; 3122 size_t len; 3123 3124 txg = spa_vdev_enter(spa); 3125 3126 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3127 3128 if (vd == NULL) 3129 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3130 3131 if (!vd->vdev_ops->vdev_op_leaf) 3132 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3133 3134 pvd = vd->vdev_parent; 3135 3136 /* 3137 * If the parent/child relationship is not as expected, don't do it. 3138 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3139 * vdev that's replacing B with C. The user's intent in replacing 3140 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3141 * the replace by detaching C, the expected behavior is to end up 3142 * M(A,B). But suppose that right after deciding to detach C, 3143 * the replacement of B completes. We would have M(A,C), and then 3144 * ask to detach C, which would leave us with just A -- not what 3145 * the user wanted. To prevent this, we make sure that the 3146 * parent/child relationship hasn't changed -- in this example, 3147 * that C's parent is still the replacing vdev R. 3148 */ 3149 if (pvd->vdev_guid != pguid && pguid != 0) 3150 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3151 3152 /* 3153 * If replace_done is specified, only remove this device if it's 3154 * the first child of a replacing vdev. For the 'spare' vdev, either 3155 * disk can be removed. 3156 */ 3157 if (replace_done) { 3158 if (pvd->vdev_ops == &vdev_replacing_ops) { 3159 if (vd->vdev_id != 0) 3160 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3161 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3162 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3163 } 3164 } 3165 3166 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3167 spa_version(spa) >= SPA_VERSION_SPARES); 3168 3169 /* 3170 * Only mirror, replacing, and spare vdevs support detach. 3171 */ 3172 if (pvd->vdev_ops != &vdev_replacing_ops && 3173 pvd->vdev_ops != &vdev_mirror_ops && 3174 pvd->vdev_ops != &vdev_spare_ops) 3175 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3176 3177 /* 3178 * If this device has the only valid copy of some data, 3179 * we cannot safely detach it. 3180 */ 3181 if (vdev_dtl_required(vd)) 3182 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3183 3184 ASSERT(pvd->vdev_children >= 2); 3185 3186 /* 3187 * If we are detaching the second disk from a replacing vdev, then 3188 * check to see if we changed the original vdev's path to have "/old" 3189 * at the end in spa_vdev_attach(). If so, undo that change now. 3190 */ 3191 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3192 pvd->vdev_child[0]->vdev_path != NULL && 3193 pvd->vdev_child[1]->vdev_path != NULL) { 3194 ASSERT(pvd->vdev_child[1] == vd); 3195 cvd = pvd->vdev_child[0]; 3196 len = strlen(vd->vdev_path); 3197 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3198 strcmp(cvd->vdev_path + len, "/old") == 0) { 3199 spa_strfree(cvd->vdev_path); 3200 cvd->vdev_path = spa_strdup(vd->vdev_path); 3201 } 3202 } 3203 3204 /* 3205 * If we are detaching the original disk from a spare, then it implies 3206 * that the spare should become a real disk, and be removed from the 3207 * active spare list for the pool. 3208 */ 3209 if (pvd->vdev_ops == &vdev_spare_ops && 3210 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3211 unspare = B_TRUE; 3212 3213 /* 3214 * Erase the disk labels so the disk can be used for other things. 3215 * This must be done after all other error cases are handled, 3216 * but before we disembowel vd (so we can still do I/O to it). 3217 * But if we can't do it, don't treat the error as fatal -- 3218 * it may be that the unwritability of the disk is the reason 3219 * it's being detached! 3220 */ 3221 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3222 3223 /* 3224 * Remove vd from its parent and compact the parent's children. 3225 */ 3226 vdev_remove_child(pvd, vd); 3227 vdev_compact_children(pvd); 3228 3229 /* 3230 * Remember one of the remaining children so we can get tvd below. 3231 */ 3232 cvd = pvd->vdev_child[0]; 3233 3234 /* 3235 * If we need to remove the remaining child from the list of hot spares, 3236 * do it now, marking the vdev as no longer a spare in the process. 3237 * We must do this before vdev_remove_parent(), because that can 3238 * change the GUID if it creates a new toplevel GUID. For a similar 3239 * reason, we must remove the spare now, in the same txg as the detach; 3240 * otherwise someone could attach a new sibling, change the GUID, and 3241 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3242 */ 3243 if (unspare) { 3244 ASSERT(cvd->vdev_isspare); 3245 spa_spare_remove(cvd); 3246 unspare_guid = cvd->vdev_guid; 3247 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3248 } 3249 3250 /* 3251 * If the parent mirror/replacing vdev only has one child, 3252 * the parent is no longer needed. Remove it from the tree. 3253 */ 3254 if (pvd->vdev_children == 1) 3255 vdev_remove_parent(cvd); 3256 3257 /* 3258 * We don't set tvd until now because the parent we just removed 3259 * may have been the previous top-level vdev. 3260 */ 3261 tvd = cvd->vdev_top; 3262 ASSERT(tvd->vdev_parent == rvd); 3263 3264 /* 3265 * Reevaluate the parent vdev state. 3266 */ 3267 vdev_propagate_state(cvd); 3268 3269 /* 3270 * If the device we just detached was smaller than the others, it may be 3271 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3272 * can't fail because the existing metaslabs are already in core, so 3273 * there's nothing to read from disk. 3274 */ 3275 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3276 3277 vdev_config_dirty(tvd); 3278 3279 /* 3280 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3281 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3282 * But first make sure we're not on any *other* txg's DTL list, to 3283 * prevent vd from being accessed after it's freed. 3284 */ 3285 for (int t = 0; t < TXG_SIZE; t++) 3286 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3287 vd->vdev_detached = B_TRUE; 3288 vdev_dirty(tvd, VDD_DTL, vd, txg); 3289 3290 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3291 3292 error = spa_vdev_exit(spa, vd, txg, 0); 3293 3294 /* 3295 * If this was the removal of the original device in a hot spare vdev, 3296 * then we want to go through and remove the device from the hot spare 3297 * list of every other pool. 3298 */ 3299 if (unspare) { 3300 spa_t *myspa = spa; 3301 spa = NULL; 3302 mutex_enter(&spa_namespace_lock); 3303 while ((spa = spa_next(spa)) != NULL) { 3304 if (spa->spa_state != POOL_STATE_ACTIVE) 3305 continue; 3306 if (spa == myspa) 3307 continue; 3308 spa_open_ref(spa, FTAG); 3309 mutex_exit(&spa_namespace_lock); 3310 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3311 mutex_enter(&spa_namespace_lock); 3312 spa_close(spa, FTAG); 3313 } 3314 mutex_exit(&spa_namespace_lock); 3315 } 3316 3317 return (error); 3318 } 3319 3320 static nvlist_t * 3321 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3322 { 3323 for (int i = 0; i < count; i++) { 3324 uint64_t guid; 3325 3326 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3327 &guid) == 0); 3328 3329 if (guid == target_guid) 3330 return (nvpp[i]); 3331 } 3332 3333 return (NULL); 3334 } 3335 3336 static void 3337 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3338 nvlist_t *dev_to_remove) 3339 { 3340 nvlist_t **newdev = NULL; 3341 3342 if (count > 1) 3343 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3344 3345 for (int i = 0, j = 0; i < count; i++) { 3346 if (dev[i] == dev_to_remove) 3347 continue; 3348 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3349 } 3350 3351 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3352 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3353 3354 for (int i = 0; i < count - 1; i++) 3355 nvlist_free(newdev[i]); 3356 3357 if (count > 1) 3358 kmem_free(newdev, (count - 1) * sizeof (void *)); 3359 } 3360 3361 /* 3362 * Remove a device from the pool. Currently, this supports removing only hot 3363 * spares and level 2 ARC devices. 3364 */ 3365 int 3366 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3367 { 3368 vdev_t *vd; 3369 nvlist_t **spares, **l2cache, *nv; 3370 uint_t nspares, nl2cache; 3371 uint64_t txg = 0; 3372 int error = 0; 3373 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3374 3375 if (!locked) 3376 txg = spa_vdev_enter(spa); 3377 3378 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3379 3380 if (spa->spa_spares.sav_vdevs != NULL && 3381 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3382 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3383 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3384 /* 3385 * Only remove the hot spare if it's not currently in use 3386 * in this pool. 3387 */ 3388 if (vd == NULL || unspare) { 3389 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3390 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3391 spa_load_spares(spa); 3392 spa->spa_spares.sav_sync = B_TRUE; 3393 } else { 3394 error = EBUSY; 3395 } 3396 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3397 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3398 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3399 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3400 /* 3401 * Cache devices can always be removed. 3402 */ 3403 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3404 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3405 spa_load_l2cache(spa); 3406 spa->spa_l2cache.sav_sync = B_TRUE; 3407 } else if (vd != NULL) { 3408 /* 3409 * Normal vdevs cannot be removed (yet). 3410 */ 3411 error = ENOTSUP; 3412 } else { 3413 /* 3414 * There is no vdev of any kind with the specified guid. 3415 */ 3416 error = ENOENT; 3417 } 3418 3419 if (!locked) 3420 return (spa_vdev_exit(spa, NULL, txg, error)); 3421 3422 return (error); 3423 } 3424 3425 /* 3426 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3427 * current spared, so we can detach it. 3428 */ 3429 static vdev_t * 3430 spa_vdev_resilver_done_hunt(vdev_t *vd) 3431 { 3432 vdev_t *newvd, *oldvd; 3433 int c; 3434 3435 for (c = 0; c < vd->vdev_children; c++) { 3436 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3437 if (oldvd != NULL) 3438 return (oldvd); 3439 } 3440 3441 /* 3442 * Check for a completed replacement. 3443 */ 3444 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3445 oldvd = vd->vdev_child[0]; 3446 newvd = vd->vdev_child[1]; 3447 3448 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3449 !vdev_dtl_required(oldvd)) 3450 return (oldvd); 3451 } 3452 3453 /* 3454 * Check for a completed resilver with the 'unspare' flag set. 3455 */ 3456 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3457 newvd = vd->vdev_child[0]; 3458 oldvd = vd->vdev_child[1]; 3459 3460 if (newvd->vdev_unspare && 3461 vdev_dtl_empty(newvd, DTL_MISSING) && 3462 !vdev_dtl_required(oldvd)) { 3463 newvd->vdev_unspare = 0; 3464 return (oldvd); 3465 } 3466 } 3467 3468 return (NULL); 3469 } 3470 3471 static void 3472 spa_vdev_resilver_done(spa_t *spa) 3473 { 3474 vdev_t *vd, *pvd, *ppvd; 3475 uint64_t guid, sguid, pguid, ppguid; 3476 3477 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3478 3479 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3480 pvd = vd->vdev_parent; 3481 ppvd = pvd->vdev_parent; 3482 guid = vd->vdev_guid; 3483 pguid = pvd->vdev_guid; 3484 ppguid = ppvd->vdev_guid; 3485 sguid = 0; 3486 /* 3487 * If we have just finished replacing a hot spared device, then 3488 * we need to detach the parent's first child (the original hot 3489 * spare) as well. 3490 */ 3491 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3492 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3493 ASSERT(ppvd->vdev_children == 2); 3494 sguid = ppvd->vdev_child[1]->vdev_guid; 3495 } 3496 spa_config_exit(spa, SCL_ALL, FTAG); 3497 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3498 return; 3499 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3500 return; 3501 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3502 } 3503 3504 spa_config_exit(spa, SCL_ALL, FTAG); 3505 } 3506 3507 /* 3508 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3509 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3510 */ 3511 int 3512 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3513 boolean_t ispath) 3514 { 3515 vdev_t *vd; 3516 uint64_t txg; 3517 3518 txg = spa_vdev_enter(spa); 3519 3520 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3521 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3522 3523 if (!vd->vdev_ops->vdev_op_leaf) 3524 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3525 3526 if (ispath) { 3527 spa_strfree(vd->vdev_path); 3528 vd->vdev_path = spa_strdup(value); 3529 } else { 3530 if (vd->vdev_fru != NULL) 3531 spa_strfree(vd->vdev_fru); 3532 vd->vdev_fru = spa_strdup(value); 3533 } 3534 3535 vdev_config_dirty(vd->vdev_top); 3536 3537 return (spa_vdev_exit(spa, NULL, txg, 0)); 3538 } 3539 3540 int 3541 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3542 { 3543 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3544 } 3545 3546 int 3547 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3548 { 3549 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3550 } 3551 3552 /* 3553 * ========================================================================== 3554 * SPA Scrubbing 3555 * ========================================================================== 3556 */ 3557 3558 int 3559 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3560 { 3561 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3562 3563 if ((uint_t)type >= POOL_SCRUB_TYPES) 3564 return (ENOTSUP); 3565 3566 /* 3567 * If a resilver was requested, but there is no DTL on a 3568 * writeable leaf device, we have nothing to do. 3569 */ 3570 if (type == POOL_SCRUB_RESILVER && 3571 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3572 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3573 return (0); 3574 } 3575 3576 if (type == POOL_SCRUB_EVERYTHING && 3577 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3578 spa->spa_dsl_pool->dp_scrub_isresilver) 3579 return (EBUSY); 3580 3581 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3582 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3583 } else if (type == POOL_SCRUB_NONE) { 3584 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3585 } else { 3586 return (EINVAL); 3587 } 3588 } 3589 3590 /* 3591 * ========================================================================== 3592 * SPA async task processing 3593 * ========================================================================== 3594 */ 3595 3596 static void 3597 spa_async_remove(spa_t *spa, vdev_t *vd) 3598 { 3599 if (vd->vdev_remove_wanted) { 3600 vd->vdev_remove_wanted = 0; 3601 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3602 vdev_clear(spa, vd); 3603 vdev_state_dirty(vd->vdev_top); 3604 } 3605 3606 for (int c = 0; c < vd->vdev_children; c++) 3607 spa_async_remove(spa, vd->vdev_child[c]); 3608 } 3609 3610 static void 3611 spa_async_probe(spa_t *spa, vdev_t *vd) 3612 { 3613 if (vd->vdev_probe_wanted) { 3614 vd->vdev_probe_wanted = 0; 3615 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3616 } 3617 3618 for (int c = 0; c < vd->vdev_children; c++) 3619 spa_async_probe(spa, vd->vdev_child[c]); 3620 } 3621 3622 static void 3623 spa_async_thread(spa_t *spa) 3624 { 3625 int tasks; 3626 3627 ASSERT(spa->spa_sync_on); 3628 3629 mutex_enter(&spa->spa_async_lock); 3630 tasks = spa->spa_async_tasks; 3631 spa->spa_async_tasks = 0; 3632 mutex_exit(&spa->spa_async_lock); 3633 3634 /* 3635 * See if the config needs to be updated. 3636 */ 3637 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3638 mutex_enter(&spa_namespace_lock); 3639 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3640 mutex_exit(&spa_namespace_lock); 3641 } 3642 3643 /* 3644 * See if any devices need to be marked REMOVED. 3645 */ 3646 if (tasks & SPA_ASYNC_REMOVE) { 3647 spa_vdev_state_enter(spa); 3648 spa_async_remove(spa, spa->spa_root_vdev); 3649 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3650 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3651 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3652 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3653 (void) spa_vdev_state_exit(spa, NULL, 0); 3654 } 3655 3656 /* 3657 * See if any devices need to be probed. 3658 */ 3659 if (tasks & SPA_ASYNC_PROBE) { 3660 spa_vdev_state_enter(spa); 3661 spa_async_probe(spa, spa->spa_root_vdev); 3662 (void) spa_vdev_state_exit(spa, NULL, 0); 3663 } 3664 3665 /* 3666 * If any devices are done replacing, detach them. 3667 */ 3668 if (tasks & SPA_ASYNC_RESILVER_DONE) 3669 spa_vdev_resilver_done(spa); 3670 3671 /* 3672 * Kick off a resilver. 3673 */ 3674 if (tasks & SPA_ASYNC_RESILVER) 3675 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3676 3677 /* 3678 * Let the world know that we're done. 3679 */ 3680 mutex_enter(&spa->spa_async_lock); 3681 spa->spa_async_thread = NULL; 3682 cv_broadcast(&spa->spa_async_cv); 3683 mutex_exit(&spa->spa_async_lock); 3684 thread_exit(); 3685 } 3686 3687 void 3688 spa_async_suspend(spa_t *spa) 3689 { 3690 mutex_enter(&spa->spa_async_lock); 3691 spa->spa_async_suspended++; 3692 while (spa->spa_async_thread != NULL) 3693 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3694 mutex_exit(&spa->spa_async_lock); 3695 } 3696 3697 void 3698 spa_async_resume(spa_t *spa) 3699 { 3700 mutex_enter(&spa->spa_async_lock); 3701 ASSERT(spa->spa_async_suspended != 0); 3702 spa->spa_async_suspended--; 3703 mutex_exit(&spa->spa_async_lock); 3704 } 3705 3706 static void 3707 spa_async_dispatch(spa_t *spa) 3708 { 3709 mutex_enter(&spa->spa_async_lock); 3710 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3711 spa->spa_async_thread == NULL && 3712 rootdir != NULL && !vn_is_readonly(rootdir)) 3713 spa->spa_async_thread = thread_create(NULL, 0, 3714 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3715 mutex_exit(&spa->spa_async_lock); 3716 } 3717 3718 void 3719 spa_async_request(spa_t *spa, int task) 3720 { 3721 mutex_enter(&spa->spa_async_lock); 3722 spa->spa_async_tasks |= task; 3723 mutex_exit(&spa->spa_async_lock); 3724 } 3725 3726 /* 3727 * ========================================================================== 3728 * SPA syncing routines 3729 * ========================================================================== 3730 */ 3731 3732 static void 3733 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3734 { 3735 bplist_t *bpl = &spa->spa_sync_bplist; 3736 dmu_tx_t *tx; 3737 blkptr_t blk; 3738 uint64_t itor = 0; 3739 zio_t *zio; 3740 int error; 3741 uint8_t c = 1; 3742 3743 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 3744 3745 while (bplist_iterate(bpl, &itor, &blk) == 0) { 3746 ASSERT(blk.blk_birth < txg); 3747 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 3748 ZIO_FLAG_MUSTSUCCEED)); 3749 } 3750 3751 error = zio_wait(zio); 3752 ASSERT3U(error, ==, 0); 3753 3754 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3755 bplist_vacate(bpl, tx); 3756 3757 /* 3758 * Pre-dirty the first block so we sync to convergence faster. 3759 * (Usually only the first block is needed.) 3760 */ 3761 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3762 dmu_tx_commit(tx); 3763 } 3764 3765 static void 3766 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3767 { 3768 char *packed = NULL; 3769 size_t bufsize; 3770 size_t nvsize = 0; 3771 dmu_buf_t *db; 3772 3773 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3774 3775 /* 3776 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3777 * information. This avoids the dbuf_will_dirty() path and 3778 * saves us a pre-read to get data we don't actually care about. 3779 */ 3780 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3781 packed = kmem_alloc(bufsize, KM_SLEEP); 3782 3783 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3784 KM_SLEEP) == 0); 3785 bzero(packed + nvsize, bufsize - nvsize); 3786 3787 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3788 3789 kmem_free(packed, bufsize); 3790 3791 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3792 dmu_buf_will_dirty(db, tx); 3793 *(uint64_t *)db->db_data = nvsize; 3794 dmu_buf_rele(db, FTAG); 3795 } 3796 3797 static void 3798 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3799 const char *config, const char *entry) 3800 { 3801 nvlist_t *nvroot; 3802 nvlist_t **list; 3803 int i; 3804 3805 if (!sav->sav_sync) 3806 return; 3807 3808 /* 3809 * Update the MOS nvlist describing the list of available devices. 3810 * spa_validate_aux() will have already made sure this nvlist is 3811 * valid and the vdevs are labeled appropriately. 3812 */ 3813 if (sav->sav_object == 0) { 3814 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3815 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3816 sizeof (uint64_t), tx); 3817 VERIFY(zap_update(spa->spa_meta_objset, 3818 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3819 &sav->sav_object, tx) == 0); 3820 } 3821 3822 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3823 if (sav->sav_count == 0) { 3824 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3825 } else { 3826 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3827 for (i = 0; i < sav->sav_count; i++) 3828 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3829 B_FALSE, B_FALSE, B_TRUE); 3830 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3831 sav->sav_count) == 0); 3832 for (i = 0; i < sav->sav_count; i++) 3833 nvlist_free(list[i]); 3834 kmem_free(list, sav->sav_count * sizeof (void *)); 3835 } 3836 3837 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3838 nvlist_free(nvroot); 3839 3840 sav->sav_sync = B_FALSE; 3841 } 3842 3843 static void 3844 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3845 { 3846 nvlist_t *config; 3847 3848 if (list_is_empty(&spa->spa_config_dirty_list)) 3849 return; 3850 3851 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3852 3853 config = spa_config_generate(spa, spa->spa_root_vdev, 3854 dmu_tx_get_txg(tx), B_FALSE); 3855 3856 spa_config_exit(spa, SCL_STATE, FTAG); 3857 3858 if (spa->spa_config_syncing) 3859 nvlist_free(spa->spa_config_syncing); 3860 spa->spa_config_syncing = config; 3861 3862 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3863 } 3864 3865 /* 3866 * Set zpool properties. 3867 */ 3868 static void 3869 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3870 { 3871 spa_t *spa = arg1; 3872 objset_t *mos = spa->spa_meta_objset; 3873 nvlist_t *nvp = arg2; 3874 nvpair_t *elem; 3875 uint64_t intval; 3876 char *strval; 3877 zpool_prop_t prop; 3878 const char *propname; 3879 zprop_type_t proptype; 3880 3881 mutex_enter(&spa->spa_props_lock); 3882 3883 elem = NULL; 3884 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3885 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3886 case ZPOOL_PROP_VERSION: 3887 /* 3888 * Only set version for non-zpool-creation cases 3889 * (set/import). spa_create() needs special care 3890 * for version setting. 3891 */ 3892 if (tx->tx_txg != TXG_INITIAL) { 3893 VERIFY(nvpair_value_uint64(elem, 3894 &intval) == 0); 3895 ASSERT(intval <= SPA_VERSION); 3896 ASSERT(intval >= spa_version(spa)); 3897 spa->spa_uberblock.ub_version = intval; 3898 vdev_config_dirty(spa->spa_root_vdev); 3899 } 3900 break; 3901 3902 case ZPOOL_PROP_ALTROOT: 3903 /* 3904 * 'altroot' is a non-persistent property. It should 3905 * have been set temporarily at creation or import time. 3906 */ 3907 ASSERT(spa->spa_root != NULL); 3908 break; 3909 3910 case ZPOOL_PROP_CACHEFILE: 3911 /* 3912 * 'cachefile' is also a non-persisitent property. 3913 */ 3914 break; 3915 default: 3916 /* 3917 * Set pool property values in the poolprops mos object. 3918 */ 3919 if (spa->spa_pool_props_object == 0) { 3920 objset_t *mos = spa->spa_meta_objset; 3921 3922 VERIFY((spa->spa_pool_props_object = 3923 zap_create(mos, DMU_OT_POOL_PROPS, 3924 DMU_OT_NONE, 0, tx)) > 0); 3925 3926 VERIFY(zap_update(mos, 3927 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3928 8, 1, &spa->spa_pool_props_object, tx) 3929 == 0); 3930 } 3931 3932 /* normalize the property name */ 3933 propname = zpool_prop_to_name(prop); 3934 proptype = zpool_prop_get_type(prop); 3935 3936 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3937 ASSERT(proptype == PROP_TYPE_STRING); 3938 VERIFY(nvpair_value_string(elem, &strval) == 0); 3939 VERIFY(zap_update(mos, 3940 spa->spa_pool_props_object, propname, 3941 1, strlen(strval) + 1, strval, tx) == 0); 3942 3943 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3944 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3945 3946 if (proptype == PROP_TYPE_INDEX) { 3947 const char *unused; 3948 VERIFY(zpool_prop_index_to_string( 3949 prop, intval, &unused) == 0); 3950 } 3951 VERIFY(zap_update(mos, 3952 spa->spa_pool_props_object, propname, 3953 8, 1, &intval, tx) == 0); 3954 } else { 3955 ASSERT(0); /* not allowed */ 3956 } 3957 3958 switch (prop) { 3959 case ZPOOL_PROP_DELEGATION: 3960 spa->spa_delegation = intval; 3961 break; 3962 case ZPOOL_PROP_BOOTFS: 3963 spa->spa_bootfs = intval; 3964 break; 3965 case ZPOOL_PROP_FAILUREMODE: 3966 spa->spa_failmode = intval; 3967 break; 3968 default: 3969 break; 3970 } 3971 } 3972 3973 /* log internal history if this is not a zpool create */ 3974 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3975 tx->tx_txg != TXG_INITIAL) { 3976 spa_history_internal_log(LOG_POOL_PROPSET, 3977 spa, tx, cr, "%s %lld %s", 3978 nvpair_name(elem), intval, spa_name(spa)); 3979 } 3980 } 3981 3982 mutex_exit(&spa->spa_props_lock); 3983 } 3984 3985 /* 3986 * Sync the specified transaction group. New blocks may be dirtied as 3987 * part of the process, so we iterate until it converges. 3988 */ 3989 void 3990 spa_sync(spa_t *spa, uint64_t txg) 3991 { 3992 dsl_pool_t *dp = spa->spa_dsl_pool; 3993 objset_t *mos = spa->spa_meta_objset; 3994 bplist_t *bpl = &spa->spa_sync_bplist; 3995 vdev_t *rvd = spa->spa_root_vdev; 3996 vdev_t *vd; 3997 dmu_tx_t *tx; 3998 int dirty_vdevs; 3999 int error; 4000 4001 /* 4002 * Lock out configuration changes. 4003 */ 4004 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4005 4006 spa->spa_syncing_txg = txg; 4007 spa->spa_sync_pass = 0; 4008 4009 /* 4010 * If there are any pending vdev state changes, convert them 4011 * into config changes that go out with this transaction group. 4012 */ 4013 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4014 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4015 /* 4016 * We need the write lock here because, for aux vdevs, 4017 * calling vdev_config_dirty() modifies sav_config. 4018 * This is ugly and will become unnecessary when we 4019 * eliminate the aux vdev wart by integrating all vdevs 4020 * into the root vdev tree. 4021 */ 4022 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4023 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4024 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4025 vdev_state_clean(vd); 4026 vdev_config_dirty(vd); 4027 } 4028 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4029 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4030 } 4031 spa_config_exit(spa, SCL_STATE, FTAG); 4032 4033 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4034 4035 tx = dmu_tx_create_assigned(dp, txg); 4036 4037 /* 4038 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4039 * set spa_deflate if we have no raid-z vdevs. 4040 */ 4041 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4042 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4043 int i; 4044 4045 for (i = 0; i < rvd->vdev_children; i++) { 4046 vd = rvd->vdev_child[i]; 4047 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4048 break; 4049 } 4050 if (i == rvd->vdev_children) { 4051 spa->spa_deflate = TRUE; 4052 VERIFY(0 == zap_add(spa->spa_meta_objset, 4053 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4054 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4055 } 4056 } 4057 4058 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4059 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4060 dsl_pool_create_origin(dp, tx); 4061 4062 /* Keeping the origin open increases spa_minref */ 4063 spa->spa_minref += 3; 4064 } 4065 4066 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4067 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4068 dsl_pool_upgrade_clones(dp, tx); 4069 } 4070 4071 /* 4072 * If anything has changed in this txg, push the deferred frees 4073 * from the previous txg. If not, leave them alone so that we 4074 * don't generate work on an otherwise idle system. 4075 */ 4076 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4077 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4078 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4079 spa_sync_deferred_frees(spa, txg); 4080 4081 /* 4082 * Iterate to convergence. 4083 */ 4084 do { 4085 spa->spa_sync_pass++; 4086 4087 spa_sync_config_object(spa, tx); 4088 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4089 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4090 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4091 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4092 spa_errlog_sync(spa, txg); 4093 dsl_pool_sync(dp, txg); 4094 4095 dirty_vdevs = 0; 4096 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4097 vdev_sync(vd, txg); 4098 dirty_vdevs++; 4099 } 4100 4101 bplist_sync(bpl, tx); 4102 } while (dirty_vdevs); 4103 4104 bplist_close(bpl); 4105 4106 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4107 4108 /* 4109 * Rewrite the vdev configuration (which includes the uberblock) 4110 * to commit the transaction group. 4111 * 4112 * If there are no dirty vdevs, we sync the uberblock to a few 4113 * random top-level vdevs that are known to be visible in the 4114 * config cache (see spa_vdev_add() for a complete description). 4115 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4116 */ 4117 for (;;) { 4118 /* 4119 * We hold SCL_STATE to prevent vdev open/close/etc. 4120 * while we're attempting to write the vdev labels. 4121 */ 4122 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4123 4124 if (list_is_empty(&spa->spa_config_dirty_list)) { 4125 vdev_t *svd[SPA_DVAS_PER_BP]; 4126 int svdcount = 0; 4127 int children = rvd->vdev_children; 4128 int c0 = spa_get_random(children); 4129 int c; 4130 4131 for (c = 0; c < children; c++) { 4132 vd = rvd->vdev_child[(c0 + c) % children]; 4133 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4134 continue; 4135 svd[svdcount++] = vd; 4136 if (svdcount == SPA_DVAS_PER_BP) 4137 break; 4138 } 4139 error = vdev_config_sync(svd, svdcount, txg); 4140 } else { 4141 error = vdev_config_sync(rvd->vdev_child, 4142 rvd->vdev_children, txg); 4143 } 4144 4145 spa_config_exit(spa, SCL_STATE, FTAG); 4146 4147 if (error == 0) 4148 break; 4149 zio_suspend(spa, NULL); 4150 zio_resume_wait(spa); 4151 } 4152 dmu_tx_commit(tx); 4153 4154 /* 4155 * Clear the dirty config list. 4156 */ 4157 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4158 vdev_config_clean(vd); 4159 4160 /* 4161 * Now that the new config has synced transactionally, 4162 * let it become visible to the config cache. 4163 */ 4164 if (spa->spa_config_syncing != NULL) { 4165 spa_config_set(spa, spa->spa_config_syncing); 4166 spa->spa_config_txg = txg; 4167 spa->spa_config_syncing = NULL; 4168 } 4169 4170 spa->spa_ubsync = spa->spa_uberblock; 4171 4172 /* 4173 * Clean up the ZIL records for the synced txg. 4174 */ 4175 dsl_pool_zil_clean(dp); 4176 4177 /* 4178 * Update usable space statistics. 4179 */ 4180 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4181 vdev_sync_done(vd, txg); 4182 4183 /* 4184 * It had better be the case that we didn't dirty anything 4185 * since vdev_config_sync(). 4186 */ 4187 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4188 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4189 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4190 ASSERT(bpl->bpl_queue == NULL); 4191 4192 spa_config_exit(spa, SCL_CONFIG, FTAG); 4193 4194 /* 4195 * If any async tasks have been requested, kick them off. 4196 */ 4197 spa_async_dispatch(spa); 4198 } 4199 4200 /* 4201 * Sync all pools. We don't want to hold the namespace lock across these 4202 * operations, so we take a reference on the spa_t and drop the lock during the 4203 * sync. 4204 */ 4205 void 4206 spa_sync_allpools(void) 4207 { 4208 spa_t *spa = NULL; 4209 mutex_enter(&spa_namespace_lock); 4210 while ((spa = spa_next(spa)) != NULL) { 4211 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4212 continue; 4213 spa_open_ref(spa, FTAG); 4214 mutex_exit(&spa_namespace_lock); 4215 txg_wait_synced(spa_get_dsl(spa), 0); 4216 mutex_enter(&spa_namespace_lock); 4217 spa_close(spa, FTAG); 4218 } 4219 mutex_exit(&spa_namespace_lock); 4220 } 4221 4222 /* 4223 * ========================================================================== 4224 * Miscellaneous routines 4225 * ========================================================================== 4226 */ 4227 4228 /* 4229 * Remove all pools in the system. 4230 */ 4231 void 4232 spa_evict_all(void) 4233 { 4234 spa_t *spa; 4235 4236 /* 4237 * Remove all cached state. All pools should be closed now, 4238 * so every spa in the AVL tree should be unreferenced. 4239 */ 4240 mutex_enter(&spa_namespace_lock); 4241 while ((spa = spa_next(NULL)) != NULL) { 4242 /* 4243 * Stop async tasks. The async thread may need to detach 4244 * a device that's been replaced, which requires grabbing 4245 * spa_namespace_lock, so we must drop it here. 4246 */ 4247 spa_open_ref(spa, FTAG); 4248 mutex_exit(&spa_namespace_lock); 4249 spa_async_suspend(spa); 4250 mutex_enter(&spa_namespace_lock); 4251 spa_close(spa, FTAG); 4252 4253 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4254 spa_unload(spa); 4255 spa_deactivate(spa); 4256 } 4257 spa_remove(spa); 4258 } 4259 mutex_exit(&spa_namespace_lock); 4260 } 4261 4262 vdev_t * 4263 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4264 { 4265 vdev_t *vd; 4266 int i; 4267 4268 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4269 return (vd); 4270 4271 if (aux) { 4272 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4273 vd = spa->spa_l2cache.sav_vdevs[i]; 4274 if (vd->vdev_guid == guid) 4275 return (vd); 4276 } 4277 4278 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4279 vd = spa->spa_spares.sav_vdevs[i]; 4280 if (vd->vdev_guid == guid) 4281 return (vd); 4282 } 4283 } 4284 4285 return (NULL); 4286 } 4287 4288 void 4289 spa_upgrade(spa_t *spa, uint64_t version) 4290 { 4291 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4292 4293 /* 4294 * This should only be called for a non-faulted pool, and since a 4295 * future version would result in an unopenable pool, this shouldn't be 4296 * possible. 4297 */ 4298 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4299 ASSERT(version >= spa->spa_uberblock.ub_version); 4300 4301 spa->spa_uberblock.ub_version = version; 4302 vdev_config_dirty(spa->spa_root_vdev); 4303 4304 spa_config_exit(spa, SCL_ALL, FTAG); 4305 4306 txg_wait_synced(spa_get_dsl(spa), 0); 4307 } 4308 4309 boolean_t 4310 spa_has_spare(spa_t *spa, uint64_t guid) 4311 { 4312 int i; 4313 uint64_t spareguid; 4314 spa_aux_vdev_t *sav = &spa->spa_spares; 4315 4316 for (i = 0; i < sav->sav_count; i++) 4317 if (sav->sav_vdevs[i]->vdev_guid == guid) 4318 return (B_TRUE); 4319 4320 for (i = 0; i < sav->sav_npending; i++) { 4321 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4322 &spareguid) == 0 && spareguid == guid) 4323 return (B_TRUE); 4324 } 4325 4326 return (B_FALSE); 4327 } 4328 4329 /* 4330 * Check if a pool has an active shared spare device. 4331 * Note: reference count of an active spare is 2, as a spare and as a replace 4332 */ 4333 static boolean_t 4334 spa_has_active_shared_spare(spa_t *spa) 4335 { 4336 int i, refcnt; 4337 uint64_t pool; 4338 spa_aux_vdev_t *sav = &spa->spa_spares; 4339 4340 for (i = 0; i < sav->sav_count; i++) { 4341 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4342 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4343 refcnt > 2) 4344 return (B_TRUE); 4345 } 4346 4347 return (B_FALSE); 4348 } 4349 4350 /* 4351 * Post a sysevent corresponding to the given event. The 'name' must be one of 4352 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4353 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4354 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4355 * or zdb as real changes. 4356 */ 4357 void 4358 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4359 { 4360 #ifdef _KERNEL 4361 sysevent_t *ev; 4362 sysevent_attr_list_t *attr = NULL; 4363 sysevent_value_t value; 4364 sysevent_id_t eid; 4365 4366 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4367 SE_SLEEP); 4368 4369 value.value_type = SE_DATA_TYPE_STRING; 4370 value.value.sv_string = spa_name(spa); 4371 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4372 goto done; 4373 4374 value.value_type = SE_DATA_TYPE_UINT64; 4375 value.value.sv_uint64 = spa_guid(spa); 4376 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4377 goto done; 4378 4379 if (vd) { 4380 value.value_type = SE_DATA_TYPE_UINT64; 4381 value.value.sv_uint64 = vd->vdev_guid; 4382 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4383 SE_SLEEP) != 0) 4384 goto done; 4385 4386 if (vd->vdev_path) { 4387 value.value_type = SE_DATA_TYPE_STRING; 4388 value.value.sv_string = vd->vdev_path; 4389 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4390 &value, SE_SLEEP) != 0) 4391 goto done; 4392 } 4393 } 4394 4395 if (sysevent_attach_attributes(ev, attr) != 0) 4396 goto done; 4397 attr = NULL; 4398 4399 (void) log_sysevent(ev, SE_SLEEP, &eid); 4400 4401 done: 4402 if (attr) 4403 sysevent_free_attr(attr); 4404 sysevent_free(ev); 4405 #endif 4406 } 4407