1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright 2017 Joyent, Inc. 31 * Copyright (c) 2017 Datto Inc. 32 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. 33 */ 34 35 /* 36 * SPA: Storage Pool Allocator 37 * 38 * This file contains all the routines used when modifying on-disk SPA state. 39 * This includes opening, importing, destroying, exporting a pool, and syncing a 40 * pool. 41 */ 42 43 #include <sys/zfs_context.h> 44 #include <sys/fm/fs/zfs.h> 45 #include <sys/spa_impl.h> 46 #include <sys/zio.h> 47 #include <sys/zio_checksum.h> 48 #include <sys/dmu.h> 49 #include <sys/dmu_tx.h> 50 #include <sys/zap.h> 51 #include <sys/zil.h> 52 #include <sys/ddt.h> 53 #include <sys/vdev_impl.h> 54 #include <sys/vdev_removal.h> 55 #include <sys/vdev_indirect_mapping.h> 56 #include <sys/vdev_indirect_births.h> 57 #include <sys/metaslab.h> 58 #include <sys/metaslab_impl.h> 59 #include <sys/uberblock_impl.h> 60 #include <sys/txg.h> 61 #include <sys/avl.h> 62 #include <sys/bpobj.h> 63 #include <sys/dmu_traverse.h> 64 #include <sys/dmu_objset.h> 65 #include <sys/unique.h> 66 #include <sys/dsl_pool.h> 67 #include <sys/dsl_dataset.h> 68 #include <sys/dsl_dir.h> 69 #include <sys/dsl_prop.h> 70 #include <sys/dsl_synctask.h> 71 #include <sys/fs/zfs.h> 72 #include <sys/arc.h> 73 #include <sys/callb.h> 74 #include <sys/systeminfo.h> 75 #include <sys/spa_boot.h> 76 #include <sys/zfs_ioctl.h> 77 #include <sys/dsl_scan.h> 78 #include <sys/zfeature.h> 79 #include <sys/dsl_destroy.h> 80 #include <sys/abd.h> 81 82 #ifdef _KERNEL 83 #include <sys/bootprops.h> 84 #include <sys/callb.h> 85 #include <sys/cpupart.h> 86 #include <sys/pool.h> 87 #include <sys/sysdc.h> 88 #include <sys/zone.h> 89 #endif /* _KERNEL */ 90 91 #include "zfs_prop.h" 92 #include "zfs_comutil.h" 93 94 /* 95 * The interval, in seconds, at which failed configuration cache file writes 96 * should be retried. 97 */ 98 int zfs_ccw_retry_interval = 300; 99 100 typedef enum zti_modes { 101 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 102 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 103 ZTI_MODE_NULL, /* don't create a taskq */ 104 ZTI_NMODES 105 } zti_modes_t; 106 107 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 108 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 109 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 110 111 #define ZTI_N(n) ZTI_P(n, 1) 112 #define ZTI_ONE ZTI_N(1) 113 114 typedef struct zio_taskq_info { 115 zti_modes_t zti_mode; 116 uint_t zti_value; 117 uint_t zti_count; 118 } zio_taskq_info_t; 119 120 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 121 "issue", "issue_high", "intr", "intr_high" 122 }; 123 124 /* 125 * This table defines the taskq settings for each ZFS I/O type. When 126 * initializing a pool, we use this table to create an appropriately sized 127 * taskq. Some operations are low volume and therefore have a small, static 128 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 129 * macros. Other operations process a large amount of data; the ZTI_BATCH 130 * macro causes us to create a taskq oriented for throughput. Some operations 131 * are so high frequency and short-lived that the taskq itself can become a a 132 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 133 * additional degree of parallelism specified by the number of threads per- 134 * taskq and the number of taskqs; when dispatching an event in this case, the 135 * particular taskq is chosen at random. 136 * 137 * The different taskq priorities are to handle the different contexts (issue 138 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 139 * need to be handled with minimum delay. 140 */ 141 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 142 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 143 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 144 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 145 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 146 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 147 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 148 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 149 }; 150 151 static void spa_sync_version(void *arg, dmu_tx_t *tx); 152 static void spa_sync_props(void *arg, dmu_tx_t *tx); 153 static boolean_t spa_has_active_shared_spare(spa_t *spa); 154 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 155 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 156 char **ereport); 157 static void spa_vdev_resilver_done(spa_t *spa); 158 159 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 160 id_t zio_taskq_psrset_bind = PS_NONE; 161 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 162 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 163 164 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 165 extern int zfs_sync_pass_deferred_free; 166 167 /* 168 * This (illegal) pool name is used when temporarily importing a spa_t in order 169 * to get the vdev stats associated with the imported devices. 170 */ 171 #define TRYIMPORT_NAME "$import" 172 173 /* 174 * ========================================================================== 175 * SPA properties routines 176 * ========================================================================== 177 */ 178 179 /* 180 * Add a (source=src, propname=propval) list to an nvlist. 181 */ 182 static void 183 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 184 uint64_t intval, zprop_source_t src) 185 { 186 const char *propname = zpool_prop_to_name(prop); 187 nvlist_t *propval; 188 189 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 190 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 191 192 if (strval != NULL) 193 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 194 else 195 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 196 197 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 198 nvlist_free(propval); 199 } 200 201 /* 202 * Get property values from the spa configuration. 203 */ 204 static void 205 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 206 { 207 vdev_t *rvd = spa->spa_root_vdev; 208 dsl_pool_t *pool = spa->spa_dsl_pool; 209 uint64_t size, alloc, cap, version; 210 zprop_source_t src = ZPROP_SRC_NONE; 211 spa_config_dirent_t *dp; 212 metaslab_class_t *mc = spa_normal_class(spa); 213 214 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 215 216 if (rvd != NULL) { 217 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 218 size = metaslab_class_get_space(spa_normal_class(spa)); 219 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 220 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 221 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 222 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 223 size - alloc, src); 224 225 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 226 metaslab_class_fragmentation(mc), src); 227 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 228 metaslab_class_expandable_space(mc), src); 229 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 230 (spa_mode(spa) == FREAD), src); 231 232 cap = (size == 0) ? 0 : (alloc * 100 / size); 233 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 234 235 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 236 ddt_get_pool_dedup_ratio(spa), src); 237 238 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 239 rvd->vdev_state, src); 240 241 version = spa_version(spa); 242 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 243 src = ZPROP_SRC_DEFAULT; 244 else 245 src = ZPROP_SRC_LOCAL; 246 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 247 } 248 249 if (pool != NULL) { 250 /* 251 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 252 * when opening pools before this version freedir will be NULL. 253 */ 254 if (pool->dp_free_dir != NULL) { 255 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 256 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 257 src); 258 } else { 259 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 260 NULL, 0, src); 261 } 262 263 if (pool->dp_leak_dir != NULL) { 264 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 265 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 266 src); 267 } else { 268 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 269 NULL, 0, src); 270 } 271 } 272 273 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 274 275 if (spa->spa_comment != NULL) { 276 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 277 0, ZPROP_SRC_LOCAL); 278 } 279 280 if (spa->spa_root != NULL) 281 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 282 0, ZPROP_SRC_LOCAL); 283 284 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 285 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 286 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 287 } else { 288 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 289 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 290 } 291 292 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 293 if (dp->scd_path == NULL) { 294 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 295 "none", 0, ZPROP_SRC_LOCAL); 296 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 297 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 298 dp->scd_path, 0, ZPROP_SRC_LOCAL); 299 } 300 } 301 } 302 303 /* 304 * Get zpool property values. 305 */ 306 int 307 spa_prop_get(spa_t *spa, nvlist_t **nvp) 308 { 309 objset_t *mos = spa->spa_meta_objset; 310 zap_cursor_t zc; 311 zap_attribute_t za; 312 int err; 313 314 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 315 316 mutex_enter(&spa->spa_props_lock); 317 318 /* 319 * Get properties from the spa config. 320 */ 321 spa_prop_get_config(spa, nvp); 322 323 /* If no pool property object, no more prop to get. */ 324 if (mos == NULL || spa->spa_pool_props_object == 0) { 325 mutex_exit(&spa->spa_props_lock); 326 return (0); 327 } 328 329 /* 330 * Get properties from the MOS pool property object. 331 */ 332 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 333 (err = zap_cursor_retrieve(&zc, &za)) == 0; 334 zap_cursor_advance(&zc)) { 335 uint64_t intval = 0; 336 char *strval = NULL; 337 zprop_source_t src = ZPROP_SRC_DEFAULT; 338 zpool_prop_t prop; 339 340 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 341 continue; 342 343 switch (za.za_integer_length) { 344 case 8: 345 /* integer property */ 346 if (za.za_first_integer != 347 zpool_prop_default_numeric(prop)) 348 src = ZPROP_SRC_LOCAL; 349 350 if (prop == ZPOOL_PROP_BOOTFS) { 351 dsl_pool_t *dp; 352 dsl_dataset_t *ds = NULL; 353 354 dp = spa_get_dsl(spa); 355 dsl_pool_config_enter(dp, FTAG); 356 if (err = dsl_dataset_hold_obj(dp, 357 za.za_first_integer, FTAG, &ds)) { 358 dsl_pool_config_exit(dp, FTAG); 359 break; 360 } 361 362 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 363 KM_SLEEP); 364 dsl_dataset_name(ds, strval); 365 dsl_dataset_rele(ds, FTAG); 366 dsl_pool_config_exit(dp, FTAG); 367 } else { 368 strval = NULL; 369 intval = za.za_first_integer; 370 } 371 372 spa_prop_add_list(*nvp, prop, strval, intval, src); 373 374 if (strval != NULL) 375 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 376 377 break; 378 379 case 1: 380 /* string property */ 381 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 382 err = zap_lookup(mos, spa->spa_pool_props_object, 383 za.za_name, 1, za.za_num_integers, strval); 384 if (err) { 385 kmem_free(strval, za.za_num_integers); 386 break; 387 } 388 spa_prop_add_list(*nvp, prop, strval, 0, src); 389 kmem_free(strval, za.za_num_integers); 390 break; 391 392 default: 393 break; 394 } 395 } 396 zap_cursor_fini(&zc); 397 mutex_exit(&spa->spa_props_lock); 398 out: 399 if (err && err != ENOENT) { 400 nvlist_free(*nvp); 401 *nvp = NULL; 402 return (err); 403 } 404 405 return (0); 406 } 407 408 /* 409 * Validate the given pool properties nvlist and modify the list 410 * for the property values to be set. 411 */ 412 static int 413 spa_prop_validate(spa_t *spa, nvlist_t *props) 414 { 415 nvpair_t *elem; 416 int error = 0, reset_bootfs = 0; 417 uint64_t objnum = 0; 418 boolean_t has_feature = B_FALSE; 419 420 elem = NULL; 421 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 422 uint64_t intval; 423 char *strval, *slash, *check, *fname; 424 const char *propname = nvpair_name(elem); 425 zpool_prop_t prop = zpool_name_to_prop(propname); 426 427 switch (prop) { 428 case ZPOOL_PROP_INVAL: 429 if (!zpool_prop_feature(propname)) { 430 error = SET_ERROR(EINVAL); 431 break; 432 } 433 434 /* 435 * Sanitize the input. 436 */ 437 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 438 error = SET_ERROR(EINVAL); 439 break; 440 } 441 442 if (nvpair_value_uint64(elem, &intval) != 0) { 443 error = SET_ERROR(EINVAL); 444 break; 445 } 446 447 if (intval != 0) { 448 error = SET_ERROR(EINVAL); 449 break; 450 } 451 452 fname = strchr(propname, '@') + 1; 453 if (zfeature_lookup_name(fname, NULL) != 0) { 454 error = SET_ERROR(EINVAL); 455 break; 456 } 457 458 has_feature = B_TRUE; 459 break; 460 461 case ZPOOL_PROP_VERSION: 462 error = nvpair_value_uint64(elem, &intval); 463 if (!error && 464 (intval < spa_version(spa) || 465 intval > SPA_VERSION_BEFORE_FEATURES || 466 has_feature)) 467 error = SET_ERROR(EINVAL); 468 break; 469 470 case ZPOOL_PROP_DELEGATION: 471 case ZPOOL_PROP_AUTOREPLACE: 472 case ZPOOL_PROP_LISTSNAPS: 473 case ZPOOL_PROP_AUTOEXPAND: 474 error = nvpair_value_uint64(elem, &intval); 475 if (!error && intval > 1) 476 error = SET_ERROR(EINVAL); 477 break; 478 479 case ZPOOL_PROP_BOOTFS: 480 /* 481 * If the pool version is less than SPA_VERSION_BOOTFS, 482 * or the pool is still being created (version == 0), 483 * the bootfs property cannot be set. 484 */ 485 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 486 error = SET_ERROR(ENOTSUP); 487 break; 488 } 489 490 /* 491 * Make sure the vdev config is bootable 492 */ 493 if (!vdev_is_bootable(spa->spa_root_vdev)) { 494 error = SET_ERROR(ENOTSUP); 495 break; 496 } 497 498 reset_bootfs = 1; 499 500 error = nvpair_value_string(elem, &strval); 501 502 if (!error) { 503 objset_t *os; 504 uint64_t propval; 505 506 if (strval == NULL || strval[0] == '\0') { 507 objnum = zpool_prop_default_numeric( 508 ZPOOL_PROP_BOOTFS); 509 break; 510 } 511 512 if (error = dmu_objset_hold(strval, FTAG, &os)) 513 break; 514 515 /* 516 * Must be ZPL, and its property settings 517 * must be supported by GRUB (compression 518 * is not gzip, and large blocks are not used). 519 */ 520 521 if (dmu_objset_type(os) != DMU_OST_ZFS) { 522 error = SET_ERROR(ENOTSUP); 523 } else if ((error = 524 dsl_prop_get_int_ds(dmu_objset_ds(os), 525 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 526 &propval)) == 0 && 527 !BOOTFS_COMPRESS_VALID(propval)) { 528 error = SET_ERROR(ENOTSUP); 529 } else { 530 objnum = dmu_objset_id(os); 531 } 532 dmu_objset_rele(os, FTAG); 533 } 534 break; 535 536 case ZPOOL_PROP_FAILUREMODE: 537 error = nvpair_value_uint64(elem, &intval); 538 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 539 intval > ZIO_FAILURE_MODE_PANIC)) 540 error = SET_ERROR(EINVAL); 541 542 /* 543 * This is a special case which only occurs when 544 * the pool has completely failed. This allows 545 * the user to change the in-core failmode property 546 * without syncing it out to disk (I/Os might 547 * currently be blocked). We do this by returning 548 * EIO to the caller (spa_prop_set) to trick it 549 * into thinking we encountered a property validation 550 * error. 551 */ 552 if (!error && spa_suspended(spa)) { 553 spa->spa_failmode = intval; 554 error = SET_ERROR(EIO); 555 } 556 break; 557 558 case ZPOOL_PROP_CACHEFILE: 559 if ((error = nvpair_value_string(elem, &strval)) != 0) 560 break; 561 562 if (strval[0] == '\0') 563 break; 564 565 if (strcmp(strval, "none") == 0) 566 break; 567 568 if (strval[0] != '/') { 569 error = SET_ERROR(EINVAL); 570 break; 571 } 572 573 slash = strrchr(strval, '/'); 574 ASSERT(slash != NULL); 575 576 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 577 strcmp(slash, "/..") == 0) 578 error = SET_ERROR(EINVAL); 579 break; 580 581 case ZPOOL_PROP_COMMENT: 582 if ((error = nvpair_value_string(elem, &strval)) != 0) 583 break; 584 for (check = strval; *check != '\0'; check++) { 585 /* 586 * The kernel doesn't have an easy isprint() 587 * check. For this kernel check, we merely 588 * check ASCII apart from DEL. Fix this if 589 * there is an easy-to-use kernel isprint(). 590 */ 591 if (*check >= 0x7f) { 592 error = SET_ERROR(EINVAL); 593 break; 594 } 595 } 596 if (strlen(strval) > ZPROP_MAX_COMMENT) 597 error = E2BIG; 598 break; 599 600 case ZPOOL_PROP_DEDUPDITTO: 601 if (spa_version(spa) < SPA_VERSION_DEDUP) 602 error = SET_ERROR(ENOTSUP); 603 else 604 error = nvpair_value_uint64(elem, &intval); 605 if (error == 0 && 606 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 607 error = SET_ERROR(EINVAL); 608 break; 609 } 610 611 if (error) 612 break; 613 } 614 615 if (!error && reset_bootfs) { 616 error = nvlist_remove(props, 617 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 618 619 if (!error) { 620 error = nvlist_add_uint64(props, 621 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 622 } 623 } 624 625 return (error); 626 } 627 628 void 629 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 630 { 631 char *cachefile; 632 spa_config_dirent_t *dp; 633 634 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 635 &cachefile) != 0) 636 return; 637 638 dp = kmem_alloc(sizeof (spa_config_dirent_t), 639 KM_SLEEP); 640 641 if (cachefile[0] == '\0') 642 dp->scd_path = spa_strdup(spa_config_path); 643 else if (strcmp(cachefile, "none") == 0) 644 dp->scd_path = NULL; 645 else 646 dp->scd_path = spa_strdup(cachefile); 647 648 list_insert_head(&spa->spa_config_list, dp); 649 if (need_sync) 650 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 651 } 652 653 int 654 spa_prop_set(spa_t *spa, nvlist_t *nvp) 655 { 656 int error; 657 nvpair_t *elem = NULL; 658 boolean_t need_sync = B_FALSE; 659 660 if ((error = spa_prop_validate(spa, nvp)) != 0) 661 return (error); 662 663 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 664 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 665 666 if (prop == ZPOOL_PROP_CACHEFILE || 667 prop == ZPOOL_PROP_ALTROOT || 668 prop == ZPOOL_PROP_READONLY) 669 continue; 670 671 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 672 uint64_t ver; 673 674 if (prop == ZPOOL_PROP_VERSION) { 675 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 676 } else { 677 ASSERT(zpool_prop_feature(nvpair_name(elem))); 678 ver = SPA_VERSION_FEATURES; 679 need_sync = B_TRUE; 680 } 681 682 /* Save time if the version is already set. */ 683 if (ver == spa_version(spa)) 684 continue; 685 686 /* 687 * In addition to the pool directory object, we might 688 * create the pool properties object, the features for 689 * read object, the features for write object, or the 690 * feature descriptions object. 691 */ 692 error = dsl_sync_task(spa->spa_name, NULL, 693 spa_sync_version, &ver, 694 6, ZFS_SPACE_CHECK_RESERVED); 695 if (error) 696 return (error); 697 continue; 698 } 699 700 need_sync = B_TRUE; 701 break; 702 } 703 704 if (need_sync) { 705 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 706 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 707 } 708 709 return (0); 710 } 711 712 /* 713 * If the bootfs property value is dsobj, clear it. 714 */ 715 void 716 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 717 { 718 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 719 VERIFY(zap_remove(spa->spa_meta_objset, 720 spa->spa_pool_props_object, 721 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 722 spa->spa_bootfs = 0; 723 } 724 } 725 726 /*ARGSUSED*/ 727 static int 728 spa_change_guid_check(void *arg, dmu_tx_t *tx) 729 { 730 uint64_t *newguid = arg; 731 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 732 vdev_t *rvd = spa->spa_root_vdev; 733 uint64_t vdev_state; 734 735 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 736 vdev_state = rvd->vdev_state; 737 spa_config_exit(spa, SCL_STATE, FTAG); 738 739 if (vdev_state != VDEV_STATE_HEALTHY) 740 return (SET_ERROR(ENXIO)); 741 742 ASSERT3U(spa_guid(spa), !=, *newguid); 743 744 return (0); 745 } 746 747 static void 748 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 749 { 750 uint64_t *newguid = arg; 751 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 752 uint64_t oldguid; 753 vdev_t *rvd = spa->spa_root_vdev; 754 755 oldguid = spa_guid(spa); 756 757 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 758 rvd->vdev_guid = *newguid; 759 rvd->vdev_guid_sum += (*newguid - oldguid); 760 vdev_config_dirty(rvd); 761 spa_config_exit(spa, SCL_STATE, FTAG); 762 763 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 764 oldguid, *newguid); 765 } 766 767 /* 768 * Change the GUID for the pool. This is done so that we can later 769 * re-import a pool built from a clone of our own vdevs. We will modify 770 * the root vdev's guid, our own pool guid, and then mark all of our 771 * vdevs dirty. Note that we must make sure that all our vdevs are 772 * online when we do this, or else any vdevs that weren't present 773 * would be orphaned from our pool. We are also going to issue a 774 * sysevent to update any watchers. 775 */ 776 int 777 spa_change_guid(spa_t *spa) 778 { 779 int error; 780 uint64_t guid; 781 782 mutex_enter(&spa->spa_vdev_top_lock); 783 mutex_enter(&spa_namespace_lock); 784 guid = spa_generate_guid(NULL); 785 786 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 787 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 788 789 if (error == 0) { 790 spa_write_cachefile(spa, B_FALSE, B_TRUE); 791 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 792 } 793 794 mutex_exit(&spa_namespace_lock); 795 mutex_exit(&spa->spa_vdev_top_lock); 796 797 return (error); 798 } 799 800 /* 801 * ========================================================================== 802 * SPA state manipulation (open/create/destroy/import/export) 803 * ========================================================================== 804 */ 805 806 static int 807 spa_error_entry_compare(const void *a, const void *b) 808 { 809 spa_error_entry_t *sa = (spa_error_entry_t *)a; 810 spa_error_entry_t *sb = (spa_error_entry_t *)b; 811 int ret; 812 813 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 814 sizeof (zbookmark_phys_t)); 815 816 if (ret < 0) 817 return (-1); 818 else if (ret > 0) 819 return (1); 820 else 821 return (0); 822 } 823 824 /* 825 * Utility function which retrieves copies of the current logs and 826 * re-initializes them in the process. 827 */ 828 void 829 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 830 { 831 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 832 833 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 834 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 835 836 avl_create(&spa->spa_errlist_scrub, 837 spa_error_entry_compare, sizeof (spa_error_entry_t), 838 offsetof(spa_error_entry_t, se_avl)); 839 avl_create(&spa->spa_errlist_last, 840 spa_error_entry_compare, sizeof (spa_error_entry_t), 841 offsetof(spa_error_entry_t, se_avl)); 842 } 843 844 static void 845 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 846 { 847 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 848 enum zti_modes mode = ztip->zti_mode; 849 uint_t value = ztip->zti_value; 850 uint_t count = ztip->zti_count; 851 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 852 char name[32]; 853 uint_t flags = 0; 854 boolean_t batch = B_FALSE; 855 856 if (mode == ZTI_MODE_NULL) { 857 tqs->stqs_count = 0; 858 tqs->stqs_taskq = NULL; 859 return; 860 } 861 862 ASSERT3U(count, >, 0); 863 864 tqs->stqs_count = count; 865 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 866 867 switch (mode) { 868 case ZTI_MODE_FIXED: 869 ASSERT3U(value, >=, 1); 870 value = MAX(value, 1); 871 break; 872 873 case ZTI_MODE_BATCH: 874 batch = B_TRUE; 875 flags |= TASKQ_THREADS_CPU_PCT; 876 value = zio_taskq_batch_pct; 877 break; 878 879 default: 880 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 881 "spa_activate()", 882 zio_type_name[t], zio_taskq_types[q], mode, value); 883 break; 884 } 885 886 for (uint_t i = 0; i < count; i++) { 887 taskq_t *tq; 888 889 if (count > 1) { 890 (void) snprintf(name, sizeof (name), "%s_%s_%u", 891 zio_type_name[t], zio_taskq_types[q], i); 892 } else { 893 (void) snprintf(name, sizeof (name), "%s_%s", 894 zio_type_name[t], zio_taskq_types[q]); 895 } 896 897 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 898 if (batch) 899 flags |= TASKQ_DC_BATCH; 900 901 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 902 spa->spa_proc, zio_taskq_basedc, flags); 903 } else { 904 pri_t pri = maxclsyspri; 905 /* 906 * The write issue taskq can be extremely CPU 907 * intensive. Run it at slightly lower priority 908 * than the other taskqs. 909 */ 910 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 911 pri--; 912 913 tq = taskq_create_proc(name, value, pri, 50, 914 INT_MAX, spa->spa_proc, flags); 915 } 916 917 tqs->stqs_taskq[i] = tq; 918 } 919 } 920 921 static void 922 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 923 { 924 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 925 926 if (tqs->stqs_taskq == NULL) { 927 ASSERT0(tqs->stqs_count); 928 return; 929 } 930 931 for (uint_t i = 0; i < tqs->stqs_count; i++) { 932 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 933 taskq_destroy(tqs->stqs_taskq[i]); 934 } 935 936 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 937 tqs->stqs_taskq = NULL; 938 } 939 940 /* 941 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 942 * Note that a type may have multiple discrete taskqs to avoid lock contention 943 * on the taskq itself. In that case we choose which taskq at random by using 944 * the low bits of gethrtime(). 945 */ 946 void 947 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 948 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 949 { 950 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 951 taskq_t *tq; 952 953 ASSERT3P(tqs->stqs_taskq, !=, NULL); 954 ASSERT3U(tqs->stqs_count, !=, 0); 955 956 if (tqs->stqs_count == 1) { 957 tq = tqs->stqs_taskq[0]; 958 } else { 959 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 960 } 961 962 taskq_dispatch_ent(tq, func, arg, flags, ent); 963 } 964 965 static void 966 spa_create_zio_taskqs(spa_t *spa) 967 { 968 for (int t = 0; t < ZIO_TYPES; t++) { 969 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 970 spa_taskqs_init(spa, t, q); 971 } 972 } 973 } 974 975 #ifdef _KERNEL 976 static void 977 spa_thread(void *arg) 978 { 979 callb_cpr_t cprinfo; 980 981 spa_t *spa = arg; 982 user_t *pu = PTOU(curproc); 983 984 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 985 spa->spa_name); 986 987 ASSERT(curproc != &p0); 988 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 989 "zpool-%s", spa->spa_name); 990 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 991 992 /* bind this thread to the requested psrset */ 993 if (zio_taskq_psrset_bind != PS_NONE) { 994 pool_lock(); 995 mutex_enter(&cpu_lock); 996 mutex_enter(&pidlock); 997 mutex_enter(&curproc->p_lock); 998 999 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1000 0, NULL, NULL) == 0) { 1001 curthread->t_bind_pset = zio_taskq_psrset_bind; 1002 } else { 1003 cmn_err(CE_WARN, 1004 "Couldn't bind process for zfs pool \"%s\" to " 1005 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1006 } 1007 1008 mutex_exit(&curproc->p_lock); 1009 mutex_exit(&pidlock); 1010 mutex_exit(&cpu_lock); 1011 pool_unlock(); 1012 } 1013 1014 if (zio_taskq_sysdc) { 1015 sysdc_thread_enter(curthread, 100, 0); 1016 } 1017 1018 spa->spa_proc = curproc; 1019 spa->spa_did = curthread->t_did; 1020 1021 spa_create_zio_taskqs(spa); 1022 1023 mutex_enter(&spa->spa_proc_lock); 1024 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1025 1026 spa->spa_proc_state = SPA_PROC_ACTIVE; 1027 cv_broadcast(&spa->spa_proc_cv); 1028 1029 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1030 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1031 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1032 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1033 1034 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1035 spa->spa_proc_state = SPA_PROC_GONE; 1036 spa->spa_proc = &p0; 1037 cv_broadcast(&spa->spa_proc_cv); 1038 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1039 1040 mutex_enter(&curproc->p_lock); 1041 lwp_exit(); 1042 } 1043 #endif 1044 1045 /* 1046 * Activate an uninitialized pool. 1047 */ 1048 static void 1049 spa_activate(spa_t *spa, int mode) 1050 { 1051 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1052 1053 spa->spa_state = POOL_STATE_ACTIVE; 1054 spa->spa_mode = mode; 1055 1056 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1057 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1058 1059 /* Try to create a covering process */ 1060 mutex_enter(&spa->spa_proc_lock); 1061 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1062 ASSERT(spa->spa_proc == &p0); 1063 spa->spa_did = 0; 1064 1065 /* Only create a process if we're going to be around a while. */ 1066 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1067 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1068 NULL, 0) == 0) { 1069 spa->spa_proc_state = SPA_PROC_CREATED; 1070 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1071 cv_wait(&spa->spa_proc_cv, 1072 &spa->spa_proc_lock); 1073 } 1074 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1075 ASSERT(spa->spa_proc != &p0); 1076 ASSERT(spa->spa_did != 0); 1077 } else { 1078 #ifdef _KERNEL 1079 cmn_err(CE_WARN, 1080 "Couldn't create process for zfs pool \"%s\"\n", 1081 spa->spa_name); 1082 #endif 1083 } 1084 } 1085 mutex_exit(&spa->spa_proc_lock); 1086 1087 /* If we didn't create a process, we need to create our taskqs. */ 1088 if (spa->spa_proc == &p0) { 1089 spa_create_zio_taskqs(spa); 1090 } 1091 1092 for (size_t i = 0; i < TXG_SIZE; i++) 1093 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); 1094 1095 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1096 offsetof(vdev_t, vdev_config_dirty_node)); 1097 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1098 offsetof(objset_t, os_evicting_node)); 1099 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1100 offsetof(vdev_t, vdev_state_dirty_node)); 1101 1102 txg_list_create(&spa->spa_vdev_txg_list, spa, 1103 offsetof(struct vdev, vdev_txg_node)); 1104 1105 avl_create(&spa->spa_errlist_scrub, 1106 spa_error_entry_compare, sizeof (spa_error_entry_t), 1107 offsetof(spa_error_entry_t, se_avl)); 1108 avl_create(&spa->spa_errlist_last, 1109 spa_error_entry_compare, sizeof (spa_error_entry_t), 1110 offsetof(spa_error_entry_t, se_avl)); 1111 } 1112 1113 /* 1114 * Opposite of spa_activate(). 1115 */ 1116 static void 1117 spa_deactivate(spa_t *spa) 1118 { 1119 ASSERT(spa->spa_sync_on == B_FALSE); 1120 ASSERT(spa->spa_dsl_pool == NULL); 1121 ASSERT(spa->spa_root_vdev == NULL); 1122 ASSERT(spa->spa_async_zio_root == NULL); 1123 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1124 1125 spa_evicting_os_wait(spa); 1126 1127 txg_list_destroy(&spa->spa_vdev_txg_list); 1128 1129 list_destroy(&spa->spa_config_dirty_list); 1130 list_destroy(&spa->spa_evicting_os_list); 1131 list_destroy(&spa->spa_state_dirty_list); 1132 1133 for (int t = 0; t < ZIO_TYPES; t++) { 1134 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1135 spa_taskqs_fini(spa, t, q); 1136 } 1137 } 1138 1139 for (size_t i = 0; i < TXG_SIZE; i++) { 1140 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1141 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1142 spa->spa_txg_zio[i] = NULL; 1143 } 1144 1145 metaslab_class_destroy(spa->spa_normal_class); 1146 spa->spa_normal_class = NULL; 1147 1148 metaslab_class_destroy(spa->spa_log_class); 1149 spa->spa_log_class = NULL; 1150 1151 /* 1152 * If this was part of an import or the open otherwise failed, we may 1153 * still have errors left in the queues. Empty them just in case. 1154 */ 1155 spa_errlog_drain(spa); 1156 1157 avl_destroy(&spa->spa_errlist_scrub); 1158 avl_destroy(&spa->spa_errlist_last); 1159 1160 spa->spa_state = POOL_STATE_UNINITIALIZED; 1161 1162 mutex_enter(&spa->spa_proc_lock); 1163 if (spa->spa_proc_state != SPA_PROC_NONE) { 1164 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1165 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1166 cv_broadcast(&spa->spa_proc_cv); 1167 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1168 ASSERT(spa->spa_proc != &p0); 1169 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1170 } 1171 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1172 spa->spa_proc_state = SPA_PROC_NONE; 1173 } 1174 ASSERT(spa->spa_proc == &p0); 1175 mutex_exit(&spa->spa_proc_lock); 1176 1177 /* 1178 * We want to make sure spa_thread() has actually exited the ZFS 1179 * module, so that the module can't be unloaded out from underneath 1180 * it. 1181 */ 1182 if (spa->spa_did != 0) { 1183 thread_join(spa->spa_did); 1184 spa->spa_did = 0; 1185 } 1186 } 1187 1188 /* 1189 * Verify a pool configuration, and construct the vdev tree appropriately. This 1190 * will create all the necessary vdevs in the appropriate layout, with each vdev 1191 * in the CLOSED state. This will prep the pool before open/creation/import. 1192 * All vdev validation is done by the vdev_alloc() routine. 1193 */ 1194 static int 1195 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1196 uint_t id, int atype) 1197 { 1198 nvlist_t **child; 1199 uint_t children; 1200 int error; 1201 1202 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1203 return (error); 1204 1205 if ((*vdp)->vdev_ops->vdev_op_leaf) 1206 return (0); 1207 1208 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1209 &child, &children); 1210 1211 if (error == ENOENT) 1212 return (0); 1213 1214 if (error) { 1215 vdev_free(*vdp); 1216 *vdp = NULL; 1217 return (SET_ERROR(EINVAL)); 1218 } 1219 1220 for (int c = 0; c < children; c++) { 1221 vdev_t *vd; 1222 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1223 atype)) != 0) { 1224 vdev_free(*vdp); 1225 *vdp = NULL; 1226 return (error); 1227 } 1228 } 1229 1230 ASSERT(*vdp != NULL); 1231 1232 return (0); 1233 } 1234 1235 /* 1236 * Opposite of spa_load(). 1237 */ 1238 static void 1239 spa_unload(spa_t *spa) 1240 { 1241 int i; 1242 1243 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1244 1245 spa_load_note(spa, "UNLOADING"); 1246 1247 /* 1248 * Stop async tasks. 1249 */ 1250 spa_async_suspend(spa); 1251 1252 /* 1253 * Stop syncing. 1254 */ 1255 if (spa->spa_sync_on) { 1256 txg_sync_stop(spa->spa_dsl_pool); 1257 spa->spa_sync_on = B_FALSE; 1258 } 1259 1260 /* 1261 * Even though vdev_free() also calls vdev_metaslab_fini, we need 1262 * to call it earlier, before we wait for async i/o to complete. 1263 * This ensures that there is no async metaslab prefetching, by 1264 * calling taskq_wait(mg_taskq). 1265 */ 1266 if (spa->spa_root_vdev != NULL) { 1267 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1268 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1269 vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1270 spa_config_exit(spa, SCL_ALL, FTAG); 1271 } 1272 1273 /* 1274 * Wait for any outstanding async I/O to complete. 1275 */ 1276 if (spa->spa_async_zio_root != NULL) { 1277 for (int i = 0; i < max_ncpus; i++) 1278 (void) zio_wait(spa->spa_async_zio_root[i]); 1279 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1280 spa->spa_async_zio_root = NULL; 1281 } 1282 1283 if (spa->spa_vdev_removal != NULL) { 1284 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1285 spa->spa_vdev_removal = NULL; 1286 } 1287 1288 spa_condense_fini(spa); 1289 1290 bpobj_close(&spa->spa_deferred_bpobj); 1291 1292 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1293 1294 /* 1295 * Close all vdevs. 1296 */ 1297 if (spa->spa_root_vdev) 1298 vdev_free(spa->spa_root_vdev); 1299 ASSERT(spa->spa_root_vdev == NULL); 1300 1301 /* 1302 * Close the dsl pool. 1303 */ 1304 if (spa->spa_dsl_pool) { 1305 dsl_pool_close(spa->spa_dsl_pool); 1306 spa->spa_dsl_pool = NULL; 1307 spa->spa_meta_objset = NULL; 1308 } 1309 1310 ddt_unload(spa); 1311 1312 /* 1313 * Drop and purge level 2 cache 1314 */ 1315 spa_l2cache_drop(spa); 1316 1317 for (i = 0; i < spa->spa_spares.sav_count; i++) 1318 vdev_free(spa->spa_spares.sav_vdevs[i]); 1319 if (spa->spa_spares.sav_vdevs) { 1320 kmem_free(spa->spa_spares.sav_vdevs, 1321 spa->spa_spares.sav_count * sizeof (void *)); 1322 spa->spa_spares.sav_vdevs = NULL; 1323 } 1324 if (spa->spa_spares.sav_config) { 1325 nvlist_free(spa->spa_spares.sav_config); 1326 spa->spa_spares.sav_config = NULL; 1327 } 1328 spa->spa_spares.sav_count = 0; 1329 1330 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1331 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1332 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1333 } 1334 if (spa->spa_l2cache.sav_vdevs) { 1335 kmem_free(spa->spa_l2cache.sav_vdevs, 1336 spa->spa_l2cache.sav_count * sizeof (void *)); 1337 spa->spa_l2cache.sav_vdevs = NULL; 1338 } 1339 if (spa->spa_l2cache.sav_config) { 1340 nvlist_free(spa->spa_l2cache.sav_config); 1341 spa->spa_l2cache.sav_config = NULL; 1342 } 1343 spa->spa_l2cache.sav_count = 0; 1344 1345 spa->spa_async_suspended = 0; 1346 1347 spa->spa_indirect_vdevs_loaded = B_FALSE; 1348 1349 if (spa->spa_comment != NULL) { 1350 spa_strfree(spa->spa_comment); 1351 spa->spa_comment = NULL; 1352 } 1353 1354 spa_config_exit(spa, SCL_ALL, FTAG); 1355 } 1356 1357 /* 1358 * Load (or re-load) the current list of vdevs describing the active spares for 1359 * this pool. When this is called, we have some form of basic information in 1360 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1361 * then re-generate a more complete list including status information. 1362 */ 1363 void 1364 spa_load_spares(spa_t *spa) 1365 { 1366 nvlist_t **spares; 1367 uint_t nspares; 1368 int i; 1369 vdev_t *vd, *tvd; 1370 1371 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1372 1373 /* 1374 * First, close and free any existing spare vdevs. 1375 */ 1376 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1377 vd = spa->spa_spares.sav_vdevs[i]; 1378 1379 /* Undo the call to spa_activate() below */ 1380 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1381 B_FALSE)) != NULL && tvd->vdev_isspare) 1382 spa_spare_remove(tvd); 1383 vdev_close(vd); 1384 vdev_free(vd); 1385 } 1386 1387 if (spa->spa_spares.sav_vdevs) 1388 kmem_free(spa->spa_spares.sav_vdevs, 1389 spa->spa_spares.sav_count * sizeof (void *)); 1390 1391 if (spa->spa_spares.sav_config == NULL) 1392 nspares = 0; 1393 else 1394 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1395 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1396 1397 spa->spa_spares.sav_count = (int)nspares; 1398 spa->spa_spares.sav_vdevs = NULL; 1399 1400 if (nspares == 0) 1401 return; 1402 1403 /* 1404 * Construct the array of vdevs, opening them to get status in the 1405 * process. For each spare, there is potentially two different vdev_t 1406 * structures associated with it: one in the list of spares (used only 1407 * for basic validation purposes) and one in the active vdev 1408 * configuration (if it's spared in). During this phase we open and 1409 * validate each vdev on the spare list. If the vdev also exists in the 1410 * active configuration, then we also mark this vdev as an active spare. 1411 */ 1412 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1413 KM_SLEEP); 1414 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1415 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1416 VDEV_ALLOC_SPARE) == 0); 1417 ASSERT(vd != NULL); 1418 1419 spa->spa_spares.sav_vdevs[i] = vd; 1420 1421 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1422 B_FALSE)) != NULL) { 1423 if (!tvd->vdev_isspare) 1424 spa_spare_add(tvd); 1425 1426 /* 1427 * We only mark the spare active if we were successfully 1428 * able to load the vdev. Otherwise, importing a pool 1429 * with a bad active spare would result in strange 1430 * behavior, because multiple pool would think the spare 1431 * is actively in use. 1432 * 1433 * There is a vulnerability here to an equally bizarre 1434 * circumstance, where a dead active spare is later 1435 * brought back to life (onlined or otherwise). Given 1436 * the rarity of this scenario, and the extra complexity 1437 * it adds, we ignore the possibility. 1438 */ 1439 if (!vdev_is_dead(tvd)) 1440 spa_spare_activate(tvd); 1441 } 1442 1443 vd->vdev_top = vd; 1444 vd->vdev_aux = &spa->spa_spares; 1445 1446 if (vdev_open(vd) != 0) 1447 continue; 1448 1449 if (vdev_validate_aux(vd) == 0) 1450 spa_spare_add(vd); 1451 } 1452 1453 /* 1454 * Recompute the stashed list of spares, with status information 1455 * this time. 1456 */ 1457 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1458 DATA_TYPE_NVLIST_ARRAY) == 0); 1459 1460 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1461 KM_SLEEP); 1462 for (i = 0; i < spa->spa_spares.sav_count; i++) 1463 spares[i] = vdev_config_generate(spa, 1464 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1465 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1466 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1467 for (i = 0; i < spa->spa_spares.sav_count; i++) 1468 nvlist_free(spares[i]); 1469 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1470 } 1471 1472 /* 1473 * Load (or re-load) the current list of vdevs describing the active l2cache for 1474 * this pool. When this is called, we have some form of basic information in 1475 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1476 * then re-generate a more complete list including status information. 1477 * Devices which are already active have their details maintained, and are 1478 * not re-opened. 1479 */ 1480 void 1481 spa_load_l2cache(spa_t *spa) 1482 { 1483 nvlist_t **l2cache; 1484 uint_t nl2cache; 1485 int i, j, oldnvdevs; 1486 uint64_t guid; 1487 vdev_t *vd, **oldvdevs, **newvdevs; 1488 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1489 1490 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1491 1492 if (sav->sav_config != NULL) { 1493 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1494 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1495 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1496 } else { 1497 nl2cache = 0; 1498 newvdevs = NULL; 1499 } 1500 1501 oldvdevs = sav->sav_vdevs; 1502 oldnvdevs = sav->sav_count; 1503 sav->sav_vdevs = NULL; 1504 sav->sav_count = 0; 1505 1506 /* 1507 * Process new nvlist of vdevs. 1508 */ 1509 for (i = 0; i < nl2cache; i++) { 1510 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1511 &guid) == 0); 1512 1513 newvdevs[i] = NULL; 1514 for (j = 0; j < oldnvdevs; j++) { 1515 vd = oldvdevs[j]; 1516 if (vd != NULL && guid == vd->vdev_guid) { 1517 /* 1518 * Retain previous vdev for add/remove ops. 1519 */ 1520 newvdevs[i] = vd; 1521 oldvdevs[j] = NULL; 1522 break; 1523 } 1524 } 1525 1526 if (newvdevs[i] == NULL) { 1527 /* 1528 * Create new vdev 1529 */ 1530 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1531 VDEV_ALLOC_L2CACHE) == 0); 1532 ASSERT(vd != NULL); 1533 newvdevs[i] = vd; 1534 1535 /* 1536 * Commit this vdev as an l2cache device, 1537 * even if it fails to open. 1538 */ 1539 spa_l2cache_add(vd); 1540 1541 vd->vdev_top = vd; 1542 vd->vdev_aux = sav; 1543 1544 spa_l2cache_activate(vd); 1545 1546 if (vdev_open(vd) != 0) 1547 continue; 1548 1549 (void) vdev_validate_aux(vd); 1550 1551 if (!vdev_is_dead(vd)) 1552 l2arc_add_vdev(spa, vd); 1553 } 1554 } 1555 1556 /* 1557 * Purge vdevs that were dropped 1558 */ 1559 for (i = 0; i < oldnvdevs; i++) { 1560 uint64_t pool; 1561 1562 vd = oldvdevs[i]; 1563 if (vd != NULL) { 1564 ASSERT(vd->vdev_isl2cache); 1565 1566 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1567 pool != 0ULL && l2arc_vdev_present(vd)) 1568 l2arc_remove_vdev(vd); 1569 vdev_clear_stats(vd); 1570 vdev_free(vd); 1571 } 1572 } 1573 1574 if (oldvdevs) 1575 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1576 1577 if (sav->sav_config == NULL) 1578 goto out; 1579 1580 sav->sav_vdevs = newvdevs; 1581 sav->sav_count = (int)nl2cache; 1582 1583 /* 1584 * Recompute the stashed list of l2cache devices, with status 1585 * information this time. 1586 */ 1587 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1588 DATA_TYPE_NVLIST_ARRAY) == 0); 1589 1590 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1591 for (i = 0; i < sav->sav_count; i++) 1592 l2cache[i] = vdev_config_generate(spa, 1593 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1594 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1595 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1596 out: 1597 for (i = 0; i < sav->sav_count; i++) 1598 nvlist_free(l2cache[i]); 1599 if (sav->sav_count) 1600 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1601 } 1602 1603 static int 1604 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1605 { 1606 dmu_buf_t *db; 1607 char *packed = NULL; 1608 size_t nvsize = 0; 1609 int error; 1610 *value = NULL; 1611 1612 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1613 if (error != 0) 1614 return (error); 1615 1616 nvsize = *(uint64_t *)db->db_data; 1617 dmu_buf_rele(db, FTAG); 1618 1619 packed = kmem_alloc(nvsize, KM_SLEEP); 1620 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1621 DMU_READ_PREFETCH); 1622 if (error == 0) 1623 error = nvlist_unpack(packed, nvsize, value, 0); 1624 kmem_free(packed, nvsize); 1625 1626 return (error); 1627 } 1628 1629 /* 1630 * Checks to see if the given vdev could not be opened, in which case we post a 1631 * sysevent to notify the autoreplace code that the device has been removed. 1632 */ 1633 static void 1634 spa_check_removed(vdev_t *vd) 1635 { 1636 for (int c = 0; c < vd->vdev_children; c++) 1637 spa_check_removed(vd->vdev_child[c]); 1638 1639 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1640 vdev_is_concrete(vd)) { 1641 zfs_post_autoreplace(vd->vdev_spa, vd); 1642 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1643 } 1644 } 1645 1646 static void 1647 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1648 { 1649 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1650 1651 vd->vdev_top_zap = mvd->vdev_top_zap; 1652 vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1653 1654 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1655 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1656 } 1657 } 1658 1659 /* 1660 * Validate the current config against the MOS config 1661 */ 1662 static boolean_t 1663 spa_config_valid(spa_t *spa, nvlist_t *config) 1664 { 1665 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1666 nvlist_t *nv; 1667 1668 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1669 1670 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1671 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1672 1673 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1674 1675 /* 1676 * If we're doing a normal import, then build up any additional 1677 * diagnostic information about missing devices in this config. 1678 * We'll pass this up to the user for further processing. 1679 */ 1680 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1681 nvlist_t **child, *nv; 1682 uint64_t idx = 0; 1683 1684 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1685 KM_SLEEP); 1686 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1687 1688 for (int c = 0; c < rvd->vdev_children; c++) { 1689 vdev_t *tvd = rvd->vdev_child[c]; 1690 vdev_t *mtvd = mrvd->vdev_child[c]; 1691 1692 if (tvd->vdev_ops == &vdev_missing_ops && 1693 mtvd->vdev_ops != &vdev_missing_ops && 1694 mtvd->vdev_islog) 1695 child[idx++] = vdev_config_generate(spa, mtvd, 1696 B_FALSE, 0); 1697 } 1698 1699 if (idx) { 1700 VERIFY(nvlist_add_nvlist_array(nv, 1701 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1702 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1703 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1704 1705 for (int i = 0; i < idx; i++) 1706 nvlist_free(child[i]); 1707 } 1708 nvlist_free(nv); 1709 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1710 } 1711 1712 /* 1713 * Compare the root vdev tree with the information we have 1714 * from the MOS config (mrvd). Check each top-level vdev 1715 * with the corresponding MOS config top-level (mtvd). 1716 */ 1717 for (int c = 0; c < rvd->vdev_children; c++) { 1718 vdev_t *tvd = rvd->vdev_child[c]; 1719 vdev_t *mtvd = mrvd->vdev_child[c]; 1720 1721 /* 1722 * Resolve any "missing" vdevs in the current configuration. 1723 * Also trust the MOS config about any "indirect" vdevs. 1724 * If we find that the MOS config has more accurate information 1725 * about the top-level vdev then use that vdev instead. 1726 */ 1727 if ((tvd->vdev_ops == &vdev_missing_ops && 1728 mtvd->vdev_ops != &vdev_missing_ops) || 1729 (mtvd->vdev_ops == &vdev_indirect_ops && 1730 tvd->vdev_ops != &vdev_indirect_ops)) { 1731 1732 /* 1733 * Device specific actions. 1734 */ 1735 if (mtvd->vdev_islog) { 1736 if (!(spa->spa_import_flags & 1737 ZFS_IMPORT_MISSING_LOG)) { 1738 continue; 1739 } 1740 1741 spa_set_log_state(spa, SPA_LOG_CLEAR); 1742 } else if (mtvd->vdev_ops != &vdev_indirect_ops) { 1743 continue; 1744 } 1745 1746 /* 1747 * Swap the missing vdev with the data we were 1748 * able to obtain from the MOS config. 1749 */ 1750 vdev_remove_child(rvd, tvd); 1751 vdev_remove_child(mrvd, mtvd); 1752 1753 vdev_add_child(rvd, mtvd); 1754 vdev_add_child(mrvd, tvd); 1755 1756 vdev_reopen(rvd); 1757 } else { 1758 if (mtvd->vdev_islog) { 1759 /* 1760 * Load the slog device's state from the MOS 1761 * config since it's possible that the label 1762 * does not contain the most up-to-date 1763 * information. 1764 */ 1765 vdev_load_log_state(tvd, mtvd); 1766 vdev_reopen(tvd); 1767 } 1768 1769 /* 1770 * Per-vdev ZAP info is stored exclusively in the MOS. 1771 */ 1772 spa_config_valid_zaps(tvd, mtvd); 1773 } 1774 1775 /* 1776 * Never trust this info from userland; always use what's 1777 * in the MOS. This prevents it from getting out of sync 1778 * with the rest of the info in the MOS. 1779 */ 1780 tvd->vdev_removing = mtvd->vdev_removing; 1781 tvd->vdev_indirect_config = mtvd->vdev_indirect_config; 1782 } 1783 1784 vdev_free(mrvd); 1785 spa_config_exit(spa, SCL_ALL, FTAG); 1786 1787 /* 1788 * Ensure we were able to validate the config. 1789 */ 1790 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1791 } 1792 1793 /* 1794 * Check for missing log devices 1795 */ 1796 static boolean_t 1797 spa_check_logs(spa_t *spa) 1798 { 1799 boolean_t rv = B_FALSE; 1800 dsl_pool_t *dp = spa_get_dsl(spa); 1801 1802 switch (spa->spa_log_state) { 1803 case SPA_LOG_MISSING: 1804 /* need to recheck in case slog has been restored */ 1805 case SPA_LOG_UNKNOWN: 1806 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1807 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1808 if (rv) 1809 spa_set_log_state(spa, SPA_LOG_MISSING); 1810 break; 1811 } 1812 return (rv); 1813 } 1814 1815 static boolean_t 1816 spa_passivate_log(spa_t *spa) 1817 { 1818 vdev_t *rvd = spa->spa_root_vdev; 1819 boolean_t slog_found = B_FALSE; 1820 1821 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1822 1823 if (!spa_has_slogs(spa)) 1824 return (B_FALSE); 1825 1826 for (int c = 0; c < rvd->vdev_children; c++) { 1827 vdev_t *tvd = rvd->vdev_child[c]; 1828 metaslab_group_t *mg = tvd->vdev_mg; 1829 1830 if (tvd->vdev_islog) { 1831 metaslab_group_passivate(mg); 1832 slog_found = B_TRUE; 1833 } 1834 } 1835 1836 return (slog_found); 1837 } 1838 1839 static void 1840 spa_activate_log(spa_t *spa) 1841 { 1842 vdev_t *rvd = spa->spa_root_vdev; 1843 1844 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1845 1846 for (int c = 0; c < rvd->vdev_children; c++) { 1847 vdev_t *tvd = rvd->vdev_child[c]; 1848 metaslab_group_t *mg = tvd->vdev_mg; 1849 1850 if (tvd->vdev_islog) 1851 metaslab_group_activate(mg); 1852 } 1853 } 1854 1855 int 1856 spa_reset_logs(spa_t *spa) 1857 { 1858 int error; 1859 1860 error = dmu_objset_find(spa_name(spa), zil_reset, 1861 NULL, DS_FIND_CHILDREN); 1862 if (error == 0) { 1863 /* 1864 * We successfully offlined the log device, sync out the 1865 * current txg so that the "stubby" block can be removed 1866 * by zil_sync(). 1867 */ 1868 txg_wait_synced(spa->spa_dsl_pool, 0); 1869 } 1870 return (error); 1871 } 1872 1873 static void 1874 spa_aux_check_removed(spa_aux_vdev_t *sav) 1875 { 1876 for (int i = 0; i < sav->sav_count; i++) 1877 spa_check_removed(sav->sav_vdevs[i]); 1878 } 1879 1880 void 1881 spa_claim_notify(zio_t *zio) 1882 { 1883 spa_t *spa = zio->io_spa; 1884 1885 if (zio->io_error) 1886 return; 1887 1888 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1889 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1890 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1891 mutex_exit(&spa->spa_props_lock); 1892 } 1893 1894 typedef struct spa_load_error { 1895 uint64_t sle_meta_count; 1896 uint64_t sle_data_count; 1897 } spa_load_error_t; 1898 1899 static void 1900 spa_load_verify_done(zio_t *zio) 1901 { 1902 blkptr_t *bp = zio->io_bp; 1903 spa_load_error_t *sle = zio->io_private; 1904 dmu_object_type_t type = BP_GET_TYPE(bp); 1905 int error = zio->io_error; 1906 spa_t *spa = zio->io_spa; 1907 1908 abd_free(zio->io_abd); 1909 if (error) { 1910 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1911 type != DMU_OT_INTENT_LOG) 1912 atomic_inc_64(&sle->sle_meta_count); 1913 else 1914 atomic_inc_64(&sle->sle_data_count); 1915 } 1916 1917 mutex_enter(&spa->spa_scrub_lock); 1918 spa->spa_scrub_inflight--; 1919 cv_broadcast(&spa->spa_scrub_io_cv); 1920 mutex_exit(&spa->spa_scrub_lock); 1921 } 1922 1923 /* 1924 * Maximum number of concurrent scrub i/os to create while verifying 1925 * a pool while importing it. 1926 */ 1927 int spa_load_verify_maxinflight = 10000; 1928 boolean_t spa_load_verify_metadata = B_TRUE; 1929 boolean_t spa_load_verify_data = B_TRUE; 1930 1931 /*ARGSUSED*/ 1932 static int 1933 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1934 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1935 { 1936 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1937 return (0); 1938 /* 1939 * Note: normally this routine will not be called if 1940 * spa_load_verify_metadata is not set. However, it may be useful 1941 * to manually set the flag after the traversal has begun. 1942 */ 1943 if (!spa_load_verify_metadata) 1944 return (0); 1945 if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 1946 return (0); 1947 1948 zio_t *rio = arg; 1949 size_t size = BP_GET_PSIZE(bp); 1950 1951 mutex_enter(&spa->spa_scrub_lock); 1952 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1953 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1954 spa->spa_scrub_inflight++; 1955 mutex_exit(&spa->spa_scrub_lock); 1956 1957 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 1958 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1959 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1960 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1961 return (0); 1962 } 1963 1964 /* ARGSUSED */ 1965 int 1966 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1967 { 1968 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 1969 return (SET_ERROR(ENAMETOOLONG)); 1970 1971 return (0); 1972 } 1973 1974 static int 1975 spa_load_verify(spa_t *spa) 1976 { 1977 zio_t *rio; 1978 spa_load_error_t sle = { 0 }; 1979 zpool_rewind_policy_t policy; 1980 boolean_t verify_ok = B_FALSE; 1981 int error = 0; 1982 1983 zpool_get_rewind_policy(spa->spa_config, &policy); 1984 1985 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1986 return (0); 1987 1988 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 1989 error = dmu_objset_find_dp(spa->spa_dsl_pool, 1990 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 1991 DS_FIND_CHILDREN); 1992 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 1993 if (error != 0) 1994 return (error); 1995 1996 rio = zio_root(spa, NULL, &sle, 1997 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1998 1999 if (spa_load_verify_metadata) { 2000 if (spa->spa_extreme_rewind) { 2001 spa_load_note(spa, "performing a complete scan of the " 2002 "pool since extreme rewind is on. This may take " 2003 "a very long time.\n (spa_load_verify_data=%u, " 2004 "spa_load_verify_metadata=%u)", 2005 spa_load_verify_data, spa_load_verify_metadata); 2006 } 2007 error = traverse_pool(spa, spa->spa_verify_min_txg, 2008 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2009 spa_load_verify_cb, rio); 2010 } 2011 2012 (void) zio_wait(rio); 2013 2014 spa->spa_load_meta_errors = sle.sle_meta_count; 2015 spa->spa_load_data_errors = sle.sle_data_count; 2016 2017 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2018 sle.sle_data_count <= policy.zrp_maxdata) { 2019 int64_t loss = 0; 2020 2021 verify_ok = B_TRUE; 2022 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2023 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2024 2025 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2026 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2027 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2028 VERIFY(nvlist_add_int64(spa->spa_load_info, 2029 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2030 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2031 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2032 } else { 2033 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2034 } 2035 2036 if (error) { 2037 if (error != ENXIO && error != EIO) 2038 error = SET_ERROR(EIO); 2039 return (error); 2040 } 2041 2042 return (verify_ok ? 0 : EIO); 2043 } 2044 2045 /* 2046 * Find a value in the pool props object. 2047 */ 2048 static void 2049 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2050 { 2051 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2052 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2053 } 2054 2055 /* 2056 * Find a value in the pool directory object. 2057 */ 2058 static int 2059 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2060 { 2061 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2062 name, sizeof (uint64_t), 1, val); 2063 2064 if (error != 0 && (error != ENOENT || log_enoent)) { 2065 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2066 "[error=%d]", name, error); 2067 } 2068 2069 return (error); 2070 } 2071 2072 static int 2073 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2074 { 2075 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2076 return (SET_ERROR(err)); 2077 } 2078 2079 /* 2080 * Fix up config after a partly-completed split. This is done with the 2081 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2082 * pool have that entry in their config, but only the splitting one contains 2083 * a list of all the guids of the vdevs that are being split off. 2084 * 2085 * This function determines what to do with that list: either rejoin 2086 * all the disks to the pool, or complete the splitting process. To attempt 2087 * the rejoin, each disk that is offlined is marked online again, and 2088 * we do a reopen() call. If the vdev label for every disk that was 2089 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2090 * then we call vdev_split() on each disk, and complete the split. 2091 * 2092 * Otherwise we leave the config alone, with all the vdevs in place in 2093 * the original pool. 2094 */ 2095 static void 2096 spa_try_repair(spa_t *spa, nvlist_t *config) 2097 { 2098 uint_t extracted; 2099 uint64_t *glist; 2100 uint_t i, gcount; 2101 nvlist_t *nvl; 2102 vdev_t **vd; 2103 boolean_t attempt_reopen; 2104 2105 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2106 return; 2107 2108 /* check that the config is complete */ 2109 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2110 &glist, &gcount) != 0) 2111 return; 2112 2113 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2114 2115 /* attempt to online all the vdevs & validate */ 2116 attempt_reopen = B_TRUE; 2117 for (i = 0; i < gcount; i++) { 2118 if (glist[i] == 0) /* vdev is hole */ 2119 continue; 2120 2121 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2122 if (vd[i] == NULL) { 2123 /* 2124 * Don't bother attempting to reopen the disks; 2125 * just do the split. 2126 */ 2127 attempt_reopen = B_FALSE; 2128 } else { 2129 /* attempt to re-online it */ 2130 vd[i]->vdev_offline = B_FALSE; 2131 } 2132 } 2133 2134 if (attempt_reopen) { 2135 vdev_reopen(spa->spa_root_vdev); 2136 2137 /* check each device to see what state it's in */ 2138 for (extracted = 0, i = 0; i < gcount; i++) { 2139 if (vd[i] != NULL && 2140 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2141 break; 2142 ++extracted; 2143 } 2144 } 2145 2146 /* 2147 * If every disk has been moved to the new pool, or if we never 2148 * even attempted to look at them, then we split them off for 2149 * good. 2150 */ 2151 if (!attempt_reopen || gcount == extracted) { 2152 for (i = 0; i < gcount; i++) 2153 if (vd[i] != NULL) 2154 vdev_split(vd[i]); 2155 vdev_reopen(spa->spa_root_vdev); 2156 } 2157 2158 kmem_free(vd, gcount * sizeof (vdev_t *)); 2159 } 2160 2161 static int 2162 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2163 boolean_t trust_config) 2164 { 2165 nvlist_t *config = spa->spa_config; 2166 char *ereport = FM_EREPORT_ZFS_POOL; 2167 char *comment; 2168 int error; 2169 uint64_t pool_guid; 2170 nvlist_t *nvl; 2171 2172 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2173 return (SET_ERROR(EINVAL)); 2174 2175 ASSERT(spa->spa_comment == NULL); 2176 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2177 spa->spa_comment = spa_strdup(comment); 2178 2179 /* 2180 * Versioning wasn't explicitly added to the label until later, so if 2181 * it's not present treat it as the initial version. 2182 */ 2183 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2184 &spa->spa_ubsync.ub_version) != 0) 2185 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2186 2187 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2188 &spa->spa_config_txg); 2189 2190 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2191 spa_guid_exists(pool_guid, 0)) { 2192 error = SET_ERROR(EEXIST); 2193 } else { 2194 spa->spa_config_guid = pool_guid; 2195 2196 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2197 &nvl) == 0) { 2198 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2199 KM_SLEEP) == 0); 2200 } 2201 2202 nvlist_free(spa->spa_load_info); 2203 spa->spa_load_info = fnvlist_alloc(); 2204 2205 gethrestime(&spa->spa_loaded_ts); 2206 error = spa_load_impl(spa, pool_guid, config, state, type, 2207 trust_config, &ereport); 2208 } 2209 2210 /* 2211 * Don't count references from objsets that are already closed 2212 * and are making their way through the eviction process. 2213 */ 2214 spa_evicting_os_wait(spa); 2215 spa->spa_minref = refcount_count(&spa->spa_refcount); 2216 if (error) { 2217 if (error != EEXIST) { 2218 spa->spa_loaded_ts.tv_sec = 0; 2219 spa->spa_loaded_ts.tv_nsec = 0; 2220 } 2221 if (error != EBADF) { 2222 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2223 } 2224 } 2225 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2226 spa->spa_ena = 0; 2227 2228 return (error); 2229 } 2230 2231 /* 2232 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2233 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2234 * spa's per-vdev ZAP list. 2235 */ 2236 static uint64_t 2237 vdev_count_verify_zaps(vdev_t *vd) 2238 { 2239 spa_t *spa = vd->vdev_spa; 2240 uint64_t total = 0; 2241 if (vd->vdev_top_zap != 0) { 2242 total++; 2243 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2244 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2245 } 2246 if (vd->vdev_leaf_zap != 0) { 2247 total++; 2248 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2249 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2250 } 2251 2252 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2253 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2254 } 2255 2256 return (total); 2257 } 2258 2259 static int 2260 spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2261 spa_import_type_t type) 2262 { 2263 int error = 0; 2264 nvlist_t *nvtree = NULL; 2265 int parse; 2266 vdev_t *rvd; 2267 2268 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 2269 spa_load_failed(spa, "invalid config provided: '%s' missing", 2270 ZPOOL_CONFIG_VDEV_TREE); 2271 return (SET_ERROR(EINVAL)); 2272 } 2273 2274 parse = (type == SPA_IMPORT_EXISTING ? 2275 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2276 2277 /* 2278 * Create "The Godfather" zio to hold all async IOs 2279 */ 2280 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2281 KM_SLEEP); 2282 for (int i = 0; i < max_ncpus; i++) { 2283 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2284 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2285 ZIO_FLAG_GODFATHER); 2286 } 2287 2288 /* 2289 * Parse the configuration into a vdev tree. We explicitly set the 2290 * value that will be returned by spa_version() since parsing the 2291 * configuration requires knowing the version number. 2292 */ 2293 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2294 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 2295 spa_config_exit(spa, SCL_ALL, FTAG); 2296 2297 if (error != 0) { 2298 spa_load_failed(spa, "unable to parse config [error=%d]", 2299 error); 2300 return (error); 2301 } 2302 2303 ASSERT(spa->spa_root_vdev == rvd); 2304 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2305 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2306 2307 if (type != SPA_IMPORT_ASSEMBLE) { 2308 ASSERT(spa_guid(spa) == pool_guid); 2309 } 2310 2311 return (0); 2312 } 2313 2314 static int 2315 spa_ld_open_vdevs(spa_t *spa) 2316 { 2317 int error = 0; 2318 2319 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2320 error = vdev_open(spa->spa_root_vdev); 2321 spa_config_exit(spa, SCL_ALL, FTAG); 2322 if (error != 0) { 2323 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 2324 error); 2325 } 2326 2327 return (error); 2328 } 2329 2330 static int 2331 spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type, 2332 boolean_t trust_config) 2333 { 2334 int error = 0; 2335 vdev_t *rvd = spa->spa_root_vdev; 2336 2337 /* 2338 * We need to validate the vdev labels against the configuration that 2339 * we have in hand, which is dependent on the setting of trust_config. 2340 * If trust_config is true then we're validating the vdev labels based 2341 * on that config. Otherwise, we're validating against the cached 2342 * config (zpool.cache) that was read when we loaded the zfs module, and 2343 * then later we will recursively call spa_load() and validate against 2344 * the vdev config. 2345 * 2346 * If we're assembling a new pool that's been split off from an 2347 * existing pool, the labels haven't yet been updated so we skip 2348 * validation for now. 2349 */ 2350 if (type != SPA_IMPORT_ASSEMBLE) { 2351 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2352 error = vdev_validate(rvd, trust_config); 2353 spa_config_exit(spa, SCL_ALL, FTAG); 2354 2355 if (error != 0) { 2356 spa_load_failed(spa, "vdev_validate failed [error=%d]", 2357 error); 2358 return (error); 2359 } 2360 2361 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2362 spa_load_failed(spa, "cannot open vdev tree after " 2363 "invalidating some vdevs"); 2364 return (SET_ERROR(ENXIO)); 2365 } 2366 } 2367 2368 return (0); 2369 } 2370 2371 static int 2372 spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, 2373 boolean_t trust_config) 2374 { 2375 vdev_t *rvd = spa->spa_root_vdev; 2376 nvlist_t *label; 2377 uberblock_t *ub = &spa->spa_uberblock; 2378 uint64_t children; 2379 2380 /* 2381 * Find the best uberblock. 2382 */ 2383 vdev_uberblock_load(rvd, ub, &label); 2384 2385 /* 2386 * If we weren't able to find a single valid uberblock, return failure. 2387 */ 2388 if (ub->ub_txg == 0) { 2389 nvlist_free(label); 2390 spa_load_failed(spa, "no valid uberblock found"); 2391 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2392 } 2393 2394 spa_load_note(spa, "using uberblock with txg=%llu", 2395 (u_longlong_t)ub->ub_txg); 2396 2397 /* 2398 * If the pool has an unsupported version we can't open it. 2399 */ 2400 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2401 nvlist_free(label); 2402 spa_load_failed(spa, "version %llu is not supported", 2403 (u_longlong_t)ub->ub_version); 2404 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2405 } 2406 2407 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2408 nvlist_t *features; 2409 2410 /* 2411 * If we weren't able to find what's necessary for reading the 2412 * MOS in the label, return failure. 2413 */ 2414 if (label == NULL) { 2415 spa_load_failed(spa, "label config unavailable"); 2416 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2417 ENXIO)); 2418 } 2419 2420 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 2421 &features) != 0) { 2422 nvlist_free(label); 2423 spa_load_failed(spa, "invalid label: '%s' missing", 2424 ZPOOL_CONFIG_FEATURES_FOR_READ); 2425 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2426 ENXIO)); 2427 } 2428 2429 /* 2430 * Update our in-core representation with the definitive values 2431 * from the label. 2432 */ 2433 nvlist_free(spa->spa_label_features); 2434 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2435 } 2436 2437 nvlist_free(label); 2438 2439 /* 2440 * Look through entries in the label nvlist's features_for_read. If 2441 * there is a feature listed there which we don't understand then we 2442 * cannot open a pool. 2443 */ 2444 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2445 nvlist_t *unsup_feat; 2446 2447 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2448 0); 2449 2450 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2451 NULL); nvp != NULL; 2452 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2453 if (!zfeature_is_supported(nvpair_name(nvp))) { 2454 VERIFY(nvlist_add_string(unsup_feat, 2455 nvpair_name(nvp), "") == 0); 2456 } 2457 } 2458 2459 if (!nvlist_empty(unsup_feat)) { 2460 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2461 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2462 nvlist_free(unsup_feat); 2463 spa_load_failed(spa, "some features are unsupported"); 2464 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2465 ENOTSUP)); 2466 } 2467 2468 nvlist_free(unsup_feat); 2469 } 2470 2471 /* 2472 * If the vdev guid sum doesn't match the uberblock, we have an 2473 * incomplete configuration. We first check to see if the pool 2474 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2475 * If it is, defer the vdev_guid_sum check till later so we 2476 * can handle missing vdevs. 2477 */ 2478 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2479 &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && 2480 rvd->vdev_guid_sum != ub->ub_guid_sum) { 2481 spa_load_failed(spa, "guid sum in config doesn't match guid " 2482 "sum in uberblock (%llu != %llu)", 2483 (u_longlong_t)rvd->vdev_guid_sum, 2484 (u_longlong_t)ub->ub_guid_sum); 2485 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2486 } 2487 2488 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2489 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2490 spa_try_repair(spa, config); 2491 spa_config_exit(spa, SCL_ALL, FTAG); 2492 nvlist_free(spa->spa_config_splitting); 2493 spa->spa_config_splitting = NULL; 2494 } 2495 2496 /* 2497 * Initialize internal SPA structures. 2498 */ 2499 spa->spa_state = POOL_STATE_ACTIVE; 2500 spa->spa_ubsync = spa->spa_uberblock; 2501 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2502 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2503 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2504 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2505 spa->spa_claim_max_txg = spa->spa_first_txg; 2506 spa->spa_prev_software_version = ub->ub_software_version; 2507 2508 return (0); 2509 } 2510 2511 static int 2512 spa_ld_open_rootbp(spa_t *spa) 2513 { 2514 int error = 0; 2515 vdev_t *rvd = spa->spa_root_vdev; 2516 2517 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2518 if (error != 0) { 2519 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 2520 "[error=%d]", error); 2521 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2522 } 2523 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2524 2525 return (0); 2526 } 2527 2528 static int 2529 spa_ld_validate_config(spa_t *spa, spa_import_type_t type) 2530 { 2531 vdev_t *rvd = spa->spa_root_vdev; 2532 2533 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 2534 != 0) 2535 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2536 2537 /* 2538 * Validate the config, using the MOS config to fill in any 2539 * information which might be missing. If we fail to validate 2540 * the config then declare the pool unfit for use. If we're 2541 * assembling a pool from a split, the log is not transferred 2542 * over. 2543 */ 2544 if (type != SPA_IMPORT_ASSEMBLE) { 2545 nvlist_t *mos_config; 2546 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 2547 != 0) { 2548 spa_load_failed(spa, "unable to retrieve MOS config"); 2549 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2550 } 2551 2552 if (!spa_config_valid(spa, mos_config)) { 2553 nvlist_free(mos_config); 2554 spa_load_failed(spa, "mismatch between config provided " 2555 "and config stored in MOS"); 2556 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2557 ENXIO)); 2558 } 2559 nvlist_free(mos_config); 2560 2561 /* 2562 * Now that we've validated the config, check the state of the 2563 * root vdev. If it can't be opened, it indicates one or 2564 * more toplevel vdevs are faulted. 2565 */ 2566 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2567 spa_load_failed(spa, "some top vdevs are unavailable"); 2568 return (SET_ERROR(ENXIO)); 2569 } 2570 } 2571 2572 return (0); 2573 } 2574 2575 static int 2576 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 2577 { 2578 int error = 0; 2579 vdev_t *rvd = spa->spa_root_vdev; 2580 2581 /* 2582 * Everything that we read before spa_remove_init() must be stored 2583 * on concreted vdevs. Therefore we do this as early as possible. 2584 */ 2585 error = spa_remove_init(spa); 2586 if (error != 0) { 2587 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 2588 error); 2589 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2590 } 2591 2592 /* 2593 * Retrieve information needed to condense indirect vdev mappings. 2594 */ 2595 error = spa_condense_init(spa); 2596 if (error != 0) { 2597 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 2598 error); 2599 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2600 } 2601 2602 return (0); 2603 } 2604 2605 static int 2606 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 2607 { 2608 int error = 0; 2609 vdev_t *rvd = spa->spa_root_vdev; 2610 2611 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2612 boolean_t missing_feat_read = B_FALSE; 2613 nvlist_t *unsup_feat, *enabled_feat; 2614 2615 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2616 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 2617 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2618 } 2619 2620 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2621 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 2622 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2623 } 2624 2625 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2626 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 2627 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2628 } 2629 2630 enabled_feat = fnvlist_alloc(); 2631 unsup_feat = fnvlist_alloc(); 2632 2633 if (!spa_features_check(spa, B_FALSE, 2634 unsup_feat, enabled_feat)) 2635 missing_feat_read = B_TRUE; 2636 2637 if (spa_writeable(spa) || 2638 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 2639 if (!spa_features_check(spa, B_TRUE, 2640 unsup_feat, enabled_feat)) { 2641 *missing_feat_writep = B_TRUE; 2642 } 2643 } 2644 2645 fnvlist_add_nvlist(spa->spa_load_info, 2646 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2647 2648 if (!nvlist_empty(unsup_feat)) { 2649 fnvlist_add_nvlist(spa->spa_load_info, 2650 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2651 } 2652 2653 fnvlist_free(enabled_feat); 2654 fnvlist_free(unsup_feat); 2655 2656 if (!missing_feat_read) { 2657 fnvlist_add_boolean(spa->spa_load_info, 2658 ZPOOL_CONFIG_CAN_RDONLY); 2659 } 2660 2661 /* 2662 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2663 * twofold: to determine whether the pool is available for 2664 * import in read-write mode and (if it is not) whether the 2665 * pool is available for import in read-only mode. If the pool 2666 * is available for import in read-write mode, it is displayed 2667 * as available in userland; if it is not available for import 2668 * in read-only mode, it is displayed as unavailable in 2669 * userland. If the pool is available for import in read-only 2670 * mode but not read-write mode, it is displayed as unavailable 2671 * in userland with a special note that the pool is actually 2672 * available for open in read-only mode. 2673 * 2674 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2675 * missing a feature for write, we must first determine whether 2676 * the pool can be opened read-only before returning to 2677 * userland in order to know whether to display the 2678 * abovementioned note. 2679 */ 2680 if (missing_feat_read || (*missing_feat_writep && 2681 spa_writeable(spa))) { 2682 spa_load_failed(spa, "pool uses unsupported features"); 2683 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2684 ENOTSUP)); 2685 } 2686 2687 /* 2688 * Load refcounts for ZFS features from disk into an in-memory 2689 * cache during SPA initialization. 2690 */ 2691 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2692 uint64_t refcount; 2693 2694 error = feature_get_refcount_from_disk(spa, 2695 &spa_feature_table[i], &refcount); 2696 if (error == 0) { 2697 spa->spa_feat_refcount_cache[i] = refcount; 2698 } else if (error == ENOTSUP) { 2699 spa->spa_feat_refcount_cache[i] = 2700 SPA_FEATURE_DISABLED; 2701 } else { 2702 spa_load_failed(spa, "error getting refcount " 2703 "for feature %s [error=%d]", 2704 spa_feature_table[i].fi_guid, error); 2705 return (spa_vdev_err(rvd, 2706 VDEV_AUX_CORRUPT_DATA, EIO)); 2707 } 2708 } 2709 } 2710 2711 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2712 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2713 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 2714 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2715 } 2716 2717 return (0); 2718 } 2719 2720 static int 2721 spa_ld_load_special_directories(spa_t *spa) 2722 { 2723 int error = 0; 2724 vdev_t *rvd = spa->spa_root_vdev; 2725 2726 spa->spa_is_initializing = B_TRUE; 2727 error = dsl_pool_open(spa->spa_dsl_pool); 2728 spa->spa_is_initializing = B_FALSE; 2729 if (error != 0) { 2730 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 2731 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2732 } 2733 2734 return (0); 2735 } 2736 2737 static int 2738 spa_ld_prepare_for_reload(spa_t *spa, int orig_mode) 2739 { 2740 vdev_t *rvd = spa->spa_root_vdev; 2741 2742 uint64_t hostid; 2743 nvlist_t *policy = NULL; 2744 nvlist_t *mos_config; 2745 2746 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2747 spa_load_failed(spa, "unable to retrieve MOS config"); 2748 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2749 } 2750 2751 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2752 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2753 char *hostname; 2754 unsigned long myhostid = 0; 2755 2756 VERIFY(nvlist_lookup_string(mos_config, 2757 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2758 2759 #ifdef _KERNEL 2760 myhostid = zone_get_hostid(NULL); 2761 #else /* _KERNEL */ 2762 /* 2763 * We're emulating the system's hostid in userland, so 2764 * we can't use zone_get_hostid(). 2765 */ 2766 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2767 #endif /* _KERNEL */ 2768 if (hostid != 0 && myhostid != 0 && 2769 hostid != myhostid) { 2770 nvlist_free(mos_config); 2771 cmn_err(CE_WARN, "pool '%s' could not be " 2772 "loaded as it was last accessed by " 2773 "another system (host: %s hostid: 0x%lx). " 2774 "See: http://illumos.org/msg/ZFS-8000-EY", 2775 spa_name(spa), hostname, 2776 (unsigned long)hostid); 2777 return (SET_ERROR(EBADF)); 2778 } 2779 } 2780 if (nvlist_lookup_nvlist(spa->spa_config, 2781 ZPOOL_REWIND_POLICY, &policy) == 0) 2782 VERIFY(nvlist_add_nvlist(mos_config, 2783 ZPOOL_REWIND_POLICY, policy) == 0); 2784 2785 spa_config_set(spa, mos_config); 2786 spa_unload(spa); 2787 spa_deactivate(spa); 2788 spa_activate(spa, orig_mode); 2789 2790 return (0); 2791 } 2792 2793 static int 2794 spa_ld_get_props(spa_t *spa) 2795 { 2796 int error = 0; 2797 uint64_t obj; 2798 vdev_t *rvd = spa->spa_root_vdev; 2799 2800 /* Grab the secret checksum salt from the MOS. */ 2801 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2802 DMU_POOL_CHECKSUM_SALT, 1, 2803 sizeof (spa->spa_cksum_salt.zcs_bytes), 2804 spa->spa_cksum_salt.zcs_bytes); 2805 if (error == ENOENT) { 2806 /* Generate a new salt for subsequent use */ 2807 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2808 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2809 } else if (error != 0) { 2810 spa_load_failed(spa, "unable to retrieve checksum salt from " 2811 "MOS [error=%d]", error); 2812 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2813 } 2814 2815 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 2816 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2817 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2818 if (error != 0) { 2819 spa_load_failed(spa, "error opening deferred-frees bpobj " 2820 "[error=%d]", error); 2821 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2822 } 2823 2824 /* 2825 * Load the bit that tells us to use the new accounting function 2826 * (raid-z deflation). If we have an older pool, this will not 2827 * be present. 2828 */ 2829 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 2830 if (error != 0 && error != ENOENT) 2831 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2832 2833 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2834 &spa->spa_creation_version, B_FALSE); 2835 if (error != 0 && error != ENOENT) 2836 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2837 2838 /* 2839 * Load the persistent error log. If we have an older pool, this will 2840 * not be present. 2841 */ 2842 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 2843 B_FALSE); 2844 if (error != 0 && error != ENOENT) 2845 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2846 2847 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2848 &spa->spa_errlog_scrub, B_FALSE); 2849 if (error != 0 && error != ENOENT) 2850 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2851 2852 /* 2853 * Load the history object. If we have an older pool, this 2854 * will not be present. 2855 */ 2856 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 2857 if (error != 0 && error != ENOENT) 2858 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2859 2860 /* 2861 * Load the per-vdev ZAP map. If we have an older pool, this will not 2862 * be present; in this case, defer its creation to a later time to 2863 * avoid dirtying the MOS this early / out of sync context. See 2864 * spa_sync_config_object. 2865 */ 2866 2867 /* The sentinel is only available in the MOS config. */ 2868 nvlist_t *mos_config; 2869 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2870 spa_load_failed(spa, "unable to retrieve MOS config"); 2871 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2872 } 2873 2874 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2875 &spa->spa_all_vdev_zaps, B_FALSE); 2876 2877 if (error == ENOENT) { 2878 VERIFY(!nvlist_exists(mos_config, 2879 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 2880 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 2881 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2882 } else if (error != 0) { 2883 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2884 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2885 /* 2886 * An older version of ZFS overwrote the sentinel value, so 2887 * we have orphaned per-vdev ZAPs in the MOS. Defer their 2888 * destruction to later; see spa_sync_config_object. 2889 */ 2890 spa->spa_avz_action = AVZ_ACTION_DESTROY; 2891 /* 2892 * We're assuming that no vdevs have had their ZAPs created 2893 * before this. Better be sure of it. 2894 */ 2895 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2896 } 2897 nvlist_free(mos_config); 2898 2899 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2900 2901 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 2902 B_FALSE); 2903 if (error && error != ENOENT) 2904 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2905 2906 if (error == 0) { 2907 uint64_t autoreplace; 2908 2909 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2910 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2911 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2912 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2913 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2914 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2915 &spa->spa_dedup_ditto); 2916 2917 spa->spa_autoreplace = (autoreplace != 0); 2918 } 2919 2920 return (0); 2921 } 2922 2923 static int 2924 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 2925 { 2926 int error = 0; 2927 vdev_t *rvd = spa->spa_root_vdev; 2928 2929 /* 2930 * If we're assembling the pool from the split-off vdevs of 2931 * an existing pool, we don't want to attach the spares & cache 2932 * devices. 2933 */ 2934 2935 /* 2936 * Load any hot spares for this pool. 2937 */ 2938 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 2939 B_FALSE); 2940 if (error != 0 && error != ENOENT) 2941 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2942 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2943 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2944 if (load_nvlist(spa, spa->spa_spares.sav_object, 2945 &spa->spa_spares.sav_config) != 0) { 2946 spa_load_failed(spa, "error loading spares nvlist"); 2947 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2948 } 2949 2950 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2951 spa_load_spares(spa); 2952 spa_config_exit(spa, SCL_ALL, FTAG); 2953 } else if (error == 0) { 2954 spa->spa_spares.sav_sync = B_TRUE; 2955 } 2956 2957 /* 2958 * Load any level 2 ARC devices for this pool. 2959 */ 2960 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2961 &spa->spa_l2cache.sav_object, B_FALSE); 2962 if (error != 0 && error != ENOENT) 2963 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2964 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2965 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2966 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2967 &spa->spa_l2cache.sav_config) != 0) { 2968 spa_load_failed(spa, "error loading l2cache nvlist"); 2969 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2970 } 2971 2972 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2973 spa_load_l2cache(spa); 2974 spa_config_exit(spa, SCL_ALL, FTAG); 2975 } else if (error == 0) { 2976 spa->spa_l2cache.sav_sync = B_TRUE; 2977 } 2978 2979 return (0); 2980 } 2981 2982 static int 2983 spa_ld_load_vdev_metadata(spa_t *spa) 2984 { 2985 int error = 0; 2986 vdev_t *rvd = spa->spa_root_vdev; 2987 2988 /* 2989 * If the 'autoreplace' property is set, then post a resource notifying 2990 * the ZFS DE that it should not issue any faults for unopenable 2991 * devices. We also iterate over the vdevs, and post a sysevent for any 2992 * unopenable vdevs so that the normal autoreplace handler can take 2993 * over. 2994 */ 2995 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 2996 spa_check_removed(spa->spa_root_vdev); 2997 /* 2998 * For the import case, this is done in spa_import(), because 2999 * at this point we're using the spare definitions from 3000 * the MOS config, not necessarily from the userland config. 3001 */ 3002 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 3003 spa_aux_check_removed(&spa->spa_spares); 3004 spa_aux_check_removed(&spa->spa_l2cache); 3005 } 3006 } 3007 3008 /* 3009 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 3010 */ 3011 error = vdev_load(rvd); 3012 if (error != 0) { 3013 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 3014 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3015 } 3016 3017 /* 3018 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 3019 */ 3020 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3021 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 3022 spa_config_exit(spa, SCL_ALL, FTAG); 3023 3024 return (0); 3025 } 3026 3027 static int 3028 spa_ld_load_dedup_tables(spa_t *spa) 3029 { 3030 int error = 0; 3031 vdev_t *rvd = spa->spa_root_vdev; 3032 3033 error = ddt_load(spa); 3034 if (error != 0) { 3035 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 3036 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3037 } 3038 3039 return (0); 3040 } 3041 3042 static int 3043 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 3044 { 3045 vdev_t *rvd = spa->spa_root_vdev; 3046 3047 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 3048 boolean_t missing = spa_check_logs(spa); 3049 if (missing) { 3050 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 3051 spa_load_failed(spa, "spa_check_logs failed"); 3052 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 3053 } 3054 } 3055 3056 return (0); 3057 } 3058 3059 static int 3060 spa_ld_verify_pool_data(spa_t *spa) 3061 { 3062 int error = 0; 3063 vdev_t *rvd = spa->spa_root_vdev; 3064 3065 /* 3066 * We've successfully opened the pool, verify that we're ready 3067 * to start pushing transactions. 3068 */ 3069 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3070 error = spa_load_verify(spa); 3071 if (error != 0) { 3072 spa_load_failed(spa, "spa_load_verify failed " 3073 "[error=%d]", error); 3074 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3075 error)); 3076 } 3077 } 3078 3079 return (0); 3080 } 3081 3082 static void 3083 spa_ld_claim_log_blocks(spa_t *spa) 3084 { 3085 dmu_tx_t *tx; 3086 dsl_pool_t *dp = spa_get_dsl(spa); 3087 3088 /* 3089 * Claim log blocks that haven't been committed yet. 3090 * This must all happen in a single txg. 3091 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 3092 * invoked from zil_claim_log_block()'s i/o done callback. 3093 * Price of rollback is that we abandon the log. 3094 */ 3095 spa->spa_claiming = B_TRUE; 3096 3097 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 3098 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 3099 zil_claim, tx, DS_FIND_CHILDREN); 3100 dmu_tx_commit(tx); 3101 3102 spa->spa_claiming = B_FALSE; 3103 3104 spa_set_log_state(spa, SPA_LOG_GOOD); 3105 } 3106 3107 static void 3108 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) 3109 { 3110 vdev_t *rvd = spa->spa_root_vdev; 3111 int need_update = B_FALSE; 3112 3113 /* 3114 * If the config cache is stale, or we have uninitialized 3115 * metaslabs (see spa_vdev_add()), then update the config. 3116 * 3117 * If this is a verbatim import, trust the current 3118 * in-core spa_config and update the disk labels. 3119 */ 3120 if (config_cache_txg != spa->spa_config_txg || 3121 spa->spa_load_state == SPA_LOAD_IMPORT || 3122 spa->spa_load_state == SPA_LOAD_RECOVER || 3123 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 3124 need_update = B_TRUE; 3125 3126 for (int c = 0; c < rvd->vdev_children; c++) 3127 if (rvd->vdev_child[c]->vdev_ms_array == 0) 3128 need_update = B_TRUE; 3129 3130 /* 3131 * Update the config cache asychronously in case we're the 3132 * root pool, in which case the config cache isn't writable yet. 3133 */ 3134 if (need_update) 3135 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3136 } 3137 3138 /* 3139 * Load an existing storage pool, using the config provided. This config 3140 * describes which vdevs are part of the pool and is later validated against 3141 * partial configs present in each vdev's label and an entire copy of the 3142 * config stored in the MOS. 3143 */ 3144 static int 3145 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 3146 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 3147 char **ereport) 3148 { 3149 int error = 0; 3150 uint64_t config_cache_txg = spa->spa_config_txg; 3151 int orig_mode = spa->spa_mode; 3152 boolean_t missing_feat_write = B_FALSE; 3153 3154 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3155 3156 spa->spa_load_state = state; 3157 spa_load_note(spa, "LOADING"); 3158 3159 /* 3160 * If this is an untrusted config, first access the pool in read-only 3161 * mode. We will then retrieve a trusted copy of the config from the MOS 3162 * and use it to reopen the pool in read-write mode. 3163 */ 3164 if (!trust_config) 3165 spa->spa_mode = FREAD; 3166 3167 /* 3168 * Parse the config provided to create a vdev tree. 3169 */ 3170 error = spa_ld_parse_config(spa, pool_guid, config, type); 3171 if (error != 0) 3172 return (error); 3173 3174 /* 3175 * Now that we have the vdev tree, try to open each vdev. This involves 3176 * opening the underlying physical device, retrieving its geometry and 3177 * probing the vdev with a dummy I/O. The state of each vdev will be set 3178 * based on the success of those operations. After this we'll be ready 3179 * to read from the vdevs. 3180 */ 3181 error = spa_ld_open_vdevs(spa); 3182 if (error != 0) 3183 return (error); 3184 3185 /* 3186 * Read the label of each vdev and make sure that the GUIDs stored 3187 * there match the GUIDs in the config provided. 3188 */ 3189 error = spa_ld_validate_vdevs(spa, type, trust_config); 3190 if (error != 0) 3191 return (error); 3192 3193 /* 3194 * Read vdev labels to find the best uberblock (i.e. latest, unless 3195 * spa_load_max_txg is set) and store it in spa_uberblock. We get the 3196 * list of features required to read blkptrs in the MOS from the vdev 3197 * label with the best uberblock and verify that our version of zfs 3198 * supports them all. 3199 */ 3200 error = spa_ld_select_uberblock(spa, config, type, trust_config); 3201 if (error != 0) 3202 return (error); 3203 3204 /* 3205 * Pass that uberblock to the dsl_pool layer which will open the root 3206 * blkptr. This blkptr points to the latest version of the MOS and will 3207 * allow us to read its contents. 3208 */ 3209 error = spa_ld_open_rootbp(spa); 3210 if (error != 0) 3211 return (error); 3212 3213 /* 3214 * Retrieve the config stored in the MOS and use it to validate the 3215 * config provided. Also extract some information from the MOS config 3216 * to update our vdev tree. 3217 */ 3218 error = spa_ld_validate_config(spa, type); 3219 if (error != 0) 3220 return (error); 3221 3222 /* 3223 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 3224 * from the pool and their contents were re-mapped to other vdevs. Note 3225 * that everything that we read before this step must have been 3226 * rewritten on concrete vdevs after the last device removal was 3227 * initiated. Otherwise we could be reading from indirect vdevs before 3228 * we have loaded their mappings. 3229 */ 3230 error = spa_ld_open_indirect_vdev_metadata(spa); 3231 if (error != 0) 3232 return (error); 3233 3234 /* 3235 * Retrieve the full list of active features from the MOS and check if 3236 * they are all supported. 3237 */ 3238 error = spa_ld_check_features(spa, &missing_feat_write); 3239 if (error != 0) 3240 return (error); 3241 3242 /* 3243 * Load several special directories from the MOS needed by the dsl_pool 3244 * layer. 3245 */ 3246 error = spa_ld_load_special_directories(spa); 3247 if (error != 0) 3248 return (error); 3249 3250 /* 3251 * If the config provided is not trusted, discard it and use the config 3252 * from the MOS to reload the pool. 3253 */ 3254 if (!trust_config) { 3255 error = spa_ld_prepare_for_reload(spa, orig_mode); 3256 if (error != 0) 3257 return (error); 3258 3259 spa_load_note(spa, "RELOADING"); 3260 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 3261 } 3262 3263 /* 3264 * Retrieve pool properties from the MOS. 3265 */ 3266 error = spa_ld_get_props(spa); 3267 if (error != 0) 3268 return (error); 3269 3270 /* 3271 * Retrieve the list of auxiliary devices - cache devices and spares - 3272 * and open them. 3273 */ 3274 error = spa_ld_open_aux_vdevs(spa, type); 3275 if (error != 0) 3276 return (error); 3277 3278 /* 3279 * Load the metadata for all vdevs. Also check if unopenable devices 3280 * should be autoreplaced. 3281 */ 3282 error = spa_ld_load_vdev_metadata(spa); 3283 if (error != 0) 3284 return (error); 3285 3286 error = spa_ld_load_dedup_tables(spa); 3287 if (error != 0) 3288 return (error); 3289 3290 /* 3291 * Verify the logs now to make sure we don't have any unexpected errors 3292 * when we claim log blocks later. 3293 */ 3294 error = spa_ld_verify_logs(spa, type, ereport); 3295 if (error != 0) 3296 return (error); 3297 3298 if (missing_feat_write) { 3299 ASSERT(state == SPA_LOAD_TRYIMPORT); 3300 3301 /* 3302 * At this point, we know that we can open the pool in 3303 * read-only mode but not read-write mode. We now have enough 3304 * information and can return to userland. 3305 */ 3306 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 3307 ENOTSUP)); 3308 } 3309 3310 /* 3311 * Traverse the last txgs to make sure the pool was left off in a safe 3312 * state. When performing an extreme rewind, we verify the whole pool, 3313 * which can take a very long time. 3314 */ 3315 error = spa_ld_verify_pool_data(spa); 3316 if (error != 0) 3317 return (error); 3318 3319 /* 3320 * Calculate the deflated space for the pool. This must be done before 3321 * we write anything to the pool because we'd need to update the space 3322 * accounting using the deflated sizes. 3323 */ 3324 spa_update_dspace(spa); 3325 3326 /* 3327 * We have now retrieved all the information we needed to open the 3328 * pool. If we are importing the pool in read-write mode, a few 3329 * additional steps must be performed to finish the import. 3330 */ 3331 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 3332 spa->spa_load_max_txg == UINT64_MAX)) { 3333 ASSERT(state != SPA_LOAD_TRYIMPORT); 3334 3335 /* 3336 * We must check this before we start the sync thread, because 3337 * we only want to start a condense thread for condense 3338 * operations that were in progress when the pool was 3339 * imported. Once we start syncing, spa_sync() could 3340 * initiate a condense (and start a thread for it). In 3341 * that case it would be wrong to start a second 3342 * condense thread. 3343 */ 3344 boolean_t condense_in_progress = 3345 (spa->spa_condensing_indirect != NULL); 3346 3347 /* 3348 * Traverse the ZIL and claim all blocks. 3349 */ 3350 spa_ld_claim_log_blocks(spa); 3351 3352 /* 3353 * Kick-off the syncing thread. 3354 */ 3355 spa->spa_sync_on = B_TRUE; 3356 txg_sync_start(spa->spa_dsl_pool); 3357 3358 /* 3359 * Wait for all claims to sync. We sync up to the highest 3360 * claimed log block birth time so that claimed log blocks 3361 * don't appear to be from the future. spa_claim_max_txg 3362 * will have been set for us by ZIL traversal operations 3363 * performed above. 3364 */ 3365 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 3366 3367 /* 3368 * Check if we need to request an update of the config. On the 3369 * next sync, we would update the config stored in vdev labels 3370 * and the cachefile (by default /etc/zfs/zpool.cache). 3371 */ 3372 spa_ld_check_for_config_update(spa, config_cache_txg); 3373 3374 /* 3375 * Check all DTLs to see if anything needs resilvering. 3376 */ 3377 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 3378 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3379 spa_async_request(spa, SPA_ASYNC_RESILVER); 3380 3381 /* 3382 * Log the fact that we booted up (so that we can detect if 3383 * we rebooted in the middle of an operation). 3384 */ 3385 spa_history_log_version(spa, "open"); 3386 3387 /* 3388 * Delete any inconsistent datasets. 3389 */ 3390 (void) dmu_objset_find(spa_name(spa), 3391 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3392 3393 /* 3394 * Clean up any stale temporary dataset userrefs. 3395 */ 3396 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3397 3398 /* 3399 * Note: unlike condensing, we don't need an analogous 3400 * "removal_in_progress" dance because no other thread 3401 * can start a removal while we hold the spa_namespace_lock. 3402 */ 3403 spa_restart_removal(spa); 3404 3405 if (condense_in_progress) 3406 spa_condense_indirect_restart(spa); 3407 } 3408 3409 spa_load_note(spa, "LOADED"); 3410 3411 return (0); 3412 } 3413 3414 static int 3415 spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) 3416 { 3417 int mode = spa->spa_mode; 3418 3419 spa_unload(spa); 3420 spa_deactivate(spa); 3421 3422 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3423 3424 spa_activate(spa, mode); 3425 spa_async_suspend(spa); 3426 3427 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 3428 (u_longlong_t)spa->spa_load_max_txg); 3429 3430 return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config)); 3431 } 3432 3433 /* 3434 * If spa_load() fails this function will try loading prior txg's. If 3435 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3436 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3437 * function will not rewind the pool and will return the same error as 3438 * spa_load(). 3439 */ 3440 static int 3441 spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, 3442 uint64_t max_request, int rewind_flags) 3443 { 3444 nvlist_t *loadinfo = NULL; 3445 nvlist_t *config = NULL; 3446 int load_error, rewind_error; 3447 uint64_t safe_rewind_txg; 3448 uint64_t min_txg; 3449 3450 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3451 spa->spa_load_max_txg = spa->spa_load_txg; 3452 spa_set_log_state(spa, SPA_LOG_CLEAR); 3453 } else { 3454 spa->spa_load_max_txg = max_request; 3455 if (max_request != UINT64_MAX) 3456 spa->spa_extreme_rewind = B_TRUE; 3457 } 3458 3459 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3460 trust_config); 3461 if (load_error == 0) 3462 return (0); 3463 3464 if (spa->spa_root_vdev != NULL) 3465 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3466 3467 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3468 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3469 3470 if (rewind_flags & ZPOOL_NEVER_REWIND) { 3471 nvlist_free(config); 3472 return (load_error); 3473 } 3474 3475 if (state == SPA_LOAD_RECOVER) { 3476 /* Price of rolling back is discarding txgs, including log */ 3477 spa_set_log_state(spa, SPA_LOG_CLEAR); 3478 } else { 3479 /* 3480 * If we aren't rolling back save the load info from our first 3481 * import attempt so that we can restore it after attempting 3482 * to rewind. 3483 */ 3484 loadinfo = spa->spa_load_info; 3485 spa->spa_load_info = fnvlist_alloc(); 3486 } 3487 3488 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3489 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3490 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3491 TXG_INITIAL : safe_rewind_txg; 3492 3493 /* 3494 * Continue as long as we're finding errors, we're still within 3495 * the acceptable rewind range, and we're still finding uberblocks 3496 */ 3497 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3498 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3499 if (spa->spa_load_max_txg < safe_rewind_txg) 3500 spa->spa_extreme_rewind = B_TRUE; 3501 rewind_error = spa_load_retry(spa, state, trust_config); 3502 } 3503 3504 spa->spa_extreme_rewind = B_FALSE; 3505 spa->spa_load_max_txg = UINT64_MAX; 3506 3507 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3508 spa_config_set(spa, config); 3509 else 3510 nvlist_free(config); 3511 3512 if (state == SPA_LOAD_RECOVER) { 3513 ASSERT3P(loadinfo, ==, NULL); 3514 return (rewind_error); 3515 } else { 3516 /* Store the rewind info as part of the initial load info */ 3517 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3518 spa->spa_load_info); 3519 3520 /* Restore the initial load info */ 3521 fnvlist_free(spa->spa_load_info); 3522 spa->spa_load_info = loadinfo; 3523 3524 return (load_error); 3525 } 3526 } 3527 3528 /* 3529 * Pool Open/Import 3530 * 3531 * The import case is identical to an open except that the configuration is sent 3532 * down from userland, instead of grabbed from the configuration cache. For the 3533 * case of an open, the pool configuration will exist in the 3534 * POOL_STATE_UNINITIALIZED state. 3535 * 3536 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3537 * the same time open the pool, without having to keep around the spa_t in some 3538 * ambiguous state. 3539 */ 3540 static int 3541 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3542 nvlist_t **config) 3543 { 3544 spa_t *spa; 3545 spa_load_state_t state = SPA_LOAD_OPEN; 3546 int error; 3547 int locked = B_FALSE; 3548 3549 *spapp = NULL; 3550 3551 /* 3552 * As disgusting as this is, we need to support recursive calls to this 3553 * function because dsl_dir_open() is called during spa_load(), and ends 3554 * up calling spa_open() again. The real fix is to figure out how to 3555 * avoid dsl_dir_open() calling this in the first place. 3556 */ 3557 if (mutex_owner(&spa_namespace_lock) != curthread) { 3558 mutex_enter(&spa_namespace_lock); 3559 locked = B_TRUE; 3560 } 3561 3562 if ((spa = spa_lookup(pool)) == NULL) { 3563 if (locked) 3564 mutex_exit(&spa_namespace_lock); 3565 return (SET_ERROR(ENOENT)); 3566 } 3567 3568 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3569 zpool_rewind_policy_t policy; 3570 3571 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3572 &policy); 3573 if (policy.zrp_request & ZPOOL_DO_REWIND) 3574 state = SPA_LOAD_RECOVER; 3575 3576 spa_activate(spa, spa_mode_global); 3577 3578 if (state != SPA_LOAD_RECOVER) 3579 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3580 3581 zfs_dbgmsg("spa_open_common: opening %s", pool); 3582 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3583 policy.zrp_request); 3584 3585 if (error == EBADF) { 3586 /* 3587 * If vdev_validate() returns failure (indicated by 3588 * EBADF), it indicates that one of the vdevs indicates 3589 * that the pool has been exported or destroyed. If 3590 * this is the case, the config cache is out of sync and 3591 * we should remove the pool from the namespace. 3592 */ 3593 spa_unload(spa); 3594 spa_deactivate(spa); 3595 spa_write_cachefile(spa, B_TRUE, B_TRUE); 3596 spa_remove(spa); 3597 if (locked) 3598 mutex_exit(&spa_namespace_lock); 3599 return (SET_ERROR(ENOENT)); 3600 } 3601 3602 if (error) { 3603 /* 3604 * We can't open the pool, but we still have useful 3605 * information: the state of each vdev after the 3606 * attempted vdev_open(). Return this to the user. 3607 */ 3608 if (config != NULL && spa->spa_config) { 3609 VERIFY(nvlist_dup(spa->spa_config, config, 3610 KM_SLEEP) == 0); 3611 VERIFY(nvlist_add_nvlist(*config, 3612 ZPOOL_CONFIG_LOAD_INFO, 3613 spa->spa_load_info) == 0); 3614 } 3615 spa_unload(spa); 3616 spa_deactivate(spa); 3617 spa->spa_last_open_failed = error; 3618 if (locked) 3619 mutex_exit(&spa_namespace_lock); 3620 *spapp = NULL; 3621 return (error); 3622 } 3623 } 3624 3625 spa_open_ref(spa, tag); 3626 3627 if (config != NULL) 3628 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3629 3630 /* 3631 * If we've recovered the pool, pass back any information we 3632 * gathered while doing the load. 3633 */ 3634 if (state == SPA_LOAD_RECOVER) { 3635 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3636 spa->spa_load_info) == 0); 3637 } 3638 3639 if (locked) { 3640 spa->spa_last_open_failed = 0; 3641 spa->spa_last_ubsync_txg = 0; 3642 spa->spa_load_txg = 0; 3643 mutex_exit(&spa_namespace_lock); 3644 } 3645 3646 *spapp = spa; 3647 3648 return (0); 3649 } 3650 3651 int 3652 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3653 nvlist_t **config) 3654 { 3655 return (spa_open_common(name, spapp, tag, policy, config)); 3656 } 3657 3658 int 3659 spa_open(const char *name, spa_t **spapp, void *tag) 3660 { 3661 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3662 } 3663 3664 /* 3665 * Lookup the given spa_t, incrementing the inject count in the process, 3666 * preventing it from being exported or destroyed. 3667 */ 3668 spa_t * 3669 spa_inject_addref(char *name) 3670 { 3671 spa_t *spa; 3672 3673 mutex_enter(&spa_namespace_lock); 3674 if ((spa = spa_lookup(name)) == NULL) { 3675 mutex_exit(&spa_namespace_lock); 3676 return (NULL); 3677 } 3678 spa->spa_inject_ref++; 3679 mutex_exit(&spa_namespace_lock); 3680 3681 return (spa); 3682 } 3683 3684 void 3685 spa_inject_delref(spa_t *spa) 3686 { 3687 mutex_enter(&spa_namespace_lock); 3688 spa->spa_inject_ref--; 3689 mutex_exit(&spa_namespace_lock); 3690 } 3691 3692 /* 3693 * Add spares device information to the nvlist. 3694 */ 3695 static void 3696 spa_add_spares(spa_t *spa, nvlist_t *config) 3697 { 3698 nvlist_t **spares; 3699 uint_t i, nspares; 3700 nvlist_t *nvroot; 3701 uint64_t guid; 3702 vdev_stat_t *vs; 3703 uint_t vsc; 3704 uint64_t pool; 3705 3706 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3707 3708 if (spa->spa_spares.sav_count == 0) 3709 return; 3710 3711 VERIFY(nvlist_lookup_nvlist(config, 3712 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3713 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3714 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3715 if (nspares != 0) { 3716 VERIFY(nvlist_add_nvlist_array(nvroot, 3717 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3718 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3719 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3720 3721 /* 3722 * Go through and find any spares which have since been 3723 * repurposed as an active spare. If this is the case, update 3724 * their status appropriately. 3725 */ 3726 for (i = 0; i < nspares; i++) { 3727 VERIFY(nvlist_lookup_uint64(spares[i], 3728 ZPOOL_CONFIG_GUID, &guid) == 0); 3729 if (spa_spare_exists(guid, &pool, NULL) && 3730 pool != 0ULL) { 3731 VERIFY(nvlist_lookup_uint64_array( 3732 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3733 (uint64_t **)&vs, &vsc) == 0); 3734 vs->vs_state = VDEV_STATE_CANT_OPEN; 3735 vs->vs_aux = VDEV_AUX_SPARED; 3736 } 3737 } 3738 } 3739 } 3740 3741 /* 3742 * Add l2cache device information to the nvlist, including vdev stats. 3743 */ 3744 static void 3745 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3746 { 3747 nvlist_t **l2cache; 3748 uint_t i, j, nl2cache; 3749 nvlist_t *nvroot; 3750 uint64_t guid; 3751 vdev_t *vd; 3752 vdev_stat_t *vs; 3753 uint_t vsc; 3754 3755 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3756 3757 if (spa->spa_l2cache.sav_count == 0) 3758 return; 3759 3760 VERIFY(nvlist_lookup_nvlist(config, 3761 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3762 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3763 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3764 if (nl2cache != 0) { 3765 VERIFY(nvlist_add_nvlist_array(nvroot, 3766 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3767 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3768 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3769 3770 /* 3771 * Update level 2 cache device stats. 3772 */ 3773 3774 for (i = 0; i < nl2cache; i++) { 3775 VERIFY(nvlist_lookup_uint64(l2cache[i], 3776 ZPOOL_CONFIG_GUID, &guid) == 0); 3777 3778 vd = NULL; 3779 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3780 if (guid == 3781 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3782 vd = spa->spa_l2cache.sav_vdevs[j]; 3783 break; 3784 } 3785 } 3786 ASSERT(vd != NULL); 3787 3788 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3789 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3790 == 0); 3791 vdev_get_stats(vd, vs); 3792 } 3793 } 3794 } 3795 3796 static void 3797 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3798 { 3799 nvlist_t *features; 3800 zap_cursor_t zc; 3801 zap_attribute_t za; 3802 3803 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3804 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3805 3806 if (spa->spa_feat_for_read_obj != 0) { 3807 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3808 spa->spa_feat_for_read_obj); 3809 zap_cursor_retrieve(&zc, &za) == 0; 3810 zap_cursor_advance(&zc)) { 3811 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3812 za.za_num_integers == 1); 3813 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3814 za.za_first_integer)); 3815 } 3816 zap_cursor_fini(&zc); 3817 } 3818 3819 if (spa->spa_feat_for_write_obj != 0) { 3820 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3821 spa->spa_feat_for_write_obj); 3822 zap_cursor_retrieve(&zc, &za) == 0; 3823 zap_cursor_advance(&zc)) { 3824 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3825 za.za_num_integers == 1); 3826 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3827 za.za_first_integer)); 3828 } 3829 zap_cursor_fini(&zc); 3830 } 3831 3832 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3833 features) == 0); 3834 nvlist_free(features); 3835 } 3836 3837 int 3838 spa_get_stats(const char *name, nvlist_t **config, 3839 char *altroot, size_t buflen) 3840 { 3841 int error; 3842 spa_t *spa; 3843 3844 *config = NULL; 3845 error = spa_open_common(name, &spa, FTAG, NULL, config); 3846 3847 if (spa != NULL) { 3848 /* 3849 * This still leaves a window of inconsistency where the spares 3850 * or l2cache devices could change and the config would be 3851 * self-inconsistent. 3852 */ 3853 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3854 3855 if (*config != NULL) { 3856 uint64_t loadtimes[2]; 3857 3858 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3859 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3860 VERIFY(nvlist_add_uint64_array(*config, 3861 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3862 3863 VERIFY(nvlist_add_uint64(*config, 3864 ZPOOL_CONFIG_ERRCOUNT, 3865 spa_get_errlog_size(spa)) == 0); 3866 3867 if (spa_suspended(spa)) 3868 VERIFY(nvlist_add_uint64(*config, 3869 ZPOOL_CONFIG_SUSPENDED, 3870 spa->spa_failmode) == 0); 3871 3872 spa_add_spares(spa, *config); 3873 spa_add_l2cache(spa, *config); 3874 spa_add_feature_stats(spa, *config); 3875 } 3876 } 3877 3878 /* 3879 * We want to get the alternate root even for faulted pools, so we cheat 3880 * and call spa_lookup() directly. 3881 */ 3882 if (altroot) { 3883 if (spa == NULL) { 3884 mutex_enter(&spa_namespace_lock); 3885 spa = spa_lookup(name); 3886 if (spa) 3887 spa_altroot(spa, altroot, buflen); 3888 else 3889 altroot[0] = '\0'; 3890 spa = NULL; 3891 mutex_exit(&spa_namespace_lock); 3892 } else { 3893 spa_altroot(spa, altroot, buflen); 3894 } 3895 } 3896 3897 if (spa != NULL) { 3898 spa_config_exit(spa, SCL_CONFIG, FTAG); 3899 spa_close(spa, FTAG); 3900 } 3901 3902 return (error); 3903 } 3904 3905 /* 3906 * Validate that the auxiliary device array is well formed. We must have an 3907 * array of nvlists, each which describes a valid leaf vdev. If this is an 3908 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3909 * specified, as long as they are well-formed. 3910 */ 3911 static int 3912 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3913 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3914 vdev_labeltype_t label) 3915 { 3916 nvlist_t **dev; 3917 uint_t i, ndev; 3918 vdev_t *vd; 3919 int error; 3920 3921 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3922 3923 /* 3924 * It's acceptable to have no devs specified. 3925 */ 3926 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3927 return (0); 3928 3929 if (ndev == 0) 3930 return (SET_ERROR(EINVAL)); 3931 3932 /* 3933 * Make sure the pool is formatted with a version that supports this 3934 * device type. 3935 */ 3936 if (spa_version(spa) < version) 3937 return (SET_ERROR(ENOTSUP)); 3938 3939 /* 3940 * Set the pending device list so we correctly handle device in-use 3941 * checking. 3942 */ 3943 sav->sav_pending = dev; 3944 sav->sav_npending = ndev; 3945 3946 for (i = 0; i < ndev; i++) { 3947 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3948 mode)) != 0) 3949 goto out; 3950 3951 if (!vd->vdev_ops->vdev_op_leaf) { 3952 vdev_free(vd); 3953 error = SET_ERROR(EINVAL); 3954 goto out; 3955 } 3956 3957 /* 3958 * The L2ARC currently only supports disk devices in 3959 * kernel context. For user-level testing, we allow it. 3960 */ 3961 #ifdef _KERNEL 3962 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3963 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3964 error = SET_ERROR(ENOTBLK); 3965 vdev_free(vd); 3966 goto out; 3967 } 3968 #endif 3969 vd->vdev_top = vd; 3970 3971 if ((error = vdev_open(vd)) == 0 && 3972 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3973 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3974 vd->vdev_guid) == 0); 3975 } 3976 3977 vdev_free(vd); 3978 3979 if (error && 3980 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3981 goto out; 3982 else 3983 error = 0; 3984 } 3985 3986 out: 3987 sav->sav_pending = NULL; 3988 sav->sav_npending = 0; 3989 return (error); 3990 } 3991 3992 static int 3993 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3994 { 3995 int error; 3996 3997 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3998 3999 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4000 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 4001 VDEV_LABEL_SPARE)) != 0) { 4002 return (error); 4003 } 4004 4005 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4006 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 4007 VDEV_LABEL_L2CACHE)); 4008 } 4009 4010 static void 4011 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 4012 const char *config) 4013 { 4014 int i; 4015 4016 if (sav->sav_config != NULL) { 4017 nvlist_t **olddevs; 4018 uint_t oldndevs; 4019 nvlist_t **newdevs; 4020 4021 /* 4022 * Generate new dev list by concatentating with the 4023 * current dev list. 4024 */ 4025 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 4026 &olddevs, &oldndevs) == 0); 4027 4028 newdevs = kmem_alloc(sizeof (void *) * 4029 (ndevs + oldndevs), KM_SLEEP); 4030 for (i = 0; i < oldndevs; i++) 4031 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 4032 KM_SLEEP) == 0); 4033 for (i = 0; i < ndevs; i++) 4034 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 4035 KM_SLEEP) == 0); 4036 4037 VERIFY(nvlist_remove(sav->sav_config, config, 4038 DATA_TYPE_NVLIST_ARRAY) == 0); 4039 4040 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 4041 config, newdevs, ndevs + oldndevs) == 0); 4042 for (i = 0; i < oldndevs + ndevs; i++) 4043 nvlist_free(newdevs[i]); 4044 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 4045 } else { 4046 /* 4047 * Generate a new dev list. 4048 */ 4049 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 4050 KM_SLEEP) == 0); 4051 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 4052 devs, ndevs) == 0); 4053 } 4054 } 4055 4056 /* 4057 * Stop and drop level 2 ARC devices 4058 */ 4059 void 4060 spa_l2cache_drop(spa_t *spa) 4061 { 4062 vdev_t *vd; 4063 int i; 4064 spa_aux_vdev_t *sav = &spa->spa_l2cache; 4065 4066 for (i = 0; i < sav->sav_count; i++) { 4067 uint64_t pool; 4068 4069 vd = sav->sav_vdevs[i]; 4070 ASSERT(vd != NULL); 4071 4072 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 4073 pool != 0ULL && l2arc_vdev_present(vd)) 4074 l2arc_remove_vdev(vd); 4075 } 4076 } 4077 4078 /* 4079 * Pool Creation 4080 */ 4081 int 4082 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 4083 nvlist_t *zplprops) 4084 { 4085 spa_t *spa; 4086 char *altroot = NULL; 4087 vdev_t *rvd; 4088 dsl_pool_t *dp; 4089 dmu_tx_t *tx; 4090 int error = 0; 4091 uint64_t txg = TXG_INITIAL; 4092 nvlist_t **spares, **l2cache; 4093 uint_t nspares, nl2cache; 4094 uint64_t version, obj; 4095 boolean_t has_features; 4096 4097 /* 4098 * If this pool already exists, return failure. 4099 */ 4100 mutex_enter(&spa_namespace_lock); 4101 if (spa_lookup(pool) != NULL) { 4102 mutex_exit(&spa_namespace_lock); 4103 return (SET_ERROR(EEXIST)); 4104 } 4105 4106 /* 4107 * Allocate a new spa_t structure. 4108 */ 4109 (void) nvlist_lookup_string(props, 4110 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4111 spa = spa_add(pool, NULL, altroot); 4112 spa_activate(spa, spa_mode_global); 4113 4114 if (props && (error = spa_prop_validate(spa, props))) { 4115 spa_deactivate(spa); 4116 spa_remove(spa); 4117 mutex_exit(&spa_namespace_lock); 4118 return (error); 4119 } 4120 4121 has_features = B_FALSE; 4122 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 4123 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 4124 if (zpool_prop_feature(nvpair_name(elem))) 4125 has_features = B_TRUE; 4126 } 4127 4128 if (has_features || nvlist_lookup_uint64(props, 4129 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 4130 version = SPA_VERSION; 4131 } 4132 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 4133 4134 spa->spa_first_txg = txg; 4135 spa->spa_uberblock.ub_txg = txg - 1; 4136 spa->spa_uberblock.ub_version = version; 4137 spa->spa_ubsync = spa->spa_uberblock; 4138 spa->spa_load_state = SPA_LOAD_CREATE; 4139 spa->spa_removing_phys.sr_state = DSS_NONE; 4140 spa->spa_removing_phys.sr_removing_vdev = -1; 4141 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 4142 4143 /* 4144 * Create "The Godfather" zio to hold all async IOs 4145 */ 4146 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4147 KM_SLEEP); 4148 for (int i = 0; i < max_ncpus; i++) { 4149 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4150 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4151 ZIO_FLAG_GODFATHER); 4152 } 4153 4154 /* 4155 * Create the root vdev. 4156 */ 4157 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4158 4159 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 4160 4161 ASSERT(error != 0 || rvd != NULL); 4162 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 4163 4164 if (error == 0 && !zfs_allocatable_devs(nvroot)) 4165 error = SET_ERROR(EINVAL); 4166 4167 if (error == 0 && 4168 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 4169 (error = spa_validate_aux(spa, nvroot, txg, 4170 VDEV_ALLOC_ADD)) == 0) { 4171 for (int c = 0; c < rvd->vdev_children; c++) { 4172 vdev_metaslab_set_size(rvd->vdev_child[c]); 4173 vdev_expand(rvd->vdev_child[c], txg); 4174 } 4175 } 4176 4177 spa_config_exit(spa, SCL_ALL, FTAG); 4178 4179 if (error != 0) { 4180 spa_unload(spa); 4181 spa_deactivate(spa); 4182 spa_remove(spa); 4183 mutex_exit(&spa_namespace_lock); 4184 return (error); 4185 } 4186 4187 /* 4188 * Get the list of spares, if specified. 4189 */ 4190 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4191 &spares, &nspares) == 0) { 4192 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 4193 KM_SLEEP) == 0); 4194 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4195 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4196 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4197 spa_load_spares(spa); 4198 spa_config_exit(spa, SCL_ALL, FTAG); 4199 spa->spa_spares.sav_sync = B_TRUE; 4200 } 4201 4202 /* 4203 * Get the list of level 2 cache devices, if specified. 4204 */ 4205 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4206 &l2cache, &nl2cache) == 0) { 4207 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4208 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4209 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4210 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4211 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4212 spa_load_l2cache(spa); 4213 spa_config_exit(spa, SCL_ALL, FTAG); 4214 spa->spa_l2cache.sav_sync = B_TRUE; 4215 } 4216 4217 spa->spa_is_initializing = B_TRUE; 4218 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 4219 spa->spa_meta_objset = dp->dp_meta_objset; 4220 spa->spa_is_initializing = B_FALSE; 4221 4222 /* 4223 * Create DDTs (dedup tables). 4224 */ 4225 ddt_create(spa); 4226 4227 spa_update_dspace(spa); 4228 4229 tx = dmu_tx_create_assigned(dp, txg); 4230 4231 /* 4232 * Create the pool config object. 4233 */ 4234 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 4235 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 4236 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 4237 4238 if (zap_add(spa->spa_meta_objset, 4239 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 4240 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 4241 cmn_err(CE_PANIC, "failed to add pool config"); 4242 } 4243 4244 if (spa_version(spa) >= SPA_VERSION_FEATURES) 4245 spa_feature_create_zap_objects(spa, tx); 4246 4247 if (zap_add(spa->spa_meta_objset, 4248 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 4249 sizeof (uint64_t), 1, &version, tx) != 0) { 4250 cmn_err(CE_PANIC, "failed to add pool version"); 4251 } 4252 4253 /* Newly created pools with the right version are always deflated. */ 4254 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 4255 spa->spa_deflate = TRUE; 4256 if (zap_add(spa->spa_meta_objset, 4257 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4258 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 4259 cmn_err(CE_PANIC, "failed to add deflate"); 4260 } 4261 } 4262 4263 /* 4264 * Create the deferred-free bpobj. Turn off compression 4265 * because sync-to-convergence takes longer if the blocksize 4266 * keeps changing. 4267 */ 4268 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 4269 dmu_object_set_compress(spa->spa_meta_objset, obj, 4270 ZIO_COMPRESS_OFF, tx); 4271 if (zap_add(spa->spa_meta_objset, 4272 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 4273 sizeof (uint64_t), 1, &obj, tx) != 0) { 4274 cmn_err(CE_PANIC, "failed to add bpobj"); 4275 } 4276 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 4277 spa->spa_meta_objset, obj)); 4278 4279 /* 4280 * Create the pool's history object. 4281 */ 4282 if (version >= SPA_VERSION_ZPOOL_HISTORY) 4283 spa_history_create_obj(spa, tx); 4284 4285 /* 4286 * Generate some random noise for salted checksums to operate on. 4287 */ 4288 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4289 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4290 4291 /* 4292 * Set pool properties. 4293 */ 4294 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 4295 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4296 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 4297 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 4298 4299 if (props != NULL) { 4300 spa_configfile_set(spa, props, B_FALSE); 4301 spa_sync_props(props, tx); 4302 } 4303 4304 dmu_tx_commit(tx); 4305 4306 spa->spa_sync_on = B_TRUE; 4307 txg_sync_start(spa->spa_dsl_pool); 4308 4309 /* 4310 * We explicitly wait for the first transaction to complete so that our 4311 * bean counters are appropriately updated. 4312 */ 4313 txg_wait_synced(spa->spa_dsl_pool, txg); 4314 4315 spa_write_cachefile(spa, B_FALSE, B_TRUE); 4316 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 4317 4318 spa_history_log_version(spa, "create"); 4319 4320 /* 4321 * Don't count references from objsets that are already closed 4322 * and are making their way through the eviction process. 4323 */ 4324 spa_evicting_os_wait(spa); 4325 spa->spa_minref = refcount_count(&spa->spa_refcount); 4326 spa->spa_load_state = SPA_LOAD_NONE; 4327 4328 mutex_exit(&spa_namespace_lock); 4329 4330 return (0); 4331 } 4332 4333 #ifdef _KERNEL 4334 /* 4335 * Get the root pool information from the root disk, then import the root pool 4336 * during the system boot up time. 4337 */ 4338 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 4339 4340 static nvlist_t * 4341 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 4342 { 4343 nvlist_t *config; 4344 nvlist_t *nvtop, *nvroot; 4345 uint64_t pgid; 4346 4347 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 4348 return (NULL); 4349 4350 /* 4351 * Add this top-level vdev to the child array. 4352 */ 4353 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4354 &nvtop) == 0); 4355 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4356 &pgid) == 0); 4357 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 4358 4359 /* 4360 * Put this pool's top-level vdevs into a root vdev. 4361 */ 4362 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4363 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4364 VDEV_TYPE_ROOT) == 0); 4365 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4366 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4367 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4368 &nvtop, 1) == 0); 4369 4370 /* 4371 * Replace the existing vdev_tree with the new root vdev in 4372 * this pool's configuration (remove the old, add the new). 4373 */ 4374 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4375 nvlist_free(nvroot); 4376 return (config); 4377 } 4378 4379 /* 4380 * Walk the vdev tree and see if we can find a device with "better" 4381 * configuration. A configuration is "better" if the label on that 4382 * device has a more recent txg. 4383 */ 4384 static void 4385 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 4386 { 4387 for (int c = 0; c < vd->vdev_children; c++) 4388 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 4389 4390 if (vd->vdev_ops->vdev_op_leaf) { 4391 nvlist_t *label; 4392 uint64_t label_txg; 4393 4394 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 4395 &label) != 0) 4396 return; 4397 4398 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 4399 &label_txg) == 0); 4400 4401 /* 4402 * Do we have a better boot device? 4403 */ 4404 if (label_txg > *txg) { 4405 *txg = label_txg; 4406 *avd = vd; 4407 } 4408 nvlist_free(label); 4409 } 4410 } 4411 4412 /* 4413 * Import a root pool. 4414 * 4415 * For x86. devpath_list will consist of devid and/or physpath name of 4416 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 4417 * The GRUB "findroot" command will return the vdev we should boot. 4418 * 4419 * For Sparc, devpath_list consists the physpath name of the booting device 4420 * no matter the rootpool is a single device pool or a mirrored pool. 4421 * e.g. 4422 * "/pci@1f,0/ide@d/disk@0,0:a" 4423 */ 4424 int 4425 spa_import_rootpool(char *devpath, char *devid) 4426 { 4427 spa_t *spa; 4428 vdev_t *rvd, *bvd, *avd = NULL; 4429 nvlist_t *config, *nvtop; 4430 uint64_t guid, txg; 4431 char *pname; 4432 int error; 4433 4434 /* 4435 * Read the label from the boot device and generate a configuration. 4436 */ 4437 config = spa_generate_rootconf(devpath, devid, &guid); 4438 #if defined(_OBP) && defined(_KERNEL) 4439 if (config == NULL) { 4440 if (strstr(devpath, "/iscsi/ssd") != NULL) { 4441 /* iscsi boot */ 4442 get_iscsi_bootpath_phy(devpath); 4443 config = spa_generate_rootconf(devpath, devid, &guid); 4444 } 4445 } 4446 #endif 4447 if (config == NULL) { 4448 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4449 devpath); 4450 return (SET_ERROR(EIO)); 4451 } 4452 4453 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4454 &pname) == 0); 4455 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4456 4457 mutex_enter(&spa_namespace_lock); 4458 if ((spa = spa_lookup(pname)) != NULL) { 4459 /* 4460 * Remove the existing root pool from the namespace so that we 4461 * can replace it with the correct config we just read in. 4462 */ 4463 spa_remove(spa); 4464 } 4465 4466 spa = spa_add(pname, config, NULL); 4467 spa->spa_is_root = B_TRUE; 4468 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4469 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4470 &spa->spa_ubsync.ub_version) != 0) 4471 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4472 4473 /* 4474 * Build up a vdev tree based on the boot device's label config. 4475 */ 4476 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4477 &nvtop) == 0); 4478 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4479 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4480 VDEV_ALLOC_ROOTPOOL); 4481 spa_config_exit(spa, SCL_ALL, FTAG); 4482 if (error) { 4483 mutex_exit(&spa_namespace_lock); 4484 nvlist_free(config); 4485 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4486 pname); 4487 return (error); 4488 } 4489 4490 /* 4491 * Get the boot vdev. 4492 */ 4493 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4494 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4495 (u_longlong_t)guid); 4496 error = SET_ERROR(ENOENT); 4497 goto out; 4498 } 4499 4500 /* 4501 * Determine if there is a better boot device. 4502 */ 4503 avd = bvd; 4504 spa_alt_rootvdev(rvd, &avd, &txg); 4505 if (avd != bvd) { 4506 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4507 "try booting from '%s'", avd->vdev_path); 4508 error = SET_ERROR(EINVAL); 4509 goto out; 4510 } 4511 4512 /* 4513 * If the boot device is part of a spare vdev then ensure that 4514 * we're booting off the active spare. 4515 */ 4516 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4517 !bvd->vdev_isspare) { 4518 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4519 "try booting from '%s'", 4520 bvd->vdev_parent-> 4521 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4522 error = SET_ERROR(EINVAL); 4523 goto out; 4524 } 4525 4526 error = 0; 4527 out: 4528 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4529 vdev_free(rvd); 4530 spa_config_exit(spa, SCL_ALL, FTAG); 4531 mutex_exit(&spa_namespace_lock); 4532 4533 nvlist_free(config); 4534 return (error); 4535 } 4536 4537 #endif 4538 4539 /* 4540 * Import a non-root pool into the system. 4541 */ 4542 int 4543 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4544 { 4545 spa_t *spa; 4546 char *altroot = NULL; 4547 spa_load_state_t state = SPA_LOAD_IMPORT; 4548 zpool_rewind_policy_t policy; 4549 uint64_t mode = spa_mode_global; 4550 uint64_t readonly = B_FALSE; 4551 int error; 4552 nvlist_t *nvroot; 4553 nvlist_t **spares, **l2cache; 4554 uint_t nspares, nl2cache; 4555 4556 /* 4557 * If a pool with this name exists, return failure. 4558 */ 4559 mutex_enter(&spa_namespace_lock); 4560 if (spa_lookup(pool) != NULL) { 4561 mutex_exit(&spa_namespace_lock); 4562 return (SET_ERROR(EEXIST)); 4563 } 4564 4565 /* 4566 * Create and initialize the spa structure. 4567 */ 4568 (void) nvlist_lookup_string(props, 4569 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4570 (void) nvlist_lookup_uint64(props, 4571 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4572 if (readonly) 4573 mode = FREAD; 4574 spa = spa_add(pool, config, altroot); 4575 spa->spa_import_flags = flags; 4576 4577 /* 4578 * Verbatim import - Take a pool and insert it into the namespace 4579 * as if it had been loaded at boot. 4580 */ 4581 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4582 if (props != NULL) 4583 spa_configfile_set(spa, props, B_FALSE); 4584 4585 spa_write_cachefile(spa, B_FALSE, B_TRUE); 4586 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4587 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 4588 mutex_exit(&spa_namespace_lock); 4589 return (0); 4590 } 4591 4592 spa_activate(spa, mode); 4593 4594 /* 4595 * Don't start async tasks until we know everything is healthy. 4596 */ 4597 spa_async_suspend(spa); 4598 4599 zpool_get_rewind_policy(config, &policy); 4600 if (policy.zrp_request & ZPOOL_DO_REWIND) 4601 state = SPA_LOAD_RECOVER; 4602 4603 /* 4604 * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config 4605 * because the user-supplied config is actually the one to trust when 4606 * doing an import. 4607 */ 4608 if (state != SPA_LOAD_RECOVER) 4609 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4610 4611 zfs_dbgmsg("spa_import: importing %s%s", pool, 4612 (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : ""); 4613 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4614 policy.zrp_request); 4615 4616 /* 4617 * Propagate anything learned while loading the pool and pass it 4618 * back to caller (i.e. rewind info, missing devices, etc). 4619 */ 4620 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4621 spa->spa_load_info) == 0); 4622 4623 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4624 /* 4625 * Toss any existing sparelist, as it doesn't have any validity 4626 * anymore, and conflicts with spa_has_spare(). 4627 */ 4628 if (spa->spa_spares.sav_config) { 4629 nvlist_free(spa->spa_spares.sav_config); 4630 spa->spa_spares.sav_config = NULL; 4631 spa_load_spares(spa); 4632 } 4633 if (spa->spa_l2cache.sav_config) { 4634 nvlist_free(spa->spa_l2cache.sav_config); 4635 spa->spa_l2cache.sav_config = NULL; 4636 spa_load_l2cache(spa); 4637 } 4638 4639 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4640 &nvroot) == 0); 4641 if (error == 0) 4642 error = spa_validate_aux(spa, nvroot, -1ULL, 4643 VDEV_ALLOC_SPARE); 4644 if (error == 0) 4645 error = spa_validate_aux(spa, nvroot, -1ULL, 4646 VDEV_ALLOC_L2CACHE); 4647 spa_config_exit(spa, SCL_ALL, FTAG); 4648 4649 if (props != NULL) 4650 spa_configfile_set(spa, props, B_FALSE); 4651 4652 if (error != 0 || (props && spa_writeable(spa) && 4653 (error = spa_prop_set(spa, props)))) { 4654 spa_unload(spa); 4655 spa_deactivate(spa); 4656 spa_remove(spa); 4657 mutex_exit(&spa_namespace_lock); 4658 return (error); 4659 } 4660 4661 spa_async_resume(spa); 4662 4663 /* 4664 * Override any spares and level 2 cache devices as specified by 4665 * the user, as these may have correct device names/devids, etc. 4666 */ 4667 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4668 &spares, &nspares) == 0) { 4669 if (spa->spa_spares.sav_config) 4670 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4671 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4672 else 4673 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4674 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4675 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4676 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4677 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4678 spa_load_spares(spa); 4679 spa_config_exit(spa, SCL_ALL, FTAG); 4680 spa->spa_spares.sav_sync = B_TRUE; 4681 } 4682 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4683 &l2cache, &nl2cache) == 0) { 4684 if (spa->spa_l2cache.sav_config) 4685 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4686 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4687 else 4688 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4689 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4690 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4691 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4692 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4693 spa_load_l2cache(spa); 4694 spa_config_exit(spa, SCL_ALL, FTAG); 4695 spa->spa_l2cache.sav_sync = B_TRUE; 4696 } 4697 4698 /* 4699 * Check for any removed devices. 4700 */ 4701 if (spa->spa_autoreplace) { 4702 spa_aux_check_removed(&spa->spa_spares); 4703 spa_aux_check_removed(&spa->spa_l2cache); 4704 } 4705 4706 if (spa_writeable(spa)) { 4707 /* 4708 * Update the config cache to include the newly-imported pool. 4709 */ 4710 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4711 } 4712 4713 /* 4714 * It's possible that the pool was expanded while it was exported. 4715 * We kick off an async task to handle this for us. 4716 */ 4717 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4718 4719 spa_history_log_version(spa, "import"); 4720 4721 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4722 4723 mutex_exit(&spa_namespace_lock); 4724 4725 return (0); 4726 } 4727 4728 nvlist_t * 4729 spa_tryimport(nvlist_t *tryconfig) 4730 { 4731 nvlist_t *config = NULL; 4732 char *poolname; 4733 spa_t *spa; 4734 uint64_t state; 4735 int error; 4736 4737 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4738 return (NULL); 4739 4740 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4741 return (NULL); 4742 4743 /* 4744 * Create and initialize the spa structure. 4745 */ 4746 mutex_enter(&spa_namespace_lock); 4747 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4748 spa_activate(spa, FREAD); 4749 4750 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 4751 4752 /* 4753 * Pass off the heavy lifting to spa_load(). 4754 * Pass TRUE for trust_config because the user-supplied config 4755 * is actually the one to trust when doing an import. 4756 */ 4757 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4758 4759 /* 4760 * If 'tryconfig' was at least parsable, return the current config. 4761 */ 4762 if (spa->spa_root_vdev != NULL) { 4763 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4764 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4765 poolname) == 0); 4766 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4767 state) == 0); 4768 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4769 spa->spa_uberblock.ub_timestamp) == 0); 4770 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4771 spa->spa_load_info) == 0); 4772 4773 /* 4774 * If the bootfs property exists on this pool then we 4775 * copy it out so that external consumers can tell which 4776 * pools are bootable. 4777 */ 4778 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4779 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4780 4781 /* 4782 * We have to play games with the name since the 4783 * pool was opened as TRYIMPORT_NAME. 4784 */ 4785 if (dsl_dsobj_to_dsname(spa_name(spa), 4786 spa->spa_bootfs, tmpname) == 0) { 4787 char *cp; 4788 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4789 4790 cp = strchr(tmpname, '/'); 4791 if (cp == NULL) { 4792 (void) strlcpy(dsname, tmpname, 4793 MAXPATHLEN); 4794 } else { 4795 (void) snprintf(dsname, MAXPATHLEN, 4796 "%s/%s", poolname, ++cp); 4797 } 4798 VERIFY(nvlist_add_string(config, 4799 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4800 kmem_free(dsname, MAXPATHLEN); 4801 } 4802 kmem_free(tmpname, MAXPATHLEN); 4803 } 4804 4805 /* 4806 * Add the list of hot spares and level 2 cache devices. 4807 */ 4808 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4809 spa_add_spares(spa, config); 4810 spa_add_l2cache(spa, config); 4811 spa_config_exit(spa, SCL_CONFIG, FTAG); 4812 } 4813 4814 spa_unload(spa); 4815 spa_deactivate(spa); 4816 spa_remove(spa); 4817 mutex_exit(&spa_namespace_lock); 4818 4819 return (config); 4820 } 4821 4822 /* 4823 * Pool export/destroy 4824 * 4825 * The act of destroying or exporting a pool is very simple. We make sure there 4826 * is no more pending I/O and any references to the pool are gone. Then, we 4827 * update the pool state and sync all the labels to disk, removing the 4828 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4829 * we don't sync the labels or remove the configuration cache. 4830 */ 4831 static int 4832 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4833 boolean_t force, boolean_t hardforce) 4834 { 4835 spa_t *spa; 4836 4837 if (oldconfig) 4838 *oldconfig = NULL; 4839 4840 if (!(spa_mode_global & FWRITE)) 4841 return (SET_ERROR(EROFS)); 4842 4843 mutex_enter(&spa_namespace_lock); 4844 if ((spa = spa_lookup(pool)) == NULL) { 4845 mutex_exit(&spa_namespace_lock); 4846 return (SET_ERROR(ENOENT)); 4847 } 4848 4849 /* 4850 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4851 * reacquire the namespace lock, and see if we can export. 4852 */ 4853 spa_open_ref(spa, FTAG); 4854 mutex_exit(&spa_namespace_lock); 4855 spa_async_suspend(spa); 4856 mutex_enter(&spa_namespace_lock); 4857 spa_close(spa, FTAG); 4858 4859 /* 4860 * The pool will be in core if it's openable, 4861 * in which case we can modify its state. 4862 */ 4863 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4864 /* 4865 * Objsets may be open only because they're dirty, so we 4866 * have to force it to sync before checking spa_refcnt. 4867 */ 4868 txg_wait_synced(spa->spa_dsl_pool, 0); 4869 spa_evicting_os_wait(spa); 4870 4871 /* 4872 * A pool cannot be exported or destroyed if there are active 4873 * references. If we are resetting a pool, allow references by 4874 * fault injection handlers. 4875 */ 4876 if (!spa_refcount_zero(spa) || 4877 (spa->spa_inject_ref != 0 && 4878 new_state != POOL_STATE_UNINITIALIZED)) { 4879 spa_async_resume(spa); 4880 mutex_exit(&spa_namespace_lock); 4881 return (SET_ERROR(EBUSY)); 4882 } 4883 4884 /* 4885 * A pool cannot be exported if it has an active shared spare. 4886 * This is to prevent other pools stealing the active spare 4887 * from an exported pool. At user's own will, such pool can 4888 * be forcedly exported. 4889 */ 4890 if (!force && new_state == POOL_STATE_EXPORTED && 4891 spa_has_active_shared_spare(spa)) { 4892 spa_async_resume(spa); 4893 mutex_exit(&spa_namespace_lock); 4894 return (SET_ERROR(EXDEV)); 4895 } 4896 4897 /* 4898 * We want this to be reflected on every label, 4899 * so mark them all dirty. spa_unload() will do the 4900 * final sync that pushes these changes out. 4901 */ 4902 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4903 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4904 spa->spa_state = new_state; 4905 spa->spa_final_txg = spa_last_synced_txg(spa) + 4906 TXG_DEFER_SIZE + 1; 4907 vdev_config_dirty(spa->spa_root_vdev); 4908 spa_config_exit(spa, SCL_ALL, FTAG); 4909 } 4910 } 4911 4912 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 4913 4914 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4915 spa_unload(spa); 4916 spa_deactivate(spa); 4917 } 4918 4919 if (oldconfig && spa->spa_config) 4920 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4921 4922 if (new_state != POOL_STATE_UNINITIALIZED) { 4923 if (!hardforce) 4924 spa_write_cachefile(spa, B_TRUE, B_TRUE); 4925 spa_remove(spa); 4926 } 4927 mutex_exit(&spa_namespace_lock); 4928 4929 return (0); 4930 } 4931 4932 /* 4933 * Destroy a storage pool. 4934 */ 4935 int 4936 spa_destroy(char *pool) 4937 { 4938 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4939 B_FALSE, B_FALSE)); 4940 } 4941 4942 /* 4943 * Export a storage pool. 4944 */ 4945 int 4946 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4947 boolean_t hardforce) 4948 { 4949 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4950 force, hardforce)); 4951 } 4952 4953 /* 4954 * Similar to spa_export(), this unloads the spa_t without actually removing it 4955 * from the namespace in any way. 4956 */ 4957 int 4958 spa_reset(char *pool) 4959 { 4960 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4961 B_FALSE, B_FALSE)); 4962 } 4963 4964 /* 4965 * ========================================================================== 4966 * Device manipulation 4967 * ========================================================================== 4968 */ 4969 4970 /* 4971 * Add a device to a storage pool. 4972 */ 4973 int 4974 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4975 { 4976 uint64_t txg, id; 4977 int error; 4978 vdev_t *rvd = spa->spa_root_vdev; 4979 vdev_t *vd, *tvd; 4980 nvlist_t **spares, **l2cache; 4981 uint_t nspares, nl2cache; 4982 4983 ASSERT(spa_writeable(spa)); 4984 4985 txg = spa_vdev_enter(spa); 4986 4987 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4988 VDEV_ALLOC_ADD)) != 0) 4989 return (spa_vdev_exit(spa, NULL, txg, error)); 4990 4991 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4992 4993 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4994 &nspares) != 0) 4995 nspares = 0; 4996 4997 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4998 &nl2cache) != 0) 4999 nl2cache = 0; 5000 5001 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 5002 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5003 5004 if (vd->vdev_children != 0 && 5005 (error = vdev_create(vd, txg, B_FALSE)) != 0) 5006 return (spa_vdev_exit(spa, vd, txg, error)); 5007 5008 /* 5009 * We must validate the spares and l2cache devices after checking the 5010 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 5011 */ 5012 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 5013 return (spa_vdev_exit(spa, vd, txg, error)); 5014 5015 /* 5016 * If we are in the middle of a device removal, we can only add 5017 * devices which match the existing devices in the pool. 5018 * If we are in the middle of a removal, or have some indirect 5019 * vdevs, we can not add raidz toplevels. 5020 */ 5021 if (spa->spa_vdev_removal != NULL || 5022 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5023 for (int c = 0; c < vd->vdev_children; c++) { 5024 tvd = vd->vdev_child[c]; 5025 if (spa->spa_vdev_removal != NULL && 5026 tvd->vdev_ashift != 5027 spa->spa_vdev_removal->svr_vdev->vdev_ashift) { 5028 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5029 } 5030 /* Fail if top level vdev is raidz */ 5031 if (tvd->vdev_ops == &vdev_raidz_ops) { 5032 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5033 } 5034 /* 5035 * Need the top level mirror to be 5036 * a mirror of leaf vdevs only 5037 */ 5038 if (tvd->vdev_ops == &vdev_mirror_ops) { 5039 for (uint64_t cid = 0; 5040 cid < tvd->vdev_children; cid++) { 5041 vdev_t *cvd = tvd->vdev_child[cid]; 5042 if (!cvd->vdev_ops->vdev_op_leaf) { 5043 return (spa_vdev_exit(spa, vd, 5044 txg, EINVAL)); 5045 } 5046 } 5047 } 5048 } 5049 } 5050 5051 for (int c = 0; c < vd->vdev_children; c++) { 5052 5053 /* 5054 * Set the vdev id to the first hole, if one exists. 5055 */ 5056 for (id = 0; id < rvd->vdev_children; id++) { 5057 if (rvd->vdev_child[id]->vdev_ishole) { 5058 vdev_free(rvd->vdev_child[id]); 5059 break; 5060 } 5061 } 5062 tvd = vd->vdev_child[c]; 5063 vdev_remove_child(vd, tvd); 5064 tvd->vdev_id = id; 5065 vdev_add_child(rvd, tvd); 5066 vdev_config_dirty(tvd); 5067 } 5068 5069 if (nspares != 0) { 5070 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 5071 ZPOOL_CONFIG_SPARES); 5072 spa_load_spares(spa); 5073 spa->spa_spares.sav_sync = B_TRUE; 5074 } 5075 5076 if (nl2cache != 0) { 5077 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 5078 ZPOOL_CONFIG_L2CACHE); 5079 spa_load_l2cache(spa); 5080 spa->spa_l2cache.sav_sync = B_TRUE; 5081 } 5082 5083 /* 5084 * We have to be careful when adding new vdevs to an existing pool. 5085 * If other threads start allocating from these vdevs before we 5086 * sync the config cache, and we lose power, then upon reboot we may 5087 * fail to open the pool because there are DVAs that the config cache 5088 * can't translate. Therefore, we first add the vdevs without 5089 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 5090 * and then let spa_config_update() initialize the new metaslabs. 5091 * 5092 * spa_load() checks for added-but-not-initialized vdevs, so that 5093 * if we lose power at any point in this sequence, the remaining 5094 * steps will be completed the next time we load the pool. 5095 */ 5096 (void) spa_vdev_exit(spa, vd, txg, 0); 5097 5098 mutex_enter(&spa_namespace_lock); 5099 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5100 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 5101 mutex_exit(&spa_namespace_lock); 5102 5103 return (0); 5104 } 5105 5106 /* 5107 * Attach a device to a mirror. The arguments are the path to any device 5108 * in the mirror, and the nvroot for the new device. If the path specifies 5109 * a device that is not mirrored, we automatically insert the mirror vdev. 5110 * 5111 * If 'replacing' is specified, the new device is intended to replace the 5112 * existing device; in this case the two devices are made into their own 5113 * mirror using the 'replacing' vdev, which is functionally identical to 5114 * the mirror vdev (it actually reuses all the same ops) but has a few 5115 * extra rules: you can't attach to it after it's been created, and upon 5116 * completion of resilvering, the first disk (the one being replaced) 5117 * is automatically detached. 5118 */ 5119 int 5120 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 5121 { 5122 uint64_t txg, dtl_max_txg; 5123 vdev_t *rvd = spa->spa_root_vdev; 5124 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 5125 vdev_ops_t *pvops; 5126 char *oldvdpath, *newvdpath; 5127 int newvd_isspare; 5128 int error; 5129 5130 ASSERT(spa_writeable(spa)); 5131 5132 txg = spa_vdev_enter(spa); 5133 5134 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 5135 5136 if (spa->spa_vdev_removal != NULL || 5137 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5138 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5139 } 5140 5141 if (oldvd == NULL) 5142 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5143 5144 if (!oldvd->vdev_ops->vdev_op_leaf) 5145 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5146 5147 pvd = oldvd->vdev_parent; 5148 5149 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 5150 VDEV_ALLOC_ATTACH)) != 0) 5151 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5152 5153 if (newrootvd->vdev_children != 1) 5154 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5155 5156 newvd = newrootvd->vdev_child[0]; 5157 5158 if (!newvd->vdev_ops->vdev_op_leaf) 5159 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5160 5161 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 5162 return (spa_vdev_exit(spa, newrootvd, txg, error)); 5163 5164 /* 5165 * Spares can't replace logs 5166 */ 5167 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 5168 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5169 5170 if (!replacing) { 5171 /* 5172 * For attach, the only allowable parent is a mirror or the root 5173 * vdev. 5174 */ 5175 if (pvd->vdev_ops != &vdev_mirror_ops && 5176 pvd->vdev_ops != &vdev_root_ops) 5177 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5178 5179 pvops = &vdev_mirror_ops; 5180 } else { 5181 /* 5182 * Active hot spares can only be replaced by inactive hot 5183 * spares. 5184 */ 5185 if (pvd->vdev_ops == &vdev_spare_ops && 5186 oldvd->vdev_isspare && 5187 !spa_has_spare(spa, newvd->vdev_guid)) 5188 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5189 5190 /* 5191 * If the source is a hot spare, and the parent isn't already a 5192 * spare, then we want to create a new hot spare. Otherwise, we 5193 * want to create a replacing vdev. The user is not allowed to 5194 * attach to a spared vdev child unless the 'isspare' state is 5195 * the same (spare replaces spare, non-spare replaces 5196 * non-spare). 5197 */ 5198 if (pvd->vdev_ops == &vdev_replacing_ops && 5199 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 5200 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5201 } else if (pvd->vdev_ops == &vdev_spare_ops && 5202 newvd->vdev_isspare != oldvd->vdev_isspare) { 5203 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5204 } 5205 5206 if (newvd->vdev_isspare) 5207 pvops = &vdev_spare_ops; 5208 else 5209 pvops = &vdev_replacing_ops; 5210 } 5211 5212 /* 5213 * Make sure the new device is big enough. 5214 */ 5215 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 5216 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 5217 5218 /* 5219 * The new device cannot have a higher alignment requirement 5220 * than the top-level vdev. 5221 */ 5222 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 5223 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 5224 5225 /* 5226 * If this is an in-place replacement, update oldvd's path and devid 5227 * to make it distinguishable from newvd, and unopenable from now on. 5228 */ 5229 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 5230 spa_strfree(oldvd->vdev_path); 5231 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 5232 KM_SLEEP); 5233 (void) sprintf(oldvd->vdev_path, "%s/%s", 5234 newvd->vdev_path, "old"); 5235 if (oldvd->vdev_devid != NULL) { 5236 spa_strfree(oldvd->vdev_devid); 5237 oldvd->vdev_devid = NULL; 5238 } 5239 } 5240 5241 /* mark the device being resilvered */ 5242 newvd->vdev_resilver_txg = txg; 5243 5244 /* 5245 * If the parent is not a mirror, or if we're replacing, insert the new 5246 * mirror/replacing/spare vdev above oldvd. 5247 */ 5248 if (pvd->vdev_ops != pvops) 5249 pvd = vdev_add_parent(oldvd, pvops); 5250 5251 ASSERT(pvd->vdev_top->vdev_parent == rvd); 5252 ASSERT(pvd->vdev_ops == pvops); 5253 ASSERT(oldvd->vdev_parent == pvd); 5254 5255 /* 5256 * Extract the new device from its root and add it to pvd. 5257 */ 5258 vdev_remove_child(newrootvd, newvd); 5259 newvd->vdev_id = pvd->vdev_children; 5260 newvd->vdev_crtxg = oldvd->vdev_crtxg; 5261 vdev_add_child(pvd, newvd); 5262 5263 tvd = newvd->vdev_top; 5264 ASSERT(pvd->vdev_top == tvd); 5265 ASSERT(tvd->vdev_parent == rvd); 5266 5267 vdev_config_dirty(tvd); 5268 5269 /* 5270 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 5271 * for any dmu_sync-ed blocks. It will propagate upward when 5272 * spa_vdev_exit() calls vdev_dtl_reassess(). 5273 */ 5274 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 5275 5276 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 5277 dtl_max_txg - TXG_INITIAL); 5278 5279 if (newvd->vdev_isspare) { 5280 spa_spare_activate(newvd); 5281 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 5282 } 5283 5284 oldvdpath = spa_strdup(oldvd->vdev_path); 5285 newvdpath = spa_strdup(newvd->vdev_path); 5286 newvd_isspare = newvd->vdev_isspare; 5287 5288 /* 5289 * Mark newvd's DTL dirty in this txg. 5290 */ 5291 vdev_dirty(tvd, VDD_DTL, newvd, txg); 5292 5293 /* 5294 * Schedule the resilver to restart in the future. We do this to 5295 * ensure that dmu_sync-ed blocks have been stitched into the 5296 * respective datasets. 5297 */ 5298 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 5299 5300 if (spa->spa_bootfs) 5301 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 5302 5303 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 5304 5305 /* 5306 * Commit the config 5307 */ 5308 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 5309 5310 spa_history_log_internal(spa, "vdev attach", NULL, 5311 "%s vdev=%s %s vdev=%s", 5312 replacing && newvd_isspare ? "spare in" : 5313 replacing ? "replace" : "attach", newvdpath, 5314 replacing ? "for" : "to", oldvdpath); 5315 5316 spa_strfree(oldvdpath); 5317 spa_strfree(newvdpath); 5318 5319 return (0); 5320 } 5321 5322 /* 5323 * Detach a device from a mirror or replacing vdev. 5324 * 5325 * If 'replace_done' is specified, only detach if the parent 5326 * is a replacing vdev. 5327 */ 5328 int 5329 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 5330 { 5331 uint64_t txg; 5332 int error; 5333 vdev_t *rvd = spa->spa_root_vdev; 5334 vdev_t *vd, *pvd, *cvd, *tvd; 5335 boolean_t unspare = B_FALSE; 5336 uint64_t unspare_guid = 0; 5337 char *vdpath; 5338 5339 ASSERT(spa_writeable(spa)); 5340 5341 txg = spa_vdev_enter(spa); 5342 5343 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5344 5345 if (vd == NULL) 5346 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5347 5348 if (!vd->vdev_ops->vdev_op_leaf) 5349 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5350 5351 pvd = vd->vdev_parent; 5352 5353 /* 5354 * If the parent/child relationship is not as expected, don't do it. 5355 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5356 * vdev that's replacing B with C. The user's intent in replacing 5357 * is to go from M(A,B) to M(A,C). If the user decides to cancel 5358 * the replace by detaching C, the expected behavior is to end up 5359 * M(A,B). But suppose that right after deciding to detach C, 5360 * the replacement of B completes. We would have M(A,C), and then 5361 * ask to detach C, which would leave us with just A -- not what 5362 * the user wanted. To prevent this, we make sure that the 5363 * parent/child relationship hasn't changed -- in this example, 5364 * that C's parent is still the replacing vdev R. 5365 */ 5366 if (pvd->vdev_guid != pguid && pguid != 0) 5367 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5368 5369 /* 5370 * Only 'replacing' or 'spare' vdevs can be replaced. 5371 */ 5372 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5373 pvd->vdev_ops != &vdev_spare_ops) 5374 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5375 5376 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5377 spa_version(spa) >= SPA_VERSION_SPARES); 5378 5379 /* 5380 * Only mirror, replacing, and spare vdevs support detach. 5381 */ 5382 if (pvd->vdev_ops != &vdev_replacing_ops && 5383 pvd->vdev_ops != &vdev_mirror_ops && 5384 pvd->vdev_ops != &vdev_spare_ops) 5385 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5386 5387 /* 5388 * If this device has the only valid copy of some data, 5389 * we cannot safely detach it. 5390 */ 5391 if (vdev_dtl_required(vd)) 5392 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5393 5394 ASSERT(pvd->vdev_children >= 2); 5395 5396 /* 5397 * If we are detaching the second disk from a replacing vdev, then 5398 * check to see if we changed the original vdev's path to have "/old" 5399 * at the end in spa_vdev_attach(). If so, undo that change now. 5400 */ 5401 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5402 vd->vdev_path != NULL) { 5403 size_t len = strlen(vd->vdev_path); 5404 5405 for (int c = 0; c < pvd->vdev_children; c++) { 5406 cvd = pvd->vdev_child[c]; 5407 5408 if (cvd == vd || cvd->vdev_path == NULL) 5409 continue; 5410 5411 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5412 strcmp(cvd->vdev_path + len, "/old") == 0) { 5413 spa_strfree(cvd->vdev_path); 5414 cvd->vdev_path = spa_strdup(vd->vdev_path); 5415 break; 5416 } 5417 } 5418 } 5419 5420 /* 5421 * If we are detaching the original disk from a spare, then it implies 5422 * that the spare should become a real disk, and be removed from the 5423 * active spare list for the pool. 5424 */ 5425 if (pvd->vdev_ops == &vdev_spare_ops && 5426 vd->vdev_id == 0 && 5427 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5428 unspare = B_TRUE; 5429 5430 /* 5431 * Erase the disk labels so the disk can be used for other things. 5432 * This must be done after all other error cases are handled, 5433 * but before we disembowel vd (so we can still do I/O to it). 5434 * But if we can't do it, don't treat the error as fatal -- 5435 * it may be that the unwritability of the disk is the reason 5436 * it's being detached! 5437 */ 5438 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5439 5440 /* 5441 * Remove vd from its parent and compact the parent's children. 5442 */ 5443 vdev_remove_child(pvd, vd); 5444 vdev_compact_children(pvd); 5445 5446 /* 5447 * Remember one of the remaining children so we can get tvd below. 5448 */ 5449 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5450 5451 /* 5452 * If we need to remove the remaining child from the list of hot spares, 5453 * do it now, marking the vdev as no longer a spare in the process. 5454 * We must do this before vdev_remove_parent(), because that can 5455 * change the GUID if it creates a new toplevel GUID. For a similar 5456 * reason, we must remove the spare now, in the same txg as the detach; 5457 * otherwise someone could attach a new sibling, change the GUID, and 5458 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5459 */ 5460 if (unspare) { 5461 ASSERT(cvd->vdev_isspare); 5462 spa_spare_remove(cvd); 5463 unspare_guid = cvd->vdev_guid; 5464 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5465 cvd->vdev_unspare = B_TRUE; 5466 } 5467 5468 /* 5469 * If the parent mirror/replacing vdev only has one child, 5470 * the parent is no longer needed. Remove it from the tree. 5471 */ 5472 if (pvd->vdev_children == 1) { 5473 if (pvd->vdev_ops == &vdev_spare_ops) 5474 cvd->vdev_unspare = B_FALSE; 5475 vdev_remove_parent(cvd); 5476 } 5477 5478 5479 /* 5480 * We don't set tvd until now because the parent we just removed 5481 * may have been the previous top-level vdev. 5482 */ 5483 tvd = cvd->vdev_top; 5484 ASSERT(tvd->vdev_parent == rvd); 5485 5486 /* 5487 * Reevaluate the parent vdev state. 5488 */ 5489 vdev_propagate_state(cvd); 5490 5491 /* 5492 * If the 'autoexpand' property is set on the pool then automatically 5493 * try to expand the size of the pool. For example if the device we 5494 * just detached was smaller than the others, it may be possible to 5495 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5496 * first so that we can obtain the updated sizes of the leaf vdevs. 5497 */ 5498 if (spa->spa_autoexpand) { 5499 vdev_reopen(tvd); 5500 vdev_expand(tvd, txg); 5501 } 5502 5503 vdev_config_dirty(tvd); 5504 5505 /* 5506 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5507 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5508 * But first make sure we're not on any *other* txg's DTL list, to 5509 * prevent vd from being accessed after it's freed. 5510 */ 5511 vdpath = spa_strdup(vd->vdev_path); 5512 for (int t = 0; t < TXG_SIZE; t++) 5513 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5514 vd->vdev_detached = B_TRUE; 5515 vdev_dirty(tvd, VDD_DTL, vd, txg); 5516 5517 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 5518 5519 /* hang on to the spa before we release the lock */ 5520 spa_open_ref(spa, FTAG); 5521 5522 error = spa_vdev_exit(spa, vd, txg, 0); 5523 5524 spa_history_log_internal(spa, "detach", NULL, 5525 "vdev=%s", vdpath); 5526 spa_strfree(vdpath); 5527 5528 /* 5529 * If this was the removal of the original device in a hot spare vdev, 5530 * then we want to go through and remove the device from the hot spare 5531 * list of every other pool. 5532 */ 5533 if (unspare) { 5534 spa_t *altspa = NULL; 5535 5536 mutex_enter(&spa_namespace_lock); 5537 while ((altspa = spa_next(altspa)) != NULL) { 5538 if (altspa->spa_state != POOL_STATE_ACTIVE || 5539 altspa == spa) 5540 continue; 5541 5542 spa_open_ref(altspa, FTAG); 5543 mutex_exit(&spa_namespace_lock); 5544 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5545 mutex_enter(&spa_namespace_lock); 5546 spa_close(altspa, FTAG); 5547 } 5548 mutex_exit(&spa_namespace_lock); 5549 5550 /* search the rest of the vdevs for spares to remove */ 5551 spa_vdev_resilver_done(spa); 5552 } 5553 5554 /* all done with the spa; OK to release */ 5555 mutex_enter(&spa_namespace_lock); 5556 spa_close(spa, FTAG); 5557 mutex_exit(&spa_namespace_lock); 5558 5559 return (error); 5560 } 5561 5562 /* 5563 * Split a set of devices from their mirrors, and create a new pool from them. 5564 */ 5565 int 5566 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5567 nvlist_t *props, boolean_t exp) 5568 { 5569 int error = 0; 5570 uint64_t txg, *glist; 5571 spa_t *newspa; 5572 uint_t c, children, lastlog; 5573 nvlist_t **child, *nvl, *tmp; 5574 dmu_tx_t *tx; 5575 char *altroot = NULL; 5576 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5577 boolean_t activate_slog; 5578 5579 ASSERT(spa_writeable(spa)); 5580 5581 txg = spa_vdev_enter(spa); 5582 5583 /* clear the log and flush everything up to now */ 5584 activate_slog = spa_passivate_log(spa); 5585 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5586 error = spa_reset_logs(spa); 5587 txg = spa_vdev_config_enter(spa); 5588 5589 if (activate_slog) 5590 spa_activate_log(spa); 5591 5592 if (error != 0) 5593 return (spa_vdev_exit(spa, NULL, txg, error)); 5594 5595 /* check new spa name before going any further */ 5596 if (spa_lookup(newname) != NULL) 5597 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5598 5599 /* 5600 * scan through all the children to ensure they're all mirrors 5601 */ 5602 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5603 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5604 &children) != 0) 5605 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5606 5607 /* first, check to ensure we've got the right child count */ 5608 rvd = spa->spa_root_vdev; 5609 lastlog = 0; 5610 for (c = 0; c < rvd->vdev_children; c++) { 5611 vdev_t *vd = rvd->vdev_child[c]; 5612 5613 /* don't count the holes & logs as children */ 5614 if (vd->vdev_islog || !vdev_is_concrete(vd)) { 5615 if (lastlog == 0) 5616 lastlog = c; 5617 continue; 5618 } 5619 5620 lastlog = 0; 5621 } 5622 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5623 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5624 5625 /* next, ensure no spare or cache devices are part of the split */ 5626 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5627 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5628 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5629 5630 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5631 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5632 5633 /* then, loop over each vdev and validate it */ 5634 for (c = 0; c < children; c++) { 5635 uint64_t is_hole = 0; 5636 5637 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5638 &is_hole); 5639 5640 if (is_hole != 0) { 5641 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5642 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5643 continue; 5644 } else { 5645 error = SET_ERROR(EINVAL); 5646 break; 5647 } 5648 } 5649 5650 /* which disk is going to be split? */ 5651 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5652 &glist[c]) != 0) { 5653 error = SET_ERROR(EINVAL); 5654 break; 5655 } 5656 5657 /* look it up in the spa */ 5658 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5659 if (vml[c] == NULL) { 5660 error = SET_ERROR(ENODEV); 5661 break; 5662 } 5663 5664 /* make sure there's nothing stopping the split */ 5665 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5666 vml[c]->vdev_islog || 5667 !vdev_is_concrete(vml[c]) || 5668 vml[c]->vdev_isspare || 5669 vml[c]->vdev_isl2cache || 5670 !vdev_writeable(vml[c]) || 5671 vml[c]->vdev_children != 0 || 5672 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5673 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5674 error = SET_ERROR(EINVAL); 5675 break; 5676 } 5677 5678 if (vdev_dtl_required(vml[c])) { 5679 error = SET_ERROR(EBUSY); 5680 break; 5681 } 5682 5683 /* we need certain info from the top level */ 5684 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5685 vml[c]->vdev_top->vdev_ms_array) == 0); 5686 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5687 vml[c]->vdev_top->vdev_ms_shift) == 0); 5688 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5689 vml[c]->vdev_top->vdev_asize) == 0); 5690 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5691 vml[c]->vdev_top->vdev_ashift) == 0); 5692 5693 /* transfer per-vdev ZAPs */ 5694 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5695 VERIFY0(nvlist_add_uint64(child[c], 5696 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5697 5698 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5699 VERIFY0(nvlist_add_uint64(child[c], 5700 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5701 vml[c]->vdev_parent->vdev_top_zap)); 5702 } 5703 5704 if (error != 0) { 5705 kmem_free(vml, children * sizeof (vdev_t *)); 5706 kmem_free(glist, children * sizeof (uint64_t)); 5707 return (spa_vdev_exit(spa, NULL, txg, error)); 5708 } 5709 5710 /* stop writers from using the disks */ 5711 for (c = 0; c < children; c++) { 5712 if (vml[c] != NULL) 5713 vml[c]->vdev_offline = B_TRUE; 5714 } 5715 vdev_reopen(spa->spa_root_vdev); 5716 5717 /* 5718 * Temporarily record the splitting vdevs in the spa config. This 5719 * will disappear once the config is regenerated. 5720 */ 5721 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5722 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5723 glist, children) == 0); 5724 kmem_free(glist, children * sizeof (uint64_t)); 5725 5726 mutex_enter(&spa->spa_props_lock); 5727 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5728 nvl) == 0); 5729 mutex_exit(&spa->spa_props_lock); 5730 spa->spa_config_splitting = nvl; 5731 vdev_config_dirty(spa->spa_root_vdev); 5732 5733 /* configure and create the new pool */ 5734 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5735 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5736 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5737 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5738 spa_version(spa)) == 0); 5739 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5740 spa->spa_config_txg) == 0); 5741 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5742 spa_generate_guid(NULL)) == 0); 5743 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5744 (void) nvlist_lookup_string(props, 5745 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5746 5747 /* add the new pool to the namespace */ 5748 newspa = spa_add(newname, config, altroot); 5749 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5750 newspa->spa_config_txg = spa->spa_config_txg; 5751 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5752 5753 /* release the spa config lock, retaining the namespace lock */ 5754 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5755 5756 if (zio_injection_enabled) 5757 zio_handle_panic_injection(spa, FTAG, 1); 5758 5759 spa_activate(newspa, spa_mode_global); 5760 spa_async_suspend(newspa); 5761 5762 /* create the new pool from the disks of the original pool */ 5763 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5764 if (error) 5765 goto out; 5766 5767 /* if that worked, generate a real config for the new pool */ 5768 if (newspa->spa_root_vdev != NULL) { 5769 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5770 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5771 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5772 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5773 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5774 B_TRUE)); 5775 } 5776 5777 /* set the props */ 5778 if (props != NULL) { 5779 spa_configfile_set(newspa, props, B_FALSE); 5780 error = spa_prop_set(newspa, props); 5781 if (error) 5782 goto out; 5783 } 5784 5785 /* flush everything */ 5786 txg = spa_vdev_config_enter(newspa); 5787 vdev_config_dirty(newspa->spa_root_vdev); 5788 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5789 5790 if (zio_injection_enabled) 5791 zio_handle_panic_injection(spa, FTAG, 2); 5792 5793 spa_async_resume(newspa); 5794 5795 /* finally, update the original pool's config */ 5796 txg = spa_vdev_config_enter(spa); 5797 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5798 error = dmu_tx_assign(tx, TXG_WAIT); 5799 if (error != 0) 5800 dmu_tx_abort(tx); 5801 for (c = 0; c < children; c++) { 5802 if (vml[c] != NULL) { 5803 vdev_split(vml[c]); 5804 if (error == 0) 5805 spa_history_log_internal(spa, "detach", tx, 5806 "vdev=%s", vml[c]->vdev_path); 5807 5808 vdev_free(vml[c]); 5809 } 5810 } 5811 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5812 vdev_config_dirty(spa->spa_root_vdev); 5813 spa->spa_config_splitting = NULL; 5814 nvlist_free(nvl); 5815 if (error == 0) 5816 dmu_tx_commit(tx); 5817 (void) spa_vdev_exit(spa, NULL, txg, 0); 5818 5819 if (zio_injection_enabled) 5820 zio_handle_panic_injection(spa, FTAG, 3); 5821 5822 /* split is complete; log a history record */ 5823 spa_history_log_internal(newspa, "split", NULL, 5824 "from pool %s", spa_name(spa)); 5825 5826 kmem_free(vml, children * sizeof (vdev_t *)); 5827 5828 /* if we're not going to mount the filesystems in userland, export */ 5829 if (exp) 5830 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5831 B_FALSE, B_FALSE); 5832 5833 return (error); 5834 5835 out: 5836 spa_unload(newspa); 5837 spa_deactivate(newspa); 5838 spa_remove(newspa); 5839 5840 txg = spa_vdev_config_enter(spa); 5841 5842 /* re-online all offlined disks */ 5843 for (c = 0; c < children; c++) { 5844 if (vml[c] != NULL) 5845 vml[c]->vdev_offline = B_FALSE; 5846 } 5847 vdev_reopen(spa->spa_root_vdev); 5848 5849 nvlist_free(spa->spa_config_splitting); 5850 spa->spa_config_splitting = NULL; 5851 (void) spa_vdev_exit(spa, NULL, txg, error); 5852 5853 kmem_free(vml, children * sizeof (vdev_t *)); 5854 return (error); 5855 } 5856 5857 /* 5858 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5859 * currently spared, so we can detach it. 5860 */ 5861 static vdev_t * 5862 spa_vdev_resilver_done_hunt(vdev_t *vd) 5863 { 5864 vdev_t *newvd, *oldvd; 5865 5866 for (int c = 0; c < vd->vdev_children; c++) { 5867 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5868 if (oldvd != NULL) 5869 return (oldvd); 5870 } 5871 5872 /* 5873 * Check for a completed replacement. We always consider the first 5874 * vdev in the list to be the oldest vdev, and the last one to be 5875 * the newest (see spa_vdev_attach() for how that works). In 5876 * the case where the newest vdev is faulted, we will not automatically 5877 * remove it after a resilver completes. This is OK as it will require 5878 * user intervention to determine which disk the admin wishes to keep. 5879 */ 5880 if (vd->vdev_ops == &vdev_replacing_ops) { 5881 ASSERT(vd->vdev_children > 1); 5882 5883 newvd = vd->vdev_child[vd->vdev_children - 1]; 5884 oldvd = vd->vdev_child[0]; 5885 5886 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5887 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5888 !vdev_dtl_required(oldvd)) 5889 return (oldvd); 5890 } 5891 5892 /* 5893 * Check for a completed resilver with the 'unspare' flag set. 5894 */ 5895 if (vd->vdev_ops == &vdev_spare_ops) { 5896 vdev_t *first = vd->vdev_child[0]; 5897 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5898 5899 if (last->vdev_unspare) { 5900 oldvd = first; 5901 newvd = last; 5902 } else if (first->vdev_unspare) { 5903 oldvd = last; 5904 newvd = first; 5905 } else { 5906 oldvd = NULL; 5907 } 5908 5909 if (oldvd != NULL && 5910 vdev_dtl_empty(newvd, DTL_MISSING) && 5911 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5912 !vdev_dtl_required(oldvd)) 5913 return (oldvd); 5914 5915 /* 5916 * If there are more than two spares attached to a disk, 5917 * and those spares are not required, then we want to 5918 * attempt to free them up now so that they can be used 5919 * by other pools. Once we're back down to a single 5920 * disk+spare, we stop removing them. 5921 */ 5922 if (vd->vdev_children > 2) { 5923 newvd = vd->vdev_child[1]; 5924 5925 if (newvd->vdev_isspare && last->vdev_isspare && 5926 vdev_dtl_empty(last, DTL_MISSING) && 5927 vdev_dtl_empty(last, DTL_OUTAGE) && 5928 !vdev_dtl_required(newvd)) 5929 return (newvd); 5930 } 5931 } 5932 5933 return (NULL); 5934 } 5935 5936 static void 5937 spa_vdev_resilver_done(spa_t *spa) 5938 { 5939 vdev_t *vd, *pvd, *ppvd; 5940 uint64_t guid, sguid, pguid, ppguid; 5941 5942 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5943 5944 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5945 pvd = vd->vdev_parent; 5946 ppvd = pvd->vdev_parent; 5947 guid = vd->vdev_guid; 5948 pguid = pvd->vdev_guid; 5949 ppguid = ppvd->vdev_guid; 5950 sguid = 0; 5951 /* 5952 * If we have just finished replacing a hot spared device, then 5953 * we need to detach the parent's first child (the original hot 5954 * spare) as well. 5955 */ 5956 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5957 ppvd->vdev_children == 2) { 5958 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5959 sguid = ppvd->vdev_child[1]->vdev_guid; 5960 } 5961 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5962 5963 spa_config_exit(spa, SCL_ALL, FTAG); 5964 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5965 return; 5966 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5967 return; 5968 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5969 } 5970 5971 spa_config_exit(spa, SCL_ALL, FTAG); 5972 } 5973 5974 /* 5975 * Update the stored path or FRU for this vdev. 5976 */ 5977 int 5978 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5979 boolean_t ispath) 5980 { 5981 vdev_t *vd; 5982 boolean_t sync = B_FALSE; 5983 5984 ASSERT(spa_writeable(spa)); 5985 5986 spa_vdev_state_enter(spa, SCL_ALL); 5987 5988 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5989 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5990 5991 if (!vd->vdev_ops->vdev_op_leaf) 5992 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5993 5994 if (ispath) { 5995 if (strcmp(value, vd->vdev_path) != 0) { 5996 spa_strfree(vd->vdev_path); 5997 vd->vdev_path = spa_strdup(value); 5998 sync = B_TRUE; 5999 } 6000 } else { 6001 if (vd->vdev_fru == NULL) { 6002 vd->vdev_fru = spa_strdup(value); 6003 sync = B_TRUE; 6004 } else if (strcmp(value, vd->vdev_fru) != 0) { 6005 spa_strfree(vd->vdev_fru); 6006 vd->vdev_fru = spa_strdup(value); 6007 sync = B_TRUE; 6008 } 6009 } 6010 6011 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6012 } 6013 6014 int 6015 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6016 { 6017 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6018 } 6019 6020 int 6021 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6022 { 6023 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6024 } 6025 6026 /* 6027 * ========================================================================== 6028 * SPA Scanning 6029 * ========================================================================== 6030 */ 6031 int 6032 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 6033 { 6034 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6035 6036 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6037 return (SET_ERROR(EBUSY)); 6038 6039 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 6040 } 6041 6042 int 6043 spa_scan_stop(spa_t *spa) 6044 { 6045 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6046 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6047 return (SET_ERROR(EBUSY)); 6048 return (dsl_scan_cancel(spa->spa_dsl_pool)); 6049 } 6050 6051 int 6052 spa_scan(spa_t *spa, pool_scan_func_t func) 6053 { 6054 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6055 6056 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6057 return (SET_ERROR(ENOTSUP)); 6058 6059 /* 6060 * If a resilver was requested, but there is no DTL on a 6061 * writeable leaf device, we have nothing to do. 6062 */ 6063 if (func == POOL_SCAN_RESILVER && 6064 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6065 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6066 return (0); 6067 } 6068 6069 return (dsl_scan(spa->spa_dsl_pool, func)); 6070 } 6071 6072 /* 6073 * ========================================================================== 6074 * SPA async task processing 6075 * ========================================================================== 6076 */ 6077 6078 static void 6079 spa_async_remove(spa_t *spa, vdev_t *vd) 6080 { 6081 if (vd->vdev_remove_wanted) { 6082 vd->vdev_remove_wanted = B_FALSE; 6083 vd->vdev_delayed_close = B_FALSE; 6084 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6085 6086 /* 6087 * We want to clear the stats, but we don't want to do a full 6088 * vdev_clear() as that will cause us to throw away 6089 * degraded/faulted state as well as attempt to reopen the 6090 * device, all of which is a waste. 6091 */ 6092 vd->vdev_stat.vs_read_errors = 0; 6093 vd->vdev_stat.vs_write_errors = 0; 6094 vd->vdev_stat.vs_checksum_errors = 0; 6095 6096 vdev_state_dirty(vd->vdev_top); 6097 } 6098 6099 for (int c = 0; c < vd->vdev_children; c++) 6100 spa_async_remove(spa, vd->vdev_child[c]); 6101 } 6102 6103 static void 6104 spa_async_probe(spa_t *spa, vdev_t *vd) 6105 { 6106 if (vd->vdev_probe_wanted) { 6107 vd->vdev_probe_wanted = B_FALSE; 6108 vdev_reopen(vd); /* vdev_open() does the actual probe */ 6109 } 6110 6111 for (int c = 0; c < vd->vdev_children; c++) 6112 spa_async_probe(spa, vd->vdev_child[c]); 6113 } 6114 6115 static void 6116 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 6117 { 6118 sysevent_id_t eid; 6119 nvlist_t *attr; 6120 char *physpath; 6121 6122 if (!spa->spa_autoexpand) 6123 return; 6124 6125 for (int c = 0; c < vd->vdev_children; c++) { 6126 vdev_t *cvd = vd->vdev_child[c]; 6127 spa_async_autoexpand(spa, cvd); 6128 } 6129 6130 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6131 return; 6132 6133 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6134 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6135 6136 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6137 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6138 6139 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6140 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 6141 6142 nvlist_free(attr); 6143 kmem_free(physpath, MAXPATHLEN); 6144 } 6145 6146 static void 6147 spa_async_thread(void *arg) 6148 { 6149 spa_t *spa = (spa_t *)arg; 6150 int tasks; 6151 6152 ASSERT(spa->spa_sync_on); 6153 6154 mutex_enter(&spa->spa_async_lock); 6155 tasks = spa->spa_async_tasks; 6156 spa->spa_async_tasks = 0; 6157 mutex_exit(&spa->spa_async_lock); 6158 6159 /* 6160 * See if the config needs to be updated. 6161 */ 6162 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6163 uint64_t old_space, new_space; 6164 6165 mutex_enter(&spa_namespace_lock); 6166 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6167 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6168 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6169 mutex_exit(&spa_namespace_lock); 6170 6171 /* 6172 * If the pool grew as a result of the config update, 6173 * then log an internal history event. 6174 */ 6175 if (new_space != old_space) { 6176 spa_history_log_internal(spa, "vdev online", NULL, 6177 "pool '%s' size: %llu(+%llu)", 6178 spa_name(spa), new_space, new_space - old_space); 6179 } 6180 } 6181 6182 /* 6183 * See if any devices need to be marked REMOVED. 6184 */ 6185 if (tasks & SPA_ASYNC_REMOVE) { 6186 spa_vdev_state_enter(spa, SCL_NONE); 6187 spa_async_remove(spa, spa->spa_root_vdev); 6188 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6189 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6190 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6191 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6192 (void) spa_vdev_state_exit(spa, NULL, 0); 6193 } 6194 6195 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6196 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6197 spa_async_autoexpand(spa, spa->spa_root_vdev); 6198 spa_config_exit(spa, SCL_CONFIG, FTAG); 6199 } 6200 6201 /* 6202 * See if any devices need to be probed. 6203 */ 6204 if (tasks & SPA_ASYNC_PROBE) { 6205 spa_vdev_state_enter(spa, SCL_NONE); 6206 spa_async_probe(spa, spa->spa_root_vdev); 6207 (void) spa_vdev_state_exit(spa, NULL, 0); 6208 } 6209 6210 /* 6211 * If any devices are done replacing, detach them. 6212 */ 6213 if (tasks & SPA_ASYNC_RESILVER_DONE) 6214 spa_vdev_resilver_done(spa); 6215 6216 /* 6217 * Kick off a resilver. 6218 */ 6219 if (tasks & SPA_ASYNC_RESILVER) 6220 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6221 6222 /* 6223 * Let the world know that we're done. 6224 */ 6225 mutex_enter(&spa->spa_async_lock); 6226 spa->spa_async_thread = NULL; 6227 cv_broadcast(&spa->spa_async_cv); 6228 mutex_exit(&spa->spa_async_lock); 6229 thread_exit(); 6230 } 6231 6232 void 6233 spa_async_suspend(spa_t *spa) 6234 { 6235 mutex_enter(&spa->spa_async_lock); 6236 spa->spa_async_suspended++; 6237 while (spa->spa_async_thread != NULL || 6238 spa->spa_condense_thread != NULL) 6239 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6240 mutex_exit(&spa->spa_async_lock); 6241 6242 spa_vdev_remove_suspend(spa); 6243 } 6244 6245 void 6246 spa_async_resume(spa_t *spa) 6247 { 6248 mutex_enter(&spa->spa_async_lock); 6249 ASSERT(spa->spa_async_suspended != 0); 6250 spa->spa_async_suspended--; 6251 mutex_exit(&spa->spa_async_lock); 6252 spa_restart_removal(spa); 6253 } 6254 6255 static boolean_t 6256 spa_async_tasks_pending(spa_t *spa) 6257 { 6258 uint_t non_config_tasks; 6259 uint_t config_task; 6260 boolean_t config_task_suspended; 6261 6262 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 6263 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6264 if (spa->spa_ccw_fail_time == 0) { 6265 config_task_suspended = B_FALSE; 6266 } else { 6267 config_task_suspended = 6268 (gethrtime() - spa->spa_ccw_fail_time) < 6269 (zfs_ccw_retry_interval * NANOSEC); 6270 } 6271 6272 return (non_config_tasks || (config_task && !config_task_suspended)); 6273 } 6274 6275 static void 6276 spa_async_dispatch(spa_t *spa) 6277 { 6278 mutex_enter(&spa->spa_async_lock); 6279 if (spa_async_tasks_pending(spa) && 6280 !spa->spa_async_suspended && 6281 spa->spa_async_thread == NULL && 6282 rootdir != NULL) 6283 spa->spa_async_thread = thread_create(NULL, 0, 6284 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6285 mutex_exit(&spa->spa_async_lock); 6286 } 6287 6288 void 6289 spa_async_request(spa_t *spa, int task) 6290 { 6291 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6292 mutex_enter(&spa->spa_async_lock); 6293 spa->spa_async_tasks |= task; 6294 mutex_exit(&spa->spa_async_lock); 6295 } 6296 6297 /* 6298 * ========================================================================== 6299 * SPA syncing routines 6300 * ========================================================================== 6301 */ 6302 6303 static int 6304 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6305 { 6306 bpobj_t *bpo = arg; 6307 bpobj_enqueue(bpo, bp, tx); 6308 return (0); 6309 } 6310 6311 static int 6312 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6313 { 6314 zio_t *zio = arg; 6315 6316 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6317 zio->io_flags)); 6318 return (0); 6319 } 6320 6321 /* 6322 * Note: this simple function is not inlined to make it easier to dtrace the 6323 * amount of time spent syncing frees. 6324 */ 6325 static void 6326 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6327 { 6328 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6329 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6330 VERIFY(zio_wait(zio) == 0); 6331 } 6332 6333 /* 6334 * Note: this simple function is not inlined to make it easier to dtrace the 6335 * amount of time spent syncing deferred frees. 6336 */ 6337 static void 6338 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6339 { 6340 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6341 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6342 spa_free_sync_cb, zio, tx), ==, 0); 6343 VERIFY0(zio_wait(zio)); 6344 } 6345 6346 6347 static void 6348 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6349 { 6350 char *packed = NULL; 6351 size_t bufsize; 6352 size_t nvsize = 0; 6353 dmu_buf_t *db; 6354 6355 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6356 6357 /* 6358 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6359 * information. This avoids the dmu_buf_will_dirty() path and 6360 * saves us a pre-read to get data we don't actually care about. 6361 */ 6362 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6363 packed = kmem_alloc(bufsize, KM_SLEEP); 6364 6365 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6366 KM_SLEEP) == 0); 6367 bzero(packed + nvsize, bufsize - nvsize); 6368 6369 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6370 6371 kmem_free(packed, bufsize); 6372 6373 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6374 dmu_buf_will_dirty(db, tx); 6375 *(uint64_t *)db->db_data = nvsize; 6376 dmu_buf_rele(db, FTAG); 6377 } 6378 6379 static void 6380 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6381 const char *config, const char *entry) 6382 { 6383 nvlist_t *nvroot; 6384 nvlist_t **list; 6385 int i; 6386 6387 if (!sav->sav_sync) 6388 return; 6389 6390 /* 6391 * Update the MOS nvlist describing the list of available devices. 6392 * spa_validate_aux() will have already made sure this nvlist is 6393 * valid and the vdevs are labeled appropriately. 6394 */ 6395 if (sav->sav_object == 0) { 6396 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6397 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6398 sizeof (uint64_t), tx); 6399 VERIFY(zap_update(spa->spa_meta_objset, 6400 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6401 &sav->sav_object, tx) == 0); 6402 } 6403 6404 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6405 if (sav->sav_count == 0) { 6406 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6407 } else { 6408 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6409 for (i = 0; i < sav->sav_count; i++) 6410 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6411 B_FALSE, VDEV_CONFIG_L2CACHE); 6412 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6413 sav->sav_count) == 0); 6414 for (i = 0; i < sav->sav_count; i++) 6415 nvlist_free(list[i]); 6416 kmem_free(list, sav->sav_count * sizeof (void *)); 6417 } 6418 6419 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6420 nvlist_free(nvroot); 6421 6422 sav->sav_sync = B_FALSE; 6423 } 6424 6425 /* 6426 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6427 * The all-vdev ZAP must be empty. 6428 */ 6429 static void 6430 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6431 { 6432 spa_t *spa = vd->vdev_spa; 6433 if (vd->vdev_top_zap != 0) { 6434 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6435 vd->vdev_top_zap, tx)); 6436 } 6437 if (vd->vdev_leaf_zap != 0) { 6438 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6439 vd->vdev_leaf_zap, tx)); 6440 } 6441 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6442 spa_avz_build(vd->vdev_child[i], avz, tx); 6443 } 6444 } 6445 6446 static void 6447 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6448 { 6449 nvlist_t *config; 6450 6451 /* 6452 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6453 * its config may not be dirty but we still need to build per-vdev ZAPs. 6454 * Similarly, if the pool is being assembled (e.g. after a split), we 6455 * need to rebuild the AVZ although the config may not be dirty. 6456 */ 6457 if (list_is_empty(&spa->spa_config_dirty_list) && 6458 spa->spa_avz_action == AVZ_ACTION_NONE) 6459 return; 6460 6461 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6462 6463 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6464 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 6465 spa->spa_all_vdev_zaps != 0); 6466 6467 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6468 /* Make and build the new AVZ */ 6469 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6470 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6471 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6472 6473 /* Diff old AVZ with new one */ 6474 zap_cursor_t zc; 6475 zap_attribute_t za; 6476 6477 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6478 spa->spa_all_vdev_zaps); 6479 zap_cursor_retrieve(&zc, &za) == 0; 6480 zap_cursor_advance(&zc)) { 6481 uint64_t vdzap = za.za_first_integer; 6482 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6483 vdzap) == ENOENT) { 6484 /* 6485 * ZAP is listed in old AVZ but not in new one; 6486 * destroy it 6487 */ 6488 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6489 tx)); 6490 } 6491 } 6492 6493 zap_cursor_fini(&zc); 6494 6495 /* Destroy the old AVZ */ 6496 VERIFY0(zap_destroy(spa->spa_meta_objset, 6497 spa->spa_all_vdev_zaps, tx)); 6498 6499 /* Replace the old AVZ in the dir obj with the new one */ 6500 VERIFY0(zap_update(spa->spa_meta_objset, 6501 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6502 sizeof (new_avz), 1, &new_avz, tx)); 6503 6504 spa->spa_all_vdev_zaps = new_avz; 6505 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6506 zap_cursor_t zc; 6507 zap_attribute_t za; 6508 6509 /* Walk through the AVZ and destroy all listed ZAPs */ 6510 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6511 spa->spa_all_vdev_zaps); 6512 zap_cursor_retrieve(&zc, &za) == 0; 6513 zap_cursor_advance(&zc)) { 6514 uint64_t zap = za.za_first_integer; 6515 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6516 } 6517 6518 zap_cursor_fini(&zc); 6519 6520 /* Destroy and unlink the AVZ itself */ 6521 VERIFY0(zap_destroy(spa->spa_meta_objset, 6522 spa->spa_all_vdev_zaps, tx)); 6523 VERIFY0(zap_remove(spa->spa_meta_objset, 6524 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6525 spa->spa_all_vdev_zaps = 0; 6526 } 6527 6528 if (spa->spa_all_vdev_zaps == 0) { 6529 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6530 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6531 DMU_POOL_VDEV_ZAP_MAP, tx); 6532 } 6533 spa->spa_avz_action = AVZ_ACTION_NONE; 6534 6535 /* Create ZAPs for vdevs that don't have them. */ 6536 vdev_construct_zaps(spa->spa_root_vdev, tx); 6537 6538 config = spa_config_generate(spa, spa->spa_root_vdev, 6539 dmu_tx_get_txg(tx), B_FALSE); 6540 6541 /* 6542 * If we're upgrading the spa version then make sure that 6543 * the config object gets updated with the correct version. 6544 */ 6545 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6546 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6547 spa->spa_uberblock.ub_version); 6548 6549 spa_config_exit(spa, SCL_STATE, FTAG); 6550 6551 nvlist_free(spa->spa_config_syncing); 6552 spa->spa_config_syncing = config; 6553 6554 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6555 } 6556 6557 static void 6558 spa_sync_version(void *arg, dmu_tx_t *tx) 6559 { 6560 uint64_t *versionp = arg; 6561 uint64_t version = *versionp; 6562 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6563 6564 /* 6565 * Setting the version is special cased when first creating the pool. 6566 */ 6567 ASSERT(tx->tx_txg != TXG_INITIAL); 6568 6569 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6570 ASSERT(version >= spa_version(spa)); 6571 6572 spa->spa_uberblock.ub_version = version; 6573 vdev_config_dirty(spa->spa_root_vdev); 6574 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6575 } 6576 6577 /* 6578 * Set zpool properties. 6579 */ 6580 static void 6581 spa_sync_props(void *arg, dmu_tx_t *tx) 6582 { 6583 nvlist_t *nvp = arg; 6584 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6585 objset_t *mos = spa->spa_meta_objset; 6586 nvpair_t *elem = NULL; 6587 6588 mutex_enter(&spa->spa_props_lock); 6589 6590 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6591 uint64_t intval; 6592 char *strval, *fname; 6593 zpool_prop_t prop; 6594 const char *propname; 6595 zprop_type_t proptype; 6596 spa_feature_t fid; 6597 6598 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6599 case ZPOOL_PROP_INVAL: 6600 /* 6601 * We checked this earlier in spa_prop_validate(). 6602 */ 6603 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6604 6605 fname = strchr(nvpair_name(elem), '@') + 1; 6606 VERIFY0(zfeature_lookup_name(fname, &fid)); 6607 6608 spa_feature_enable(spa, fid, tx); 6609 spa_history_log_internal(spa, "set", tx, 6610 "%s=enabled", nvpair_name(elem)); 6611 break; 6612 6613 case ZPOOL_PROP_VERSION: 6614 intval = fnvpair_value_uint64(elem); 6615 /* 6616 * The version is synced seperatly before other 6617 * properties and should be correct by now. 6618 */ 6619 ASSERT3U(spa_version(spa), >=, intval); 6620 break; 6621 6622 case ZPOOL_PROP_ALTROOT: 6623 /* 6624 * 'altroot' is a non-persistent property. It should 6625 * have been set temporarily at creation or import time. 6626 */ 6627 ASSERT(spa->spa_root != NULL); 6628 break; 6629 6630 case ZPOOL_PROP_READONLY: 6631 case ZPOOL_PROP_CACHEFILE: 6632 /* 6633 * 'readonly' and 'cachefile' are also non-persisitent 6634 * properties. 6635 */ 6636 break; 6637 case ZPOOL_PROP_COMMENT: 6638 strval = fnvpair_value_string(elem); 6639 if (spa->spa_comment != NULL) 6640 spa_strfree(spa->spa_comment); 6641 spa->spa_comment = spa_strdup(strval); 6642 /* 6643 * We need to dirty the configuration on all the vdevs 6644 * so that their labels get updated. It's unnecessary 6645 * to do this for pool creation since the vdev's 6646 * configuratoin has already been dirtied. 6647 */ 6648 if (tx->tx_txg != TXG_INITIAL) 6649 vdev_config_dirty(spa->spa_root_vdev); 6650 spa_history_log_internal(spa, "set", tx, 6651 "%s=%s", nvpair_name(elem), strval); 6652 break; 6653 default: 6654 /* 6655 * Set pool property values in the poolprops mos object. 6656 */ 6657 if (spa->spa_pool_props_object == 0) { 6658 spa->spa_pool_props_object = 6659 zap_create_link(mos, DMU_OT_POOL_PROPS, 6660 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6661 tx); 6662 } 6663 6664 /* normalize the property name */ 6665 propname = zpool_prop_to_name(prop); 6666 proptype = zpool_prop_get_type(prop); 6667 6668 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6669 ASSERT(proptype == PROP_TYPE_STRING); 6670 strval = fnvpair_value_string(elem); 6671 VERIFY0(zap_update(mos, 6672 spa->spa_pool_props_object, propname, 6673 1, strlen(strval) + 1, strval, tx)); 6674 spa_history_log_internal(spa, "set", tx, 6675 "%s=%s", nvpair_name(elem), strval); 6676 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6677 intval = fnvpair_value_uint64(elem); 6678 6679 if (proptype == PROP_TYPE_INDEX) { 6680 const char *unused; 6681 VERIFY0(zpool_prop_index_to_string( 6682 prop, intval, &unused)); 6683 } 6684 VERIFY0(zap_update(mos, 6685 spa->spa_pool_props_object, propname, 6686 8, 1, &intval, tx)); 6687 spa_history_log_internal(spa, "set", tx, 6688 "%s=%lld", nvpair_name(elem), intval); 6689 } else { 6690 ASSERT(0); /* not allowed */ 6691 } 6692 6693 switch (prop) { 6694 case ZPOOL_PROP_DELEGATION: 6695 spa->spa_delegation = intval; 6696 break; 6697 case ZPOOL_PROP_BOOTFS: 6698 spa->spa_bootfs = intval; 6699 break; 6700 case ZPOOL_PROP_FAILUREMODE: 6701 spa->spa_failmode = intval; 6702 break; 6703 case ZPOOL_PROP_AUTOEXPAND: 6704 spa->spa_autoexpand = intval; 6705 if (tx->tx_txg != TXG_INITIAL) 6706 spa_async_request(spa, 6707 SPA_ASYNC_AUTOEXPAND); 6708 break; 6709 case ZPOOL_PROP_DEDUPDITTO: 6710 spa->spa_dedup_ditto = intval; 6711 break; 6712 default: 6713 break; 6714 } 6715 } 6716 6717 } 6718 6719 mutex_exit(&spa->spa_props_lock); 6720 } 6721 6722 /* 6723 * Perform one-time upgrade on-disk changes. spa_version() does not 6724 * reflect the new version this txg, so there must be no changes this 6725 * txg to anything that the upgrade code depends on after it executes. 6726 * Therefore this must be called after dsl_pool_sync() does the sync 6727 * tasks. 6728 */ 6729 static void 6730 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6731 { 6732 dsl_pool_t *dp = spa->spa_dsl_pool; 6733 6734 ASSERT(spa->spa_sync_pass == 1); 6735 6736 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6737 6738 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6739 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6740 dsl_pool_create_origin(dp, tx); 6741 6742 /* Keeping the origin open increases spa_minref */ 6743 spa->spa_minref += 3; 6744 } 6745 6746 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6747 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6748 dsl_pool_upgrade_clones(dp, tx); 6749 } 6750 6751 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6752 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6753 dsl_pool_upgrade_dir_clones(dp, tx); 6754 6755 /* Keeping the freedir open increases spa_minref */ 6756 spa->spa_minref += 3; 6757 } 6758 6759 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6760 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6761 spa_feature_create_zap_objects(spa, tx); 6762 } 6763 6764 /* 6765 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6766 * when possibility to use lz4 compression for metadata was added 6767 * Old pools that have this feature enabled must be upgraded to have 6768 * this feature active 6769 */ 6770 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6771 boolean_t lz4_en = spa_feature_is_enabled(spa, 6772 SPA_FEATURE_LZ4_COMPRESS); 6773 boolean_t lz4_ac = spa_feature_is_active(spa, 6774 SPA_FEATURE_LZ4_COMPRESS); 6775 6776 if (lz4_en && !lz4_ac) 6777 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6778 } 6779 6780 /* 6781 * If we haven't written the salt, do so now. Note that the 6782 * feature may not be activated yet, but that's fine since 6783 * the presence of this ZAP entry is backwards compatible. 6784 */ 6785 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6786 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6787 VERIFY0(zap_add(spa->spa_meta_objset, 6788 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6789 sizeof (spa->spa_cksum_salt.zcs_bytes), 6790 spa->spa_cksum_salt.zcs_bytes, tx)); 6791 } 6792 6793 rrw_exit(&dp->dp_config_rwlock, FTAG); 6794 } 6795 6796 static void 6797 vdev_indirect_state_sync_verify(vdev_t *vd) 6798 { 6799 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6800 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 6801 6802 if (vd->vdev_ops == &vdev_indirect_ops) { 6803 ASSERT(vim != NULL); 6804 ASSERT(vib != NULL); 6805 } 6806 6807 if (vdev_obsolete_sm_object(vd) != 0) { 6808 ASSERT(vd->vdev_obsolete_sm != NULL); 6809 ASSERT(vd->vdev_removing || 6810 vd->vdev_ops == &vdev_indirect_ops); 6811 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 6812 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 6813 6814 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 6815 space_map_object(vd->vdev_obsolete_sm)); 6816 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 6817 space_map_allocated(vd->vdev_obsolete_sm)); 6818 } 6819 ASSERT(vd->vdev_obsolete_segments != NULL); 6820 6821 /* 6822 * Since frees / remaps to an indirect vdev can only 6823 * happen in syncing context, the obsolete segments 6824 * tree must be empty when we start syncing. 6825 */ 6826 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 6827 } 6828 6829 /* 6830 * Sync the specified transaction group. New blocks may be dirtied as 6831 * part of the process, so we iterate until it converges. 6832 */ 6833 void 6834 spa_sync(spa_t *spa, uint64_t txg) 6835 { 6836 dsl_pool_t *dp = spa->spa_dsl_pool; 6837 objset_t *mos = spa->spa_meta_objset; 6838 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6839 vdev_t *rvd = spa->spa_root_vdev; 6840 vdev_t *vd; 6841 dmu_tx_t *tx; 6842 int error; 6843 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6844 zfs_vdev_queue_depth_pct / 100; 6845 6846 VERIFY(spa_writeable(spa)); 6847 6848 /* 6849 * Wait for i/os issued in open context that need to complete 6850 * before this txg syncs. 6851 */ 6852 VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); 6853 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); 6854 6855 /* 6856 * Lock out configuration changes. 6857 */ 6858 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6859 6860 spa->spa_syncing_txg = txg; 6861 spa->spa_sync_pass = 0; 6862 6863 mutex_enter(&spa->spa_alloc_lock); 6864 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6865 mutex_exit(&spa->spa_alloc_lock); 6866 6867 /* 6868 * If there are any pending vdev state changes, convert them 6869 * into config changes that go out with this transaction group. 6870 */ 6871 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6872 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6873 /* 6874 * We need the write lock here because, for aux vdevs, 6875 * calling vdev_config_dirty() modifies sav_config. 6876 * This is ugly and will become unnecessary when we 6877 * eliminate the aux vdev wart by integrating all vdevs 6878 * into the root vdev tree. 6879 */ 6880 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6881 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6882 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6883 vdev_state_clean(vd); 6884 vdev_config_dirty(vd); 6885 } 6886 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6887 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6888 } 6889 spa_config_exit(spa, SCL_STATE, FTAG); 6890 6891 tx = dmu_tx_create_assigned(dp, txg); 6892 6893 spa->spa_sync_starttime = gethrtime(); 6894 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6895 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6896 6897 /* 6898 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6899 * set spa_deflate if we have no raid-z vdevs. 6900 */ 6901 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6902 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6903 int i; 6904 6905 for (i = 0; i < rvd->vdev_children; i++) { 6906 vd = rvd->vdev_child[i]; 6907 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6908 break; 6909 } 6910 if (i == rvd->vdev_children) { 6911 spa->spa_deflate = TRUE; 6912 VERIFY(0 == zap_add(spa->spa_meta_objset, 6913 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6914 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6915 } 6916 } 6917 6918 /* 6919 * Set the top-level vdev's max queue depth. Evaluate each 6920 * top-level's async write queue depth in case it changed. 6921 * The max queue depth will not change in the middle of syncing 6922 * out this txg. 6923 */ 6924 uint64_t queue_depth_total = 0; 6925 for (int c = 0; c < rvd->vdev_children; c++) { 6926 vdev_t *tvd = rvd->vdev_child[c]; 6927 metaslab_group_t *mg = tvd->vdev_mg; 6928 6929 if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6930 !metaslab_group_initialized(mg)) 6931 continue; 6932 6933 /* 6934 * It is safe to do a lock-free check here because only async 6935 * allocations look at mg_max_alloc_queue_depth, and async 6936 * allocations all happen from spa_sync(). 6937 */ 6938 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6939 mg->mg_max_alloc_queue_depth = max_queue_depth; 6940 queue_depth_total += mg->mg_max_alloc_queue_depth; 6941 } 6942 metaslab_class_t *mc = spa_normal_class(spa); 6943 ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6944 mc->mc_alloc_max_slots = queue_depth_total; 6945 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6946 6947 ASSERT3U(mc->mc_alloc_max_slots, <=, 6948 max_queue_depth * rvd->vdev_children); 6949 6950 for (int c = 0; c < rvd->vdev_children; c++) { 6951 vdev_t *vd = rvd->vdev_child[c]; 6952 vdev_indirect_state_sync_verify(vd); 6953 6954 if (vdev_indirect_should_condense(vd)) { 6955 spa_condense_indirect_start_sync(vd, tx); 6956 break; 6957 } 6958 } 6959 6960 /* 6961 * Iterate to convergence. 6962 */ 6963 do { 6964 int pass = ++spa->spa_sync_pass; 6965 6966 spa_sync_config_object(spa, tx); 6967 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6968 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6969 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6970 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6971 spa_errlog_sync(spa, txg); 6972 dsl_pool_sync(dp, txg); 6973 6974 if (pass < zfs_sync_pass_deferred_free) { 6975 spa_sync_frees(spa, free_bpl, tx); 6976 } else { 6977 /* 6978 * We can not defer frees in pass 1, because 6979 * we sync the deferred frees later in pass 1. 6980 */ 6981 ASSERT3U(pass, >, 1); 6982 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6983 &spa->spa_deferred_bpobj, tx); 6984 } 6985 6986 ddt_sync(spa, txg); 6987 dsl_scan_sync(dp, tx); 6988 6989 if (spa->spa_vdev_removal != NULL) 6990 svr_sync(spa, tx); 6991 6992 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6993 != NULL) 6994 vdev_sync(vd, txg); 6995 6996 if (pass == 1) { 6997 spa_sync_upgrades(spa, tx); 6998 ASSERT3U(txg, >=, 6999 spa->spa_uberblock.ub_rootbp.blk_birth); 7000 /* 7001 * Note: We need to check if the MOS is dirty 7002 * because we could have marked the MOS dirty 7003 * without updating the uberblock (e.g. if we 7004 * have sync tasks but no dirty user data). We 7005 * need to check the uberblock's rootbp because 7006 * it is updated if we have synced out dirty 7007 * data (though in this case the MOS will most 7008 * likely also be dirty due to second order 7009 * effects, we don't want to rely on that here). 7010 */ 7011 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7012 !dmu_objset_is_dirty(mos, txg)) { 7013 /* 7014 * Nothing changed on the first pass, 7015 * therefore this TXG is a no-op. Avoid 7016 * syncing deferred frees, so that we 7017 * can keep this TXG as a no-op. 7018 */ 7019 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7020 txg)); 7021 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7022 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7023 break; 7024 } 7025 spa_sync_deferred_frees(spa, tx); 7026 } 7027 7028 } while (dmu_objset_is_dirty(mos, txg)); 7029 7030 if (!list_is_empty(&spa->spa_config_dirty_list)) { 7031 /* 7032 * Make sure that the number of ZAPs for all the vdevs matches 7033 * the number of ZAPs in the per-vdev ZAP list. This only gets 7034 * called if the config is dirty; otherwise there may be 7035 * outstanding AVZ operations that weren't completed in 7036 * spa_sync_config_object. 7037 */ 7038 uint64_t all_vdev_zap_entry_count; 7039 ASSERT0(zap_count(spa->spa_meta_objset, 7040 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7041 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7042 all_vdev_zap_entry_count); 7043 } 7044 7045 if (spa->spa_vdev_removal != NULL) { 7046 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 7047 } 7048 7049 /* 7050 * Rewrite the vdev configuration (which includes the uberblock) 7051 * to commit the transaction group. 7052 * 7053 * If there are no dirty vdevs, we sync the uberblock to a few 7054 * random top-level vdevs that are known to be visible in the 7055 * config cache (see spa_vdev_add() for a complete description). 7056 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7057 */ 7058 for (;;) { 7059 /* 7060 * We hold SCL_STATE to prevent vdev open/close/etc. 7061 * while we're attempting to write the vdev labels. 7062 */ 7063 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7064 7065 if (list_is_empty(&spa->spa_config_dirty_list)) { 7066 vdev_t *svd[SPA_DVAS_PER_BP]; 7067 int svdcount = 0; 7068 int children = rvd->vdev_children; 7069 int c0 = spa_get_random(children); 7070 7071 for (int c = 0; c < children; c++) { 7072 vd = rvd->vdev_child[(c0 + c) % children]; 7073 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 7074 !vdev_is_concrete(vd)) 7075 continue; 7076 svd[svdcount++] = vd; 7077 if (svdcount == SPA_DVAS_PER_BP) 7078 break; 7079 } 7080 error = vdev_config_sync(svd, svdcount, txg); 7081 } else { 7082 error = vdev_config_sync(rvd->vdev_child, 7083 rvd->vdev_children, txg); 7084 } 7085 7086 if (error == 0) 7087 spa->spa_last_synced_guid = rvd->vdev_guid; 7088 7089 spa_config_exit(spa, SCL_STATE, FTAG); 7090 7091 if (error == 0) 7092 break; 7093 zio_suspend(spa, NULL); 7094 zio_resume_wait(spa); 7095 } 7096 dmu_tx_commit(tx); 7097 7098 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7099 7100 /* 7101 * Clear the dirty config list. 7102 */ 7103 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7104 vdev_config_clean(vd); 7105 7106 /* 7107 * Now that the new config has synced transactionally, 7108 * let it become visible to the config cache. 7109 */ 7110 if (spa->spa_config_syncing != NULL) { 7111 spa_config_set(spa, spa->spa_config_syncing); 7112 spa->spa_config_txg = txg; 7113 spa->spa_config_syncing = NULL; 7114 } 7115 7116 dsl_pool_sync_done(dp, txg); 7117 7118 mutex_enter(&spa->spa_alloc_lock); 7119 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7120 mutex_exit(&spa->spa_alloc_lock); 7121 7122 /* 7123 * Update usable space statistics. 7124 */ 7125 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7126 vdev_sync_done(vd, txg); 7127 7128 spa_update_dspace(spa); 7129 7130 /* 7131 * It had better be the case that we didn't dirty anything 7132 * since vdev_config_sync(). 7133 */ 7134 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7135 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7136 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7137 7138 spa->spa_sync_pass = 0; 7139 7140 /* 7141 * Update the last synced uberblock here. We want to do this at 7142 * the end of spa_sync() so that consumers of spa_last_synced_txg() 7143 * will be guaranteed that all the processing associated with 7144 * that txg has been completed. 7145 */ 7146 spa->spa_ubsync = spa->spa_uberblock; 7147 spa_config_exit(spa, SCL_CONFIG, FTAG); 7148 7149 spa_handle_ignored_writes(spa); 7150 7151 /* 7152 * If any async tasks have been requested, kick them off. 7153 */ 7154 spa_async_dispatch(spa); 7155 } 7156 7157 /* 7158 * Sync all pools. We don't want to hold the namespace lock across these 7159 * operations, so we take a reference on the spa_t and drop the lock during the 7160 * sync. 7161 */ 7162 void 7163 spa_sync_allpools(void) 7164 { 7165 spa_t *spa = NULL; 7166 mutex_enter(&spa_namespace_lock); 7167 while ((spa = spa_next(spa)) != NULL) { 7168 if (spa_state(spa) != POOL_STATE_ACTIVE || 7169 !spa_writeable(spa) || spa_suspended(spa)) 7170 continue; 7171 spa_open_ref(spa, FTAG); 7172 mutex_exit(&spa_namespace_lock); 7173 txg_wait_synced(spa_get_dsl(spa), 0); 7174 mutex_enter(&spa_namespace_lock); 7175 spa_close(spa, FTAG); 7176 } 7177 mutex_exit(&spa_namespace_lock); 7178 } 7179 7180 /* 7181 * ========================================================================== 7182 * Miscellaneous routines 7183 * ========================================================================== 7184 */ 7185 7186 /* 7187 * Remove all pools in the system. 7188 */ 7189 void 7190 spa_evict_all(void) 7191 { 7192 spa_t *spa; 7193 7194 /* 7195 * Remove all cached state. All pools should be closed now, 7196 * so every spa in the AVL tree should be unreferenced. 7197 */ 7198 mutex_enter(&spa_namespace_lock); 7199 while ((spa = spa_next(NULL)) != NULL) { 7200 /* 7201 * Stop async tasks. The async thread may need to detach 7202 * a device that's been replaced, which requires grabbing 7203 * spa_namespace_lock, so we must drop it here. 7204 */ 7205 spa_open_ref(spa, FTAG); 7206 mutex_exit(&spa_namespace_lock); 7207 spa_async_suspend(spa); 7208 mutex_enter(&spa_namespace_lock); 7209 spa_close(spa, FTAG); 7210 7211 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7212 spa_unload(spa); 7213 spa_deactivate(spa); 7214 } 7215 spa_remove(spa); 7216 } 7217 mutex_exit(&spa_namespace_lock); 7218 } 7219 7220 vdev_t * 7221 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7222 { 7223 vdev_t *vd; 7224 int i; 7225 7226 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7227 return (vd); 7228 7229 if (aux) { 7230 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7231 vd = spa->spa_l2cache.sav_vdevs[i]; 7232 if (vd->vdev_guid == guid) 7233 return (vd); 7234 } 7235 7236 for (i = 0; i < spa->spa_spares.sav_count; i++) { 7237 vd = spa->spa_spares.sav_vdevs[i]; 7238 if (vd->vdev_guid == guid) 7239 return (vd); 7240 } 7241 } 7242 7243 return (NULL); 7244 } 7245 7246 void 7247 spa_upgrade(spa_t *spa, uint64_t version) 7248 { 7249 ASSERT(spa_writeable(spa)); 7250 7251 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7252 7253 /* 7254 * This should only be called for a non-faulted pool, and since a 7255 * future version would result in an unopenable pool, this shouldn't be 7256 * possible. 7257 */ 7258 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7259 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7260 7261 spa->spa_uberblock.ub_version = version; 7262 vdev_config_dirty(spa->spa_root_vdev); 7263 7264 spa_config_exit(spa, SCL_ALL, FTAG); 7265 7266 txg_wait_synced(spa_get_dsl(spa), 0); 7267 } 7268 7269 boolean_t 7270 spa_has_spare(spa_t *spa, uint64_t guid) 7271 { 7272 int i; 7273 uint64_t spareguid; 7274 spa_aux_vdev_t *sav = &spa->spa_spares; 7275 7276 for (i = 0; i < sav->sav_count; i++) 7277 if (sav->sav_vdevs[i]->vdev_guid == guid) 7278 return (B_TRUE); 7279 7280 for (i = 0; i < sav->sav_npending; i++) { 7281 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7282 &spareguid) == 0 && spareguid == guid) 7283 return (B_TRUE); 7284 } 7285 7286 return (B_FALSE); 7287 } 7288 7289 /* 7290 * Check if a pool has an active shared spare device. 7291 * Note: reference count of an active spare is 2, as a spare and as a replace 7292 */ 7293 static boolean_t 7294 spa_has_active_shared_spare(spa_t *spa) 7295 { 7296 int i, refcnt; 7297 uint64_t pool; 7298 spa_aux_vdev_t *sav = &spa->spa_spares; 7299 7300 for (i = 0; i < sav->sav_count; i++) { 7301 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7302 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7303 refcnt > 2) 7304 return (B_TRUE); 7305 } 7306 7307 return (B_FALSE); 7308 } 7309 7310 sysevent_t * 7311 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7312 { 7313 sysevent_t *ev = NULL; 7314 #ifdef _KERNEL 7315 sysevent_attr_list_t *attr = NULL; 7316 sysevent_value_t value; 7317 7318 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7319 SE_SLEEP); 7320 ASSERT(ev != NULL); 7321 7322 value.value_type = SE_DATA_TYPE_STRING; 7323 value.value.sv_string = spa_name(spa); 7324 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7325 goto done; 7326 7327 value.value_type = SE_DATA_TYPE_UINT64; 7328 value.value.sv_uint64 = spa_guid(spa); 7329 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7330 goto done; 7331 7332 if (vd) { 7333 value.value_type = SE_DATA_TYPE_UINT64; 7334 value.value.sv_uint64 = vd->vdev_guid; 7335 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7336 SE_SLEEP) != 0) 7337 goto done; 7338 7339 if (vd->vdev_path) { 7340 value.value_type = SE_DATA_TYPE_STRING; 7341 value.value.sv_string = vd->vdev_path; 7342 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7343 &value, SE_SLEEP) != 0) 7344 goto done; 7345 } 7346 } 7347 7348 if (hist_nvl != NULL) { 7349 fnvlist_merge((nvlist_t *)attr, hist_nvl); 7350 } 7351 7352 if (sysevent_attach_attributes(ev, attr) != 0) 7353 goto done; 7354 attr = NULL; 7355 7356 done: 7357 if (attr) 7358 sysevent_free_attr(attr); 7359 7360 #endif 7361 return (ev); 7362 } 7363 7364 void 7365 spa_event_post(sysevent_t *ev) 7366 { 7367 #ifdef _KERNEL 7368 sysevent_id_t eid; 7369 7370 (void) log_sysevent(ev, SE_SLEEP, &eid); 7371 sysevent_free(ev); 7372 #endif 7373 } 7374 7375 void 7376 spa_event_discard(sysevent_t *ev) 7377 { 7378 #ifdef _KERNEL 7379 sysevent_free(ev); 7380 #endif 7381 } 7382 7383 /* 7384 * Post a sysevent corresponding to the given event. The 'name' must be one of 7385 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7386 * filled in from the spa and (optionally) the vdev and history nvl. This 7387 * doesn't do anything in the userland libzpool, as we don't want consumers to 7388 * misinterpret ztest or zdb as real changes. 7389 */ 7390 void 7391 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7392 { 7393 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 7394 } 7395