1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright 2017 Joyent, Inc. 31 * Copyright (c) 2017 Datto Inc. 32 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. 33 */ 34 35 /* 36 * SPA: Storage Pool Allocator 37 * 38 * This file contains all the routines used when modifying on-disk SPA state. 39 * This includes opening, importing, destroying, exporting a pool, and syncing a 40 * pool. 41 */ 42 43 #include <sys/zfs_context.h> 44 #include <sys/fm/fs/zfs.h> 45 #include <sys/spa_impl.h> 46 #include <sys/zio.h> 47 #include <sys/zio_checksum.h> 48 #include <sys/dmu.h> 49 #include <sys/dmu_tx.h> 50 #include <sys/zap.h> 51 #include <sys/zil.h> 52 #include <sys/ddt.h> 53 #include <sys/vdev_impl.h> 54 #include <sys/vdev_removal.h> 55 #include <sys/vdev_indirect_mapping.h> 56 #include <sys/vdev_indirect_births.h> 57 #include <sys/metaslab.h> 58 #include <sys/metaslab_impl.h> 59 #include <sys/uberblock_impl.h> 60 #include <sys/txg.h> 61 #include <sys/avl.h> 62 #include <sys/bpobj.h> 63 #include <sys/dmu_traverse.h> 64 #include <sys/dmu_objset.h> 65 #include <sys/unique.h> 66 #include <sys/dsl_pool.h> 67 #include <sys/dsl_dataset.h> 68 #include <sys/dsl_dir.h> 69 #include <sys/dsl_prop.h> 70 #include <sys/dsl_synctask.h> 71 #include <sys/fs/zfs.h> 72 #include <sys/arc.h> 73 #include <sys/callb.h> 74 #include <sys/systeminfo.h> 75 #include <sys/spa_boot.h> 76 #include <sys/zfs_ioctl.h> 77 #include <sys/dsl_scan.h> 78 #include <sys/zfeature.h> 79 #include <sys/dsl_destroy.h> 80 #include <sys/abd.h> 81 82 #ifdef _KERNEL 83 #include <sys/bootprops.h> 84 #include <sys/callb.h> 85 #include <sys/cpupart.h> 86 #include <sys/pool.h> 87 #include <sys/sysdc.h> 88 #include <sys/zone.h> 89 #endif /* _KERNEL */ 90 91 #include "zfs_prop.h" 92 #include "zfs_comutil.h" 93 94 /* 95 * The interval, in seconds, at which failed configuration cache file writes 96 * should be retried. 97 */ 98 int zfs_ccw_retry_interval = 300; 99 100 typedef enum zti_modes { 101 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 102 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 103 ZTI_MODE_NULL, /* don't create a taskq */ 104 ZTI_NMODES 105 } zti_modes_t; 106 107 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 108 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 109 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 110 111 #define ZTI_N(n) ZTI_P(n, 1) 112 #define ZTI_ONE ZTI_N(1) 113 114 typedef struct zio_taskq_info { 115 zti_modes_t zti_mode; 116 uint_t zti_value; 117 uint_t zti_count; 118 } zio_taskq_info_t; 119 120 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 121 "issue", "issue_high", "intr", "intr_high" 122 }; 123 124 /* 125 * This table defines the taskq settings for each ZFS I/O type. When 126 * initializing a pool, we use this table to create an appropriately sized 127 * taskq. Some operations are low volume and therefore have a small, static 128 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 129 * macros. Other operations process a large amount of data; the ZTI_BATCH 130 * macro causes us to create a taskq oriented for throughput. Some operations 131 * are so high frequency and short-lived that the taskq itself can become a a 132 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 133 * additional degree of parallelism specified by the number of threads per- 134 * taskq and the number of taskqs; when dispatching an event in this case, the 135 * particular taskq is chosen at random. 136 * 137 * The different taskq priorities are to handle the different contexts (issue 138 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 139 * need to be handled with minimum delay. 140 */ 141 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 142 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 143 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 144 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 145 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 146 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 147 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 148 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 149 }; 150 151 static void spa_sync_version(void *arg, dmu_tx_t *tx); 152 static void spa_sync_props(void *arg, dmu_tx_t *tx); 153 static boolean_t spa_has_active_shared_spare(spa_t *spa); 154 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 155 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 156 char **ereport); 157 static void spa_vdev_resilver_done(spa_t *spa); 158 159 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 160 id_t zio_taskq_psrset_bind = PS_NONE; 161 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 162 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 163 164 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 165 extern int zfs_sync_pass_deferred_free; 166 167 /* 168 * Report any spa_load_verify errors found, but do not fail spa_load. 169 * This is used by zdb to analyze non-idle pools. 170 */ 171 boolean_t spa_load_verify_dryrun = B_FALSE; 172 173 /* 174 * This (illegal) pool name is used when temporarily importing a spa_t in order 175 * to get the vdev stats associated with the imported devices. 176 */ 177 #define TRYIMPORT_NAME "$import" 178 179 /* 180 * ========================================================================== 181 * SPA properties routines 182 * ========================================================================== 183 */ 184 185 /* 186 * Add a (source=src, propname=propval) list to an nvlist. 187 */ 188 static void 189 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 190 uint64_t intval, zprop_source_t src) 191 { 192 const char *propname = zpool_prop_to_name(prop); 193 nvlist_t *propval; 194 195 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 196 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 197 198 if (strval != NULL) 199 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 200 else 201 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 202 203 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 204 nvlist_free(propval); 205 } 206 207 /* 208 * Get property values from the spa configuration. 209 */ 210 static void 211 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 212 { 213 vdev_t *rvd = spa->spa_root_vdev; 214 dsl_pool_t *pool = spa->spa_dsl_pool; 215 uint64_t size, alloc, cap, version; 216 zprop_source_t src = ZPROP_SRC_NONE; 217 spa_config_dirent_t *dp; 218 metaslab_class_t *mc = spa_normal_class(spa); 219 220 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 221 222 if (rvd != NULL) { 223 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 224 size = metaslab_class_get_space(spa_normal_class(spa)); 225 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 226 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 227 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 228 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 229 size - alloc, src); 230 231 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 232 metaslab_class_fragmentation(mc), src); 233 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 234 metaslab_class_expandable_space(mc), src); 235 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 236 (spa_mode(spa) == FREAD), src); 237 238 cap = (size == 0) ? 0 : (alloc * 100 / size); 239 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 240 241 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 242 ddt_get_pool_dedup_ratio(spa), src); 243 244 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 245 rvd->vdev_state, src); 246 247 version = spa_version(spa); 248 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 249 src = ZPROP_SRC_DEFAULT; 250 else 251 src = ZPROP_SRC_LOCAL; 252 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 253 } 254 255 if (pool != NULL) { 256 /* 257 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 258 * when opening pools before this version freedir will be NULL. 259 */ 260 if (pool->dp_free_dir != NULL) { 261 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 262 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 263 src); 264 } else { 265 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 266 NULL, 0, src); 267 } 268 269 if (pool->dp_leak_dir != NULL) { 270 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 271 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 272 src); 273 } else { 274 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 275 NULL, 0, src); 276 } 277 } 278 279 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 280 281 if (spa->spa_comment != NULL) { 282 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 283 0, ZPROP_SRC_LOCAL); 284 } 285 286 if (spa->spa_root != NULL) 287 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 288 0, ZPROP_SRC_LOCAL); 289 290 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 291 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 292 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 293 } else { 294 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 295 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 296 } 297 298 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 299 if (dp->scd_path == NULL) { 300 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 301 "none", 0, ZPROP_SRC_LOCAL); 302 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 303 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 304 dp->scd_path, 0, ZPROP_SRC_LOCAL); 305 } 306 } 307 } 308 309 /* 310 * Get zpool property values. 311 */ 312 int 313 spa_prop_get(spa_t *spa, nvlist_t **nvp) 314 { 315 objset_t *mos = spa->spa_meta_objset; 316 zap_cursor_t zc; 317 zap_attribute_t za; 318 int err; 319 320 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 321 322 mutex_enter(&spa->spa_props_lock); 323 324 /* 325 * Get properties from the spa config. 326 */ 327 spa_prop_get_config(spa, nvp); 328 329 /* If no pool property object, no more prop to get. */ 330 if (mos == NULL || spa->spa_pool_props_object == 0) { 331 mutex_exit(&spa->spa_props_lock); 332 return (0); 333 } 334 335 /* 336 * Get properties from the MOS pool property object. 337 */ 338 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 339 (err = zap_cursor_retrieve(&zc, &za)) == 0; 340 zap_cursor_advance(&zc)) { 341 uint64_t intval = 0; 342 char *strval = NULL; 343 zprop_source_t src = ZPROP_SRC_DEFAULT; 344 zpool_prop_t prop; 345 346 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 347 continue; 348 349 switch (za.za_integer_length) { 350 case 8: 351 /* integer property */ 352 if (za.za_first_integer != 353 zpool_prop_default_numeric(prop)) 354 src = ZPROP_SRC_LOCAL; 355 356 if (prop == ZPOOL_PROP_BOOTFS) { 357 dsl_pool_t *dp; 358 dsl_dataset_t *ds = NULL; 359 360 dp = spa_get_dsl(spa); 361 dsl_pool_config_enter(dp, FTAG); 362 if (err = dsl_dataset_hold_obj(dp, 363 za.za_first_integer, FTAG, &ds)) { 364 dsl_pool_config_exit(dp, FTAG); 365 break; 366 } 367 368 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 369 KM_SLEEP); 370 dsl_dataset_name(ds, strval); 371 dsl_dataset_rele(ds, FTAG); 372 dsl_pool_config_exit(dp, FTAG); 373 } else { 374 strval = NULL; 375 intval = za.za_first_integer; 376 } 377 378 spa_prop_add_list(*nvp, prop, strval, intval, src); 379 380 if (strval != NULL) 381 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 382 383 break; 384 385 case 1: 386 /* string property */ 387 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 388 err = zap_lookup(mos, spa->spa_pool_props_object, 389 za.za_name, 1, za.za_num_integers, strval); 390 if (err) { 391 kmem_free(strval, za.za_num_integers); 392 break; 393 } 394 spa_prop_add_list(*nvp, prop, strval, 0, src); 395 kmem_free(strval, za.za_num_integers); 396 break; 397 398 default: 399 break; 400 } 401 } 402 zap_cursor_fini(&zc); 403 mutex_exit(&spa->spa_props_lock); 404 out: 405 if (err && err != ENOENT) { 406 nvlist_free(*nvp); 407 *nvp = NULL; 408 return (err); 409 } 410 411 return (0); 412 } 413 414 /* 415 * Validate the given pool properties nvlist and modify the list 416 * for the property values to be set. 417 */ 418 static int 419 spa_prop_validate(spa_t *spa, nvlist_t *props) 420 { 421 nvpair_t *elem; 422 int error = 0, reset_bootfs = 0; 423 uint64_t objnum = 0; 424 boolean_t has_feature = B_FALSE; 425 426 elem = NULL; 427 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 428 uint64_t intval; 429 char *strval, *slash, *check, *fname; 430 const char *propname = nvpair_name(elem); 431 zpool_prop_t prop = zpool_name_to_prop(propname); 432 433 switch (prop) { 434 case ZPOOL_PROP_INVAL: 435 if (!zpool_prop_feature(propname)) { 436 error = SET_ERROR(EINVAL); 437 break; 438 } 439 440 /* 441 * Sanitize the input. 442 */ 443 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 444 error = SET_ERROR(EINVAL); 445 break; 446 } 447 448 if (nvpair_value_uint64(elem, &intval) != 0) { 449 error = SET_ERROR(EINVAL); 450 break; 451 } 452 453 if (intval != 0) { 454 error = SET_ERROR(EINVAL); 455 break; 456 } 457 458 fname = strchr(propname, '@') + 1; 459 if (zfeature_lookup_name(fname, NULL) != 0) { 460 error = SET_ERROR(EINVAL); 461 break; 462 } 463 464 has_feature = B_TRUE; 465 break; 466 467 case ZPOOL_PROP_VERSION: 468 error = nvpair_value_uint64(elem, &intval); 469 if (!error && 470 (intval < spa_version(spa) || 471 intval > SPA_VERSION_BEFORE_FEATURES || 472 has_feature)) 473 error = SET_ERROR(EINVAL); 474 break; 475 476 case ZPOOL_PROP_DELEGATION: 477 case ZPOOL_PROP_AUTOREPLACE: 478 case ZPOOL_PROP_LISTSNAPS: 479 case ZPOOL_PROP_AUTOEXPAND: 480 error = nvpair_value_uint64(elem, &intval); 481 if (!error && intval > 1) 482 error = SET_ERROR(EINVAL); 483 break; 484 485 case ZPOOL_PROP_BOOTFS: 486 /* 487 * If the pool version is less than SPA_VERSION_BOOTFS, 488 * or the pool is still being created (version == 0), 489 * the bootfs property cannot be set. 490 */ 491 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 492 error = SET_ERROR(ENOTSUP); 493 break; 494 } 495 496 /* 497 * Make sure the vdev config is bootable 498 */ 499 if (!vdev_is_bootable(spa->spa_root_vdev)) { 500 error = SET_ERROR(ENOTSUP); 501 break; 502 } 503 504 reset_bootfs = 1; 505 506 error = nvpair_value_string(elem, &strval); 507 508 if (!error) { 509 objset_t *os; 510 uint64_t propval; 511 512 if (strval == NULL || strval[0] == '\0') { 513 objnum = zpool_prop_default_numeric( 514 ZPOOL_PROP_BOOTFS); 515 break; 516 } 517 518 if (error = dmu_objset_hold(strval, FTAG, &os)) 519 break; 520 521 /* 522 * Must be ZPL, and its property settings 523 * must be supported by GRUB (compression 524 * is not gzip, and large blocks are not used). 525 */ 526 527 if (dmu_objset_type(os) != DMU_OST_ZFS) { 528 error = SET_ERROR(ENOTSUP); 529 } else if ((error = 530 dsl_prop_get_int_ds(dmu_objset_ds(os), 531 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 532 &propval)) == 0 && 533 !BOOTFS_COMPRESS_VALID(propval)) { 534 error = SET_ERROR(ENOTSUP); 535 } else { 536 objnum = dmu_objset_id(os); 537 } 538 dmu_objset_rele(os, FTAG); 539 } 540 break; 541 542 case ZPOOL_PROP_FAILUREMODE: 543 error = nvpair_value_uint64(elem, &intval); 544 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 545 intval > ZIO_FAILURE_MODE_PANIC)) 546 error = SET_ERROR(EINVAL); 547 548 /* 549 * This is a special case which only occurs when 550 * the pool has completely failed. This allows 551 * the user to change the in-core failmode property 552 * without syncing it out to disk (I/Os might 553 * currently be blocked). We do this by returning 554 * EIO to the caller (spa_prop_set) to trick it 555 * into thinking we encountered a property validation 556 * error. 557 */ 558 if (!error && spa_suspended(spa)) { 559 spa->spa_failmode = intval; 560 error = SET_ERROR(EIO); 561 } 562 break; 563 564 case ZPOOL_PROP_CACHEFILE: 565 if ((error = nvpair_value_string(elem, &strval)) != 0) 566 break; 567 568 if (strval[0] == '\0') 569 break; 570 571 if (strcmp(strval, "none") == 0) 572 break; 573 574 if (strval[0] != '/') { 575 error = SET_ERROR(EINVAL); 576 break; 577 } 578 579 slash = strrchr(strval, '/'); 580 ASSERT(slash != NULL); 581 582 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 583 strcmp(slash, "/..") == 0) 584 error = SET_ERROR(EINVAL); 585 break; 586 587 case ZPOOL_PROP_COMMENT: 588 if ((error = nvpair_value_string(elem, &strval)) != 0) 589 break; 590 for (check = strval; *check != '\0'; check++) { 591 /* 592 * The kernel doesn't have an easy isprint() 593 * check. For this kernel check, we merely 594 * check ASCII apart from DEL. Fix this if 595 * there is an easy-to-use kernel isprint(). 596 */ 597 if (*check >= 0x7f) { 598 error = SET_ERROR(EINVAL); 599 break; 600 } 601 } 602 if (strlen(strval) > ZPROP_MAX_COMMENT) 603 error = E2BIG; 604 break; 605 606 case ZPOOL_PROP_DEDUPDITTO: 607 if (spa_version(spa) < SPA_VERSION_DEDUP) 608 error = SET_ERROR(ENOTSUP); 609 else 610 error = nvpair_value_uint64(elem, &intval); 611 if (error == 0 && 612 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 613 error = SET_ERROR(EINVAL); 614 break; 615 } 616 617 if (error) 618 break; 619 } 620 621 if (!error && reset_bootfs) { 622 error = nvlist_remove(props, 623 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 624 625 if (!error) { 626 error = nvlist_add_uint64(props, 627 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 628 } 629 } 630 631 return (error); 632 } 633 634 void 635 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 636 { 637 char *cachefile; 638 spa_config_dirent_t *dp; 639 640 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 641 &cachefile) != 0) 642 return; 643 644 dp = kmem_alloc(sizeof (spa_config_dirent_t), 645 KM_SLEEP); 646 647 if (cachefile[0] == '\0') 648 dp->scd_path = spa_strdup(spa_config_path); 649 else if (strcmp(cachefile, "none") == 0) 650 dp->scd_path = NULL; 651 else 652 dp->scd_path = spa_strdup(cachefile); 653 654 list_insert_head(&spa->spa_config_list, dp); 655 if (need_sync) 656 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 657 } 658 659 int 660 spa_prop_set(spa_t *spa, nvlist_t *nvp) 661 { 662 int error; 663 nvpair_t *elem = NULL; 664 boolean_t need_sync = B_FALSE; 665 666 if ((error = spa_prop_validate(spa, nvp)) != 0) 667 return (error); 668 669 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 670 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 671 672 if (prop == ZPOOL_PROP_CACHEFILE || 673 prop == ZPOOL_PROP_ALTROOT || 674 prop == ZPOOL_PROP_READONLY) 675 continue; 676 677 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 678 uint64_t ver; 679 680 if (prop == ZPOOL_PROP_VERSION) { 681 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 682 } else { 683 ASSERT(zpool_prop_feature(nvpair_name(elem))); 684 ver = SPA_VERSION_FEATURES; 685 need_sync = B_TRUE; 686 } 687 688 /* Save time if the version is already set. */ 689 if (ver == spa_version(spa)) 690 continue; 691 692 /* 693 * In addition to the pool directory object, we might 694 * create the pool properties object, the features for 695 * read object, the features for write object, or the 696 * feature descriptions object. 697 */ 698 error = dsl_sync_task(spa->spa_name, NULL, 699 spa_sync_version, &ver, 700 6, ZFS_SPACE_CHECK_RESERVED); 701 if (error) 702 return (error); 703 continue; 704 } 705 706 need_sync = B_TRUE; 707 break; 708 } 709 710 if (need_sync) { 711 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 712 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 713 } 714 715 return (0); 716 } 717 718 /* 719 * If the bootfs property value is dsobj, clear it. 720 */ 721 void 722 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 723 { 724 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 725 VERIFY(zap_remove(spa->spa_meta_objset, 726 spa->spa_pool_props_object, 727 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 728 spa->spa_bootfs = 0; 729 } 730 } 731 732 /*ARGSUSED*/ 733 static int 734 spa_change_guid_check(void *arg, dmu_tx_t *tx) 735 { 736 uint64_t *newguid = arg; 737 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 738 vdev_t *rvd = spa->spa_root_vdev; 739 uint64_t vdev_state; 740 741 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 742 vdev_state = rvd->vdev_state; 743 spa_config_exit(spa, SCL_STATE, FTAG); 744 745 if (vdev_state != VDEV_STATE_HEALTHY) 746 return (SET_ERROR(ENXIO)); 747 748 ASSERT3U(spa_guid(spa), !=, *newguid); 749 750 return (0); 751 } 752 753 static void 754 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 755 { 756 uint64_t *newguid = arg; 757 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 758 uint64_t oldguid; 759 vdev_t *rvd = spa->spa_root_vdev; 760 761 oldguid = spa_guid(spa); 762 763 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 764 rvd->vdev_guid = *newguid; 765 rvd->vdev_guid_sum += (*newguid - oldguid); 766 vdev_config_dirty(rvd); 767 spa_config_exit(spa, SCL_STATE, FTAG); 768 769 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 770 oldguid, *newguid); 771 } 772 773 /* 774 * Change the GUID for the pool. This is done so that we can later 775 * re-import a pool built from a clone of our own vdevs. We will modify 776 * the root vdev's guid, our own pool guid, and then mark all of our 777 * vdevs dirty. Note that we must make sure that all our vdevs are 778 * online when we do this, or else any vdevs that weren't present 779 * would be orphaned from our pool. We are also going to issue a 780 * sysevent to update any watchers. 781 */ 782 int 783 spa_change_guid(spa_t *spa) 784 { 785 int error; 786 uint64_t guid; 787 788 mutex_enter(&spa->spa_vdev_top_lock); 789 mutex_enter(&spa_namespace_lock); 790 guid = spa_generate_guid(NULL); 791 792 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 793 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 794 795 if (error == 0) { 796 spa_write_cachefile(spa, B_FALSE, B_TRUE); 797 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 798 } 799 800 mutex_exit(&spa_namespace_lock); 801 mutex_exit(&spa->spa_vdev_top_lock); 802 803 return (error); 804 } 805 806 /* 807 * ========================================================================== 808 * SPA state manipulation (open/create/destroy/import/export) 809 * ========================================================================== 810 */ 811 812 static int 813 spa_error_entry_compare(const void *a, const void *b) 814 { 815 spa_error_entry_t *sa = (spa_error_entry_t *)a; 816 spa_error_entry_t *sb = (spa_error_entry_t *)b; 817 int ret; 818 819 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 820 sizeof (zbookmark_phys_t)); 821 822 if (ret < 0) 823 return (-1); 824 else if (ret > 0) 825 return (1); 826 else 827 return (0); 828 } 829 830 /* 831 * Utility function which retrieves copies of the current logs and 832 * re-initializes them in the process. 833 */ 834 void 835 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 836 { 837 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 838 839 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 840 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 841 842 avl_create(&spa->spa_errlist_scrub, 843 spa_error_entry_compare, sizeof (spa_error_entry_t), 844 offsetof(spa_error_entry_t, se_avl)); 845 avl_create(&spa->spa_errlist_last, 846 spa_error_entry_compare, sizeof (spa_error_entry_t), 847 offsetof(spa_error_entry_t, se_avl)); 848 } 849 850 static void 851 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 852 { 853 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 854 enum zti_modes mode = ztip->zti_mode; 855 uint_t value = ztip->zti_value; 856 uint_t count = ztip->zti_count; 857 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 858 char name[32]; 859 uint_t flags = 0; 860 boolean_t batch = B_FALSE; 861 862 if (mode == ZTI_MODE_NULL) { 863 tqs->stqs_count = 0; 864 tqs->stqs_taskq = NULL; 865 return; 866 } 867 868 ASSERT3U(count, >, 0); 869 870 tqs->stqs_count = count; 871 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 872 873 switch (mode) { 874 case ZTI_MODE_FIXED: 875 ASSERT3U(value, >=, 1); 876 value = MAX(value, 1); 877 break; 878 879 case ZTI_MODE_BATCH: 880 batch = B_TRUE; 881 flags |= TASKQ_THREADS_CPU_PCT; 882 value = zio_taskq_batch_pct; 883 break; 884 885 default: 886 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 887 "spa_activate()", 888 zio_type_name[t], zio_taskq_types[q], mode, value); 889 break; 890 } 891 892 for (uint_t i = 0; i < count; i++) { 893 taskq_t *tq; 894 895 if (count > 1) { 896 (void) snprintf(name, sizeof (name), "%s_%s_%u", 897 zio_type_name[t], zio_taskq_types[q], i); 898 } else { 899 (void) snprintf(name, sizeof (name), "%s_%s", 900 zio_type_name[t], zio_taskq_types[q]); 901 } 902 903 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 904 if (batch) 905 flags |= TASKQ_DC_BATCH; 906 907 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 908 spa->spa_proc, zio_taskq_basedc, flags); 909 } else { 910 pri_t pri = maxclsyspri; 911 /* 912 * The write issue taskq can be extremely CPU 913 * intensive. Run it at slightly lower priority 914 * than the other taskqs. 915 */ 916 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 917 pri--; 918 919 tq = taskq_create_proc(name, value, pri, 50, 920 INT_MAX, spa->spa_proc, flags); 921 } 922 923 tqs->stqs_taskq[i] = tq; 924 } 925 } 926 927 static void 928 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 929 { 930 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 931 932 if (tqs->stqs_taskq == NULL) { 933 ASSERT0(tqs->stqs_count); 934 return; 935 } 936 937 for (uint_t i = 0; i < tqs->stqs_count; i++) { 938 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 939 taskq_destroy(tqs->stqs_taskq[i]); 940 } 941 942 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 943 tqs->stqs_taskq = NULL; 944 } 945 946 /* 947 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 948 * Note that a type may have multiple discrete taskqs to avoid lock contention 949 * on the taskq itself. In that case we choose which taskq at random by using 950 * the low bits of gethrtime(). 951 */ 952 void 953 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 954 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 955 { 956 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 957 taskq_t *tq; 958 959 ASSERT3P(tqs->stqs_taskq, !=, NULL); 960 ASSERT3U(tqs->stqs_count, !=, 0); 961 962 if (tqs->stqs_count == 1) { 963 tq = tqs->stqs_taskq[0]; 964 } else { 965 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 966 } 967 968 taskq_dispatch_ent(tq, func, arg, flags, ent); 969 } 970 971 static void 972 spa_create_zio_taskqs(spa_t *spa) 973 { 974 for (int t = 0; t < ZIO_TYPES; t++) { 975 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 976 spa_taskqs_init(spa, t, q); 977 } 978 } 979 } 980 981 #ifdef _KERNEL 982 static void 983 spa_thread(void *arg) 984 { 985 callb_cpr_t cprinfo; 986 987 spa_t *spa = arg; 988 user_t *pu = PTOU(curproc); 989 990 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 991 spa->spa_name); 992 993 ASSERT(curproc != &p0); 994 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 995 "zpool-%s", spa->spa_name); 996 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 997 998 /* bind this thread to the requested psrset */ 999 if (zio_taskq_psrset_bind != PS_NONE) { 1000 pool_lock(); 1001 mutex_enter(&cpu_lock); 1002 mutex_enter(&pidlock); 1003 mutex_enter(&curproc->p_lock); 1004 1005 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1006 0, NULL, NULL) == 0) { 1007 curthread->t_bind_pset = zio_taskq_psrset_bind; 1008 } else { 1009 cmn_err(CE_WARN, 1010 "Couldn't bind process for zfs pool \"%s\" to " 1011 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1012 } 1013 1014 mutex_exit(&curproc->p_lock); 1015 mutex_exit(&pidlock); 1016 mutex_exit(&cpu_lock); 1017 pool_unlock(); 1018 } 1019 1020 if (zio_taskq_sysdc) { 1021 sysdc_thread_enter(curthread, 100, 0); 1022 } 1023 1024 spa->spa_proc = curproc; 1025 spa->spa_did = curthread->t_did; 1026 1027 spa_create_zio_taskqs(spa); 1028 1029 mutex_enter(&spa->spa_proc_lock); 1030 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1031 1032 spa->spa_proc_state = SPA_PROC_ACTIVE; 1033 cv_broadcast(&spa->spa_proc_cv); 1034 1035 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1036 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1037 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1038 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1039 1040 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1041 spa->spa_proc_state = SPA_PROC_GONE; 1042 spa->spa_proc = &p0; 1043 cv_broadcast(&spa->spa_proc_cv); 1044 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1045 1046 mutex_enter(&curproc->p_lock); 1047 lwp_exit(); 1048 } 1049 #endif 1050 1051 /* 1052 * Activate an uninitialized pool. 1053 */ 1054 static void 1055 spa_activate(spa_t *spa, int mode) 1056 { 1057 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1058 1059 spa->spa_state = POOL_STATE_ACTIVE; 1060 spa->spa_mode = mode; 1061 1062 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1063 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1064 1065 /* Try to create a covering process */ 1066 mutex_enter(&spa->spa_proc_lock); 1067 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1068 ASSERT(spa->spa_proc == &p0); 1069 spa->spa_did = 0; 1070 1071 /* Only create a process if we're going to be around a while. */ 1072 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1073 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1074 NULL, 0) == 0) { 1075 spa->spa_proc_state = SPA_PROC_CREATED; 1076 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1077 cv_wait(&spa->spa_proc_cv, 1078 &spa->spa_proc_lock); 1079 } 1080 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1081 ASSERT(spa->spa_proc != &p0); 1082 ASSERT(spa->spa_did != 0); 1083 } else { 1084 #ifdef _KERNEL 1085 cmn_err(CE_WARN, 1086 "Couldn't create process for zfs pool \"%s\"\n", 1087 spa->spa_name); 1088 #endif 1089 } 1090 } 1091 mutex_exit(&spa->spa_proc_lock); 1092 1093 /* If we didn't create a process, we need to create our taskqs. */ 1094 if (spa->spa_proc == &p0) { 1095 spa_create_zio_taskqs(spa); 1096 } 1097 1098 for (size_t i = 0; i < TXG_SIZE; i++) 1099 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); 1100 1101 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1102 offsetof(vdev_t, vdev_config_dirty_node)); 1103 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1104 offsetof(objset_t, os_evicting_node)); 1105 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1106 offsetof(vdev_t, vdev_state_dirty_node)); 1107 1108 txg_list_create(&spa->spa_vdev_txg_list, spa, 1109 offsetof(struct vdev, vdev_txg_node)); 1110 1111 avl_create(&spa->spa_errlist_scrub, 1112 spa_error_entry_compare, sizeof (spa_error_entry_t), 1113 offsetof(spa_error_entry_t, se_avl)); 1114 avl_create(&spa->spa_errlist_last, 1115 spa_error_entry_compare, sizeof (spa_error_entry_t), 1116 offsetof(spa_error_entry_t, se_avl)); 1117 } 1118 1119 /* 1120 * Opposite of spa_activate(). 1121 */ 1122 static void 1123 spa_deactivate(spa_t *spa) 1124 { 1125 ASSERT(spa->spa_sync_on == B_FALSE); 1126 ASSERT(spa->spa_dsl_pool == NULL); 1127 ASSERT(spa->spa_root_vdev == NULL); 1128 ASSERT(spa->spa_async_zio_root == NULL); 1129 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1130 1131 spa_evicting_os_wait(spa); 1132 1133 txg_list_destroy(&spa->spa_vdev_txg_list); 1134 1135 list_destroy(&spa->spa_config_dirty_list); 1136 list_destroy(&spa->spa_evicting_os_list); 1137 list_destroy(&spa->spa_state_dirty_list); 1138 1139 for (int t = 0; t < ZIO_TYPES; t++) { 1140 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1141 spa_taskqs_fini(spa, t, q); 1142 } 1143 } 1144 1145 for (size_t i = 0; i < TXG_SIZE; i++) { 1146 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1147 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1148 spa->spa_txg_zio[i] = NULL; 1149 } 1150 1151 metaslab_class_destroy(spa->spa_normal_class); 1152 spa->spa_normal_class = NULL; 1153 1154 metaslab_class_destroy(spa->spa_log_class); 1155 spa->spa_log_class = NULL; 1156 1157 /* 1158 * If this was part of an import or the open otherwise failed, we may 1159 * still have errors left in the queues. Empty them just in case. 1160 */ 1161 spa_errlog_drain(spa); 1162 1163 avl_destroy(&spa->spa_errlist_scrub); 1164 avl_destroy(&spa->spa_errlist_last); 1165 1166 spa->spa_state = POOL_STATE_UNINITIALIZED; 1167 1168 mutex_enter(&spa->spa_proc_lock); 1169 if (spa->spa_proc_state != SPA_PROC_NONE) { 1170 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1171 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1172 cv_broadcast(&spa->spa_proc_cv); 1173 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1174 ASSERT(spa->spa_proc != &p0); 1175 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1176 } 1177 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1178 spa->spa_proc_state = SPA_PROC_NONE; 1179 } 1180 ASSERT(spa->spa_proc == &p0); 1181 mutex_exit(&spa->spa_proc_lock); 1182 1183 /* 1184 * We want to make sure spa_thread() has actually exited the ZFS 1185 * module, so that the module can't be unloaded out from underneath 1186 * it. 1187 */ 1188 if (spa->spa_did != 0) { 1189 thread_join(spa->spa_did); 1190 spa->spa_did = 0; 1191 } 1192 } 1193 1194 /* 1195 * Verify a pool configuration, and construct the vdev tree appropriately. This 1196 * will create all the necessary vdevs in the appropriate layout, with each vdev 1197 * in the CLOSED state. This will prep the pool before open/creation/import. 1198 * All vdev validation is done by the vdev_alloc() routine. 1199 */ 1200 static int 1201 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1202 uint_t id, int atype) 1203 { 1204 nvlist_t **child; 1205 uint_t children; 1206 int error; 1207 1208 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1209 return (error); 1210 1211 if ((*vdp)->vdev_ops->vdev_op_leaf) 1212 return (0); 1213 1214 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1215 &child, &children); 1216 1217 if (error == ENOENT) 1218 return (0); 1219 1220 if (error) { 1221 vdev_free(*vdp); 1222 *vdp = NULL; 1223 return (SET_ERROR(EINVAL)); 1224 } 1225 1226 for (int c = 0; c < children; c++) { 1227 vdev_t *vd; 1228 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1229 atype)) != 0) { 1230 vdev_free(*vdp); 1231 *vdp = NULL; 1232 return (error); 1233 } 1234 } 1235 1236 ASSERT(*vdp != NULL); 1237 1238 return (0); 1239 } 1240 1241 /* 1242 * Opposite of spa_load(). 1243 */ 1244 static void 1245 spa_unload(spa_t *spa) 1246 { 1247 int i; 1248 1249 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1250 1251 spa_load_note(spa, "UNLOADING"); 1252 1253 /* 1254 * Stop async tasks. 1255 */ 1256 spa_async_suspend(spa); 1257 1258 /* 1259 * Stop syncing. 1260 */ 1261 if (spa->spa_sync_on) { 1262 txg_sync_stop(spa->spa_dsl_pool); 1263 spa->spa_sync_on = B_FALSE; 1264 } 1265 1266 /* 1267 * Even though vdev_free() also calls vdev_metaslab_fini, we need 1268 * to call it earlier, before we wait for async i/o to complete. 1269 * This ensures that there is no async metaslab prefetching, by 1270 * calling taskq_wait(mg_taskq). 1271 */ 1272 if (spa->spa_root_vdev != NULL) { 1273 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1274 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) 1275 vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); 1276 spa_config_exit(spa, SCL_ALL, FTAG); 1277 } 1278 1279 /* 1280 * Wait for any outstanding async I/O to complete. 1281 */ 1282 if (spa->spa_async_zio_root != NULL) { 1283 for (int i = 0; i < max_ncpus; i++) 1284 (void) zio_wait(spa->spa_async_zio_root[i]); 1285 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1286 spa->spa_async_zio_root = NULL; 1287 } 1288 1289 if (spa->spa_vdev_removal != NULL) { 1290 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1291 spa->spa_vdev_removal = NULL; 1292 } 1293 1294 spa_condense_fini(spa); 1295 1296 bpobj_close(&spa->spa_deferred_bpobj); 1297 1298 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1299 1300 /* 1301 * Close all vdevs. 1302 */ 1303 if (spa->spa_root_vdev) 1304 vdev_free(spa->spa_root_vdev); 1305 ASSERT(spa->spa_root_vdev == NULL); 1306 1307 /* 1308 * Close the dsl pool. 1309 */ 1310 if (spa->spa_dsl_pool) { 1311 dsl_pool_close(spa->spa_dsl_pool); 1312 spa->spa_dsl_pool = NULL; 1313 spa->spa_meta_objset = NULL; 1314 } 1315 1316 ddt_unload(spa); 1317 1318 /* 1319 * Drop and purge level 2 cache 1320 */ 1321 spa_l2cache_drop(spa); 1322 1323 for (i = 0; i < spa->spa_spares.sav_count; i++) 1324 vdev_free(spa->spa_spares.sav_vdevs[i]); 1325 if (spa->spa_spares.sav_vdevs) { 1326 kmem_free(spa->spa_spares.sav_vdevs, 1327 spa->spa_spares.sav_count * sizeof (void *)); 1328 spa->spa_spares.sav_vdevs = NULL; 1329 } 1330 if (spa->spa_spares.sav_config) { 1331 nvlist_free(spa->spa_spares.sav_config); 1332 spa->spa_spares.sav_config = NULL; 1333 } 1334 spa->spa_spares.sav_count = 0; 1335 1336 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1337 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1338 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1339 } 1340 if (spa->spa_l2cache.sav_vdevs) { 1341 kmem_free(spa->spa_l2cache.sav_vdevs, 1342 spa->spa_l2cache.sav_count * sizeof (void *)); 1343 spa->spa_l2cache.sav_vdevs = NULL; 1344 } 1345 if (spa->spa_l2cache.sav_config) { 1346 nvlist_free(spa->spa_l2cache.sav_config); 1347 spa->spa_l2cache.sav_config = NULL; 1348 } 1349 spa->spa_l2cache.sav_count = 0; 1350 1351 spa->spa_async_suspended = 0; 1352 1353 spa->spa_indirect_vdevs_loaded = B_FALSE; 1354 1355 if (spa->spa_comment != NULL) { 1356 spa_strfree(spa->spa_comment); 1357 spa->spa_comment = NULL; 1358 } 1359 1360 spa_config_exit(spa, SCL_ALL, FTAG); 1361 } 1362 1363 /* 1364 * Load (or re-load) the current list of vdevs describing the active spares for 1365 * this pool. When this is called, we have some form of basic information in 1366 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1367 * then re-generate a more complete list including status information. 1368 */ 1369 void 1370 spa_load_spares(spa_t *spa) 1371 { 1372 nvlist_t **spares; 1373 uint_t nspares; 1374 int i; 1375 vdev_t *vd, *tvd; 1376 1377 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1378 1379 /* 1380 * First, close and free any existing spare vdevs. 1381 */ 1382 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1383 vd = spa->spa_spares.sav_vdevs[i]; 1384 1385 /* Undo the call to spa_activate() below */ 1386 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1387 B_FALSE)) != NULL && tvd->vdev_isspare) 1388 spa_spare_remove(tvd); 1389 vdev_close(vd); 1390 vdev_free(vd); 1391 } 1392 1393 if (spa->spa_spares.sav_vdevs) 1394 kmem_free(spa->spa_spares.sav_vdevs, 1395 spa->spa_spares.sav_count * sizeof (void *)); 1396 1397 if (spa->spa_spares.sav_config == NULL) 1398 nspares = 0; 1399 else 1400 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1401 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1402 1403 spa->spa_spares.sav_count = (int)nspares; 1404 spa->spa_spares.sav_vdevs = NULL; 1405 1406 if (nspares == 0) 1407 return; 1408 1409 /* 1410 * Construct the array of vdevs, opening them to get status in the 1411 * process. For each spare, there is potentially two different vdev_t 1412 * structures associated with it: one in the list of spares (used only 1413 * for basic validation purposes) and one in the active vdev 1414 * configuration (if it's spared in). During this phase we open and 1415 * validate each vdev on the spare list. If the vdev also exists in the 1416 * active configuration, then we also mark this vdev as an active spare. 1417 */ 1418 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1419 KM_SLEEP); 1420 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1421 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1422 VDEV_ALLOC_SPARE) == 0); 1423 ASSERT(vd != NULL); 1424 1425 spa->spa_spares.sav_vdevs[i] = vd; 1426 1427 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1428 B_FALSE)) != NULL) { 1429 if (!tvd->vdev_isspare) 1430 spa_spare_add(tvd); 1431 1432 /* 1433 * We only mark the spare active if we were successfully 1434 * able to load the vdev. Otherwise, importing a pool 1435 * with a bad active spare would result in strange 1436 * behavior, because multiple pool would think the spare 1437 * is actively in use. 1438 * 1439 * There is a vulnerability here to an equally bizarre 1440 * circumstance, where a dead active spare is later 1441 * brought back to life (onlined or otherwise). Given 1442 * the rarity of this scenario, and the extra complexity 1443 * it adds, we ignore the possibility. 1444 */ 1445 if (!vdev_is_dead(tvd)) 1446 spa_spare_activate(tvd); 1447 } 1448 1449 vd->vdev_top = vd; 1450 vd->vdev_aux = &spa->spa_spares; 1451 1452 if (vdev_open(vd) != 0) 1453 continue; 1454 1455 if (vdev_validate_aux(vd) == 0) 1456 spa_spare_add(vd); 1457 } 1458 1459 /* 1460 * Recompute the stashed list of spares, with status information 1461 * this time. 1462 */ 1463 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1464 DATA_TYPE_NVLIST_ARRAY) == 0); 1465 1466 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1467 KM_SLEEP); 1468 for (i = 0; i < spa->spa_spares.sav_count; i++) 1469 spares[i] = vdev_config_generate(spa, 1470 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1471 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1472 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1473 for (i = 0; i < spa->spa_spares.sav_count; i++) 1474 nvlist_free(spares[i]); 1475 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1476 } 1477 1478 /* 1479 * Load (or re-load) the current list of vdevs describing the active l2cache for 1480 * this pool. When this is called, we have some form of basic information in 1481 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1482 * then re-generate a more complete list including status information. 1483 * Devices which are already active have their details maintained, and are 1484 * not re-opened. 1485 */ 1486 void 1487 spa_load_l2cache(spa_t *spa) 1488 { 1489 nvlist_t **l2cache; 1490 uint_t nl2cache; 1491 int i, j, oldnvdevs; 1492 uint64_t guid; 1493 vdev_t *vd, **oldvdevs, **newvdevs; 1494 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1495 1496 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1497 1498 if (sav->sav_config != NULL) { 1499 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1500 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1501 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1502 } else { 1503 nl2cache = 0; 1504 newvdevs = NULL; 1505 } 1506 1507 oldvdevs = sav->sav_vdevs; 1508 oldnvdevs = sav->sav_count; 1509 sav->sav_vdevs = NULL; 1510 sav->sav_count = 0; 1511 1512 /* 1513 * Process new nvlist of vdevs. 1514 */ 1515 for (i = 0; i < nl2cache; i++) { 1516 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1517 &guid) == 0); 1518 1519 newvdevs[i] = NULL; 1520 for (j = 0; j < oldnvdevs; j++) { 1521 vd = oldvdevs[j]; 1522 if (vd != NULL && guid == vd->vdev_guid) { 1523 /* 1524 * Retain previous vdev for add/remove ops. 1525 */ 1526 newvdevs[i] = vd; 1527 oldvdevs[j] = NULL; 1528 break; 1529 } 1530 } 1531 1532 if (newvdevs[i] == NULL) { 1533 /* 1534 * Create new vdev 1535 */ 1536 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1537 VDEV_ALLOC_L2CACHE) == 0); 1538 ASSERT(vd != NULL); 1539 newvdevs[i] = vd; 1540 1541 /* 1542 * Commit this vdev as an l2cache device, 1543 * even if it fails to open. 1544 */ 1545 spa_l2cache_add(vd); 1546 1547 vd->vdev_top = vd; 1548 vd->vdev_aux = sav; 1549 1550 spa_l2cache_activate(vd); 1551 1552 if (vdev_open(vd) != 0) 1553 continue; 1554 1555 (void) vdev_validate_aux(vd); 1556 1557 if (!vdev_is_dead(vd)) 1558 l2arc_add_vdev(spa, vd); 1559 } 1560 } 1561 1562 /* 1563 * Purge vdevs that were dropped 1564 */ 1565 for (i = 0; i < oldnvdevs; i++) { 1566 uint64_t pool; 1567 1568 vd = oldvdevs[i]; 1569 if (vd != NULL) { 1570 ASSERT(vd->vdev_isl2cache); 1571 1572 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1573 pool != 0ULL && l2arc_vdev_present(vd)) 1574 l2arc_remove_vdev(vd); 1575 vdev_clear_stats(vd); 1576 vdev_free(vd); 1577 } 1578 } 1579 1580 if (oldvdevs) 1581 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1582 1583 if (sav->sav_config == NULL) 1584 goto out; 1585 1586 sav->sav_vdevs = newvdevs; 1587 sav->sav_count = (int)nl2cache; 1588 1589 /* 1590 * Recompute the stashed list of l2cache devices, with status 1591 * information this time. 1592 */ 1593 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1594 DATA_TYPE_NVLIST_ARRAY) == 0); 1595 1596 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1597 for (i = 0; i < sav->sav_count; i++) 1598 l2cache[i] = vdev_config_generate(spa, 1599 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1600 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1601 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1602 out: 1603 for (i = 0; i < sav->sav_count; i++) 1604 nvlist_free(l2cache[i]); 1605 if (sav->sav_count) 1606 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1607 } 1608 1609 static int 1610 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1611 { 1612 dmu_buf_t *db; 1613 char *packed = NULL; 1614 size_t nvsize = 0; 1615 int error; 1616 *value = NULL; 1617 1618 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1619 if (error != 0) 1620 return (error); 1621 1622 nvsize = *(uint64_t *)db->db_data; 1623 dmu_buf_rele(db, FTAG); 1624 1625 packed = kmem_alloc(nvsize, KM_SLEEP); 1626 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1627 DMU_READ_PREFETCH); 1628 if (error == 0) 1629 error = nvlist_unpack(packed, nvsize, value, 0); 1630 kmem_free(packed, nvsize); 1631 1632 return (error); 1633 } 1634 1635 /* 1636 * Checks to see if the given vdev could not be opened, in which case we post a 1637 * sysevent to notify the autoreplace code that the device has been removed. 1638 */ 1639 static void 1640 spa_check_removed(vdev_t *vd) 1641 { 1642 for (int c = 0; c < vd->vdev_children; c++) 1643 spa_check_removed(vd->vdev_child[c]); 1644 1645 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1646 vdev_is_concrete(vd)) { 1647 zfs_post_autoreplace(vd->vdev_spa, vd); 1648 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 1649 } 1650 } 1651 1652 static void 1653 spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1654 { 1655 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1656 1657 vd->vdev_top_zap = mvd->vdev_top_zap; 1658 vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1659 1660 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1661 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1662 } 1663 } 1664 1665 /* 1666 * Validate the current config against the MOS config 1667 */ 1668 static boolean_t 1669 spa_config_valid(spa_t *spa, nvlist_t *config) 1670 { 1671 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1672 nvlist_t *nv; 1673 1674 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1675 1676 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1677 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1678 1679 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1680 1681 /* 1682 * If we're doing a normal import, then build up any additional 1683 * diagnostic information about missing devices in this config. 1684 * We'll pass this up to the user for further processing. 1685 */ 1686 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1687 nvlist_t **child, *nv; 1688 uint64_t idx = 0; 1689 1690 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1691 KM_SLEEP); 1692 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1693 1694 for (int c = 0; c < rvd->vdev_children; c++) { 1695 vdev_t *tvd = rvd->vdev_child[c]; 1696 vdev_t *mtvd = mrvd->vdev_child[c]; 1697 1698 if (tvd->vdev_ops == &vdev_missing_ops && 1699 mtvd->vdev_ops != &vdev_missing_ops && 1700 mtvd->vdev_islog) 1701 child[idx++] = vdev_config_generate(spa, mtvd, 1702 B_FALSE, 0); 1703 } 1704 1705 if (idx) { 1706 VERIFY(nvlist_add_nvlist_array(nv, 1707 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1708 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1709 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1710 1711 for (int i = 0; i < idx; i++) 1712 nvlist_free(child[i]); 1713 } 1714 nvlist_free(nv); 1715 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1716 } 1717 1718 /* 1719 * Compare the root vdev tree with the information we have 1720 * from the MOS config (mrvd). Check each top-level vdev 1721 * with the corresponding MOS config top-level (mtvd). 1722 */ 1723 for (int c = 0; c < rvd->vdev_children; c++) { 1724 vdev_t *tvd = rvd->vdev_child[c]; 1725 vdev_t *mtvd = mrvd->vdev_child[c]; 1726 1727 /* 1728 * Resolve any "missing" vdevs in the current configuration. 1729 * Also trust the MOS config about any "indirect" vdevs. 1730 * If we find that the MOS config has more accurate information 1731 * about the top-level vdev then use that vdev instead. 1732 */ 1733 if ((tvd->vdev_ops == &vdev_missing_ops && 1734 mtvd->vdev_ops != &vdev_missing_ops) || 1735 (mtvd->vdev_ops == &vdev_indirect_ops && 1736 tvd->vdev_ops != &vdev_indirect_ops)) { 1737 1738 /* 1739 * Device specific actions. 1740 */ 1741 if (mtvd->vdev_islog) { 1742 if (!(spa->spa_import_flags & 1743 ZFS_IMPORT_MISSING_LOG)) { 1744 continue; 1745 } 1746 1747 spa_set_log_state(spa, SPA_LOG_CLEAR); 1748 } else if (mtvd->vdev_ops != &vdev_indirect_ops) { 1749 continue; 1750 } 1751 1752 /* 1753 * Swap the missing vdev with the data we were 1754 * able to obtain from the MOS config. 1755 */ 1756 vdev_remove_child(rvd, tvd); 1757 vdev_remove_child(mrvd, mtvd); 1758 1759 vdev_add_child(rvd, mtvd); 1760 vdev_add_child(mrvd, tvd); 1761 1762 vdev_reopen(rvd); 1763 } else { 1764 if (mtvd->vdev_islog) { 1765 /* 1766 * Load the slog device's state from the MOS 1767 * config since it's possible that the label 1768 * does not contain the most up-to-date 1769 * information. 1770 */ 1771 vdev_load_log_state(tvd, mtvd); 1772 vdev_reopen(tvd); 1773 } 1774 1775 /* 1776 * Per-vdev ZAP info is stored exclusively in the MOS. 1777 */ 1778 spa_config_valid_zaps(tvd, mtvd); 1779 } 1780 1781 /* 1782 * Never trust this info from userland; always use what's 1783 * in the MOS. This prevents it from getting out of sync 1784 * with the rest of the info in the MOS. 1785 */ 1786 tvd->vdev_removing = mtvd->vdev_removing; 1787 tvd->vdev_indirect_config = mtvd->vdev_indirect_config; 1788 } 1789 1790 vdev_free(mrvd); 1791 spa_config_exit(spa, SCL_ALL, FTAG); 1792 1793 /* 1794 * Ensure we were able to validate the config. 1795 */ 1796 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1797 } 1798 1799 /* 1800 * Check for missing log devices 1801 */ 1802 static boolean_t 1803 spa_check_logs(spa_t *spa) 1804 { 1805 boolean_t rv = B_FALSE; 1806 dsl_pool_t *dp = spa_get_dsl(spa); 1807 1808 switch (spa->spa_log_state) { 1809 case SPA_LOG_MISSING: 1810 /* need to recheck in case slog has been restored */ 1811 case SPA_LOG_UNKNOWN: 1812 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1813 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1814 if (rv) 1815 spa_set_log_state(spa, SPA_LOG_MISSING); 1816 break; 1817 } 1818 return (rv); 1819 } 1820 1821 static boolean_t 1822 spa_passivate_log(spa_t *spa) 1823 { 1824 vdev_t *rvd = spa->spa_root_vdev; 1825 boolean_t slog_found = B_FALSE; 1826 1827 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1828 1829 if (!spa_has_slogs(spa)) 1830 return (B_FALSE); 1831 1832 for (int c = 0; c < rvd->vdev_children; c++) { 1833 vdev_t *tvd = rvd->vdev_child[c]; 1834 metaslab_group_t *mg = tvd->vdev_mg; 1835 1836 if (tvd->vdev_islog) { 1837 metaslab_group_passivate(mg); 1838 slog_found = B_TRUE; 1839 } 1840 } 1841 1842 return (slog_found); 1843 } 1844 1845 static void 1846 spa_activate_log(spa_t *spa) 1847 { 1848 vdev_t *rvd = spa->spa_root_vdev; 1849 1850 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1851 1852 for (int c = 0; c < rvd->vdev_children; c++) { 1853 vdev_t *tvd = rvd->vdev_child[c]; 1854 metaslab_group_t *mg = tvd->vdev_mg; 1855 1856 if (tvd->vdev_islog) 1857 metaslab_group_activate(mg); 1858 } 1859 } 1860 1861 int 1862 spa_reset_logs(spa_t *spa) 1863 { 1864 int error; 1865 1866 error = dmu_objset_find(spa_name(spa), zil_reset, 1867 NULL, DS_FIND_CHILDREN); 1868 if (error == 0) { 1869 /* 1870 * We successfully offlined the log device, sync out the 1871 * current txg so that the "stubby" block can be removed 1872 * by zil_sync(). 1873 */ 1874 txg_wait_synced(spa->spa_dsl_pool, 0); 1875 } 1876 return (error); 1877 } 1878 1879 static void 1880 spa_aux_check_removed(spa_aux_vdev_t *sav) 1881 { 1882 for (int i = 0; i < sav->sav_count; i++) 1883 spa_check_removed(sav->sav_vdevs[i]); 1884 } 1885 1886 void 1887 spa_claim_notify(zio_t *zio) 1888 { 1889 spa_t *spa = zio->io_spa; 1890 1891 if (zio->io_error) 1892 return; 1893 1894 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1895 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1896 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1897 mutex_exit(&spa->spa_props_lock); 1898 } 1899 1900 typedef struct spa_load_error { 1901 uint64_t sle_meta_count; 1902 uint64_t sle_data_count; 1903 } spa_load_error_t; 1904 1905 static void 1906 spa_load_verify_done(zio_t *zio) 1907 { 1908 blkptr_t *bp = zio->io_bp; 1909 spa_load_error_t *sle = zio->io_private; 1910 dmu_object_type_t type = BP_GET_TYPE(bp); 1911 int error = zio->io_error; 1912 spa_t *spa = zio->io_spa; 1913 1914 abd_free(zio->io_abd); 1915 if (error) { 1916 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1917 type != DMU_OT_INTENT_LOG) 1918 atomic_inc_64(&sle->sle_meta_count); 1919 else 1920 atomic_inc_64(&sle->sle_data_count); 1921 } 1922 1923 mutex_enter(&spa->spa_scrub_lock); 1924 spa->spa_scrub_inflight--; 1925 cv_broadcast(&spa->spa_scrub_io_cv); 1926 mutex_exit(&spa->spa_scrub_lock); 1927 } 1928 1929 /* 1930 * Maximum number of concurrent scrub i/os to create while verifying 1931 * a pool while importing it. 1932 */ 1933 int spa_load_verify_maxinflight = 10000; 1934 boolean_t spa_load_verify_metadata = B_TRUE; 1935 boolean_t spa_load_verify_data = B_TRUE; 1936 1937 /*ARGSUSED*/ 1938 static int 1939 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1940 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1941 { 1942 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1943 return (0); 1944 /* 1945 * Note: normally this routine will not be called if 1946 * spa_load_verify_metadata is not set. However, it may be useful 1947 * to manually set the flag after the traversal has begun. 1948 */ 1949 if (!spa_load_verify_metadata) 1950 return (0); 1951 if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 1952 return (0); 1953 1954 zio_t *rio = arg; 1955 size_t size = BP_GET_PSIZE(bp); 1956 1957 mutex_enter(&spa->spa_scrub_lock); 1958 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1959 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1960 spa->spa_scrub_inflight++; 1961 mutex_exit(&spa->spa_scrub_lock); 1962 1963 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 1964 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1965 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1966 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1967 return (0); 1968 } 1969 1970 /* ARGSUSED */ 1971 int 1972 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1973 { 1974 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 1975 return (SET_ERROR(ENAMETOOLONG)); 1976 1977 return (0); 1978 } 1979 1980 static int 1981 spa_load_verify(spa_t *spa) 1982 { 1983 zio_t *rio; 1984 spa_load_error_t sle = { 0 }; 1985 zpool_rewind_policy_t policy; 1986 boolean_t verify_ok = B_FALSE; 1987 int error = 0; 1988 1989 zpool_get_rewind_policy(spa->spa_config, &policy); 1990 1991 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1992 return (0); 1993 1994 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 1995 error = dmu_objset_find_dp(spa->spa_dsl_pool, 1996 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 1997 DS_FIND_CHILDREN); 1998 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 1999 if (error != 0) 2000 return (error); 2001 2002 rio = zio_root(spa, NULL, &sle, 2003 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2004 2005 if (spa_load_verify_metadata) { 2006 if (spa->spa_extreme_rewind) { 2007 spa_load_note(spa, "performing a complete scan of the " 2008 "pool since extreme rewind is on. This may take " 2009 "a very long time.\n (spa_load_verify_data=%u, " 2010 "spa_load_verify_metadata=%u)", 2011 spa_load_verify_data, spa_load_verify_metadata); 2012 } 2013 error = traverse_pool(spa, spa->spa_verify_min_txg, 2014 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2015 spa_load_verify_cb, rio); 2016 } 2017 2018 (void) zio_wait(rio); 2019 2020 spa->spa_load_meta_errors = sle.sle_meta_count; 2021 spa->spa_load_data_errors = sle.sle_data_count; 2022 2023 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2024 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2025 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2026 (u_longlong_t)sle.sle_data_count); 2027 } 2028 2029 if (spa_load_verify_dryrun || 2030 (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2031 sle.sle_data_count <= policy.zrp_maxdata)) { 2032 int64_t loss = 0; 2033 2034 verify_ok = B_TRUE; 2035 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2036 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2037 2038 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2039 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2040 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2041 VERIFY(nvlist_add_int64(spa->spa_load_info, 2042 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2043 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2044 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2045 } else { 2046 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2047 } 2048 2049 if (spa_load_verify_dryrun) 2050 return (0); 2051 2052 if (error) { 2053 if (error != ENXIO && error != EIO) 2054 error = SET_ERROR(EIO); 2055 return (error); 2056 } 2057 2058 return (verify_ok ? 0 : EIO); 2059 } 2060 2061 /* 2062 * Find a value in the pool props object. 2063 */ 2064 static void 2065 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2066 { 2067 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2068 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2069 } 2070 2071 /* 2072 * Find a value in the pool directory object. 2073 */ 2074 static int 2075 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2076 { 2077 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2078 name, sizeof (uint64_t), 1, val); 2079 2080 if (error != 0 && (error != ENOENT || log_enoent)) { 2081 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2082 "[error=%d]", name, error); 2083 } 2084 2085 return (error); 2086 } 2087 2088 static int 2089 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2090 { 2091 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2092 return (SET_ERROR(err)); 2093 } 2094 2095 /* 2096 * Fix up config after a partly-completed split. This is done with the 2097 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2098 * pool have that entry in their config, but only the splitting one contains 2099 * a list of all the guids of the vdevs that are being split off. 2100 * 2101 * This function determines what to do with that list: either rejoin 2102 * all the disks to the pool, or complete the splitting process. To attempt 2103 * the rejoin, each disk that is offlined is marked online again, and 2104 * we do a reopen() call. If the vdev label for every disk that was 2105 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2106 * then we call vdev_split() on each disk, and complete the split. 2107 * 2108 * Otherwise we leave the config alone, with all the vdevs in place in 2109 * the original pool. 2110 */ 2111 static void 2112 spa_try_repair(spa_t *spa, nvlist_t *config) 2113 { 2114 uint_t extracted; 2115 uint64_t *glist; 2116 uint_t i, gcount; 2117 nvlist_t *nvl; 2118 vdev_t **vd; 2119 boolean_t attempt_reopen; 2120 2121 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2122 return; 2123 2124 /* check that the config is complete */ 2125 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2126 &glist, &gcount) != 0) 2127 return; 2128 2129 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2130 2131 /* attempt to online all the vdevs & validate */ 2132 attempt_reopen = B_TRUE; 2133 for (i = 0; i < gcount; i++) { 2134 if (glist[i] == 0) /* vdev is hole */ 2135 continue; 2136 2137 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2138 if (vd[i] == NULL) { 2139 /* 2140 * Don't bother attempting to reopen the disks; 2141 * just do the split. 2142 */ 2143 attempt_reopen = B_FALSE; 2144 } else { 2145 /* attempt to re-online it */ 2146 vd[i]->vdev_offline = B_FALSE; 2147 } 2148 } 2149 2150 if (attempt_reopen) { 2151 vdev_reopen(spa->spa_root_vdev); 2152 2153 /* check each device to see what state it's in */ 2154 for (extracted = 0, i = 0; i < gcount; i++) { 2155 if (vd[i] != NULL && 2156 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2157 break; 2158 ++extracted; 2159 } 2160 } 2161 2162 /* 2163 * If every disk has been moved to the new pool, or if we never 2164 * even attempted to look at them, then we split them off for 2165 * good. 2166 */ 2167 if (!attempt_reopen || gcount == extracted) { 2168 for (i = 0; i < gcount; i++) 2169 if (vd[i] != NULL) 2170 vdev_split(vd[i]); 2171 vdev_reopen(spa->spa_root_vdev); 2172 } 2173 2174 kmem_free(vd, gcount * sizeof (vdev_t *)); 2175 } 2176 2177 static int 2178 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2179 boolean_t trust_config) 2180 { 2181 nvlist_t *config = spa->spa_config; 2182 char *ereport = FM_EREPORT_ZFS_POOL; 2183 char *comment; 2184 int error; 2185 uint64_t pool_guid; 2186 nvlist_t *nvl; 2187 2188 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2189 return (SET_ERROR(EINVAL)); 2190 2191 ASSERT(spa->spa_comment == NULL); 2192 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2193 spa->spa_comment = spa_strdup(comment); 2194 2195 /* 2196 * Versioning wasn't explicitly added to the label until later, so if 2197 * it's not present treat it as the initial version. 2198 */ 2199 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2200 &spa->spa_ubsync.ub_version) != 0) 2201 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2202 2203 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2204 &spa->spa_config_txg); 2205 2206 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2207 spa_guid_exists(pool_guid, 0)) { 2208 error = SET_ERROR(EEXIST); 2209 } else { 2210 spa->spa_config_guid = pool_guid; 2211 2212 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2213 &nvl) == 0) { 2214 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2215 KM_SLEEP) == 0); 2216 } 2217 2218 nvlist_free(spa->spa_load_info); 2219 spa->spa_load_info = fnvlist_alloc(); 2220 2221 gethrestime(&spa->spa_loaded_ts); 2222 error = spa_load_impl(spa, pool_guid, config, state, type, 2223 trust_config, &ereport); 2224 } 2225 2226 /* 2227 * Don't count references from objsets that are already closed 2228 * and are making their way through the eviction process. 2229 */ 2230 spa_evicting_os_wait(spa); 2231 spa->spa_minref = refcount_count(&spa->spa_refcount); 2232 if (error) { 2233 if (error != EEXIST) { 2234 spa->spa_loaded_ts.tv_sec = 0; 2235 spa->spa_loaded_ts.tv_nsec = 0; 2236 } 2237 if (error != EBADF) { 2238 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2239 } 2240 } 2241 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2242 spa->spa_ena = 0; 2243 2244 return (error); 2245 } 2246 2247 /* 2248 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2249 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2250 * spa's per-vdev ZAP list. 2251 */ 2252 static uint64_t 2253 vdev_count_verify_zaps(vdev_t *vd) 2254 { 2255 spa_t *spa = vd->vdev_spa; 2256 uint64_t total = 0; 2257 if (vd->vdev_top_zap != 0) { 2258 total++; 2259 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2260 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2261 } 2262 if (vd->vdev_leaf_zap != 0) { 2263 total++; 2264 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2265 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2266 } 2267 2268 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2269 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2270 } 2271 2272 return (total); 2273 } 2274 2275 static int 2276 spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2277 spa_import_type_t type) 2278 { 2279 int error = 0; 2280 nvlist_t *nvtree = NULL; 2281 int parse; 2282 vdev_t *rvd; 2283 2284 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 2285 spa_load_failed(spa, "invalid config provided: '%s' missing", 2286 ZPOOL_CONFIG_VDEV_TREE); 2287 return (SET_ERROR(EINVAL)); 2288 } 2289 2290 parse = (type == SPA_IMPORT_EXISTING ? 2291 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2292 2293 /* 2294 * Create "The Godfather" zio to hold all async IOs 2295 */ 2296 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2297 KM_SLEEP); 2298 for (int i = 0; i < max_ncpus; i++) { 2299 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2300 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2301 ZIO_FLAG_GODFATHER); 2302 } 2303 2304 /* 2305 * Parse the configuration into a vdev tree. We explicitly set the 2306 * value that will be returned by spa_version() since parsing the 2307 * configuration requires knowing the version number. 2308 */ 2309 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2310 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 2311 spa_config_exit(spa, SCL_ALL, FTAG); 2312 2313 if (error != 0) { 2314 spa_load_failed(spa, "unable to parse config [error=%d]", 2315 error); 2316 return (error); 2317 } 2318 2319 ASSERT(spa->spa_root_vdev == rvd); 2320 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2321 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2322 2323 if (type != SPA_IMPORT_ASSEMBLE) { 2324 ASSERT(spa_guid(spa) == pool_guid); 2325 } 2326 2327 return (0); 2328 } 2329 2330 static int 2331 spa_ld_open_vdevs(spa_t *spa) 2332 { 2333 int error = 0; 2334 2335 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2336 error = vdev_open(spa->spa_root_vdev); 2337 spa_config_exit(spa, SCL_ALL, FTAG); 2338 if (error != 0) { 2339 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 2340 error); 2341 } 2342 2343 return (error); 2344 } 2345 2346 static int 2347 spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type, 2348 boolean_t trust_config) 2349 { 2350 int error = 0; 2351 vdev_t *rvd = spa->spa_root_vdev; 2352 2353 /* 2354 * We need to validate the vdev labels against the configuration that 2355 * we have in hand, which is dependent on the setting of trust_config. 2356 * If trust_config is true then we're validating the vdev labels based 2357 * on that config. Otherwise, we're validating against the cached 2358 * config (zpool.cache) that was read when we loaded the zfs module, and 2359 * then later we will recursively call spa_load() and validate against 2360 * the vdev config. 2361 * 2362 * If we're assembling a new pool that's been split off from an 2363 * existing pool, the labels haven't yet been updated so we skip 2364 * validation for now. 2365 */ 2366 if (type != SPA_IMPORT_ASSEMBLE) { 2367 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2368 error = vdev_validate(rvd, trust_config); 2369 spa_config_exit(spa, SCL_ALL, FTAG); 2370 2371 if (error != 0) { 2372 spa_load_failed(spa, "vdev_validate failed [error=%d]", 2373 error); 2374 return (error); 2375 } 2376 2377 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2378 spa_load_failed(spa, "cannot open vdev tree after " 2379 "invalidating some vdevs"); 2380 return (SET_ERROR(ENXIO)); 2381 } 2382 } 2383 2384 return (0); 2385 } 2386 2387 static int 2388 spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, 2389 boolean_t trust_config) 2390 { 2391 vdev_t *rvd = spa->spa_root_vdev; 2392 nvlist_t *label; 2393 uberblock_t *ub = &spa->spa_uberblock; 2394 uint64_t children; 2395 2396 /* 2397 * Find the best uberblock. 2398 */ 2399 vdev_uberblock_load(rvd, ub, &label); 2400 2401 /* 2402 * If we weren't able to find a single valid uberblock, return failure. 2403 */ 2404 if (ub->ub_txg == 0) { 2405 nvlist_free(label); 2406 spa_load_failed(spa, "no valid uberblock found"); 2407 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2408 } 2409 2410 spa_load_note(spa, "using uberblock with txg=%llu", 2411 (u_longlong_t)ub->ub_txg); 2412 2413 /* 2414 * If the pool has an unsupported version we can't open it. 2415 */ 2416 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2417 nvlist_free(label); 2418 spa_load_failed(spa, "version %llu is not supported", 2419 (u_longlong_t)ub->ub_version); 2420 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2421 } 2422 2423 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2424 nvlist_t *features; 2425 2426 /* 2427 * If we weren't able to find what's necessary for reading the 2428 * MOS in the label, return failure. 2429 */ 2430 if (label == NULL) { 2431 spa_load_failed(spa, "label config unavailable"); 2432 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2433 ENXIO)); 2434 } 2435 2436 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 2437 &features) != 0) { 2438 nvlist_free(label); 2439 spa_load_failed(spa, "invalid label: '%s' missing", 2440 ZPOOL_CONFIG_FEATURES_FOR_READ); 2441 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2442 ENXIO)); 2443 } 2444 2445 /* 2446 * Update our in-core representation with the definitive values 2447 * from the label. 2448 */ 2449 nvlist_free(spa->spa_label_features); 2450 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2451 } 2452 2453 nvlist_free(label); 2454 2455 /* 2456 * Look through entries in the label nvlist's features_for_read. If 2457 * there is a feature listed there which we don't understand then we 2458 * cannot open a pool. 2459 */ 2460 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2461 nvlist_t *unsup_feat; 2462 2463 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2464 0); 2465 2466 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2467 NULL); nvp != NULL; 2468 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2469 if (!zfeature_is_supported(nvpair_name(nvp))) { 2470 VERIFY(nvlist_add_string(unsup_feat, 2471 nvpair_name(nvp), "") == 0); 2472 } 2473 } 2474 2475 if (!nvlist_empty(unsup_feat)) { 2476 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2477 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2478 nvlist_free(unsup_feat); 2479 spa_load_failed(spa, "some features are unsupported"); 2480 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2481 ENOTSUP)); 2482 } 2483 2484 nvlist_free(unsup_feat); 2485 } 2486 2487 /* 2488 * If the vdev guid sum doesn't match the uberblock, we have an 2489 * incomplete configuration. We first check to see if the pool 2490 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2491 * If it is, defer the vdev_guid_sum check till later so we 2492 * can handle missing vdevs. 2493 */ 2494 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2495 &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && 2496 rvd->vdev_guid_sum != ub->ub_guid_sum) { 2497 spa_load_failed(spa, "guid sum in config doesn't match guid " 2498 "sum in uberblock (%llu != %llu)", 2499 (u_longlong_t)rvd->vdev_guid_sum, 2500 (u_longlong_t)ub->ub_guid_sum); 2501 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2502 } 2503 2504 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2505 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2506 spa_try_repair(spa, config); 2507 spa_config_exit(spa, SCL_ALL, FTAG); 2508 nvlist_free(spa->spa_config_splitting); 2509 spa->spa_config_splitting = NULL; 2510 } 2511 2512 /* 2513 * Initialize internal SPA structures. 2514 */ 2515 spa->spa_state = POOL_STATE_ACTIVE; 2516 spa->spa_ubsync = spa->spa_uberblock; 2517 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2518 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2519 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2520 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2521 spa->spa_claim_max_txg = spa->spa_first_txg; 2522 spa->spa_prev_software_version = ub->ub_software_version; 2523 2524 return (0); 2525 } 2526 2527 static int 2528 spa_ld_open_rootbp(spa_t *spa) 2529 { 2530 int error = 0; 2531 vdev_t *rvd = spa->spa_root_vdev; 2532 2533 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2534 if (error != 0) { 2535 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 2536 "[error=%d]", error); 2537 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2538 } 2539 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2540 2541 return (0); 2542 } 2543 2544 static int 2545 spa_ld_validate_config(spa_t *spa, spa_import_type_t type) 2546 { 2547 vdev_t *rvd = spa->spa_root_vdev; 2548 2549 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 2550 != 0) 2551 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2552 2553 /* 2554 * Validate the config, using the MOS config to fill in any 2555 * information which might be missing. If we fail to validate 2556 * the config then declare the pool unfit for use. If we're 2557 * assembling a pool from a split, the log is not transferred 2558 * over. 2559 */ 2560 if (type != SPA_IMPORT_ASSEMBLE) { 2561 nvlist_t *mos_config; 2562 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 2563 != 0) { 2564 spa_load_failed(spa, "unable to retrieve MOS config"); 2565 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2566 } 2567 2568 if (!spa_config_valid(spa, mos_config)) { 2569 nvlist_free(mos_config); 2570 spa_load_failed(spa, "mismatch between config provided " 2571 "and config stored in MOS"); 2572 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2573 ENXIO)); 2574 } 2575 nvlist_free(mos_config); 2576 2577 /* 2578 * Now that we've validated the config, check the state of the 2579 * root vdev. If it can't be opened, it indicates one or 2580 * more toplevel vdevs are faulted. 2581 */ 2582 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 2583 spa_load_failed(spa, "some top vdevs are unavailable"); 2584 return (SET_ERROR(ENXIO)); 2585 } 2586 } 2587 2588 return (0); 2589 } 2590 2591 static int 2592 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 2593 { 2594 int error = 0; 2595 vdev_t *rvd = spa->spa_root_vdev; 2596 2597 /* 2598 * Everything that we read before spa_remove_init() must be stored 2599 * on concreted vdevs. Therefore we do this as early as possible. 2600 */ 2601 error = spa_remove_init(spa); 2602 if (error != 0) { 2603 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 2604 error); 2605 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2606 } 2607 2608 /* 2609 * Retrieve information needed to condense indirect vdev mappings. 2610 */ 2611 error = spa_condense_init(spa); 2612 if (error != 0) { 2613 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 2614 error); 2615 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 2616 } 2617 2618 return (0); 2619 } 2620 2621 static int 2622 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 2623 { 2624 int error = 0; 2625 vdev_t *rvd = spa->spa_root_vdev; 2626 2627 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2628 boolean_t missing_feat_read = B_FALSE; 2629 nvlist_t *unsup_feat, *enabled_feat; 2630 2631 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2632 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 2633 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2634 } 2635 2636 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2637 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 2638 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2639 } 2640 2641 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2642 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 2643 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2644 } 2645 2646 enabled_feat = fnvlist_alloc(); 2647 unsup_feat = fnvlist_alloc(); 2648 2649 if (!spa_features_check(spa, B_FALSE, 2650 unsup_feat, enabled_feat)) 2651 missing_feat_read = B_TRUE; 2652 2653 if (spa_writeable(spa) || 2654 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 2655 if (!spa_features_check(spa, B_TRUE, 2656 unsup_feat, enabled_feat)) { 2657 *missing_feat_writep = B_TRUE; 2658 } 2659 } 2660 2661 fnvlist_add_nvlist(spa->spa_load_info, 2662 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2663 2664 if (!nvlist_empty(unsup_feat)) { 2665 fnvlist_add_nvlist(spa->spa_load_info, 2666 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2667 } 2668 2669 fnvlist_free(enabled_feat); 2670 fnvlist_free(unsup_feat); 2671 2672 if (!missing_feat_read) { 2673 fnvlist_add_boolean(spa->spa_load_info, 2674 ZPOOL_CONFIG_CAN_RDONLY); 2675 } 2676 2677 /* 2678 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2679 * twofold: to determine whether the pool is available for 2680 * import in read-write mode and (if it is not) whether the 2681 * pool is available for import in read-only mode. If the pool 2682 * is available for import in read-write mode, it is displayed 2683 * as available in userland; if it is not available for import 2684 * in read-only mode, it is displayed as unavailable in 2685 * userland. If the pool is available for import in read-only 2686 * mode but not read-write mode, it is displayed as unavailable 2687 * in userland with a special note that the pool is actually 2688 * available for open in read-only mode. 2689 * 2690 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2691 * missing a feature for write, we must first determine whether 2692 * the pool can be opened read-only before returning to 2693 * userland in order to know whether to display the 2694 * abovementioned note. 2695 */ 2696 if (missing_feat_read || (*missing_feat_writep && 2697 spa_writeable(spa))) { 2698 spa_load_failed(spa, "pool uses unsupported features"); 2699 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2700 ENOTSUP)); 2701 } 2702 2703 /* 2704 * Load refcounts for ZFS features from disk into an in-memory 2705 * cache during SPA initialization. 2706 */ 2707 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2708 uint64_t refcount; 2709 2710 error = feature_get_refcount_from_disk(spa, 2711 &spa_feature_table[i], &refcount); 2712 if (error == 0) { 2713 spa->spa_feat_refcount_cache[i] = refcount; 2714 } else if (error == ENOTSUP) { 2715 spa->spa_feat_refcount_cache[i] = 2716 SPA_FEATURE_DISABLED; 2717 } else { 2718 spa_load_failed(spa, "error getting refcount " 2719 "for feature %s [error=%d]", 2720 spa_feature_table[i].fi_guid, error); 2721 return (spa_vdev_err(rvd, 2722 VDEV_AUX_CORRUPT_DATA, EIO)); 2723 } 2724 } 2725 } 2726 2727 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2728 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2729 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 2730 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2731 } 2732 2733 return (0); 2734 } 2735 2736 static int 2737 spa_ld_load_special_directories(spa_t *spa) 2738 { 2739 int error = 0; 2740 vdev_t *rvd = spa->spa_root_vdev; 2741 2742 spa->spa_is_initializing = B_TRUE; 2743 error = dsl_pool_open(spa->spa_dsl_pool); 2744 spa->spa_is_initializing = B_FALSE; 2745 if (error != 0) { 2746 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 2747 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2748 } 2749 2750 return (0); 2751 } 2752 2753 static int 2754 spa_ld_prepare_for_reload(spa_t *spa, int orig_mode) 2755 { 2756 vdev_t *rvd = spa->spa_root_vdev; 2757 2758 uint64_t hostid; 2759 nvlist_t *policy = NULL; 2760 nvlist_t *mos_config; 2761 2762 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2763 spa_load_failed(spa, "unable to retrieve MOS config"); 2764 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2765 } 2766 2767 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 2768 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2769 char *hostname; 2770 unsigned long myhostid = 0; 2771 2772 VERIFY(nvlist_lookup_string(mos_config, 2773 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2774 2775 #ifdef _KERNEL 2776 myhostid = zone_get_hostid(NULL); 2777 #else /* _KERNEL */ 2778 /* 2779 * We're emulating the system's hostid in userland, so 2780 * we can't use zone_get_hostid(). 2781 */ 2782 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2783 #endif /* _KERNEL */ 2784 if (hostid != 0 && myhostid != 0 && 2785 hostid != myhostid) { 2786 nvlist_free(mos_config); 2787 cmn_err(CE_WARN, "pool '%s' could not be " 2788 "loaded as it was last accessed by " 2789 "another system (host: %s hostid: 0x%lx). " 2790 "See: http://illumos.org/msg/ZFS-8000-EY", 2791 spa_name(spa), hostname, 2792 (unsigned long)hostid); 2793 return (SET_ERROR(EBADF)); 2794 } 2795 } 2796 if (nvlist_lookup_nvlist(spa->spa_config, 2797 ZPOOL_REWIND_POLICY, &policy) == 0) 2798 VERIFY(nvlist_add_nvlist(mos_config, 2799 ZPOOL_REWIND_POLICY, policy) == 0); 2800 2801 spa_config_set(spa, mos_config); 2802 spa_unload(spa); 2803 spa_deactivate(spa); 2804 spa_activate(spa, orig_mode); 2805 2806 return (0); 2807 } 2808 2809 static int 2810 spa_ld_get_props(spa_t *spa) 2811 { 2812 int error = 0; 2813 uint64_t obj; 2814 vdev_t *rvd = spa->spa_root_vdev; 2815 2816 /* Grab the secret checksum salt from the MOS. */ 2817 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2818 DMU_POOL_CHECKSUM_SALT, 1, 2819 sizeof (spa->spa_cksum_salt.zcs_bytes), 2820 spa->spa_cksum_salt.zcs_bytes); 2821 if (error == ENOENT) { 2822 /* Generate a new salt for subsequent use */ 2823 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2824 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2825 } else if (error != 0) { 2826 spa_load_failed(spa, "unable to retrieve checksum salt from " 2827 "MOS [error=%d]", error); 2828 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2829 } 2830 2831 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 2832 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2833 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2834 if (error != 0) { 2835 spa_load_failed(spa, "error opening deferred-frees bpobj " 2836 "[error=%d]", error); 2837 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2838 } 2839 2840 /* 2841 * Load the bit that tells us to use the new accounting function 2842 * (raid-z deflation). If we have an older pool, this will not 2843 * be present. 2844 */ 2845 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 2846 if (error != 0 && error != ENOENT) 2847 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2848 2849 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2850 &spa->spa_creation_version, B_FALSE); 2851 if (error != 0 && error != ENOENT) 2852 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2853 2854 /* 2855 * Load the persistent error log. If we have an older pool, this will 2856 * not be present. 2857 */ 2858 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 2859 B_FALSE); 2860 if (error != 0 && error != ENOENT) 2861 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2862 2863 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2864 &spa->spa_errlog_scrub, B_FALSE); 2865 if (error != 0 && error != ENOENT) 2866 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2867 2868 /* 2869 * Load the history object. If we have an older pool, this 2870 * will not be present. 2871 */ 2872 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 2873 if (error != 0 && error != ENOENT) 2874 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2875 2876 /* 2877 * Load the per-vdev ZAP map. If we have an older pool, this will not 2878 * be present; in this case, defer its creation to a later time to 2879 * avoid dirtying the MOS this early / out of sync context. See 2880 * spa_sync_config_object. 2881 */ 2882 2883 /* The sentinel is only available in the MOS config. */ 2884 nvlist_t *mos_config; 2885 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 2886 spa_load_failed(spa, "unable to retrieve MOS config"); 2887 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2888 } 2889 2890 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2891 &spa->spa_all_vdev_zaps, B_FALSE); 2892 2893 if (error == ENOENT) { 2894 VERIFY(!nvlist_exists(mos_config, 2895 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 2896 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 2897 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2898 } else if (error != 0) { 2899 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2900 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2901 /* 2902 * An older version of ZFS overwrote the sentinel value, so 2903 * we have orphaned per-vdev ZAPs in the MOS. Defer their 2904 * destruction to later; see spa_sync_config_object. 2905 */ 2906 spa->spa_avz_action = AVZ_ACTION_DESTROY; 2907 /* 2908 * We're assuming that no vdevs have had their ZAPs created 2909 * before this. Better be sure of it. 2910 */ 2911 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2912 } 2913 nvlist_free(mos_config); 2914 2915 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2916 2917 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 2918 B_FALSE); 2919 if (error && error != ENOENT) 2920 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2921 2922 if (error == 0) { 2923 uint64_t autoreplace; 2924 2925 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2926 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2927 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2928 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2929 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2930 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2931 &spa->spa_dedup_ditto); 2932 2933 spa->spa_autoreplace = (autoreplace != 0); 2934 } 2935 2936 return (0); 2937 } 2938 2939 static int 2940 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 2941 { 2942 int error = 0; 2943 vdev_t *rvd = spa->spa_root_vdev; 2944 2945 /* 2946 * If we're assembling the pool from the split-off vdevs of 2947 * an existing pool, we don't want to attach the spares & cache 2948 * devices. 2949 */ 2950 2951 /* 2952 * Load any hot spares for this pool. 2953 */ 2954 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 2955 B_FALSE); 2956 if (error != 0 && error != ENOENT) 2957 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2958 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2959 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2960 if (load_nvlist(spa, spa->spa_spares.sav_object, 2961 &spa->spa_spares.sav_config) != 0) { 2962 spa_load_failed(spa, "error loading spares nvlist"); 2963 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2964 } 2965 2966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2967 spa_load_spares(spa); 2968 spa_config_exit(spa, SCL_ALL, FTAG); 2969 } else if (error == 0) { 2970 spa->spa_spares.sav_sync = B_TRUE; 2971 } 2972 2973 /* 2974 * Load any level 2 ARC devices for this pool. 2975 */ 2976 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2977 &spa->spa_l2cache.sav_object, B_FALSE); 2978 if (error != 0 && error != ENOENT) 2979 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2980 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2981 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2982 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2983 &spa->spa_l2cache.sav_config) != 0) { 2984 spa_load_failed(spa, "error loading l2cache nvlist"); 2985 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2986 } 2987 2988 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2989 spa_load_l2cache(spa); 2990 spa_config_exit(spa, SCL_ALL, FTAG); 2991 } else if (error == 0) { 2992 spa->spa_l2cache.sav_sync = B_TRUE; 2993 } 2994 2995 return (0); 2996 } 2997 2998 static int 2999 spa_ld_load_vdev_metadata(spa_t *spa) 3000 { 3001 int error = 0; 3002 vdev_t *rvd = spa->spa_root_vdev; 3003 3004 /* 3005 * If the 'autoreplace' property is set, then post a resource notifying 3006 * the ZFS DE that it should not issue any faults for unopenable 3007 * devices. We also iterate over the vdevs, and post a sysevent for any 3008 * unopenable vdevs so that the normal autoreplace handler can take 3009 * over. 3010 */ 3011 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3012 spa_check_removed(spa->spa_root_vdev); 3013 /* 3014 * For the import case, this is done in spa_import(), because 3015 * at this point we're using the spare definitions from 3016 * the MOS config, not necessarily from the userland config. 3017 */ 3018 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 3019 spa_aux_check_removed(&spa->spa_spares); 3020 spa_aux_check_removed(&spa->spa_l2cache); 3021 } 3022 } 3023 3024 /* 3025 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 3026 */ 3027 error = vdev_load(rvd); 3028 if (error != 0) { 3029 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 3030 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3031 } 3032 3033 /* 3034 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 3035 */ 3036 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3037 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 3038 spa_config_exit(spa, SCL_ALL, FTAG); 3039 3040 return (0); 3041 } 3042 3043 static int 3044 spa_ld_load_dedup_tables(spa_t *spa) 3045 { 3046 int error = 0; 3047 vdev_t *rvd = spa->spa_root_vdev; 3048 3049 error = ddt_load(spa); 3050 if (error != 0) { 3051 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 3052 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3053 } 3054 3055 return (0); 3056 } 3057 3058 static int 3059 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 3060 { 3061 vdev_t *rvd = spa->spa_root_vdev; 3062 3063 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 3064 boolean_t missing = spa_check_logs(spa); 3065 if (missing) { 3066 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 3067 spa_load_failed(spa, "spa_check_logs failed"); 3068 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 3069 } 3070 } 3071 3072 return (0); 3073 } 3074 3075 static int 3076 spa_ld_verify_pool_data(spa_t *spa) 3077 { 3078 int error = 0; 3079 vdev_t *rvd = spa->spa_root_vdev; 3080 3081 /* 3082 * We've successfully opened the pool, verify that we're ready 3083 * to start pushing transactions. 3084 */ 3085 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 3086 error = spa_load_verify(spa); 3087 if (error != 0) { 3088 spa_load_failed(spa, "spa_load_verify failed " 3089 "[error=%d]", error); 3090 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3091 error)); 3092 } 3093 } 3094 3095 return (0); 3096 } 3097 3098 static void 3099 spa_ld_claim_log_blocks(spa_t *spa) 3100 { 3101 dmu_tx_t *tx; 3102 dsl_pool_t *dp = spa_get_dsl(spa); 3103 3104 /* 3105 * Claim log blocks that haven't been committed yet. 3106 * This must all happen in a single txg. 3107 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 3108 * invoked from zil_claim_log_block()'s i/o done callback. 3109 * Price of rollback is that we abandon the log. 3110 */ 3111 spa->spa_claiming = B_TRUE; 3112 3113 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 3114 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 3115 zil_claim, tx, DS_FIND_CHILDREN); 3116 dmu_tx_commit(tx); 3117 3118 spa->spa_claiming = B_FALSE; 3119 3120 spa_set_log_state(spa, SPA_LOG_GOOD); 3121 } 3122 3123 static void 3124 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) 3125 { 3126 vdev_t *rvd = spa->spa_root_vdev; 3127 int need_update = B_FALSE; 3128 3129 /* 3130 * If the config cache is stale, or we have uninitialized 3131 * metaslabs (see spa_vdev_add()), then update the config. 3132 * 3133 * If this is a verbatim import, trust the current 3134 * in-core spa_config and update the disk labels. 3135 */ 3136 if (config_cache_txg != spa->spa_config_txg || 3137 spa->spa_load_state == SPA_LOAD_IMPORT || 3138 spa->spa_load_state == SPA_LOAD_RECOVER || 3139 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 3140 need_update = B_TRUE; 3141 3142 for (int c = 0; c < rvd->vdev_children; c++) 3143 if (rvd->vdev_child[c]->vdev_ms_array == 0) 3144 need_update = B_TRUE; 3145 3146 /* 3147 * Update the config cache asychronously in case we're the 3148 * root pool, in which case the config cache isn't writable yet. 3149 */ 3150 if (need_update) 3151 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3152 } 3153 3154 /* 3155 * Load an existing storage pool, using the config provided. This config 3156 * describes which vdevs are part of the pool and is later validated against 3157 * partial configs present in each vdev's label and an entire copy of the 3158 * config stored in the MOS. 3159 */ 3160 static int 3161 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 3162 spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, 3163 char **ereport) 3164 { 3165 int error = 0; 3166 uint64_t config_cache_txg = spa->spa_config_txg; 3167 int orig_mode = spa->spa_mode; 3168 boolean_t missing_feat_write = B_FALSE; 3169 3170 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3171 3172 spa->spa_load_state = state; 3173 spa_load_note(spa, "LOADING"); 3174 3175 /* 3176 * If this is an untrusted config, first access the pool in read-only 3177 * mode. We will then retrieve a trusted copy of the config from the MOS 3178 * and use it to reopen the pool in read-write mode. 3179 */ 3180 if (!trust_config) 3181 spa->spa_mode = FREAD; 3182 3183 /* 3184 * Parse the config provided to create a vdev tree. 3185 */ 3186 error = spa_ld_parse_config(spa, pool_guid, config, type); 3187 if (error != 0) 3188 return (error); 3189 3190 /* 3191 * Now that we have the vdev tree, try to open each vdev. This involves 3192 * opening the underlying physical device, retrieving its geometry and 3193 * probing the vdev with a dummy I/O. The state of each vdev will be set 3194 * based on the success of those operations. After this we'll be ready 3195 * to read from the vdevs. 3196 */ 3197 error = spa_ld_open_vdevs(spa); 3198 if (error != 0) 3199 return (error); 3200 3201 /* 3202 * Read the label of each vdev and make sure that the GUIDs stored 3203 * there match the GUIDs in the config provided. 3204 */ 3205 error = spa_ld_validate_vdevs(spa, type, trust_config); 3206 if (error != 0) 3207 return (error); 3208 3209 /* 3210 * Read vdev labels to find the best uberblock (i.e. latest, unless 3211 * spa_load_max_txg is set) and store it in spa_uberblock. We get the 3212 * list of features required to read blkptrs in the MOS from the vdev 3213 * label with the best uberblock and verify that our version of zfs 3214 * supports them all. 3215 */ 3216 error = spa_ld_select_uberblock(spa, config, type, trust_config); 3217 if (error != 0) 3218 return (error); 3219 3220 /* 3221 * Pass that uberblock to the dsl_pool layer which will open the root 3222 * blkptr. This blkptr points to the latest version of the MOS and will 3223 * allow us to read its contents. 3224 */ 3225 error = spa_ld_open_rootbp(spa); 3226 if (error != 0) 3227 return (error); 3228 3229 /* 3230 * Retrieve the config stored in the MOS and use it to validate the 3231 * config provided. Also extract some information from the MOS config 3232 * to update our vdev tree. 3233 */ 3234 error = spa_ld_validate_config(spa, type); 3235 if (error != 0) 3236 return (error); 3237 3238 /* 3239 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 3240 * from the pool and their contents were re-mapped to other vdevs. Note 3241 * that everything that we read before this step must have been 3242 * rewritten on concrete vdevs after the last device removal was 3243 * initiated. Otherwise we could be reading from indirect vdevs before 3244 * we have loaded their mappings. 3245 */ 3246 error = spa_ld_open_indirect_vdev_metadata(spa); 3247 if (error != 0) 3248 return (error); 3249 3250 /* 3251 * Retrieve the full list of active features from the MOS and check if 3252 * they are all supported. 3253 */ 3254 error = spa_ld_check_features(spa, &missing_feat_write); 3255 if (error != 0) 3256 return (error); 3257 3258 /* 3259 * Load several special directories from the MOS needed by the dsl_pool 3260 * layer. 3261 */ 3262 error = spa_ld_load_special_directories(spa); 3263 if (error != 0) 3264 return (error); 3265 3266 /* 3267 * If the config provided is not trusted, discard it and use the config 3268 * from the MOS to reload the pool. 3269 */ 3270 if (!trust_config) { 3271 error = spa_ld_prepare_for_reload(spa, orig_mode); 3272 if (error != 0) 3273 return (error); 3274 3275 spa_load_note(spa, "RELOADING"); 3276 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 3277 } 3278 3279 /* 3280 * Retrieve pool properties from the MOS. 3281 */ 3282 error = spa_ld_get_props(spa); 3283 if (error != 0) 3284 return (error); 3285 3286 /* 3287 * Retrieve the list of auxiliary devices - cache devices and spares - 3288 * and open them. 3289 */ 3290 error = spa_ld_open_aux_vdevs(spa, type); 3291 if (error != 0) 3292 return (error); 3293 3294 /* 3295 * Load the metadata for all vdevs. Also check if unopenable devices 3296 * should be autoreplaced. 3297 */ 3298 error = spa_ld_load_vdev_metadata(spa); 3299 if (error != 0) 3300 return (error); 3301 3302 error = spa_ld_load_dedup_tables(spa); 3303 if (error != 0) 3304 return (error); 3305 3306 /* 3307 * Verify the logs now to make sure we don't have any unexpected errors 3308 * when we claim log blocks later. 3309 */ 3310 error = spa_ld_verify_logs(spa, type, ereport); 3311 if (error != 0) 3312 return (error); 3313 3314 if (missing_feat_write) { 3315 ASSERT(state == SPA_LOAD_TRYIMPORT); 3316 3317 /* 3318 * At this point, we know that we can open the pool in 3319 * read-only mode but not read-write mode. We now have enough 3320 * information and can return to userland. 3321 */ 3322 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 3323 ENOTSUP)); 3324 } 3325 3326 /* 3327 * Traverse the last txgs to make sure the pool was left off in a safe 3328 * state. When performing an extreme rewind, we verify the whole pool, 3329 * which can take a very long time. 3330 */ 3331 error = spa_ld_verify_pool_data(spa); 3332 if (error != 0) 3333 return (error); 3334 3335 /* 3336 * Calculate the deflated space for the pool. This must be done before 3337 * we write anything to the pool because we'd need to update the space 3338 * accounting using the deflated sizes. 3339 */ 3340 spa_update_dspace(spa); 3341 3342 /* 3343 * We have now retrieved all the information we needed to open the 3344 * pool. If we are importing the pool in read-write mode, a few 3345 * additional steps must be performed to finish the import. 3346 */ 3347 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 3348 spa->spa_load_max_txg == UINT64_MAX)) { 3349 ASSERT(state != SPA_LOAD_TRYIMPORT); 3350 3351 /* 3352 * We must check this before we start the sync thread, because 3353 * we only want to start a condense thread for condense 3354 * operations that were in progress when the pool was 3355 * imported. Once we start syncing, spa_sync() could 3356 * initiate a condense (and start a thread for it). In 3357 * that case it would be wrong to start a second 3358 * condense thread. 3359 */ 3360 boolean_t condense_in_progress = 3361 (spa->spa_condensing_indirect != NULL); 3362 3363 /* 3364 * Traverse the ZIL and claim all blocks. 3365 */ 3366 spa_ld_claim_log_blocks(spa); 3367 3368 /* 3369 * Kick-off the syncing thread. 3370 */ 3371 spa->spa_sync_on = B_TRUE; 3372 txg_sync_start(spa->spa_dsl_pool); 3373 3374 /* 3375 * Wait for all claims to sync. We sync up to the highest 3376 * claimed log block birth time so that claimed log blocks 3377 * don't appear to be from the future. spa_claim_max_txg 3378 * will have been set for us by ZIL traversal operations 3379 * performed above. 3380 */ 3381 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 3382 3383 /* 3384 * Check if we need to request an update of the config. On the 3385 * next sync, we would update the config stored in vdev labels 3386 * and the cachefile (by default /etc/zfs/zpool.cache). 3387 */ 3388 spa_ld_check_for_config_update(spa, config_cache_txg); 3389 3390 /* 3391 * Check all DTLs to see if anything needs resilvering. 3392 */ 3393 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 3394 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3395 spa_async_request(spa, SPA_ASYNC_RESILVER); 3396 3397 /* 3398 * Log the fact that we booted up (so that we can detect if 3399 * we rebooted in the middle of an operation). 3400 */ 3401 spa_history_log_version(spa, "open"); 3402 3403 /* 3404 * Delete any inconsistent datasets. 3405 */ 3406 (void) dmu_objset_find(spa_name(spa), 3407 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 3408 3409 /* 3410 * Clean up any stale temporary dataset userrefs. 3411 */ 3412 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 3413 3414 /* 3415 * Note: unlike condensing, we don't need an analogous 3416 * "removal_in_progress" dance because no other thread 3417 * can start a removal while we hold the spa_namespace_lock. 3418 */ 3419 spa_restart_removal(spa); 3420 3421 if (condense_in_progress) 3422 spa_condense_indirect_restart(spa); 3423 } 3424 3425 spa_load_note(spa, "LOADED"); 3426 3427 return (0); 3428 } 3429 3430 static int 3431 spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) 3432 { 3433 int mode = spa->spa_mode; 3434 3435 spa_unload(spa); 3436 spa_deactivate(spa); 3437 3438 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 3439 3440 spa_activate(spa, mode); 3441 spa_async_suspend(spa); 3442 3443 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 3444 (u_longlong_t)spa->spa_load_max_txg); 3445 3446 return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config)); 3447 } 3448 3449 /* 3450 * If spa_load() fails this function will try loading prior txg's. If 3451 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 3452 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 3453 * function will not rewind the pool and will return the same error as 3454 * spa_load(). 3455 */ 3456 static int 3457 spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, 3458 uint64_t max_request, int rewind_flags) 3459 { 3460 nvlist_t *loadinfo = NULL; 3461 nvlist_t *config = NULL; 3462 int load_error, rewind_error; 3463 uint64_t safe_rewind_txg; 3464 uint64_t min_txg; 3465 3466 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3467 spa->spa_load_max_txg = spa->spa_load_txg; 3468 spa_set_log_state(spa, SPA_LOG_CLEAR); 3469 } else { 3470 spa->spa_load_max_txg = max_request; 3471 if (max_request != UINT64_MAX) 3472 spa->spa_extreme_rewind = B_TRUE; 3473 } 3474 3475 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3476 trust_config); 3477 if (load_error == 0) 3478 return (0); 3479 3480 if (spa->spa_root_vdev != NULL) 3481 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3482 3483 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3484 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3485 3486 if (rewind_flags & ZPOOL_NEVER_REWIND) { 3487 nvlist_free(config); 3488 return (load_error); 3489 } 3490 3491 if (state == SPA_LOAD_RECOVER) { 3492 /* Price of rolling back is discarding txgs, including log */ 3493 spa_set_log_state(spa, SPA_LOG_CLEAR); 3494 } else { 3495 /* 3496 * If we aren't rolling back save the load info from our first 3497 * import attempt so that we can restore it after attempting 3498 * to rewind. 3499 */ 3500 loadinfo = spa->spa_load_info; 3501 spa->spa_load_info = fnvlist_alloc(); 3502 } 3503 3504 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3505 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3506 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3507 TXG_INITIAL : safe_rewind_txg; 3508 3509 /* 3510 * Continue as long as we're finding errors, we're still within 3511 * the acceptable rewind range, and we're still finding uberblocks 3512 */ 3513 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3514 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3515 if (spa->spa_load_max_txg < safe_rewind_txg) 3516 spa->spa_extreme_rewind = B_TRUE; 3517 rewind_error = spa_load_retry(spa, state, trust_config); 3518 } 3519 3520 spa->spa_extreme_rewind = B_FALSE; 3521 spa->spa_load_max_txg = UINT64_MAX; 3522 3523 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3524 spa_config_set(spa, config); 3525 else 3526 nvlist_free(config); 3527 3528 if (state == SPA_LOAD_RECOVER) { 3529 ASSERT3P(loadinfo, ==, NULL); 3530 return (rewind_error); 3531 } else { 3532 /* Store the rewind info as part of the initial load info */ 3533 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3534 spa->spa_load_info); 3535 3536 /* Restore the initial load info */ 3537 fnvlist_free(spa->spa_load_info); 3538 spa->spa_load_info = loadinfo; 3539 3540 return (load_error); 3541 } 3542 } 3543 3544 /* 3545 * Pool Open/Import 3546 * 3547 * The import case is identical to an open except that the configuration is sent 3548 * down from userland, instead of grabbed from the configuration cache. For the 3549 * case of an open, the pool configuration will exist in the 3550 * POOL_STATE_UNINITIALIZED state. 3551 * 3552 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3553 * the same time open the pool, without having to keep around the spa_t in some 3554 * ambiguous state. 3555 */ 3556 static int 3557 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3558 nvlist_t **config) 3559 { 3560 spa_t *spa; 3561 spa_load_state_t state = SPA_LOAD_OPEN; 3562 int error; 3563 int locked = B_FALSE; 3564 3565 *spapp = NULL; 3566 3567 /* 3568 * As disgusting as this is, we need to support recursive calls to this 3569 * function because dsl_dir_open() is called during spa_load(), and ends 3570 * up calling spa_open() again. The real fix is to figure out how to 3571 * avoid dsl_dir_open() calling this in the first place. 3572 */ 3573 if (mutex_owner(&spa_namespace_lock) != curthread) { 3574 mutex_enter(&spa_namespace_lock); 3575 locked = B_TRUE; 3576 } 3577 3578 if ((spa = spa_lookup(pool)) == NULL) { 3579 if (locked) 3580 mutex_exit(&spa_namespace_lock); 3581 return (SET_ERROR(ENOENT)); 3582 } 3583 3584 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3585 zpool_rewind_policy_t policy; 3586 3587 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3588 &policy); 3589 if (policy.zrp_request & ZPOOL_DO_REWIND) 3590 state = SPA_LOAD_RECOVER; 3591 3592 spa_activate(spa, spa_mode_global); 3593 3594 if (state != SPA_LOAD_RECOVER) 3595 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3596 3597 zfs_dbgmsg("spa_open_common: opening %s", pool); 3598 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3599 policy.zrp_request); 3600 3601 if (error == EBADF) { 3602 /* 3603 * If vdev_validate() returns failure (indicated by 3604 * EBADF), it indicates that one of the vdevs indicates 3605 * that the pool has been exported or destroyed. If 3606 * this is the case, the config cache is out of sync and 3607 * we should remove the pool from the namespace. 3608 */ 3609 spa_unload(spa); 3610 spa_deactivate(spa); 3611 spa_write_cachefile(spa, B_TRUE, B_TRUE); 3612 spa_remove(spa); 3613 if (locked) 3614 mutex_exit(&spa_namespace_lock); 3615 return (SET_ERROR(ENOENT)); 3616 } 3617 3618 if (error) { 3619 /* 3620 * We can't open the pool, but we still have useful 3621 * information: the state of each vdev after the 3622 * attempted vdev_open(). Return this to the user. 3623 */ 3624 if (config != NULL && spa->spa_config) { 3625 VERIFY(nvlist_dup(spa->spa_config, config, 3626 KM_SLEEP) == 0); 3627 VERIFY(nvlist_add_nvlist(*config, 3628 ZPOOL_CONFIG_LOAD_INFO, 3629 spa->spa_load_info) == 0); 3630 } 3631 spa_unload(spa); 3632 spa_deactivate(spa); 3633 spa->spa_last_open_failed = error; 3634 if (locked) 3635 mutex_exit(&spa_namespace_lock); 3636 *spapp = NULL; 3637 return (error); 3638 } 3639 } 3640 3641 spa_open_ref(spa, tag); 3642 3643 if (config != NULL) 3644 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3645 3646 /* 3647 * If we've recovered the pool, pass back any information we 3648 * gathered while doing the load. 3649 */ 3650 if (state == SPA_LOAD_RECOVER) { 3651 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3652 spa->spa_load_info) == 0); 3653 } 3654 3655 if (locked) { 3656 spa->spa_last_open_failed = 0; 3657 spa->spa_last_ubsync_txg = 0; 3658 spa->spa_load_txg = 0; 3659 mutex_exit(&spa_namespace_lock); 3660 } 3661 3662 *spapp = spa; 3663 3664 return (0); 3665 } 3666 3667 int 3668 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3669 nvlist_t **config) 3670 { 3671 return (spa_open_common(name, spapp, tag, policy, config)); 3672 } 3673 3674 int 3675 spa_open(const char *name, spa_t **spapp, void *tag) 3676 { 3677 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3678 } 3679 3680 /* 3681 * Lookup the given spa_t, incrementing the inject count in the process, 3682 * preventing it from being exported or destroyed. 3683 */ 3684 spa_t * 3685 spa_inject_addref(char *name) 3686 { 3687 spa_t *spa; 3688 3689 mutex_enter(&spa_namespace_lock); 3690 if ((spa = spa_lookup(name)) == NULL) { 3691 mutex_exit(&spa_namespace_lock); 3692 return (NULL); 3693 } 3694 spa->spa_inject_ref++; 3695 mutex_exit(&spa_namespace_lock); 3696 3697 return (spa); 3698 } 3699 3700 void 3701 spa_inject_delref(spa_t *spa) 3702 { 3703 mutex_enter(&spa_namespace_lock); 3704 spa->spa_inject_ref--; 3705 mutex_exit(&spa_namespace_lock); 3706 } 3707 3708 /* 3709 * Add spares device information to the nvlist. 3710 */ 3711 static void 3712 spa_add_spares(spa_t *spa, nvlist_t *config) 3713 { 3714 nvlist_t **spares; 3715 uint_t i, nspares; 3716 nvlist_t *nvroot; 3717 uint64_t guid; 3718 vdev_stat_t *vs; 3719 uint_t vsc; 3720 uint64_t pool; 3721 3722 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3723 3724 if (spa->spa_spares.sav_count == 0) 3725 return; 3726 3727 VERIFY(nvlist_lookup_nvlist(config, 3728 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3729 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3730 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3731 if (nspares != 0) { 3732 VERIFY(nvlist_add_nvlist_array(nvroot, 3733 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3734 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3735 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3736 3737 /* 3738 * Go through and find any spares which have since been 3739 * repurposed as an active spare. If this is the case, update 3740 * their status appropriately. 3741 */ 3742 for (i = 0; i < nspares; i++) { 3743 VERIFY(nvlist_lookup_uint64(spares[i], 3744 ZPOOL_CONFIG_GUID, &guid) == 0); 3745 if (spa_spare_exists(guid, &pool, NULL) && 3746 pool != 0ULL) { 3747 VERIFY(nvlist_lookup_uint64_array( 3748 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3749 (uint64_t **)&vs, &vsc) == 0); 3750 vs->vs_state = VDEV_STATE_CANT_OPEN; 3751 vs->vs_aux = VDEV_AUX_SPARED; 3752 } 3753 } 3754 } 3755 } 3756 3757 /* 3758 * Add l2cache device information to the nvlist, including vdev stats. 3759 */ 3760 static void 3761 spa_add_l2cache(spa_t *spa, nvlist_t *config) 3762 { 3763 nvlist_t **l2cache; 3764 uint_t i, j, nl2cache; 3765 nvlist_t *nvroot; 3766 uint64_t guid; 3767 vdev_t *vd; 3768 vdev_stat_t *vs; 3769 uint_t vsc; 3770 3771 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3772 3773 if (spa->spa_l2cache.sav_count == 0) 3774 return; 3775 3776 VERIFY(nvlist_lookup_nvlist(config, 3777 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3778 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3779 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3780 if (nl2cache != 0) { 3781 VERIFY(nvlist_add_nvlist_array(nvroot, 3782 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3783 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3784 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3785 3786 /* 3787 * Update level 2 cache device stats. 3788 */ 3789 3790 for (i = 0; i < nl2cache; i++) { 3791 VERIFY(nvlist_lookup_uint64(l2cache[i], 3792 ZPOOL_CONFIG_GUID, &guid) == 0); 3793 3794 vd = NULL; 3795 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3796 if (guid == 3797 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3798 vd = spa->spa_l2cache.sav_vdevs[j]; 3799 break; 3800 } 3801 } 3802 ASSERT(vd != NULL); 3803 3804 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3805 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3806 == 0); 3807 vdev_get_stats(vd, vs); 3808 } 3809 } 3810 } 3811 3812 static void 3813 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3814 { 3815 nvlist_t *features; 3816 zap_cursor_t zc; 3817 zap_attribute_t za; 3818 3819 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3820 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3821 3822 if (spa->spa_feat_for_read_obj != 0) { 3823 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3824 spa->spa_feat_for_read_obj); 3825 zap_cursor_retrieve(&zc, &za) == 0; 3826 zap_cursor_advance(&zc)) { 3827 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3828 za.za_num_integers == 1); 3829 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3830 za.za_first_integer)); 3831 } 3832 zap_cursor_fini(&zc); 3833 } 3834 3835 if (spa->spa_feat_for_write_obj != 0) { 3836 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3837 spa->spa_feat_for_write_obj); 3838 zap_cursor_retrieve(&zc, &za) == 0; 3839 zap_cursor_advance(&zc)) { 3840 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3841 za.za_num_integers == 1); 3842 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3843 za.za_first_integer)); 3844 } 3845 zap_cursor_fini(&zc); 3846 } 3847 3848 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3849 features) == 0); 3850 nvlist_free(features); 3851 } 3852 3853 int 3854 spa_get_stats(const char *name, nvlist_t **config, 3855 char *altroot, size_t buflen) 3856 { 3857 int error; 3858 spa_t *spa; 3859 3860 *config = NULL; 3861 error = spa_open_common(name, &spa, FTAG, NULL, config); 3862 3863 if (spa != NULL) { 3864 /* 3865 * This still leaves a window of inconsistency where the spares 3866 * or l2cache devices could change and the config would be 3867 * self-inconsistent. 3868 */ 3869 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3870 3871 if (*config != NULL) { 3872 uint64_t loadtimes[2]; 3873 3874 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3875 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3876 VERIFY(nvlist_add_uint64_array(*config, 3877 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3878 3879 VERIFY(nvlist_add_uint64(*config, 3880 ZPOOL_CONFIG_ERRCOUNT, 3881 spa_get_errlog_size(spa)) == 0); 3882 3883 if (spa_suspended(spa)) 3884 VERIFY(nvlist_add_uint64(*config, 3885 ZPOOL_CONFIG_SUSPENDED, 3886 spa->spa_failmode) == 0); 3887 3888 spa_add_spares(spa, *config); 3889 spa_add_l2cache(spa, *config); 3890 spa_add_feature_stats(spa, *config); 3891 } 3892 } 3893 3894 /* 3895 * We want to get the alternate root even for faulted pools, so we cheat 3896 * and call spa_lookup() directly. 3897 */ 3898 if (altroot) { 3899 if (spa == NULL) { 3900 mutex_enter(&spa_namespace_lock); 3901 spa = spa_lookup(name); 3902 if (spa) 3903 spa_altroot(spa, altroot, buflen); 3904 else 3905 altroot[0] = '\0'; 3906 spa = NULL; 3907 mutex_exit(&spa_namespace_lock); 3908 } else { 3909 spa_altroot(spa, altroot, buflen); 3910 } 3911 } 3912 3913 if (spa != NULL) { 3914 spa_config_exit(spa, SCL_CONFIG, FTAG); 3915 spa_close(spa, FTAG); 3916 } 3917 3918 return (error); 3919 } 3920 3921 /* 3922 * Validate that the auxiliary device array is well formed. We must have an 3923 * array of nvlists, each which describes a valid leaf vdev. If this is an 3924 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3925 * specified, as long as they are well-formed. 3926 */ 3927 static int 3928 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3929 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3930 vdev_labeltype_t label) 3931 { 3932 nvlist_t **dev; 3933 uint_t i, ndev; 3934 vdev_t *vd; 3935 int error; 3936 3937 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3938 3939 /* 3940 * It's acceptable to have no devs specified. 3941 */ 3942 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3943 return (0); 3944 3945 if (ndev == 0) 3946 return (SET_ERROR(EINVAL)); 3947 3948 /* 3949 * Make sure the pool is formatted with a version that supports this 3950 * device type. 3951 */ 3952 if (spa_version(spa) < version) 3953 return (SET_ERROR(ENOTSUP)); 3954 3955 /* 3956 * Set the pending device list so we correctly handle device in-use 3957 * checking. 3958 */ 3959 sav->sav_pending = dev; 3960 sav->sav_npending = ndev; 3961 3962 for (i = 0; i < ndev; i++) { 3963 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3964 mode)) != 0) 3965 goto out; 3966 3967 if (!vd->vdev_ops->vdev_op_leaf) { 3968 vdev_free(vd); 3969 error = SET_ERROR(EINVAL); 3970 goto out; 3971 } 3972 3973 /* 3974 * The L2ARC currently only supports disk devices in 3975 * kernel context. For user-level testing, we allow it. 3976 */ 3977 #ifdef _KERNEL 3978 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3979 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3980 error = SET_ERROR(ENOTBLK); 3981 vdev_free(vd); 3982 goto out; 3983 } 3984 #endif 3985 vd->vdev_top = vd; 3986 3987 if ((error = vdev_open(vd)) == 0 && 3988 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3989 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3990 vd->vdev_guid) == 0); 3991 } 3992 3993 vdev_free(vd); 3994 3995 if (error && 3996 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3997 goto out; 3998 else 3999 error = 0; 4000 } 4001 4002 out: 4003 sav->sav_pending = NULL; 4004 sav->sav_npending = 0; 4005 return (error); 4006 } 4007 4008 static int 4009 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 4010 { 4011 int error; 4012 4013 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4014 4015 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4016 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 4017 VDEV_LABEL_SPARE)) != 0) { 4018 return (error); 4019 } 4020 4021 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 4022 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 4023 VDEV_LABEL_L2CACHE)); 4024 } 4025 4026 static void 4027 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 4028 const char *config) 4029 { 4030 int i; 4031 4032 if (sav->sav_config != NULL) { 4033 nvlist_t **olddevs; 4034 uint_t oldndevs; 4035 nvlist_t **newdevs; 4036 4037 /* 4038 * Generate new dev list by concatentating with the 4039 * current dev list. 4040 */ 4041 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 4042 &olddevs, &oldndevs) == 0); 4043 4044 newdevs = kmem_alloc(sizeof (void *) * 4045 (ndevs + oldndevs), KM_SLEEP); 4046 for (i = 0; i < oldndevs; i++) 4047 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 4048 KM_SLEEP) == 0); 4049 for (i = 0; i < ndevs; i++) 4050 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 4051 KM_SLEEP) == 0); 4052 4053 VERIFY(nvlist_remove(sav->sav_config, config, 4054 DATA_TYPE_NVLIST_ARRAY) == 0); 4055 4056 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 4057 config, newdevs, ndevs + oldndevs) == 0); 4058 for (i = 0; i < oldndevs + ndevs; i++) 4059 nvlist_free(newdevs[i]); 4060 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 4061 } else { 4062 /* 4063 * Generate a new dev list. 4064 */ 4065 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 4066 KM_SLEEP) == 0); 4067 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 4068 devs, ndevs) == 0); 4069 } 4070 } 4071 4072 /* 4073 * Stop and drop level 2 ARC devices 4074 */ 4075 void 4076 spa_l2cache_drop(spa_t *spa) 4077 { 4078 vdev_t *vd; 4079 int i; 4080 spa_aux_vdev_t *sav = &spa->spa_l2cache; 4081 4082 for (i = 0; i < sav->sav_count; i++) { 4083 uint64_t pool; 4084 4085 vd = sav->sav_vdevs[i]; 4086 ASSERT(vd != NULL); 4087 4088 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 4089 pool != 0ULL && l2arc_vdev_present(vd)) 4090 l2arc_remove_vdev(vd); 4091 } 4092 } 4093 4094 /* 4095 * Pool Creation 4096 */ 4097 int 4098 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 4099 nvlist_t *zplprops) 4100 { 4101 spa_t *spa; 4102 char *altroot = NULL; 4103 vdev_t *rvd; 4104 dsl_pool_t *dp; 4105 dmu_tx_t *tx; 4106 int error = 0; 4107 uint64_t txg = TXG_INITIAL; 4108 nvlist_t **spares, **l2cache; 4109 uint_t nspares, nl2cache; 4110 uint64_t version, obj; 4111 boolean_t has_features; 4112 4113 /* 4114 * If this pool already exists, return failure. 4115 */ 4116 mutex_enter(&spa_namespace_lock); 4117 if (spa_lookup(pool) != NULL) { 4118 mutex_exit(&spa_namespace_lock); 4119 return (SET_ERROR(EEXIST)); 4120 } 4121 4122 /* 4123 * Allocate a new spa_t structure. 4124 */ 4125 (void) nvlist_lookup_string(props, 4126 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4127 spa = spa_add(pool, NULL, altroot); 4128 spa_activate(spa, spa_mode_global); 4129 4130 if (props && (error = spa_prop_validate(spa, props))) { 4131 spa_deactivate(spa); 4132 spa_remove(spa); 4133 mutex_exit(&spa_namespace_lock); 4134 return (error); 4135 } 4136 4137 has_features = B_FALSE; 4138 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 4139 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 4140 if (zpool_prop_feature(nvpair_name(elem))) 4141 has_features = B_TRUE; 4142 } 4143 4144 if (has_features || nvlist_lookup_uint64(props, 4145 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 4146 version = SPA_VERSION; 4147 } 4148 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 4149 4150 spa->spa_first_txg = txg; 4151 spa->spa_uberblock.ub_txg = txg - 1; 4152 spa->spa_uberblock.ub_version = version; 4153 spa->spa_ubsync = spa->spa_uberblock; 4154 spa->spa_load_state = SPA_LOAD_CREATE; 4155 spa->spa_removing_phys.sr_state = DSS_NONE; 4156 spa->spa_removing_phys.sr_removing_vdev = -1; 4157 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 4158 4159 /* 4160 * Create "The Godfather" zio to hold all async IOs 4161 */ 4162 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4163 KM_SLEEP); 4164 for (int i = 0; i < max_ncpus; i++) { 4165 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4166 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4167 ZIO_FLAG_GODFATHER); 4168 } 4169 4170 /* 4171 * Create the root vdev. 4172 */ 4173 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4174 4175 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 4176 4177 ASSERT(error != 0 || rvd != NULL); 4178 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 4179 4180 if (error == 0 && !zfs_allocatable_devs(nvroot)) 4181 error = SET_ERROR(EINVAL); 4182 4183 if (error == 0 && 4184 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 4185 (error = spa_validate_aux(spa, nvroot, txg, 4186 VDEV_ALLOC_ADD)) == 0) { 4187 for (int c = 0; c < rvd->vdev_children; c++) { 4188 vdev_metaslab_set_size(rvd->vdev_child[c]); 4189 vdev_expand(rvd->vdev_child[c], txg); 4190 } 4191 } 4192 4193 spa_config_exit(spa, SCL_ALL, FTAG); 4194 4195 if (error != 0) { 4196 spa_unload(spa); 4197 spa_deactivate(spa); 4198 spa_remove(spa); 4199 mutex_exit(&spa_namespace_lock); 4200 return (error); 4201 } 4202 4203 /* 4204 * Get the list of spares, if specified. 4205 */ 4206 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4207 &spares, &nspares) == 0) { 4208 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 4209 KM_SLEEP) == 0); 4210 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4211 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4212 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4213 spa_load_spares(spa); 4214 spa_config_exit(spa, SCL_ALL, FTAG); 4215 spa->spa_spares.sav_sync = B_TRUE; 4216 } 4217 4218 /* 4219 * Get the list of level 2 cache devices, if specified. 4220 */ 4221 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4222 &l2cache, &nl2cache) == 0) { 4223 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4224 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4225 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4226 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4227 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4228 spa_load_l2cache(spa); 4229 spa_config_exit(spa, SCL_ALL, FTAG); 4230 spa->spa_l2cache.sav_sync = B_TRUE; 4231 } 4232 4233 spa->spa_is_initializing = B_TRUE; 4234 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 4235 spa->spa_meta_objset = dp->dp_meta_objset; 4236 spa->spa_is_initializing = B_FALSE; 4237 4238 /* 4239 * Create DDTs (dedup tables). 4240 */ 4241 ddt_create(spa); 4242 4243 spa_update_dspace(spa); 4244 4245 tx = dmu_tx_create_assigned(dp, txg); 4246 4247 /* 4248 * Create the pool config object. 4249 */ 4250 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 4251 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 4252 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 4253 4254 if (zap_add(spa->spa_meta_objset, 4255 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 4256 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 4257 cmn_err(CE_PANIC, "failed to add pool config"); 4258 } 4259 4260 if (spa_version(spa) >= SPA_VERSION_FEATURES) 4261 spa_feature_create_zap_objects(spa, tx); 4262 4263 if (zap_add(spa->spa_meta_objset, 4264 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 4265 sizeof (uint64_t), 1, &version, tx) != 0) { 4266 cmn_err(CE_PANIC, "failed to add pool version"); 4267 } 4268 4269 /* Newly created pools with the right version are always deflated. */ 4270 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 4271 spa->spa_deflate = TRUE; 4272 if (zap_add(spa->spa_meta_objset, 4273 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4274 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 4275 cmn_err(CE_PANIC, "failed to add deflate"); 4276 } 4277 } 4278 4279 /* 4280 * Create the deferred-free bpobj. Turn off compression 4281 * because sync-to-convergence takes longer if the blocksize 4282 * keeps changing. 4283 */ 4284 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 4285 dmu_object_set_compress(spa->spa_meta_objset, obj, 4286 ZIO_COMPRESS_OFF, tx); 4287 if (zap_add(spa->spa_meta_objset, 4288 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 4289 sizeof (uint64_t), 1, &obj, tx) != 0) { 4290 cmn_err(CE_PANIC, "failed to add bpobj"); 4291 } 4292 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 4293 spa->spa_meta_objset, obj)); 4294 4295 /* 4296 * Create the pool's history object. 4297 */ 4298 if (version >= SPA_VERSION_ZPOOL_HISTORY) 4299 spa_history_create_obj(spa, tx); 4300 4301 /* 4302 * Generate some random noise for salted checksums to operate on. 4303 */ 4304 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4305 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4306 4307 /* 4308 * Set pool properties. 4309 */ 4310 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 4311 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4312 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 4313 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 4314 4315 if (props != NULL) { 4316 spa_configfile_set(spa, props, B_FALSE); 4317 spa_sync_props(props, tx); 4318 } 4319 4320 dmu_tx_commit(tx); 4321 4322 spa->spa_sync_on = B_TRUE; 4323 txg_sync_start(spa->spa_dsl_pool); 4324 4325 /* 4326 * We explicitly wait for the first transaction to complete so that our 4327 * bean counters are appropriately updated. 4328 */ 4329 txg_wait_synced(spa->spa_dsl_pool, txg); 4330 4331 spa_write_cachefile(spa, B_FALSE, B_TRUE); 4332 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 4333 4334 spa_history_log_version(spa, "create"); 4335 4336 /* 4337 * Don't count references from objsets that are already closed 4338 * and are making their way through the eviction process. 4339 */ 4340 spa_evicting_os_wait(spa); 4341 spa->spa_minref = refcount_count(&spa->spa_refcount); 4342 spa->spa_load_state = SPA_LOAD_NONE; 4343 4344 mutex_exit(&spa_namespace_lock); 4345 4346 return (0); 4347 } 4348 4349 #ifdef _KERNEL 4350 /* 4351 * Get the root pool information from the root disk, then import the root pool 4352 * during the system boot up time. 4353 */ 4354 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 4355 4356 static nvlist_t * 4357 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 4358 { 4359 nvlist_t *config; 4360 nvlist_t *nvtop, *nvroot; 4361 uint64_t pgid; 4362 4363 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 4364 return (NULL); 4365 4366 /* 4367 * Add this top-level vdev to the child array. 4368 */ 4369 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4370 &nvtop) == 0); 4371 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4372 &pgid) == 0); 4373 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 4374 4375 /* 4376 * Put this pool's top-level vdevs into a root vdev. 4377 */ 4378 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4379 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4380 VDEV_TYPE_ROOT) == 0); 4381 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4382 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4383 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4384 &nvtop, 1) == 0); 4385 4386 /* 4387 * Replace the existing vdev_tree with the new root vdev in 4388 * this pool's configuration (remove the old, add the new). 4389 */ 4390 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4391 nvlist_free(nvroot); 4392 return (config); 4393 } 4394 4395 /* 4396 * Walk the vdev tree and see if we can find a device with "better" 4397 * configuration. A configuration is "better" if the label on that 4398 * device has a more recent txg. 4399 */ 4400 static void 4401 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 4402 { 4403 for (int c = 0; c < vd->vdev_children; c++) 4404 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 4405 4406 if (vd->vdev_ops->vdev_op_leaf) { 4407 nvlist_t *label; 4408 uint64_t label_txg; 4409 4410 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 4411 &label) != 0) 4412 return; 4413 4414 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 4415 &label_txg) == 0); 4416 4417 /* 4418 * Do we have a better boot device? 4419 */ 4420 if (label_txg > *txg) { 4421 *txg = label_txg; 4422 *avd = vd; 4423 } 4424 nvlist_free(label); 4425 } 4426 } 4427 4428 /* 4429 * Import a root pool. 4430 * 4431 * For x86. devpath_list will consist of devid and/or physpath name of 4432 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 4433 * The GRUB "findroot" command will return the vdev we should boot. 4434 * 4435 * For Sparc, devpath_list consists the physpath name of the booting device 4436 * no matter the rootpool is a single device pool or a mirrored pool. 4437 * e.g. 4438 * "/pci@1f,0/ide@d/disk@0,0:a" 4439 */ 4440 int 4441 spa_import_rootpool(char *devpath, char *devid) 4442 { 4443 spa_t *spa; 4444 vdev_t *rvd, *bvd, *avd = NULL; 4445 nvlist_t *config, *nvtop; 4446 uint64_t guid, txg; 4447 char *pname; 4448 int error; 4449 4450 /* 4451 * Read the label from the boot device and generate a configuration. 4452 */ 4453 config = spa_generate_rootconf(devpath, devid, &guid); 4454 #if defined(_OBP) && defined(_KERNEL) 4455 if (config == NULL) { 4456 if (strstr(devpath, "/iscsi/ssd") != NULL) { 4457 /* iscsi boot */ 4458 get_iscsi_bootpath_phy(devpath); 4459 config = spa_generate_rootconf(devpath, devid, &guid); 4460 } 4461 } 4462 #endif 4463 if (config == NULL) { 4464 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4465 devpath); 4466 return (SET_ERROR(EIO)); 4467 } 4468 4469 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4470 &pname) == 0); 4471 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4472 4473 mutex_enter(&spa_namespace_lock); 4474 if ((spa = spa_lookup(pname)) != NULL) { 4475 /* 4476 * Remove the existing root pool from the namespace so that we 4477 * can replace it with the correct config we just read in. 4478 */ 4479 spa_remove(spa); 4480 } 4481 4482 spa = spa_add(pname, config, NULL); 4483 spa->spa_is_root = B_TRUE; 4484 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4485 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4486 &spa->spa_ubsync.ub_version) != 0) 4487 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4488 4489 /* 4490 * Build up a vdev tree based on the boot device's label config. 4491 */ 4492 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4493 &nvtop) == 0); 4494 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4495 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4496 VDEV_ALLOC_ROOTPOOL); 4497 spa_config_exit(spa, SCL_ALL, FTAG); 4498 if (error) { 4499 mutex_exit(&spa_namespace_lock); 4500 nvlist_free(config); 4501 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4502 pname); 4503 return (error); 4504 } 4505 4506 /* 4507 * Get the boot vdev. 4508 */ 4509 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4510 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4511 (u_longlong_t)guid); 4512 error = SET_ERROR(ENOENT); 4513 goto out; 4514 } 4515 4516 /* 4517 * Determine if there is a better boot device. 4518 */ 4519 avd = bvd; 4520 spa_alt_rootvdev(rvd, &avd, &txg); 4521 if (avd != bvd) { 4522 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4523 "try booting from '%s'", avd->vdev_path); 4524 error = SET_ERROR(EINVAL); 4525 goto out; 4526 } 4527 4528 /* 4529 * If the boot device is part of a spare vdev then ensure that 4530 * we're booting off the active spare. 4531 */ 4532 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4533 !bvd->vdev_isspare) { 4534 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4535 "try booting from '%s'", 4536 bvd->vdev_parent-> 4537 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4538 error = SET_ERROR(EINVAL); 4539 goto out; 4540 } 4541 4542 error = 0; 4543 out: 4544 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4545 vdev_free(rvd); 4546 spa_config_exit(spa, SCL_ALL, FTAG); 4547 mutex_exit(&spa_namespace_lock); 4548 4549 nvlist_free(config); 4550 return (error); 4551 } 4552 4553 #endif 4554 4555 /* 4556 * Import a non-root pool into the system. 4557 */ 4558 int 4559 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4560 { 4561 spa_t *spa; 4562 char *altroot = NULL; 4563 spa_load_state_t state = SPA_LOAD_IMPORT; 4564 zpool_rewind_policy_t policy; 4565 uint64_t mode = spa_mode_global; 4566 uint64_t readonly = B_FALSE; 4567 int error; 4568 nvlist_t *nvroot; 4569 nvlist_t **spares, **l2cache; 4570 uint_t nspares, nl2cache; 4571 4572 /* 4573 * If a pool with this name exists, return failure. 4574 */ 4575 mutex_enter(&spa_namespace_lock); 4576 if (spa_lookup(pool) != NULL) { 4577 mutex_exit(&spa_namespace_lock); 4578 return (SET_ERROR(EEXIST)); 4579 } 4580 4581 /* 4582 * Create and initialize the spa structure. 4583 */ 4584 (void) nvlist_lookup_string(props, 4585 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4586 (void) nvlist_lookup_uint64(props, 4587 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4588 if (readonly) 4589 mode = FREAD; 4590 spa = spa_add(pool, config, altroot); 4591 spa->spa_import_flags = flags; 4592 4593 /* 4594 * Verbatim import - Take a pool and insert it into the namespace 4595 * as if it had been loaded at boot. 4596 */ 4597 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4598 if (props != NULL) 4599 spa_configfile_set(spa, props, B_FALSE); 4600 4601 spa_write_cachefile(spa, B_FALSE, B_TRUE); 4602 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4603 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 4604 mutex_exit(&spa_namespace_lock); 4605 return (0); 4606 } 4607 4608 spa_activate(spa, mode); 4609 4610 /* 4611 * Don't start async tasks until we know everything is healthy. 4612 */ 4613 spa_async_suspend(spa); 4614 4615 zpool_get_rewind_policy(config, &policy); 4616 if (policy.zrp_request & ZPOOL_DO_REWIND) 4617 state = SPA_LOAD_RECOVER; 4618 4619 /* 4620 * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config 4621 * because the user-supplied config is actually the one to trust when 4622 * doing an import. 4623 */ 4624 if (state != SPA_LOAD_RECOVER) 4625 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4626 4627 zfs_dbgmsg("spa_import: importing %s%s", pool, 4628 (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : ""); 4629 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4630 policy.zrp_request); 4631 4632 /* 4633 * Propagate anything learned while loading the pool and pass it 4634 * back to caller (i.e. rewind info, missing devices, etc). 4635 */ 4636 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4637 spa->spa_load_info) == 0); 4638 4639 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4640 /* 4641 * Toss any existing sparelist, as it doesn't have any validity 4642 * anymore, and conflicts with spa_has_spare(). 4643 */ 4644 if (spa->spa_spares.sav_config) { 4645 nvlist_free(spa->spa_spares.sav_config); 4646 spa->spa_spares.sav_config = NULL; 4647 spa_load_spares(spa); 4648 } 4649 if (spa->spa_l2cache.sav_config) { 4650 nvlist_free(spa->spa_l2cache.sav_config); 4651 spa->spa_l2cache.sav_config = NULL; 4652 spa_load_l2cache(spa); 4653 } 4654 4655 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4656 &nvroot) == 0); 4657 if (error == 0) 4658 error = spa_validate_aux(spa, nvroot, -1ULL, 4659 VDEV_ALLOC_SPARE); 4660 if (error == 0) 4661 error = spa_validate_aux(spa, nvroot, -1ULL, 4662 VDEV_ALLOC_L2CACHE); 4663 spa_config_exit(spa, SCL_ALL, FTAG); 4664 4665 if (props != NULL) 4666 spa_configfile_set(spa, props, B_FALSE); 4667 4668 if (error != 0 || (props && spa_writeable(spa) && 4669 (error = spa_prop_set(spa, props)))) { 4670 spa_unload(spa); 4671 spa_deactivate(spa); 4672 spa_remove(spa); 4673 mutex_exit(&spa_namespace_lock); 4674 return (error); 4675 } 4676 4677 spa_async_resume(spa); 4678 4679 /* 4680 * Override any spares and level 2 cache devices as specified by 4681 * the user, as these may have correct device names/devids, etc. 4682 */ 4683 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4684 &spares, &nspares) == 0) { 4685 if (spa->spa_spares.sav_config) 4686 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4687 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4688 else 4689 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4690 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4691 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4692 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4693 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4694 spa_load_spares(spa); 4695 spa_config_exit(spa, SCL_ALL, FTAG); 4696 spa->spa_spares.sav_sync = B_TRUE; 4697 } 4698 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4699 &l2cache, &nl2cache) == 0) { 4700 if (spa->spa_l2cache.sav_config) 4701 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4702 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4703 else 4704 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4705 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4706 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4707 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4708 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4709 spa_load_l2cache(spa); 4710 spa_config_exit(spa, SCL_ALL, FTAG); 4711 spa->spa_l2cache.sav_sync = B_TRUE; 4712 } 4713 4714 /* 4715 * Check for any removed devices. 4716 */ 4717 if (spa->spa_autoreplace) { 4718 spa_aux_check_removed(&spa->spa_spares); 4719 spa_aux_check_removed(&spa->spa_l2cache); 4720 } 4721 4722 if (spa_writeable(spa)) { 4723 /* 4724 * Update the config cache to include the newly-imported pool. 4725 */ 4726 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4727 } 4728 4729 /* 4730 * It's possible that the pool was expanded while it was exported. 4731 * We kick off an async task to handle this for us. 4732 */ 4733 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4734 4735 spa_history_log_version(spa, "import"); 4736 4737 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 4738 4739 mutex_exit(&spa_namespace_lock); 4740 4741 return (0); 4742 } 4743 4744 nvlist_t * 4745 spa_tryimport(nvlist_t *tryconfig) 4746 { 4747 nvlist_t *config = NULL; 4748 char *poolname; 4749 spa_t *spa; 4750 uint64_t state; 4751 int error; 4752 4753 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4754 return (NULL); 4755 4756 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4757 return (NULL); 4758 4759 /* 4760 * Create and initialize the spa structure. 4761 */ 4762 mutex_enter(&spa_namespace_lock); 4763 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4764 spa_activate(spa, FREAD); 4765 4766 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 4767 4768 /* 4769 * Pass off the heavy lifting to spa_load(). 4770 * Pass TRUE for trust_config because the user-supplied config 4771 * is actually the one to trust when doing an import. 4772 */ 4773 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4774 4775 /* 4776 * If 'tryconfig' was at least parsable, return the current config. 4777 */ 4778 if (spa->spa_root_vdev != NULL) { 4779 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4780 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4781 poolname) == 0); 4782 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4783 state) == 0); 4784 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4785 spa->spa_uberblock.ub_timestamp) == 0); 4786 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4787 spa->spa_load_info) == 0); 4788 4789 /* 4790 * If the bootfs property exists on this pool then we 4791 * copy it out so that external consumers can tell which 4792 * pools are bootable. 4793 */ 4794 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4795 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4796 4797 /* 4798 * We have to play games with the name since the 4799 * pool was opened as TRYIMPORT_NAME. 4800 */ 4801 if (dsl_dsobj_to_dsname(spa_name(spa), 4802 spa->spa_bootfs, tmpname) == 0) { 4803 char *cp; 4804 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4805 4806 cp = strchr(tmpname, '/'); 4807 if (cp == NULL) { 4808 (void) strlcpy(dsname, tmpname, 4809 MAXPATHLEN); 4810 } else { 4811 (void) snprintf(dsname, MAXPATHLEN, 4812 "%s/%s", poolname, ++cp); 4813 } 4814 VERIFY(nvlist_add_string(config, 4815 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4816 kmem_free(dsname, MAXPATHLEN); 4817 } 4818 kmem_free(tmpname, MAXPATHLEN); 4819 } 4820 4821 /* 4822 * Add the list of hot spares and level 2 cache devices. 4823 */ 4824 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4825 spa_add_spares(spa, config); 4826 spa_add_l2cache(spa, config); 4827 spa_config_exit(spa, SCL_CONFIG, FTAG); 4828 } 4829 4830 spa_unload(spa); 4831 spa_deactivate(spa); 4832 spa_remove(spa); 4833 mutex_exit(&spa_namespace_lock); 4834 4835 return (config); 4836 } 4837 4838 /* 4839 * Pool export/destroy 4840 * 4841 * The act of destroying or exporting a pool is very simple. We make sure there 4842 * is no more pending I/O and any references to the pool are gone. Then, we 4843 * update the pool state and sync all the labels to disk, removing the 4844 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4845 * we don't sync the labels or remove the configuration cache. 4846 */ 4847 static int 4848 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4849 boolean_t force, boolean_t hardforce) 4850 { 4851 spa_t *spa; 4852 4853 if (oldconfig) 4854 *oldconfig = NULL; 4855 4856 if (!(spa_mode_global & FWRITE)) 4857 return (SET_ERROR(EROFS)); 4858 4859 mutex_enter(&spa_namespace_lock); 4860 if ((spa = spa_lookup(pool)) == NULL) { 4861 mutex_exit(&spa_namespace_lock); 4862 return (SET_ERROR(ENOENT)); 4863 } 4864 4865 /* 4866 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4867 * reacquire the namespace lock, and see if we can export. 4868 */ 4869 spa_open_ref(spa, FTAG); 4870 mutex_exit(&spa_namespace_lock); 4871 spa_async_suspend(spa); 4872 mutex_enter(&spa_namespace_lock); 4873 spa_close(spa, FTAG); 4874 4875 /* 4876 * The pool will be in core if it's openable, 4877 * in which case we can modify its state. 4878 */ 4879 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4880 /* 4881 * Objsets may be open only because they're dirty, so we 4882 * have to force it to sync before checking spa_refcnt. 4883 */ 4884 txg_wait_synced(spa->spa_dsl_pool, 0); 4885 spa_evicting_os_wait(spa); 4886 4887 /* 4888 * A pool cannot be exported or destroyed if there are active 4889 * references. If we are resetting a pool, allow references by 4890 * fault injection handlers. 4891 */ 4892 if (!spa_refcount_zero(spa) || 4893 (spa->spa_inject_ref != 0 && 4894 new_state != POOL_STATE_UNINITIALIZED)) { 4895 spa_async_resume(spa); 4896 mutex_exit(&spa_namespace_lock); 4897 return (SET_ERROR(EBUSY)); 4898 } 4899 4900 /* 4901 * A pool cannot be exported if it has an active shared spare. 4902 * This is to prevent other pools stealing the active spare 4903 * from an exported pool. At user's own will, such pool can 4904 * be forcedly exported. 4905 */ 4906 if (!force && new_state == POOL_STATE_EXPORTED && 4907 spa_has_active_shared_spare(spa)) { 4908 spa_async_resume(spa); 4909 mutex_exit(&spa_namespace_lock); 4910 return (SET_ERROR(EXDEV)); 4911 } 4912 4913 /* 4914 * We want this to be reflected on every label, 4915 * so mark them all dirty. spa_unload() will do the 4916 * final sync that pushes these changes out. 4917 */ 4918 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4919 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4920 spa->spa_state = new_state; 4921 spa->spa_final_txg = spa_last_synced_txg(spa) + 4922 TXG_DEFER_SIZE + 1; 4923 vdev_config_dirty(spa->spa_root_vdev); 4924 spa_config_exit(spa, SCL_ALL, FTAG); 4925 } 4926 } 4927 4928 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 4929 4930 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4931 spa_unload(spa); 4932 spa_deactivate(spa); 4933 } 4934 4935 if (oldconfig && spa->spa_config) 4936 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4937 4938 if (new_state != POOL_STATE_UNINITIALIZED) { 4939 if (!hardforce) 4940 spa_write_cachefile(spa, B_TRUE, B_TRUE); 4941 spa_remove(spa); 4942 } 4943 mutex_exit(&spa_namespace_lock); 4944 4945 return (0); 4946 } 4947 4948 /* 4949 * Destroy a storage pool. 4950 */ 4951 int 4952 spa_destroy(char *pool) 4953 { 4954 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4955 B_FALSE, B_FALSE)); 4956 } 4957 4958 /* 4959 * Export a storage pool. 4960 */ 4961 int 4962 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4963 boolean_t hardforce) 4964 { 4965 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4966 force, hardforce)); 4967 } 4968 4969 /* 4970 * Similar to spa_export(), this unloads the spa_t without actually removing it 4971 * from the namespace in any way. 4972 */ 4973 int 4974 spa_reset(char *pool) 4975 { 4976 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4977 B_FALSE, B_FALSE)); 4978 } 4979 4980 /* 4981 * ========================================================================== 4982 * Device manipulation 4983 * ========================================================================== 4984 */ 4985 4986 /* 4987 * Add a device to a storage pool. 4988 */ 4989 int 4990 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4991 { 4992 uint64_t txg, id; 4993 int error; 4994 vdev_t *rvd = spa->spa_root_vdev; 4995 vdev_t *vd, *tvd; 4996 nvlist_t **spares, **l2cache; 4997 uint_t nspares, nl2cache; 4998 4999 ASSERT(spa_writeable(spa)); 5000 5001 txg = spa_vdev_enter(spa); 5002 5003 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 5004 VDEV_ALLOC_ADD)) != 0) 5005 return (spa_vdev_exit(spa, NULL, txg, error)); 5006 5007 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 5008 5009 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 5010 &nspares) != 0) 5011 nspares = 0; 5012 5013 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 5014 &nl2cache) != 0) 5015 nl2cache = 0; 5016 5017 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 5018 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5019 5020 if (vd->vdev_children != 0 && 5021 (error = vdev_create(vd, txg, B_FALSE)) != 0) 5022 return (spa_vdev_exit(spa, vd, txg, error)); 5023 5024 /* 5025 * We must validate the spares and l2cache devices after checking the 5026 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 5027 */ 5028 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 5029 return (spa_vdev_exit(spa, vd, txg, error)); 5030 5031 /* 5032 * If we are in the middle of a device removal, we can only add 5033 * devices which match the existing devices in the pool. 5034 * If we are in the middle of a removal, or have some indirect 5035 * vdevs, we can not add raidz toplevels. 5036 */ 5037 if (spa->spa_vdev_removal != NULL || 5038 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5039 for (int c = 0; c < vd->vdev_children; c++) { 5040 tvd = vd->vdev_child[c]; 5041 if (spa->spa_vdev_removal != NULL && 5042 tvd->vdev_ashift != 5043 spa->spa_vdev_removal->svr_vdev->vdev_ashift) { 5044 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5045 } 5046 /* Fail if top level vdev is raidz */ 5047 if (tvd->vdev_ops == &vdev_raidz_ops) { 5048 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 5049 } 5050 /* 5051 * Need the top level mirror to be 5052 * a mirror of leaf vdevs only 5053 */ 5054 if (tvd->vdev_ops == &vdev_mirror_ops) { 5055 for (uint64_t cid = 0; 5056 cid < tvd->vdev_children; cid++) { 5057 vdev_t *cvd = tvd->vdev_child[cid]; 5058 if (!cvd->vdev_ops->vdev_op_leaf) { 5059 return (spa_vdev_exit(spa, vd, 5060 txg, EINVAL)); 5061 } 5062 } 5063 } 5064 } 5065 } 5066 5067 for (int c = 0; c < vd->vdev_children; c++) { 5068 5069 /* 5070 * Set the vdev id to the first hole, if one exists. 5071 */ 5072 for (id = 0; id < rvd->vdev_children; id++) { 5073 if (rvd->vdev_child[id]->vdev_ishole) { 5074 vdev_free(rvd->vdev_child[id]); 5075 break; 5076 } 5077 } 5078 tvd = vd->vdev_child[c]; 5079 vdev_remove_child(vd, tvd); 5080 tvd->vdev_id = id; 5081 vdev_add_child(rvd, tvd); 5082 vdev_config_dirty(tvd); 5083 } 5084 5085 if (nspares != 0) { 5086 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 5087 ZPOOL_CONFIG_SPARES); 5088 spa_load_spares(spa); 5089 spa->spa_spares.sav_sync = B_TRUE; 5090 } 5091 5092 if (nl2cache != 0) { 5093 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 5094 ZPOOL_CONFIG_L2CACHE); 5095 spa_load_l2cache(spa); 5096 spa->spa_l2cache.sav_sync = B_TRUE; 5097 } 5098 5099 /* 5100 * We have to be careful when adding new vdevs to an existing pool. 5101 * If other threads start allocating from these vdevs before we 5102 * sync the config cache, and we lose power, then upon reboot we may 5103 * fail to open the pool because there are DVAs that the config cache 5104 * can't translate. Therefore, we first add the vdevs without 5105 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 5106 * and then let spa_config_update() initialize the new metaslabs. 5107 * 5108 * spa_load() checks for added-but-not-initialized vdevs, so that 5109 * if we lose power at any point in this sequence, the remaining 5110 * steps will be completed the next time we load the pool. 5111 */ 5112 (void) spa_vdev_exit(spa, vd, txg, 0); 5113 5114 mutex_enter(&spa_namespace_lock); 5115 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5116 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 5117 mutex_exit(&spa_namespace_lock); 5118 5119 return (0); 5120 } 5121 5122 /* 5123 * Attach a device to a mirror. The arguments are the path to any device 5124 * in the mirror, and the nvroot for the new device. If the path specifies 5125 * a device that is not mirrored, we automatically insert the mirror vdev. 5126 * 5127 * If 'replacing' is specified, the new device is intended to replace the 5128 * existing device; in this case the two devices are made into their own 5129 * mirror using the 'replacing' vdev, which is functionally identical to 5130 * the mirror vdev (it actually reuses all the same ops) but has a few 5131 * extra rules: you can't attach to it after it's been created, and upon 5132 * completion of resilvering, the first disk (the one being replaced) 5133 * is automatically detached. 5134 */ 5135 int 5136 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 5137 { 5138 uint64_t txg, dtl_max_txg; 5139 vdev_t *rvd = spa->spa_root_vdev; 5140 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 5141 vdev_ops_t *pvops; 5142 char *oldvdpath, *newvdpath; 5143 int newvd_isspare; 5144 int error; 5145 5146 ASSERT(spa_writeable(spa)); 5147 5148 txg = spa_vdev_enter(spa); 5149 5150 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 5151 5152 if (spa->spa_vdev_removal != NULL || 5153 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 5154 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5155 } 5156 5157 if (oldvd == NULL) 5158 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5159 5160 if (!oldvd->vdev_ops->vdev_op_leaf) 5161 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5162 5163 pvd = oldvd->vdev_parent; 5164 5165 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 5166 VDEV_ALLOC_ATTACH)) != 0) 5167 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5168 5169 if (newrootvd->vdev_children != 1) 5170 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5171 5172 newvd = newrootvd->vdev_child[0]; 5173 5174 if (!newvd->vdev_ops->vdev_op_leaf) 5175 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 5176 5177 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 5178 return (spa_vdev_exit(spa, newrootvd, txg, error)); 5179 5180 /* 5181 * Spares can't replace logs 5182 */ 5183 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 5184 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5185 5186 if (!replacing) { 5187 /* 5188 * For attach, the only allowable parent is a mirror or the root 5189 * vdev. 5190 */ 5191 if (pvd->vdev_ops != &vdev_mirror_ops && 5192 pvd->vdev_ops != &vdev_root_ops) 5193 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5194 5195 pvops = &vdev_mirror_ops; 5196 } else { 5197 /* 5198 * Active hot spares can only be replaced by inactive hot 5199 * spares. 5200 */ 5201 if (pvd->vdev_ops == &vdev_spare_ops && 5202 oldvd->vdev_isspare && 5203 !spa_has_spare(spa, newvd->vdev_guid)) 5204 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5205 5206 /* 5207 * If the source is a hot spare, and the parent isn't already a 5208 * spare, then we want to create a new hot spare. Otherwise, we 5209 * want to create a replacing vdev. The user is not allowed to 5210 * attach to a spared vdev child unless the 'isspare' state is 5211 * the same (spare replaces spare, non-spare replaces 5212 * non-spare). 5213 */ 5214 if (pvd->vdev_ops == &vdev_replacing_ops && 5215 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 5216 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5217 } else if (pvd->vdev_ops == &vdev_spare_ops && 5218 newvd->vdev_isspare != oldvd->vdev_isspare) { 5219 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 5220 } 5221 5222 if (newvd->vdev_isspare) 5223 pvops = &vdev_spare_ops; 5224 else 5225 pvops = &vdev_replacing_ops; 5226 } 5227 5228 /* 5229 * Make sure the new device is big enough. 5230 */ 5231 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 5232 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 5233 5234 /* 5235 * The new device cannot have a higher alignment requirement 5236 * than the top-level vdev. 5237 */ 5238 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 5239 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 5240 5241 /* 5242 * If this is an in-place replacement, update oldvd's path and devid 5243 * to make it distinguishable from newvd, and unopenable from now on. 5244 */ 5245 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 5246 spa_strfree(oldvd->vdev_path); 5247 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 5248 KM_SLEEP); 5249 (void) sprintf(oldvd->vdev_path, "%s/%s", 5250 newvd->vdev_path, "old"); 5251 if (oldvd->vdev_devid != NULL) { 5252 spa_strfree(oldvd->vdev_devid); 5253 oldvd->vdev_devid = NULL; 5254 } 5255 } 5256 5257 /* mark the device being resilvered */ 5258 newvd->vdev_resilver_txg = txg; 5259 5260 /* 5261 * If the parent is not a mirror, or if we're replacing, insert the new 5262 * mirror/replacing/spare vdev above oldvd. 5263 */ 5264 if (pvd->vdev_ops != pvops) 5265 pvd = vdev_add_parent(oldvd, pvops); 5266 5267 ASSERT(pvd->vdev_top->vdev_parent == rvd); 5268 ASSERT(pvd->vdev_ops == pvops); 5269 ASSERT(oldvd->vdev_parent == pvd); 5270 5271 /* 5272 * Extract the new device from its root and add it to pvd. 5273 */ 5274 vdev_remove_child(newrootvd, newvd); 5275 newvd->vdev_id = pvd->vdev_children; 5276 newvd->vdev_crtxg = oldvd->vdev_crtxg; 5277 vdev_add_child(pvd, newvd); 5278 5279 tvd = newvd->vdev_top; 5280 ASSERT(pvd->vdev_top == tvd); 5281 ASSERT(tvd->vdev_parent == rvd); 5282 5283 vdev_config_dirty(tvd); 5284 5285 /* 5286 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 5287 * for any dmu_sync-ed blocks. It will propagate upward when 5288 * spa_vdev_exit() calls vdev_dtl_reassess(). 5289 */ 5290 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 5291 5292 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 5293 dtl_max_txg - TXG_INITIAL); 5294 5295 if (newvd->vdev_isspare) { 5296 spa_spare_activate(newvd); 5297 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 5298 } 5299 5300 oldvdpath = spa_strdup(oldvd->vdev_path); 5301 newvdpath = spa_strdup(newvd->vdev_path); 5302 newvd_isspare = newvd->vdev_isspare; 5303 5304 /* 5305 * Mark newvd's DTL dirty in this txg. 5306 */ 5307 vdev_dirty(tvd, VDD_DTL, newvd, txg); 5308 5309 /* 5310 * Schedule the resilver to restart in the future. We do this to 5311 * ensure that dmu_sync-ed blocks have been stitched into the 5312 * respective datasets. 5313 */ 5314 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 5315 5316 if (spa->spa_bootfs) 5317 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 5318 5319 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 5320 5321 /* 5322 * Commit the config 5323 */ 5324 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 5325 5326 spa_history_log_internal(spa, "vdev attach", NULL, 5327 "%s vdev=%s %s vdev=%s", 5328 replacing && newvd_isspare ? "spare in" : 5329 replacing ? "replace" : "attach", newvdpath, 5330 replacing ? "for" : "to", oldvdpath); 5331 5332 spa_strfree(oldvdpath); 5333 spa_strfree(newvdpath); 5334 5335 return (0); 5336 } 5337 5338 /* 5339 * Detach a device from a mirror or replacing vdev. 5340 * 5341 * If 'replace_done' is specified, only detach if the parent 5342 * is a replacing vdev. 5343 */ 5344 int 5345 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 5346 { 5347 uint64_t txg; 5348 int error; 5349 vdev_t *rvd = spa->spa_root_vdev; 5350 vdev_t *vd, *pvd, *cvd, *tvd; 5351 boolean_t unspare = B_FALSE; 5352 uint64_t unspare_guid = 0; 5353 char *vdpath; 5354 5355 ASSERT(spa_writeable(spa)); 5356 5357 txg = spa_vdev_enter(spa); 5358 5359 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5360 5361 if (vd == NULL) 5362 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5363 5364 if (!vd->vdev_ops->vdev_op_leaf) 5365 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5366 5367 pvd = vd->vdev_parent; 5368 5369 /* 5370 * If the parent/child relationship is not as expected, don't do it. 5371 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5372 * vdev that's replacing B with C. The user's intent in replacing 5373 * is to go from M(A,B) to M(A,C). If the user decides to cancel 5374 * the replace by detaching C, the expected behavior is to end up 5375 * M(A,B). But suppose that right after deciding to detach C, 5376 * the replacement of B completes. We would have M(A,C), and then 5377 * ask to detach C, which would leave us with just A -- not what 5378 * the user wanted. To prevent this, we make sure that the 5379 * parent/child relationship hasn't changed -- in this example, 5380 * that C's parent is still the replacing vdev R. 5381 */ 5382 if (pvd->vdev_guid != pguid && pguid != 0) 5383 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5384 5385 /* 5386 * Only 'replacing' or 'spare' vdevs can be replaced. 5387 */ 5388 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5389 pvd->vdev_ops != &vdev_spare_ops) 5390 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5391 5392 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5393 spa_version(spa) >= SPA_VERSION_SPARES); 5394 5395 /* 5396 * Only mirror, replacing, and spare vdevs support detach. 5397 */ 5398 if (pvd->vdev_ops != &vdev_replacing_ops && 5399 pvd->vdev_ops != &vdev_mirror_ops && 5400 pvd->vdev_ops != &vdev_spare_ops) 5401 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5402 5403 /* 5404 * If this device has the only valid copy of some data, 5405 * we cannot safely detach it. 5406 */ 5407 if (vdev_dtl_required(vd)) 5408 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5409 5410 ASSERT(pvd->vdev_children >= 2); 5411 5412 /* 5413 * If we are detaching the second disk from a replacing vdev, then 5414 * check to see if we changed the original vdev's path to have "/old" 5415 * at the end in spa_vdev_attach(). If so, undo that change now. 5416 */ 5417 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5418 vd->vdev_path != NULL) { 5419 size_t len = strlen(vd->vdev_path); 5420 5421 for (int c = 0; c < pvd->vdev_children; c++) { 5422 cvd = pvd->vdev_child[c]; 5423 5424 if (cvd == vd || cvd->vdev_path == NULL) 5425 continue; 5426 5427 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5428 strcmp(cvd->vdev_path + len, "/old") == 0) { 5429 spa_strfree(cvd->vdev_path); 5430 cvd->vdev_path = spa_strdup(vd->vdev_path); 5431 break; 5432 } 5433 } 5434 } 5435 5436 /* 5437 * If we are detaching the original disk from a spare, then it implies 5438 * that the spare should become a real disk, and be removed from the 5439 * active spare list for the pool. 5440 */ 5441 if (pvd->vdev_ops == &vdev_spare_ops && 5442 vd->vdev_id == 0 && 5443 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5444 unspare = B_TRUE; 5445 5446 /* 5447 * Erase the disk labels so the disk can be used for other things. 5448 * This must be done after all other error cases are handled, 5449 * but before we disembowel vd (so we can still do I/O to it). 5450 * But if we can't do it, don't treat the error as fatal -- 5451 * it may be that the unwritability of the disk is the reason 5452 * it's being detached! 5453 */ 5454 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5455 5456 /* 5457 * Remove vd from its parent and compact the parent's children. 5458 */ 5459 vdev_remove_child(pvd, vd); 5460 vdev_compact_children(pvd); 5461 5462 /* 5463 * Remember one of the remaining children so we can get tvd below. 5464 */ 5465 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5466 5467 /* 5468 * If we need to remove the remaining child from the list of hot spares, 5469 * do it now, marking the vdev as no longer a spare in the process. 5470 * We must do this before vdev_remove_parent(), because that can 5471 * change the GUID if it creates a new toplevel GUID. For a similar 5472 * reason, we must remove the spare now, in the same txg as the detach; 5473 * otherwise someone could attach a new sibling, change the GUID, and 5474 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5475 */ 5476 if (unspare) { 5477 ASSERT(cvd->vdev_isspare); 5478 spa_spare_remove(cvd); 5479 unspare_guid = cvd->vdev_guid; 5480 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5481 cvd->vdev_unspare = B_TRUE; 5482 } 5483 5484 /* 5485 * If the parent mirror/replacing vdev only has one child, 5486 * the parent is no longer needed. Remove it from the tree. 5487 */ 5488 if (pvd->vdev_children == 1) { 5489 if (pvd->vdev_ops == &vdev_spare_ops) 5490 cvd->vdev_unspare = B_FALSE; 5491 vdev_remove_parent(cvd); 5492 } 5493 5494 5495 /* 5496 * We don't set tvd until now because the parent we just removed 5497 * may have been the previous top-level vdev. 5498 */ 5499 tvd = cvd->vdev_top; 5500 ASSERT(tvd->vdev_parent == rvd); 5501 5502 /* 5503 * Reevaluate the parent vdev state. 5504 */ 5505 vdev_propagate_state(cvd); 5506 5507 /* 5508 * If the 'autoexpand' property is set on the pool then automatically 5509 * try to expand the size of the pool. For example if the device we 5510 * just detached was smaller than the others, it may be possible to 5511 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5512 * first so that we can obtain the updated sizes of the leaf vdevs. 5513 */ 5514 if (spa->spa_autoexpand) { 5515 vdev_reopen(tvd); 5516 vdev_expand(tvd, txg); 5517 } 5518 5519 vdev_config_dirty(tvd); 5520 5521 /* 5522 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5523 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5524 * But first make sure we're not on any *other* txg's DTL list, to 5525 * prevent vd from being accessed after it's freed. 5526 */ 5527 vdpath = spa_strdup(vd->vdev_path); 5528 for (int t = 0; t < TXG_SIZE; t++) 5529 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5530 vd->vdev_detached = B_TRUE; 5531 vdev_dirty(tvd, VDD_DTL, vd, txg); 5532 5533 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 5534 5535 /* hang on to the spa before we release the lock */ 5536 spa_open_ref(spa, FTAG); 5537 5538 error = spa_vdev_exit(spa, vd, txg, 0); 5539 5540 spa_history_log_internal(spa, "detach", NULL, 5541 "vdev=%s", vdpath); 5542 spa_strfree(vdpath); 5543 5544 /* 5545 * If this was the removal of the original device in a hot spare vdev, 5546 * then we want to go through and remove the device from the hot spare 5547 * list of every other pool. 5548 */ 5549 if (unspare) { 5550 spa_t *altspa = NULL; 5551 5552 mutex_enter(&spa_namespace_lock); 5553 while ((altspa = spa_next(altspa)) != NULL) { 5554 if (altspa->spa_state != POOL_STATE_ACTIVE || 5555 altspa == spa) 5556 continue; 5557 5558 spa_open_ref(altspa, FTAG); 5559 mutex_exit(&spa_namespace_lock); 5560 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5561 mutex_enter(&spa_namespace_lock); 5562 spa_close(altspa, FTAG); 5563 } 5564 mutex_exit(&spa_namespace_lock); 5565 5566 /* search the rest of the vdevs for spares to remove */ 5567 spa_vdev_resilver_done(spa); 5568 } 5569 5570 /* all done with the spa; OK to release */ 5571 mutex_enter(&spa_namespace_lock); 5572 spa_close(spa, FTAG); 5573 mutex_exit(&spa_namespace_lock); 5574 5575 return (error); 5576 } 5577 5578 /* 5579 * Split a set of devices from their mirrors, and create a new pool from them. 5580 */ 5581 int 5582 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5583 nvlist_t *props, boolean_t exp) 5584 { 5585 int error = 0; 5586 uint64_t txg, *glist; 5587 spa_t *newspa; 5588 uint_t c, children, lastlog; 5589 nvlist_t **child, *nvl, *tmp; 5590 dmu_tx_t *tx; 5591 char *altroot = NULL; 5592 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5593 boolean_t activate_slog; 5594 5595 ASSERT(spa_writeable(spa)); 5596 5597 txg = spa_vdev_enter(spa); 5598 5599 /* clear the log and flush everything up to now */ 5600 activate_slog = spa_passivate_log(spa); 5601 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5602 error = spa_reset_logs(spa); 5603 txg = spa_vdev_config_enter(spa); 5604 5605 if (activate_slog) 5606 spa_activate_log(spa); 5607 5608 if (error != 0) 5609 return (spa_vdev_exit(spa, NULL, txg, error)); 5610 5611 /* check new spa name before going any further */ 5612 if (spa_lookup(newname) != NULL) 5613 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5614 5615 /* 5616 * scan through all the children to ensure they're all mirrors 5617 */ 5618 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5619 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5620 &children) != 0) 5621 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5622 5623 /* first, check to ensure we've got the right child count */ 5624 rvd = spa->spa_root_vdev; 5625 lastlog = 0; 5626 for (c = 0; c < rvd->vdev_children; c++) { 5627 vdev_t *vd = rvd->vdev_child[c]; 5628 5629 /* don't count the holes & logs as children */ 5630 if (vd->vdev_islog || !vdev_is_concrete(vd)) { 5631 if (lastlog == 0) 5632 lastlog = c; 5633 continue; 5634 } 5635 5636 lastlog = 0; 5637 } 5638 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5639 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5640 5641 /* next, ensure no spare or cache devices are part of the split */ 5642 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5643 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5644 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5645 5646 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5647 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5648 5649 /* then, loop over each vdev and validate it */ 5650 for (c = 0; c < children; c++) { 5651 uint64_t is_hole = 0; 5652 5653 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5654 &is_hole); 5655 5656 if (is_hole != 0) { 5657 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5658 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5659 continue; 5660 } else { 5661 error = SET_ERROR(EINVAL); 5662 break; 5663 } 5664 } 5665 5666 /* which disk is going to be split? */ 5667 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5668 &glist[c]) != 0) { 5669 error = SET_ERROR(EINVAL); 5670 break; 5671 } 5672 5673 /* look it up in the spa */ 5674 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5675 if (vml[c] == NULL) { 5676 error = SET_ERROR(ENODEV); 5677 break; 5678 } 5679 5680 /* make sure there's nothing stopping the split */ 5681 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5682 vml[c]->vdev_islog || 5683 !vdev_is_concrete(vml[c]) || 5684 vml[c]->vdev_isspare || 5685 vml[c]->vdev_isl2cache || 5686 !vdev_writeable(vml[c]) || 5687 vml[c]->vdev_children != 0 || 5688 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5689 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5690 error = SET_ERROR(EINVAL); 5691 break; 5692 } 5693 5694 if (vdev_dtl_required(vml[c])) { 5695 error = SET_ERROR(EBUSY); 5696 break; 5697 } 5698 5699 /* we need certain info from the top level */ 5700 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5701 vml[c]->vdev_top->vdev_ms_array) == 0); 5702 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5703 vml[c]->vdev_top->vdev_ms_shift) == 0); 5704 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5705 vml[c]->vdev_top->vdev_asize) == 0); 5706 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5707 vml[c]->vdev_top->vdev_ashift) == 0); 5708 5709 /* transfer per-vdev ZAPs */ 5710 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5711 VERIFY0(nvlist_add_uint64(child[c], 5712 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5713 5714 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5715 VERIFY0(nvlist_add_uint64(child[c], 5716 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5717 vml[c]->vdev_parent->vdev_top_zap)); 5718 } 5719 5720 if (error != 0) { 5721 kmem_free(vml, children * sizeof (vdev_t *)); 5722 kmem_free(glist, children * sizeof (uint64_t)); 5723 return (spa_vdev_exit(spa, NULL, txg, error)); 5724 } 5725 5726 /* stop writers from using the disks */ 5727 for (c = 0; c < children; c++) { 5728 if (vml[c] != NULL) 5729 vml[c]->vdev_offline = B_TRUE; 5730 } 5731 vdev_reopen(spa->spa_root_vdev); 5732 5733 /* 5734 * Temporarily record the splitting vdevs in the spa config. This 5735 * will disappear once the config is regenerated. 5736 */ 5737 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5738 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5739 glist, children) == 0); 5740 kmem_free(glist, children * sizeof (uint64_t)); 5741 5742 mutex_enter(&spa->spa_props_lock); 5743 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5744 nvl) == 0); 5745 mutex_exit(&spa->spa_props_lock); 5746 spa->spa_config_splitting = nvl; 5747 vdev_config_dirty(spa->spa_root_vdev); 5748 5749 /* configure and create the new pool */ 5750 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5751 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5752 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5753 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5754 spa_version(spa)) == 0); 5755 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5756 spa->spa_config_txg) == 0); 5757 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5758 spa_generate_guid(NULL)) == 0); 5759 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5760 (void) nvlist_lookup_string(props, 5761 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5762 5763 /* add the new pool to the namespace */ 5764 newspa = spa_add(newname, config, altroot); 5765 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5766 newspa->spa_config_txg = spa->spa_config_txg; 5767 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5768 5769 /* release the spa config lock, retaining the namespace lock */ 5770 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5771 5772 if (zio_injection_enabled) 5773 zio_handle_panic_injection(spa, FTAG, 1); 5774 5775 spa_activate(newspa, spa_mode_global); 5776 spa_async_suspend(newspa); 5777 5778 /* create the new pool from the disks of the original pool */ 5779 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5780 if (error) 5781 goto out; 5782 5783 /* if that worked, generate a real config for the new pool */ 5784 if (newspa->spa_root_vdev != NULL) { 5785 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5786 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5787 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5788 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5789 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5790 B_TRUE)); 5791 } 5792 5793 /* set the props */ 5794 if (props != NULL) { 5795 spa_configfile_set(newspa, props, B_FALSE); 5796 error = spa_prop_set(newspa, props); 5797 if (error) 5798 goto out; 5799 } 5800 5801 /* flush everything */ 5802 txg = spa_vdev_config_enter(newspa); 5803 vdev_config_dirty(newspa->spa_root_vdev); 5804 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5805 5806 if (zio_injection_enabled) 5807 zio_handle_panic_injection(spa, FTAG, 2); 5808 5809 spa_async_resume(newspa); 5810 5811 /* finally, update the original pool's config */ 5812 txg = spa_vdev_config_enter(spa); 5813 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5814 error = dmu_tx_assign(tx, TXG_WAIT); 5815 if (error != 0) 5816 dmu_tx_abort(tx); 5817 for (c = 0; c < children; c++) { 5818 if (vml[c] != NULL) { 5819 vdev_split(vml[c]); 5820 if (error == 0) 5821 spa_history_log_internal(spa, "detach", tx, 5822 "vdev=%s", vml[c]->vdev_path); 5823 5824 vdev_free(vml[c]); 5825 } 5826 } 5827 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5828 vdev_config_dirty(spa->spa_root_vdev); 5829 spa->spa_config_splitting = NULL; 5830 nvlist_free(nvl); 5831 if (error == 0) 5832 dmu_tx_commit(tx); 5833 (void) spa_vdev_exit(spa, NULL, txg, 0); 5834 5835 if (zio_injection_enabled) 5836 zio_handle_panic_injection(spa, FTAG, 3); 5837 5838 /* split is complete; log a history record */ 5839 spa_history_log_internal(newspa, "split", NULL, 5840 "from pool %s", spa_name(spa)); 5841 5842 kmem_free(vml, children * sizeof (vdev_t *)); 5843 5844 /* if we're not going to mount the filesystems in userland, export */ 5845 if (exp) 5846 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5847 B_FALSE, B_FALSE); 5848 5849 return (error); 5850 5851 out: 5852 spa_unload(newspa); 5853 spa_deactivate(newspa); 5854 spa_remove(newspa); 5855 5856 txg = spa_vdev_config_enter(spa); 5857 5858 /* re-online all offlined disks */ 5859 for (c = 0; c < children; c++) { 5860 if (vml[c] != NULL) 5861 vml[c]->vdev_offline = B_FALSE; 5862 } 5863 vdev_reopen(spa->spa_root_vdev); 5864 5865 nvlist_free(spa->spa_config_splitting); 5866 spa->spa_config_splitting = NULL; 5867 (void) spa_vdev_exit(spa, NULL, txg, error); 5868 5869 kmem_free(vml, children * sizeof (vdev_t *)); 5870 return (error); 5871 } 5872 5873 /* 5874 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5875 * currently spared, so we can detach it. 5876 */ 5877 static vdev_t * 5878 spa_vdev_resilver_done_hunt(vdev_t *vd) 5879 { 5880 vdev_t *newvd, *oldvd; 5881 5882 for (int c = 0; c < vd->vdev_children; c++) { 5883 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5884 if (oldvd != NULL) 5885 return (oldvd); 5886 } 5887 5888 /* 5889 * Check for a completed replacement. We always consider the first 5890 * vdev in the list to be the oldest vdev, and the last one to be 5891 * the newest (see spa_vdev_attach() for how that works). In 5892 * the case where the newest vdev is faulted, we will not automatically 5893 * remove it after a resilver completes. This is OK as it will require 5894 * user intervention to determine which disk the admin wishes to keep. 5895 */ 5896 if (vd->vdev_ops == &vdev_replacing_ops) { 5897 ASSERT(vd->vdev_children > 1); 5898 5899 newvd = vd->vdev_child[vd->vdev_children - 1]; 5900 oldvd = vd->vdev_child[0]; 5901 5902 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5903 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5904 !vdev_dtl_required(oldvd)) 5905 return (oldvd); 5906 } 5907 5908 /* 5909 * Check for a completed resilver with the 'unspare' flag set. 5910 */ 5911 if (vd->vdev_ops == &vdev_spare_ops) { 5912 vdev_t *first = vd->vdev_child[0]; 5913 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5914 5915 if (last->vdev_unspare) { 5916 oldvd = first; 5917 newvd = last; 5918 } else if (first->vdev_unspare) { 5919 oldvd = last; 5920 newvd = first; 5921 } else { 5922 oldvd = NULL; 5923 } 5924 5925 if (oldvd != NULL && 5926 vdev_dtl_empty(newvd, DTL_MISSING) && 5927 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5928 !vdev_dtl_required(oldvd)) 5929 return (oldvd); 5930 5931 /* 5932 * If there are more than two spares attached to a disk, 5933 * and those spares are not required, then we want to 5934 * attempt to free them up now so that they can be used 5935 * by other pools. Once we're back down to a single 5936 * disk+spare, we stop removing them. 5937 */ 5938 if (vd->vdev_children > 2) { 5939 newvd = vd->vdev_child[1]; 5940 5941 if (newvd->vdev_isspare && last->vdev_isspare && 5942 vdev_dtl_empty(last, DTL_MISSING) && 5943 vdev_dtl_empty(last, DTL_OUTAGE) && 5944 !vdev_dtl_required(newvd)) 5945 return (newvd); 5946 } 5947 } 5948 5949 return (NULL); 5950 } 5951 5952 static void 5953 spa_vdev_resilver_done(spa_t *spa) 5954 { 5955 vdev_t *vd, *pvd, *ppvd; 5956 uint64_t guid, sguid, pguid, ppguid; 5957 5958 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5959 5960 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5961 pvd = vd->vdev_parent; 5962 ppvd = pvd->vdev_parent; 5963 guid = vd->vdev_guid; 5964 pguid = pvd->vdev_guid; 5965 ppguid = ppvd->vdev_guid; 5966 sguid = 0; 5967 /* 5968 * If we have just finished replacing a hot spared device, then 5969 * we need to detach the parent's first child (the original hot 5970 * spare) as well. 5971 */ 5972 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5973 ppvd->vdev_children == 2) { 5974 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5975 sguid = ppvd->vdev_child[1]->vdev_guid; 5976 } 5977 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5978 5979 spa_config_exit(spa, SCL_ALL, FTAG); 5980 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5981 return; 5982 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5983 return; 5984 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5985 } 5986 5987 spa_config_exit(spa, SCL_ALL, FTAG); 5988 } 5989 5990 /* 5991 * Update the stored path or FRU for this vdev. 5992 */ 5993 int 5994 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5995 boolean_t ispath) 5996 { 5997 vdev_t *vd; 5998 boolean_t sync = B_FALSE; 5999 6000 ASSERT(spa_writeable(spa)); 6001 6002 spa_vdev_state_enter(spa, SCL_ALL); 6003 6004 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 6005 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 6006 6007 if (!vd->vdev_ops->vdev_op_leaf) 6008 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 6009 6010 if (ispath) { 6011 if (strcmp(value, vd->vdev_path) != 0) { 6012 spa_strfree(vd->vdev_path); 6013 vd->vdev_path = spa_strdup(value); 6014 sync = B_TRUE; 6015 } 6016 } else { 6017 if (vd->vdev_fru == NULL) { 6018 vd->vdev_fru = spa_strdup(value); 6019 sync = B_TRUE; 6020 } else if (strcmp(value, vd->vdev_fru) != 0) { 6021 spa_strfree(vd->vdev_fru); 6022 vd->vdev_fru = spa_strdup(value); 6023 sync = B_TRUE; 6024 } 6025 } 6026 6027 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 6028 } 6029 6030 int 6031 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 6032 { 6033 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 6034 } 6035 6036 int 6037 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 6038 { 6039 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 6040 } 6041 6042 /* 6043 * ========================================================================== 6044 * SPA Scanning 6045 * ========================================================================== 6046 */ 6047 int 6048 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 6049 { 6050 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6051 6052 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6053 return (SET_ERROR(EBUSY)); 6054 6055 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 6056 } 6057 6058 int 6059 spa_scan_stop(spa_t *spa) 6060 { 6061 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6062 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6063 return (SET_ERROR(EBUSY)); 6064 return (dsl_scan_cancel(spa->spa_dsl_pool)); 6065 } 6066 6067 int 6068 spa_scan(spa_t *spa, pool_scan_func_t func) 6069 { 6070 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6071 6072 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6073 return (SET_ERROR(ENOTSUP)); 6074 6075 /* 6076 * If a resilver was requested, but there is no DTL on a 6077 * writeable leaf device, we have nothing to do. 6078 */ 6079 if (func == POOL_SCAN_RESILVER && 6080 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6081 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6082 return (0); 6083 } 6084 6085 return (dsl_scan(spa->spa_dsl_pool, func)); 6086 } 6087 6088 /* 6089 * ========================================================================== 6090 * SPA async task processing 6091 * ========================================================================== 6092 */ 6093 6094 static void 6095 spa_async_remove(spa_t *spa, vdev_t *vd) 6096 { 6097 if (vd->vdev_remove_wanted) { 6098 vd->vdev_remove_wanted = B_FALSE; 6099 vd->vdev_delayed_close = B_FALSE; 6100 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6101 6102 /* 6103 * We want to clear the stats, but we don't want to do a full 6104 * vdev_clear() as that will cause us to throw away 6105 * degraded/faulted state as well as attempt to reopen the 6106 * device, all of which is a waste. 6107 */ 6108 vd->vdev_stat.vs_read_errors = 0; 6109 vd->vdev_stat.vs_write_errors = 0; 6110 vd->vdev_stat.vs_checksum_errors = 0; 6111 6112 vdev_state_dirty(vd->vdev_top); 6113 } 6114 6115 for (int c = 0; c < vd->vdev_children; c++) 6116 spa_async_remove(spa, vd->vdev_child[c]); 6117 } 6118 6119 static void 6120 spa_async_probe(spa_t *spa, vdev_t *vd) 6121 { 6122 if (vd->vdev_probe_wanted) { 6123 vd->vdev_probe_wanted = B_FALSE; 6124 vdev_reopen(vd); /* vdev_open() does the actual probe */ 6125 } 6126 6127 for (int c = 0; c < vd->vdev_children; c++) 6128 spa_async_probe(spa, vd->vdev_child[c]); 6129 } 6130 6131 static void 6132 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 6133 { 6134 sysevent_id_t eid; 6135 nvlist_t *attr; 6136 char *physpath; 6137 6138 if (!spa->spa_autoexpand) 6139 return; 6140 6141 for (int c = 0; c < vd->vdev_children; c++) { 6142 vdev_t *cvd = vd->vdev_child[c]; 6143 spa_async_autoexpand(spa, cvd); 6144 } 6145 6146 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6147 return; 6148 6149 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6150 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6151 6152 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6153 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6154 6155 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6156 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 6157 6158 nvlist_free(attr); 6159 kmem_free(physpath, MAXPATHLEN); 6160 } 6161 6162 static void 6163 spa_async_thread(void *arg) 6164 { 6165 spa_t *spa = (spa_t *)arg; 6166 int tasks; 6167 6168 ASSERT(spa->spa_sync_on); 6169 6170 mutex_enter(&spa->spa_async_lock); 6171 tasks = spa->spa_async_tasks; 6172 spa->spa_async_tasks = 0; 6173 mutex_exit(&spa->spa_async_lock); 6174 6175 /* 6176 * See if the config needs to be updated. 6177 */ 6178 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6179 uint64_t old_space, new_space; 6180 6181 mutex_enter(&spa_namespace_lock); 6182 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6183 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6184 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6185 mutex_exit(&spa_namespace_lock); 6186 6187 /* 6188 * If the pool grew as a result of the config update, 6189 * then log an internal history event. 6190 */ 6191 if (new_space != old_space) { 6192 spa_history_log_internal(spa, "vdev online", NULL, 6193 "pool '%s' size: %llu(+%llu)", 6194 spa_name(spa), new_space, new_space - old_space); 6195 } 6196 } 6197 6198 /* 6199 * See if any devices need to be marked REMOVED. 6200 */ 6201 if (tasks & SPA_ASYNC_REMOVE) { 6202 spa_vdev_state_enter(spa, SCL_NONE); 6203 spa_async_remove(spa, spa->spa_root_vdev); 6204 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6205 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6206 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6207 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6208 (void) spa_vdev_state_exit(spa, NULL, 0); 6209 } 6210 6211 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6212 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6213 spa_async_autoexpand(spa, spa->spa_root_vdev); 6214 spa_config_exit(spa, SCL_CONFIG, FTAG); 6215 } 6216 6217 /* 6218 * See if any devices need to be probed. 6219 */ 6220 if (tasks & SPA_ASYNC_PROBE) { 6221 spa_vdev_state_enter(spa, SCL_NONE); 6222 spa_async_probe(spa, spa->spa_root_vdev); 6223 (void) spa_vdev_state_exit(spa, NULL, 0); 6224 } 6225 6226 /* 6227 * If any devices are done replacing, detach them. 6228 */ 6229 if (tasks & SPA_ASYNC_RESILVER_DONE) 6230 spa_vdev_resilver_done(spa); 6231 6232 /* 6233 * Kick off a resilver. 6234 */ 6235 if (tasks & SPA_ASYNC_RESILVER) 6236 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6237 6238 /* 6239 * Let the world know that we're done. 6240 */ 6241 mutex_enter(&spa->spa_async_lock); 6242 spa->spa_async_thread = NULL; 6243 cv_broadcast(&spa->spa_async_cv); 6244 mutex_exit(&spa->spa_async_lock); 6245 thread_exit(); 6246 } 6247 6248 void 6249 spa_async_suspend(spa_t *spa) 6250 { 6251 mutex_enter(&spa->spa_async_lock); 6252 spa->spa_async_suspended++; 6253 while (spa->spa_async_thread != NULL || 6254 spa->spa_condense_thread != NULL) 6255 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6256 mutex_exit(&spa->spa_async_lock); 6257 6258 spa_vdev_remove_suspend(spa); 6259 } 6260 6261 void 6262 spa_async_resume(spa_t *spa) 6263 { 6264 mutex_enter(&spa->spa_async_lock); 6265 ASSERT(spa->spa_async_suspended != 0); 6266 spa->spa_async_suspended--; 6267 mutex_exit(&spa->spa_async_lock); 6268 spa_restart_removal(spa); 6269 } 6270 6271 static boolean_t 6272 spa_async_tasks_pending(spa_t *spa) 6273 { 6274 uint_t non_config_tasks; 6275 uint_t config_task; 6276 boolean_t config_task_suspended; 6277 6278 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 6279 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6280 if (spa->spa_ccw_fail_time == 0) { 6281 config_task_suspended = B_FALSE; 6282 } else { 6283 config_task_suspended = 6284 (gethrtime() - spa->spa_ccw_fail_time) < 6285 (zfs_ccw_retry_interval * NANOSEC); 6286 } 6287 6288 return (non_config_tasks || (config_task && !config_task_suspended)); 6289 } 6290 6291 static void 6292 spa_async_dispatch(spa_t *spa) 6293 { 6294 mutex_enter(&spa->spa_async_lock); 6295 if (spa_async_tasks_pending(spa) && 6296 !spa->spa_async_suspended && 6297 spa->spa_async_thread == NULL && 6298 rootdir != NULL) 6299 spa->spa_async_thread = thread_create(NULL, 0, 6300 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6301 mutex_exit(&spa->spa_async_lock); 6302 } 6303 6304 void 6305 spa_async_request(spa_t *spa, int task) 6306 { 6307 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6308 mutex_enter(&spa->spa_async_lock); 6309 spa->spa_async_tasks |= task; 6310 mutex_exit(&spa->spa_async_lock); 6311 } 6312 6313 /* 6314 * ========================================================================== 6315 * SPA syncing routines 6316 * ========================================================================== 6317 */ 6318 6319 static int 6320 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6321 { 6322 bpobj_t *bpo = arg; 6323 bpobj_enqueue(bpo, bp, tx); 6324 return (0); 6325 } 6326 6327 static int 6328 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6329 { 6330 zio_t *zio = arg; 6331 6332 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6333 zio->io_flags)); 6334 return (0); 6335 } 6336 6337 /* 6338 * Note: this simple function is not inlined to make it easier to dtrace the 6339 * amount of time spent syncing frees. 6340 */ 6341 static void 6342 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6343 { 6344 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6345 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6346 VERIFY(zio_wait(zio) == 0); 6347 } 6348 6349 /* 6350 * Note: this simple function is not inlined to make it easier to dtrace the 6351 * amount of time spent syncing deferred frees. 6352 */ 6353 static void 6354 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6355 { 6356 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6357 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6358 spa_free_sync_cb, zio, tx), ==, 0); 6359 VERIFY0(zio_wait(zio)); 6360 } 6361 6362 6363 static void 6364 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6365 { 6366 char *packed = NULL; 6367 size_t bufsize; 6368 size_t nvsize = 0; 6369 dmu_buf_t *db; 6370 6371 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6372 6373 /* 6374 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6375 * information. This avoids the dmu_buf_will_dirty() path and 6376 * saves us a pre-read to get data we don't actually care about. 6377 */ 6378 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6379 packed = kmem_alloc(bufsize, KM_SLEEP); 6380 6381 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6382 KM_SLEEP) == 0); 6383 bzero(packed + nvsize, bufsize - nvsize); 6384 6385 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6386 6387 kmem_free(packed, bufsize); 6388 6389 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6390 dmu_buf_will_dirty(db, tx); 6391 *(uint64_t *)db->db_data = nvsize; 6392 dmu_buf_rele(db, FTAG); 6393 } 6394 6395 static void 6396 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6397 const char *config, const char *entry) 6398 { 6399 nvlist_t *nvroot; 6400 nvlist_t **list; 6401 int i; 6402 6403 if (!sav->sav_sync) 6404 return; 6405 6406 /* 6407 * Update the MOS nvlist describing the list of available devices. 6408 * spa_validate_aux() will have already made sure this nvlist is 6409 * valid and the vdevs are labeled appropriately. 6410 */ 6411 if (sav->sav_object == 0) { 6412 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6413 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6414 sizeof (uint64_t), tx); 6415 VERIFY(zap_update(spa->spa_meta_objset, 6416 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6417 &sav->sav_object, tx) == 0); 6418 } 6419 6420 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6421 if (sav->sav_count == 0) { 6422 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6423 } else { 6424 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6425 for (i = 0; i < sav->sav_count; i++) 6426 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6427 B_FALSE, VDEV_CONFIG_L2CACHE); 6428 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6429 sav->sav_count) == 0); 6430 for (i = 0; i < sav->sav_count; i++) 6431 nvlist_free(list[i]); 6432 kmem_free(list, sav->sav_count * sizeof (void *)); 6433 } 6434 6435 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6436 nvlist_free(nvroot); 6437 6438 sav->sav_sync = B_FALSE; 6439 } 6440 6441 /* 6442 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6443 * The all-vdev ZAP must be empty. 6444 */ 6445 static void 6446 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6447 { 6448 spa_t *spa = vd->vdev_spa; 6449 if (vd->vdev_top_zap != 0) { 6450 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6451 vd->vdev_top_zap, tx)); 6452 } 6453 if (vd->vdev_leaf_zap != 0) { 6454 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6455 vd->vdev_leaf_zap, tx)); 6456 } 6457 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6458 spa_avz_build(vd->vdev_child[i], avz, tx); 6459 } 6460 } 6461 6462 static void 6463 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6464 { 6465 nvlist_t *config; 6466 6467 /* 6468 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6469 * its config may not be dirty but we still need to build per-vdev ZAPs. 6470 * Similarly, if the pool is being assembled (e.g. after a split), we 6471 * need to rebuild the AVZ although the config may not be dirty. 6472 */ 6473 if (list_is_empty(&spa->spa_config_dirty_list) && 6474 spa->spa_avz_action == AVZ_ACTION_NONE) 6475 return; 6476 6477 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6478 6479 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6480 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 6481 spa->spa_all_vdev_zaps != 0); 6482 6483 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6484 /* Make and build the new AVZ */ 6485 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6486 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6487 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6488 6489 /* Diff old AVZ with new one */ 6490 zap_cursor_t zc; 6491 zap_attribute_t za; 6492 6493 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6494 spa->spa_all_vdev_zaps); 6495 zap_cursor_retrieve(&zc, &za) == 0; 6496 zap_cursor_advance(&zc)) { 6497 uint64_t vdzap = za.za_first_integer; 6498 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6499 vdzap) == ENOENT) { 6500 /* 6501 * ZAP is listed in old AVZ but not in new one; 6502 * destroy it 6503 */ 6504 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6505 tx)); 6506 } 6507 } 6508 6509 zap_cursor_fini(&zc); 6510 6511 /* Destroy the old AVZ */ 6512 VERIFY0(zap_destroy(spa->spa_meta_objset, 6513 spa->spa_all_vdev_zaps, tx)); 6514 6515 /* Replace the old AVZ in the dir obj with the new one */ 6516 VERIFY0(zap_update(spa->spa_meta_objset, 6517 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6518 sizeof (new_avz), 1, &new_avz, tx)); 6519 6520 spa->spa_all_vdev_zaps = new_avz; 6521 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6522 zap_cursor_t zc; 6523 zap_attribute_t za; 6524 6525 /* Walk through the AVZ and destroy all listed ZAPs */ 6526 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6527 spa->spa_all_vdev_zaps); 6528 zap_cursor_retrieve(&zc, &za) == 0; 6529 zap_cursor_advance(&zc)) { 6530 uint64_t zap = za.za_first_integer; 6531 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6532 } 6533 6534 zap_cursor_fini(&zc); 6535 6536 /* Destroy and unlink the AVZ itself */ 6537 VERIFY0(zap_destroy(spa->spa_meta_objset, 6538 spa->spa_all_vdev_zaps, tx)); 6539 VERIFY0(zap_remove(spa->spa_meta_objset, 6540 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6541 spa->spa_all_vdev_zaps = 0; 6542 } 6543 6544 if (spa->spa_all_vdev_zaps == 0) { 6545 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6546 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6547 DMU_POOL_VDEV_ZAP_MAP, tx); 6548 } 6549 spa->spa_avz_action = AVZ_ACTION_NONE; 6550 6551 /* Create ZAPs for vdevs that don't have them. */ 6552 vdev_construct_zaps(spa->spa_root_vdev, tx); 6553 6554 config = spa_config_generate(spa, spa->spa_root_vdev, 6555 dmu_tx_get_txg(tx), B_FALSE); 6556 6557 /* 6558 * If we're upgrading the spa version then make sure that 6559 * the config object gets updated with the correct version. 6560 */ 6561 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6562 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6563 spa->spa_uberblock.ub_version); 6564 6565 spa_config_exit(spa, SCL_STATE, FTAG); 6566 6567 nvlist_free(spa->spa_config_syncing); 6568 spa->spa_config_syncing = config; 6569 6570 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6571 } 6572 6573 static void 6574 spa_sync_version(void *arg, dmu_tx_t *tx) 6575 { 6576 uint64_t *versionp = arg; 6577 uint64_t version = *versionp; 6578 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6579 6580 /* 6581 * Setting the version is special cased when first creating the pool. 6582 */ 6583 ASSERT(tx->tx_txg != TXG_INITIAL); 6584 6585 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6586 ASSERT(version >= spa_version(spa)); 6587 6588 spa->spa_uberblock.ub_version = version; 6589 vdev_config_dirty(spa->spa_root_vdev); 6590 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6591 } 6592 6593 /* 6594 * Set zpool properties. 6595 */ 6596 static void 6597 spa_sync_props(void *arg, dmu_tx_t *tx) 6598 { 6599 nvlist_t *nvp = arg; 6600 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6601 objset_t *mos = spa->spa_meta_objset; 6602 nvpair_t *elem = NULL; 6603 6604 mutex_enter(&spa->spa_props_lock); 6605 6606 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6607 uint64_t intval; 6608 char *strval, *fname; 6609 zpool_prop_t prop; 6610 const char *propname; 6611 zprop_type_t proptype; 6612 spa_feature_t fid; 6613 6614 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6615 case ZPOOL_PROP_INVAL: 6616 /* 6617 * We checked this earlier in spa_prop_validate(). 6618 */ 6619 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6620 6621 fname = strchr(nvpair_name(elem), '@') + 1; 6622 VERIFY0(zfeature_lookup_name(fname, &fid)); 6623 6624 spa_feature_enable(spa, fid, tx); 6625 spa_history_log_internal(spa, "set", tx, 6626 "%s=enabled", nvpair_name(elem)); 6627 break; 6628 6629 case ZPOOL_PROP_VERSION: 6630 intval = fnvpair_value_uint64(elem); 6631 /* 6632 * The version is synced seperatly before other 6633 * properties and should be correct by now. 6634 */ 6635 ASSERT3U(spa_version(spa), >=, intval); 6636 break; 6637 6638 case ZPOOL_PROP_ALTROOT: 6639 /* 6640 * 'altroot' is a non-persistent property. It should 6641 * have been set temporarily at creation or import time. 6642 */ 6643 ASSERT(spa->spa_root != NULL); 6644 break; 6645 6646 case ZPOOL_PROP_READONLY: 6647 case ZPOOL_PROP_CACHEFILE: 6648 /* 6649 * 'readonly' and 'cachefile' are also non-persisitent 6650 * properties. 6651 */ 6652 break; 6653 case ZPOOL_PROP_COMMENT: 6654 strval = fnvpair_value_string(elem); 6655 if (spa->spa_comment != NULL) 6656 spa_strfree(spa->spa_comment); 6657 spa->spa_comment = spa_strdup(strval); 6658 /* 6659 * We need to dirty the configuration on all the vdevs 6660 * so that their labels get updated. It's unnecessary 6661 * to do this for pool creation since the vdev's 6662 * configuratoin has already been dirtied. 6663 */ 6664 if (tx->tx_txg != TXG_INITIAL) 6665 vdev_config_dirty(spa->spa_root_vdev); 6666 spa_history_log_internal(spa, "set", tx, 6667 "%s=%s", nvpair_name(elem), strval); 6668 break; 6669 default: 6670 /* 6671 * Set pool property values in the poolprops mos object. 6672 */ 6673 if (spa->spa_pool_props_object == 0) { 6674 spa->spa_pool_props_object = 6675 zap_create_link(mos, DMU_OT_POOL_PROPS, 6676 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6677 tx); 6678 } 6679 6680 /* normalize the property name */ 6681 propname = zpool_prop_to_name(prop); 6682 proptype = zpool_prop_get_type(prop); 6683 6684 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6685 ASSERT(proptype == PROP_TYPE_STRING); 6686 strval = fnvpair_value_string(elem); 6687 VERIFY0(zap_update(mos, 6688 spa->spa_pool_props_object, propname, 6689 1, strlen(strval) + 1, strval, tx)); 6690 spa_history_log_internal(spa, "set", tx, 6691 "%s=%s", nvpair_name(elem), strval); 6692 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6693 intval = fnvpair_value_uint64(elem); 6694 6695 if (proptype == PROP_TYPE_INDEX) { 6696 const char *unused; 6697 VERIFY0(zpool_prop_index_to_string( 6698 prop, intval, &unused)); 6699 } 6700 VERIFY0(zap_update(mos, 6701 spa->spa_pool_props_object, propname, 6702 8, 1, &intval, tx)); 6703 spa_history_log_internal(spa, "set", tx, 6704 "%s=%lld", nvpair_name(elem), intval); 6705 } else { 6706 ASSERT(0); /* not allowed */ 6707 } 6708 6709 switch (prop) { 6710 case ZPOOL_PROP_DELEGATION: 6711 spa->spa_delegation = intval; 6712 break; 6713 case ZPOOL_PROP_BOOTFS: 6714 spa->spa_bootfs = intval; 6715 break; 6716 case ZPOOL_PROP_FAILUREMODE: 6717 spa->spa_failmode = intval; 6718 break; 6719 case ZPOOL_PROP_AUTOEXPAND: 6720 spa->spa_autoexpand = intval; 6721 if (tx->tx_txg != TXG_INITIAL) 6722 spa_async_request(spa, 6723 SPA_ASYNC_AUTOEXPAND); 6724 break; 6725 case ZPOOL_PROP_DEDUPDITTO: 6726 spa->spa_dedup_ditto = intval; 6727 break; 6728 default: 6729 break; 6730 } 6731 } 6732 6733 } 6734 6735 mutex_exit(&spa->spa_props_lock); 6736 } 6737 6738 /* 6739 * Perform one-time upgrade on-disk changes. spa_version() does not 6740 * reflect the new version this txg, so there must be no changes this 6741 * txg to anything that the upgrade code depends on after it executes. 6742 * Therefore this must be called after dsl_pool_sync() does the sync 6743 * tasks. 6744 */ 6745 static void 6746 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6747 { 6748 dsl_pool_t *dp = spa->spa_dsl_pool; 6749 6750 ASSERT(spa->spa_sync_pass == 1); 6751 6752 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6753 6754 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6755 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6756 dsl_pool_create_origin(dp, tx); 6757 6758 /* Keeping the origin open increases spa_minref */ 6759 spa->spa_minref += 3; 6760 } 6761 6762 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6763 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6764 dsl_pool_upgrade_clones(dp, tx); 6765 } 6766 6767 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6768 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6769 dsl_pool_upgrade_dir_clones(dp, tx); 6770 6771 /* Keeping the freedir open increases spa_minref */ 6772 spa->spa_minref += 3; 6773 } 6774 6775 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6776 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6777 spa_feature_create_zap_objects(spa, tx); 6778 } 6779 6780 /* 6781 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6782 * when possibility to use lz4 compression for metadata was added 6783 * Old pools that have this feature enabled must be upgraded to have 6784 * this feature active 6785 */ 6786 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6787 boolean_t lz4_en = spa_feature_is_enabled(spa, 6788 SPA_FEATURE_LZ4_COMPRESS); 6789 boolean_t lz4_ac = spa_feature_is_active(spa, 6790 SPA_FEATURE_LZ4_COMPRESS); 6791 6792 if (lz4_en && !lz4_ac) 6793 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6794 } 6795 6796 /* 6797 * If we haven't written the salt, do so now. Note that the 6798 * feature may not be activated yet, but that's fine since 6799 * the presence of this ZAP entry is backwards compatible. 6800 */ 6801 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6802 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6803 VERIFY0(zap_add(spa->spa_meta_objset, 6804 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6805 sizeof (spa->spa_cksum_salt.zcs_bytes), 6806 spa->spa_cksum_salt.zcs_bytes, tx)); 6807 } 6808 6809 rrw_exit(&dp->dp_config_rwlock, FTAG); 6810 } 6811 6812 static void 6813 vdev_indirect_state_sync_verify(vdev_t *vd) 6814 { 6815 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6816 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 6817 6818 if (vd->vdev_ops == &vdev_indirect_ops) { 6819 ASSERT(vim != NULL); 6820 ASSERT(vib != NULL); 6821 } 6822 6823 if (vdev_obsolete_sm_object(vd) != 0) { 6824 ASSERT(vd->vdev_obsolete_sm != NULL); 6825 ASSERT(vd->vdev_removing || 6826 vd->vdev_ops == &vdev_indirect_ops); 6827 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 6828 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 6829 6830 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 6831 space_map_object(vd->vdev_obsolete_sm)); 6832 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 6833 space_map_allocated(vd->vdev_obsolete_sm)); 6834 } 6835 ASSERT(vd->vdev_obsolete_segments != NULL); 6836 6837 /* 6838 * Since frees / remaps to an indirect vdev can only 6839 * happen in syncing context, the obsolete segments 6840 * tree must be empty when we start syncing. 6841 */ 6842 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 6843 } 6844 6845 /* 6846 * Sync the specified transaction group. New blocks may be dirtied as 6847 * part of the process, so we iterate until it converges. 6848 */ 6849 void 6850 spa_sync(spa_t *spa, uint64_t txg) 6851 { 6852 dsl_pool_t *dp = spa->spa_dsl_pool; 6853 objset_t *mos = spa->spa_meta_objset; 6854 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6855 vdev_t *rvd = spa->spa_root_vdev; 6856 vdev_t *vd; 6857 dmu_tx_t *tx; 6858 int error; 6859 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6860 zfs_vdev_queue_depth_pct / 100; 6861 6862 VERIFY(spa_writeable(spa)); 6863 6864 /* 6865 * Wait for i/os issued in open context that need to complete 6866 * before this txg syncs. 6867 */ 6868 VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); 6869 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); 6870 6871 /* 6872 * Lock out configuration changes. 6873 */ 6874 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6875 6876 spa->spa_syncing_txg = txg; 6877 spa->spa_sync_pass = 0; 6878 6879 mutex_enter(&spa->spa_alloc_lock); 6880 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6881 mutex_exit(&spa->spa_alloc_lock); 6882 6883 /* 6884 * If there are any pending vdev state changes, convert them 6885 * into config changes that go out with this transaction group. 6886 */ 6887 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6888 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6889 /* 6890 * We need the write lock here because, for aux vdevs, 6891 * calling vdev_config_dirty() modifies sav_config. 6892 * This is ugly and will become unnecessary when we 6893 * eliminate the aux vdev wart by integrating all vdevs 6894 * into the root vdev tree. 6895 */ 6896 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6897 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6898 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6899 vdev_state_clean(vd); 6900 vdev_config_dirty(vd); 6901 } 6902 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6903 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6904 } 6905 spa_config_exit(spa, SCL_STATE, FTAG); 6906 6907 tx = dmu_tx_create_assigned(dp, txg); 6908 6909 spa->spa_sync_starttime = gethrtime(); 6910 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6911 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6912 6913 /* 6914 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6915 * set spa_deflate if we have no raid-z vdevs. 6916 */ 6917 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6918 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6919 int i; 6920 6921 for (i = 0; i < rvd->vdev_children; i++) { 6922 vd = rvd->vdev_child[i]; 6923 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6924 break; 6925 } 6926 if (i == rvd->vdev_children) { 6927 spa->spa_deflate = TRUE; 6928 VERIFY(0 == zap_add(spa->spa_meta_objset, 6929 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6930 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6931 } 6932 } 6933 6934 /* 6935 * Set the top-level vdev's max queue depth. Evaluate each 6936 * top-level's async write queue depth in case it changed. 6937 * The max queue depth will not change in the middle of syncing 6938 * out this txg. 6939 */ 6940 uint64_t queue_depth_total = 0; 6941 for (int c = 0; c < rvd->vdev_children; c++) { 6942 vdev_t *tvd = rvd->vdev_child[c]; 6943 metaslab_group_t *mg = tvd->vdev_mg; 6944 6945 if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6946 !metaslab_group_initialized(mg)) 6947 continue; 6948 6949 /* 6950 * It is safe to do a lock-free check here because only async 6951 * allocations look at mg_max_alloc_queue_depth, and async 6952 * allocations all happen from spa_sync(). 6953 */ 6954 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6955 mg->mg_max_alloc_queue_depth = max_queue_depth; 6956 queue_depth_total += mg->mg_max_alloc_queue_depth; 6957 } 6958 metaslab_class_t *mc = spa_normal_class(spa); 6959 ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6960 mc->mc_alloc_max_slots = queue_depth_total; 6961 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6962 6963 ASSERT3U(mc->mc_alloc_max_slots, <=, 6964 max_queue_depth * rvd->vdev_children); 6965 6966 for (int c = 0; c < rvd->vdev_children; c++) { 6967 vdev_t *vd = rvd->vdev_child[c]; 6968 vdev_indirect_state_sync_verify(vd); 6969 6970 if (vdev_indirect_should_condense(vd)) { 6971 spa_condense_indirect_start_sync(vd, tx); 6972 break; 6973 } 6974 } 6975 6976 /* 6977 * Iterate to convergence. 6978 */ 6979 do { 6980 int pass = ++spa->spa_sync_pass; 6981 6982 spa_sync_config_object(spa, tx); 6983 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6984 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6985 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6986 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6987 spa_errlog_sync(spa, txg); 6988 dsl_pool_sync(dp, txg); 6989 6990 if (pass < zfs_sync_pass_deferred_free) { 6991 spa_sync_frees(spa, free_bpl, tx); 6992 } else { 6993 /* 6994 * We can not defer frees in pass 1, because 6995 * we sync the deferred frees later in pass 1. 6996 */ 6997 ASSERT3U(pass, >, 1); 6998 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6999 &spa->spa_deferred_bpobj, tx); 7000 } 7001 7002 ddt_sync(spa, txg); 7003 dsl_scan_sync(dp, tx); 7004 7005 if (spa->spa_vdev_removal != NULL) 7006 svr_sync(spa, tx); 7007 7008 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 7009 != NULL) 7010 vdev_sync(vd, txg); 7011 7012 if (pass == 1) { 7013 spa_sync_upgrades(spa, tx); 7014 ASSERT3U(txg, >=, 7015 spa->spa_uberblock.ub_rootbp.blk_birth); 7016 /* 7017 * Note: We need to check if the MOS is dirty 7018 * because we could have marked the MOS dirty 7019 * without updating the uberblock (e.g. if we 7020 * have sync tasks but no dirty user data). We 7021 * need to check the uberblock's rootbp because 7022 * it is updated if we have synced out dirty 7023 * data (though in this case the MOS will most 7024 * likely also be dirty due to second order 7025 * effects, we don't want to rely on that here). 7026 */ 7027 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 7028 !dmu_objset_is_dirty(mos, txg)) { 7029 /* 7030 * Nothing changed on the first pass, 7031 * therefore this TXG is a no-op. Avoid 7032 * syncing deferred frees, so that we 7033 * can keep this TXG as a no-op. 7034 */ 7035 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 7036 txg)); 7037 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7038 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 7039 break; 7040 } 7041 spa_sync_deferred_frees(spa, tx); 7042 } 7043 7044 } while (dmu_objset_is_dirty(mos, txg)); 7045 7046 if (!list_is_empty(&spa->spa_config_dirty_list)) { 7047 /* 7048 * Make sure that the number of ZAPs for all the vdevs matches 7049 * the number of ZAPs in the per-vdev ZAP list. This only gets 7050 * called if the config is dirty; otherwise there may be 7051 * outstanding AVZ operations that weren't completed in 7052 * spa_sync_config_object. 7053 */ 7054 uint64_t all_vdev_zap_entry_count; 7055 ASSERT0(zap_count(spa->spa_meta_objset, 7056 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 7057 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 7058 all_vdev_zap_entry_count); 7059 } 7060 7061 if (spa->spa_vdev_removal != NULL) { 7062 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 7063 } 7064 7065 /* 7066 * Rewrite the vdev configuration (which includes the uberblock) 7067 * to commit the transaction group. 7068 * 7069 * If there are no dirty vdevs, we sync the uberblock to a few 7070 * random top-level vdevs that are known to be visible in the 7071 * config cache (see spa_vdev_add() for a complete description). 7072 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 7073 */ 7074 for (;;) { 7075 /* 7076 * We hold SCL_STATE to prevent vdev open/close/etc. 7077 * while we're attempting to write the vdev labels. 7078 */ 7079 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 7080 7081 if (list_is_empty(&spa->spa_config_dirty_list)) { 7082 vdev_t *svd[SPA_DVAS_PER_BP]; 7083 int svdcount = 0; 7084 int children = rvd->vdev_children; 7085 int c0 = spa_get_random(children); 7086 7087 for (int c = 0; c < children; c++) { 7088 vd = rvd->vdev_child[(c0 + c) % children]; 7089 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 7090 !vdev_is_concrete(vd)) 7091 continue; 7092 svd[svdcount++] = vd; 7093 if (svdcount == SPA_DVAS_PER_BP) 7094 break; 7095 } 7096 error = vdev_config_sync(svd, svdcount, txg); 7097 } else { 7098 error = vdev_config_sync(rvd->vdev_child, 7099 rvd->vdev_children, txg); 7100 } 7101 7102 if (error == 0) 7103 spa->spa_last_synced_guid = rvd->vdev_guid; 7104 7105 spa_config_exit(spa, SCL_STATE, FTAG); 7106 7107 if (error == 0) 7108 break; 7109 zio_suspend(spa, NULL); 7110 zio_resume_wait(spa); 7111 } 7112 dmu_tx_commit(tx); 7113 7114 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7115 7116 /* 7117 * Clear the dirty config list. 7118 */ 7119 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7120 vdev_config_clean(vd); 7121 7122 /* 7123 * Now that the new config has synced transactionally, 7124 * let it become visible to the config cache. 7125 */ 7126 if (spa->spa_config_syncing != NULL) { 7127 spa_config_set(spa, spa->spa_config_syncing); 7128 spa->spa_config_txg = txg; 7129 spa->spa_config_syncing = NULL; 7130 } 7131 7132 dsl_pool_sync_done(dp, txg); 7133 7134 mutex_enter(&spa->spa_alloc_lock); 7135 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 7136 mutex_exit(&spa->spa_alloc_lock); 7137 7138 /* 7139 * Update usable space statistics. 7140 */ 7141 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7142 vdev_sync_done(vd, txg); 7143 7144 spa_update_dspace(spa); 7145 7146 /* 7147 * It had better be the case that we didn't dirty anything 7148 * since vdev_config_sync(). 7149 */ 7150 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7151 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7152 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7153 7154 spa->spa_sync_pass = 0; 7155 7156 /* 7157 * Update the last synced uberblock here. We want to do this at 7158 * the end of spa_sync() so that consumers of spa_last_synced_txg() 7159 * will be guaranteed that all the processing associated with 7160 * that txg has been completed. 7161 */ 7162 spa->spa_ubsync = spa->spa_uberblock; 7163 spa_config_exit(spa, SCL_CONFIG, FTAG); 7164 7165 spa_handle_ignored_writes(spa); 7166 7167 /* 7168 * If any async tasks have been requested, kick them off. 7169 */ 7170 spa_async_dispatch(spa); 7171 } 7172 7173 /* 7174 * Sync all pools. We don't want to hold the namespace lock across these 7175 * operations, so we take a reference on the spa_t and drop the lock during the 7176 * sync. 7177 */ 7178 void 7179 spa_sync_allpools(void) 7180 { 7181 spa_t *spa = NULL; 7182 mutex_enter(&spa_namespace_lock); 7183 while ((spa = spa_next(spa)) != NULL) { 7184 if (spa_state(spa) != POOL_STATE_ACTIVE || 7185 !spa_writeable(spa) || spa_suspended(spa)) 7186 continue; 7187 spa_open_ref(spa, FTAG); 7188 mutex_exit(&spa_namespace_lock); 7189 txg_wait_synced(spa_get_dsl(spa), 0); 7190 mutex_enter(&spa_namespace_lock); 7191 spa_close(spa, FTAG); 7192 } 7193 mutex_exit(&spa_namespace_lock); 7194 } 7195 7196 /* 7197 * ========================================================================== 7198 * Miscellaneous routines 7199 * ========================================================================== 7200 */ 7201 7202 /* 7203 * Remove all pools in the system. 7204 */ 7205 void 7206 spa_evict_all(void) 7207 { 7208 spa_t *spa; 7209 7210 /* 7211 * Remove all cached state. All pools should be closed now, 7212 * so every spa in the AVL tree should be unreferenced. 7213 */ 7214 mutex_enter(&spa_namespace_lock); 7215 while ((spa = spa_next(NULL)) != NULL) { 7216 /* 7217 * Stop async tasks. The async thread may need to detach 7218 * a device that's been replaced, which requires grabbing 7219 * spa_namespace_lock, so we must drop it here. 7220 */ 7221 spa_open_ref(spa, FTAG); 7222 mutex_exit(&spa_namespace_lock); 7223 spa_async_suspend(spa); 7224 mutex_enter(&spa_namespace_lock); 7225 spa_close(spa, FTAG); 7226 7227 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7228 spa_unload(spa); 7229 spa_deactivate(spa); 7230 } 7231 spa_remove(spa); 7232 } 7233 mutex_exit(&spa_namespace_lock); 7234 } 7235 7236 vdev_t * 7237 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7238 { 7239 vdev_t *vd; 7240 int i; 7241 7242 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7243 return (vd); 7244 7245 if (aux) { 7246 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7247 vd = spa->spa_l2cache.sav_vdevs[i]; 7248 if (vd->vdev_guid == guid) 7249 return (vd); 7250 } 7251 7252 for (i = 0; i < spa->spa_spares.sav_count; i++) { 7253 vd = spa->spa_spares.sav_vdevs[i]; 7254 if (vd->vdev_guid == guid) 7255 return (vd); 7256 } 7257 } 7258 7259 return (NULL); 7260 } 7261 7262 void 7263 spa_upgrade(spa_t *spa, uint64_t version) 7264 { 7265 ASSERT(spa_writeable(spa)); 7266 7267 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7268 7269 /* 7270 * This should only be called for a non-faulted pool, and since a 7271 * future version would result in an unopenable pool, this shouldn't be 7272 * possible. 7273 */ 7274 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7275 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7276 7277 spa->spa_uberblock.ub_version = version; 7278 vdev_config_dirty(spa->spa_root_vdev); 7279 7280 spa_config_exit(spa, SCL_ALL, FTAG); 7281 7282 txg_wait_synced(spa_get_dsl(spa), 0); 7283 } 7284 7285 boolean_t 7286 spa_has_spare(spa_t *spa, uint64_t guid) 7287 { 7288 int i; 7289 uint64_t spareguid; 7290 spa_aux_vdev_t *sav = &spa->spa_spares; 7291 7292 for (i = 0; i < sav->sav_count; i++) 7293 if (sav->sav_vdevs[i]->vdev_guid == guid) 7294 return (B_TRUE); 7295 7296 for (i = 0; i < sav->sav_npending; i++) { 7297 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7298 &spareguid) == 0 && spareguid == guid) 7299 return (B_TRUE); 7300 } 7301 7302 return (B_FALSE); 7303 } 7304 7305 /* 7306 * Check if a pool has an active shared spare device. 7307 * Note: reference count of an active spare is 2, as a spare and as a replace 7308 */ 7309 static boolean_t 7310 spa_has_active_shared_spare(spa_t *spa) 7311 { 7312 int i, refcnt; 7313 uint64_t pool; 7314 spa_aux_vdev_t *sav = &spa->spa_spares; 7315 7316 for (i = 0; i < sav->sav_count; i++) { 7317 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7318 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7319 refcnt > 2) 7320 return (B_TRUE); 7321 } 7322 7323 return (B_FALSE); 7324 } 7325 7326 sysevent_t * 7327 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7328 { 7329 sysevent_t *ev = NULL; 7330 #ifdef _KERNEL 7331 sysevent_attr_list_t *attr = NULL; 7332 sysevent_value_t value; 7333 7334 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7335 SE_SLEEP); 7336 ASSERT(ev != NULL); 7337 7338 value.value_type = SE_DATA_TYPE_STRING; 7339 value.value.sv_string = spa_name(spa); 7340 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7341 goto done; 7342 7343 value.value_type = SE_DATA_TYPE_UINT64; 7344 value.value.sv_uint64 = spa_guid(spa); 7345 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7346 goto done; 7347 7348 if (vd) { 7349 value.value_type = SE_DATA_TYPE_UINT64; 7350 value.value.sv_uint64 = vd->vdev_guid; 7351 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7352 SE_SLEEP) != 0) 7353 goto done; 7354 7355 if (vd->vdev_path) { 7356 value.value_type = SE_DATA_TYPE_STRING; 7357 value.value.sv_string = vd->vdev_path; 7358 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7359 &value, SE_SLEEP) != 0) 7360 goto done; 7361 } 7362 } 7363 7364 if (hist_nvl != NULL) { 7365 fnvlist_merge((nvlist_t *)attr, hist_nvl); 7366 } 7367 7368 if (sysevent_attach_attributes(ev, attr) != 0) 7369 goto done; 7370 attr = NULL; 7371 7372 done: 7373 if (attr) 7374 sysevent_free_attr(attr); 7375 7376 #endif 7377 return (ev); 7378 } 7379 7380 void 7381 spa_event_post(sysevent_t *ev) 7382 { 7383 #ifdef _KERNEL 7384 sysevent_id_t eid; 7385 7386 (void) log_sysevent(ev, SE_SLEEP, &eid); 7387 sysevent_free(ev); 7388 #endif 7389 } 7390 7391 void 7392 spa_event_discard(sysevent_t *ev) 7393 { 7394 #ifdef _KERNEL 7395 sysevent_free(ev); 7396 #endif 7397 } 7398 7399 /* 7400 * Post a sysevent corresponding to the given event. The 'name' must be one of 7401 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7402 * filled in from the spa and (optionally) the vdev and history nvl. This 7403 * doesn't do anything in the userland libzpool, as we don't want consumers to 7404 * misinterpret ztest or zdb as real changes. 7405 */ 7406 void 7407 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 7408 { 7409 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 7410 } 7411