1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21*99653d4eSeschrock 22fa9e4066Sahrens /* 23c67d9675Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens /* 30fa9e4066Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31fa9e4066Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32fa9e4066Sahrens * pool. 33fa9e4066Sahrens */ 34fa9e4066Sahrens 35fa9e4066Sahrens #include <sys/zfs_context.h> 36ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 37fa9e4066Sahrens #include <sys/spa_impl.h> 38fa9e4066Sahrens #include <sys/zio.h> 39fa9e4066Sahrens #include <sys/zio_checksum.h> 40fa9e4066Sahrens #include <sys/zio_compress.h> 41fa9e4066Sahrens #include <sys/dmu.h> 42fa9e4066Sahrens #include <sys/dmu_tx.h> 43fa9e4066Sahrens #include <sys/zap.h> 44fa9e4066Sahrens #include <sys/zil.h> 45fa9e4066Sahrens #include <sys/vdev_impl.h> 46fa9e4066Sahrens #include <sys/metaslab.h> 47fa9e4066Sahrens #include <sys/uberblock_impl.h> 48fa9e4066Sahrens #include <sys/txg.h> 49fa9e4066Sahrens #include <sys/avl.h> 50fa9e4066Sahrens #include <sys/dmu_traverse.h> 51fa9e4066Sahrens #include <sys/unique.h> 52fa9e4066Sahrens #include <sys/dsl_pool.h> 53fa9e4066Sahrens #include <sys/dsl_dir.h> 54fa9e4066Sahrens #include <sys/dsl_prop.h> 55fa9e4066Sahrens #include <sys/fs/zfs.h> 56fa9e4066Sahrens #include <sys/callb.h> 57fa9e4066Sahrens 58fa9e4066Sahrens /* 59fa9e4066Sahrens * ========================================================================== 60fa9e4066Sahrens * SPA state manipulation (open/create/destroy/import/export) 61fa9e4066Sahrens * ========================================================================== 62fa9e4066Sahrens */ 63fa9e4066Sahrens 64ea8dc4b6Seschrock static int 65ea8dc4b6Seschrock spa_error_entry_compare(const void *a, const void *b) 66ea8dc4b6Seschrock { 67ea8dc4b6Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 68ea8dc4b6Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 69ea8dc4b6Seschrock int ret; 70ea8dc4b6Seschrock 71ea8dc4b6Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72ea8dc4b6Seschrock sizeof (zbookmark_t)); 73ea8dc4b6Seschrock 74ea8dc4b6Seschrock if (ret < 0) 75ea8dc4b6Seschrock return (-1); 76ea8dc4b6Seschrock else if (ret > 0) 77ea8dc4b6Seschrock return (1); 78ea8dc4b6Seschrock else 79ea8dc4b6Seschrock return (0); 80ea8dc4b6Seschrock } 81ea8dc4b6Seschrock 82ea8dc4b6Seschrock /* 83ea8dc4b6Seschrock * Utility function which retrieves copies of the current logs and 84ea8dc4b6Seschrock * re-initializes them in the process. 85ea8dc4b6Seschrock */ 86ea8dc4b6Seschrock void 87ea8dc4b6Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88ea8dc4b6Seschrock { 89ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90ea8dc4b6Seschrock 91ea8dc4b6Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92ea8dc4b6Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93ea8dc4b6Seschrock 94ea8dc4b6Seschrock avl_create(&spa->spa_errlist_scrub, 95ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 96ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 97ea8dc4b6Seschrock avl_create(&spa->spa_errlist_last, 98ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 99ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 100ea8dc4b6Seschrock } 101ea8dc4b6Seschrock 102fa9e4066Sahrens /* 103fa9e4066Sahrens * Activate an uninitialized pool. 104fa9e4066Sahrens */ 105fa9e4066Sahrens static void 106fa9e4066Sahrens spa_activate(spa_t *spa) 107fa9e4066Sahrens { 108fa9e4066Sahrens int t; 109fa9e4066Sahrens 110fa9e4066Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111fa9e4066Sahrens 112fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 113fa9e4066Sahrens 114fa9e4066Sahrens spa->spa_normal_class = metaslab_class_create(); 115fa9e4066Sahrens 116fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 117fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 119fa9e4066Sahrens TASKQ_PREPOPULATE); 120fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 122fa9e4066Sahrens TASKQ_PREPOPULATE); 123fa9e4066Sahrens } 124fa9e4066Sahrens 125fa9e4066Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126fa9e4066Sahrens 127fa9e4066Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128fa9e4066Sahrens offsetof(vdev_t, vdev_dirty_node)); 129fa9e4066Sahrens 130fa9e4066Sahrens txg_list_create(&spa->spa_vdev_txg_list, 131fa9e4066Sahrens offsetof(struct vdev, vdev_txg_node)); 132ea8dc4b6Seschrock 133ea8dc4b6Seschrock avl_create(&spa->spa_errlist_scrub, 134ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 135ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 136ea8dc4b6Seschrock avl_create(&spa->spa_errlist_last, 137ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 138ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 139fa9e4066Sahrens } 140fa9e4066Sahrens 141fa9e4066Sahrens /* 142fa9e4066Sahrens * Opposite of spa_activate(). 143fa9e4066Sahrens */ 144fa9e4066Sahrens static void 145fa9e4066Sahrens spa_deactivate(spa_t *spa) 146fa9e4066Sahrens { 147fa9e4066Sahrens int t; 148fa9e4066Sahrens 149fa9e4066Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 150fa9e4066Sahrens ASSERT(spa->spa_dsl_pool == NULL); 151fa9e4066Sahrens ASSERT(spa->spa_root_vdev == NULL); 152fa9e4066Sahrens 153fa9e4066Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154fa9e4066Sahrens 155fa9e4066Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 156fa9e4066Sahrens 157fa9e4066Sahrens list_destroy(&spa->spa_dirty_list); 158fa9e4066Sahrens 159fa9e4066Sahrens rw_destroy(&spa->spa_traverse_lock); 160fa9e4066Sahrens 161fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 162fa9e4066Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 163fa9e4066Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 164fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = NULL; 165fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = NULL; 166fa9e4066Sahrens } 167fa9e4066Sahrens 168fa9e4066Sahrens metaslab_class_destroy(spa->spa_normal_class); 169fa9e4066Sahrens spa->spa_normal_class = NULL; 170fa9e4066Sahrens 171ea8dc4b6Seschrock /* 172ea8dc4b6Seschrock * If this was part of an import or the open otherwise failed, we may 173ea8dc4b6Seschrock * still have errors left in the queues. Empty them just in case. 174ea8dc4b6Seschrock */ 175ea8dc4b6Seschrock spa_errlog_drain(spa); 176ea8dc4b6Seschrock 177ea8dc4b6Seschrock avl_destroy(&spa->spa_errlist_scrub); 178ea8dc4b6Seschrock avl_destroy(&spa->spa_errlist_last); 179ea8dc4b6Seschrock 180fa9e4066Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 181fa9e4066Sahrens } 182fa9e4066Sahrens 183fa9e4066Sahrens /* 184fa9e4066Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 185fa9e4066Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 186fa9e4066Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 187fa9e4066Sahrens * All vdev validation is done by the vdev_alloc() routine. 188fa9e4066Sahrens */ 189*99653d4eSeschrock static int 190*99653d4eSeschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 191*99653d4eSeschrock uint_t id, int atype) 192fa9e4066Sahrens { 193fa9e4066Sahrens nvlist_t **child; 194fa9e4066Sahrens uint_t c, children; 195*99653d4eSeschrock int error; 196fa9e4066Sahrens 197*99653d4eSeschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 198*99653d4eSeschrock return (error); 199fa9e4066Sahrens 200*99653d4eSeschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 201*99653d4eSeschrock return (0); 202fa9e4066Sahrens 203fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204fa9e4066Sahrens &child, &children) != 0) { 205*99653d4eSeschrock vdev_free(*vdp); 206*99653d4eSeschrock *vdp = NULL; 207*99653d4eSeschrock return (EINVAL); 208fa9e4066Sahrens } 209fa9e4066Sahrens 210fa9e4066Sahrens for (c = 0; c < children; c++) { 211*99653d4eSeschrock vdev_t *vd; 212*99653d4eSeschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 213*99653d4eSeschrock atype)) != 0) { 214*99653d4eSeschrock vdev_free(*vdp); 215*99653d4eSeschrock *vdp = NULL; 216*99653d4eSeschrock return (error); 217fa9e4066Sahrens } 218fa9e4066Sahrens } 219fa9e4066Sahrens 220*99653d4eSeschrock ASSERT(*vdp != NULL); 221*99653d4eSeschrock 222*99653d4eSeschrock return (0); 223fa9e4066Sahrens } 224fa9e4066Sahrens 225fa9e4066Sahrens /* 226fa9e4066Sahrens * Opposite of spa_load(). 227fa9e4066Sahrens */ 228fa9e4066Sahrens static void 229fa9e4066Sahrens spa_unload(spa_t *spa) 230fa9e4066Sahrens { 231*99653d4eSeschrock int i; 232*99653d4eSeschrock 233ea8dc4b6Seschrock /* 234ea8dc4b6Seschrock * Stop async tasks. 235ea8dc4b6Seschrock */ 236ea8dc4b6Seschrock spa_async_suspend(spa); 237ea8dc4b6Seschrock 238fa9e4066Sahrens /* 239fa9e4066Sahrens * Stop syncing. 240fa9e4066Sahrens */ 241fa9e4066Sahrens if (spa->spa_sync_on) { 242fa9e4066Sahrens txg_sync_stop(spa->spa_dsl_pool); 243fa9e4066Sahrens spa->spa_sync_on = B_FALSE; 244fa9e4066Sahrens } 245fa9e4066Sahrens 246fa9e4066Sahrens /* 247fa9e4066Sahrens * Wait for any outstanding prefetch I/O to complete. 248fa9e4066Sahrens */ 249ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 250ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 251fa9e4066Sahrens 252fa9e4066Sahrens /* 253fa9e4066Sahrens * Close the dsl pool. 254fa9e4066Sahrens */ 255fa9e4066Sahrens if (spa->spa_dsl_pool) { 256fa9e4066Sahrens dsl_pool_close(spa->spa_dsl_pool); 257fa9e4066Sahrens spa->spa_dsl_pool = NULL; 258fa9e4066Sahrens } 259fa9e4066Sahrens 260fa9e4066Sahrens /* 261fa9e4066Sahrens * Close all vdevs. 262fa9e4066Sahrens */ 2630e34b6a7Sbonwick if (spa->spa_root_vdev) 264fa9e4066Sahrens vdev_free(spa->spa_root_vdev); 2650e34b6a7Sbonwick ASSERT(spa->spa_root_vdev == NULL); 266ea8dc4b6Seschrock 267*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 268*99653d4eSeschrock vdev_free(spa->spa_spares[i]); 269*99653d4eSeschrock if (spa->spa_spares) { 270*99653d4eSeschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 271*99653d4eSeschrock spa->spa_spares = NULL; 272*99653d4eSeschrock } 273*99653d4eSeschrock if (spa->spa_sparelist) { 274*99653d4eSeschrock nvlist_free(spa->spa_sparelist); 275*99653d4eSeschrock spa->spa_sparelist = NULL; 276*99653d4eSeschrock } 277*99653d4eSeschrock 278ea8dc4b6Seschrock spa->spa_async_suspended = 0; 279fa9e4066Sahrens } 280fa9e4066Sahrens 281*99653d4eSeschrock /* 282*99653d4eSeschrock * Load (or re-load) the current list of vdevs describing the active spares for 283*99653d4eSeschrock * this pool. When this is called, we have some form of basic information in 284*99653d4eSeschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 285*99653d4eSeschrock * re-generate a more complete list including status information. 286*99653d4eSeschrock */ 287*99653d4eSeschrock static void 288*99653d4eSeschrock spa_load_spares(spa_t *spa) 289*99653d4eSeschrock { 290*99653d4eSeschrock nvlist_t **spares; 291*99653d4eSeschrock uint_t nspares; 292*99653d4eSeschrock int i; 293*99653d4eSeschrock 294*99653d4eSeschrock /* 295*99653d4eSeschrock * First, close and free any existing spare vdevs. 296*99653d4eSeschrock */ 297*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) { 298*99653d4eSeschrock vdev_close(spa->spa_spares[i]); 299*99653d4eSeschrock vdev_free(spa->spa_spares[i]); 300*99653d4eSeschrock } 301*99653d4eSeschrock if (spa->spa_spares) 302*99653d4eSeschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303*99653d4eSeschrock 304*99653d4eSeschrock if (spa->spa_sparelist == NULL) 305*99653d4eSeschrock nspares = 0; 306*99653d4eSeschrock else 307*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 308*99653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 309*99653d4eSeschrock 310*99653d4eSeschrock spa->spa_nspares = (int)nspares; 311*99653d4eSeschrock spa->spa_spares = NULL; 312*99653d4eSeschrock 313*99653d4eSeschrock if (nspares == 0) 314*99653d4eSeschrock return; 315*99653d4eSeschrock 316*99653d4eSeschrock /* 317*99653d4eSeschrock * Construct the array of vdevs, opening them to get status in the 318*99653d4eSeschrock * process. 319*99653d4eSeschrock */ 320*99653d4eSeschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 321*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) { 322*99653d4eSeschrock vdev_t *vd; 323*99653d4eSeschrock 324*99653d4eSeschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 325*99653d4eSeschrock VDEV_ALLOC_SPARE) == 0); 326*99653d4eSeschrock ASSERT(vd != NULL); 327*99653d4eSeschrock 328*99653d4eSeschrock spa->spa_spares[i] = vd; 329*99653d4eSeschrock 330*99653d4eSeschrock if (vdev_open(vd) != 0) 331*99653d4eSeschrock continue; 332*99653d4eSeschrock 333*99653d4eSeschrock vd->vdev_top = vd; 334*99653d4eSeschrock (void) vdev_validate_spare(vd); 335*99653d4eSeschrock } 336*99653d4eSeschrock 337*99653d4eSeschrock /* 338*99653d4eSeschrock * Recompute the stashed list of spares, with status information 339*99653d4eSeschrock * this time. 340*99653d4eSeschrock */ 341*99653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 342*99653d4eSeschrock DATA_TYPE_NVLIST_ARRAY) == 0); 343*99653d4eSeschrock 344*99653d4eSeschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 345*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 346*99653d4eSeschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 347*99653d4eSeschrock B_TRUE, B_TRUE); 348*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 349*99653d4eSeschrock spares, spa->spa_nspares) == 0); 350*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 351*99653d4eSeschrock nvlist_free(spares[i]); 352*99653d4eSeschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 353*99653d4eSeschrock } 354*99653d4eSeschrock 355*99653d4eSeschrock static int 356*99653d4eSeschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 357*99653d4eSeschrock { 358*99653d4eSeschrock dmu_buf_t *db; 359*99653d4eSeschrock char *packed = NULL; 360*99653d4eSeschrock size_t nvsize = 0; 361*99653d4eSeschrock int error; 362*99653d4eSeschrock *value = NULL; 363*99653d4eSeschrock 364*99653d4eSeschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 365*99653d4eSeschrock nvsize = *(uint64_t *)db->db_data; 366*99653d4eSeschrock dmu_buf_rele(db, FTAG); 367*99653d4eSeschrock 368*99653d4eSeschrock packed = kmem_alloc(nvsize, KM_SLEEP); 369*99653d4eSeschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 370*99653d4eSeschrock if (error == 0) 371*99653d4eSeschrock error = nvlist_unpack(packed, nvsize, value, 0); 372*99653d4eSeschrock kmem_free(packed, nvsize); 373*99653d4eSeschrock 374*99653d4eSeschrock return (error); 375*99653d4eSeschrock } 376*99653d4eSeschrock 377fa9e4066Sahrens /* 378fa9e4066Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 379ea8dc4b6Seschrock * source of configuration information. 380fa9e4066Sahrens */ 381fa9e4066Sahrens static int 382ea8dc4b6Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383fa9e4066Sahrens { 384fa9e4066Sahrens int error = 0; 385fa9e4066Sahrens nvlist_t *nvroot = NULL; 386fa9e4066Sahrens vdev_t *rvd; 387fa9e4066Sahrens uberblock_t *ub = &spa->spa_uberblock; 3880373e76bSbonwick uint64_t config_cache_txg = spa->spa_config_txg; 389fa9e4066Sahrens uint64_t pool_guid; 390*99653d4eSeschrock uint64_t version; 391fa9e4066Sahrens zio_t *zio; 392fa9e4066Sahrens 393ea8dc4b6Seschrock spa->spa_load_state = state; 3940373e76bSbonwick 395fa9e4066Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 396a9926bf0Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 397ea8dc4b6Seschrock error = EINVAL; 398ea8dc4b6Seschrock goto out; 399ea8dc4b6Seschrock } 400fa9e4066Sahrens 401*99653d4eSeschrock /* 402*99653d4eSeschrock * Versioning wasn't explicitly added to the label until later, so if 403*99653d4eSeschrock * it's not present treat it as the initial version. 404*99653d4eSeschrock */ 405*99653d4eSeschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 406*99653d4eSeschrock version = ZFS_VERSION_INITIAL; 407*99653d4eSeschrock 408a9926bf0Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 409a9926bf0Sbonwick &spa->spa_config_txg); 410a9926bf0Sbonwick 4110373e76bSbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 412ea8dc4b6Seschrock spa_guid_exists(pool_guid, 0)) { 413ea8dc4b6Seschrock error = EEXIST; 414ea8dc4b6Seschrock goto out; 415ea8dc4b6Seschrock } 416fa9e4066Sahrens 417fa9e4066Sahrens /* 418*99653d4eSeschrock * Parse the configuration into a vdev tree. We explicitly set the 419*99653d4eSeschrock * value that will be returned by spa_version() since parsing the 420*99653d4eSeschrock * configuration requires knowing the version number. 421fa9e4066Sahrens */ 422ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 423*99653d4eSeschrock spa->spa_ubsync.ub_version = version; 424*99653d4eSeschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 425ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 426fa9e4066Sahrens 427*99653d4eSeschrock if (error != 0) 428ea8dc4b6Seschrock goto out; 429fa9e4066Sahrens 4300e34b6a7Sbonwick ASSERT(spa->spa_root_vdev == rvd); 431fa9e4066Sahrens ASSERT(spa_guid(spa) == pool_guid); 432fa9e4066Sahrens 433fa9e4066Sahrens /* 434fa9e4066Sahrens * Try to open all vdevs, loading each label in the process. 435fa9e4066Sahrens */ 436ea8dc4b6Seschrock if (vdev_open(rvd) != 0) { 437ea8dc4b6Seschrock error = ENXIO; 438ea8dc4b6Seschrock goto out; 439ea8dc4b6Seschrock } 440fa9e4066Sahrens 441560e6e96Seschrock /* 442560e6e96Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 443560e6e96Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 444560e6e96Seschrock * flag. 445560e6e96Seschrock */ 446560e6e96Seschrock spa_config_enter(spa, RW_READER, FTAG); 447560e6e96Seschrock error = vdev_validate(rvd); 448560e6e96Seschrock spa_config_exit(spa, FTAG); 449560e6e96Seschrock 450560e6e96Seschrock if (error != 0) { 451560e6e96Seschrock error = EBADF; 452560e6e96Seschrock goto out; 453560e6e96Seschrock } 454560e6e96Seschrock 455560e6e96Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 456560e6e96Seschrock error = ENXIO; 457560e6e96Seschrock goto out; 458560e6e96Seschrock } 459560e6e96Seschrock 460fa9e4066Sahrens /* 461fa9e4066Sahrens * Find the best uberblock. 462fa9e4066Sahrens */ 463fa9e4066Sahrens bzero(ub, sizeof (uberblock_t)); 464fa9e4066Sahrens 465fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, 466fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 467fa9e4066Sahrens vdev_uberblock_load(zio, rvd, ub); 468fa9e4066Sahrens error = zio_wait(zio); 469fa9e4066Sahrens 470fa9e4066Sahrens /* 471fa9e4066Sahrens * If we weren't able to find a single valid uberblock, return failure. 472fa9e4066Sahrens */ 473fa9e4066Sahrens if (ub->ub_txg == 0) { 474eaca9bbdSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 475eaca9bbdSeschrock VDEV_AUX_CORRUPT_DATA); 476ea8dc4b6Seschrock error = ENXIO; 477ea8dc4b6Seschrock goto out; 478ea8dc4b6Seschrock } 479ea8dc4b6Seschrock 480ea8dc4b6Seschrock /* 481ea8dc4b6Seschrock * If the pool is newer than the code, we can't open it. 482ea8dc4b6Seschrock */ 483eaca9bbdSeschrock if (ub->ub_version > ZFS_VERSION) { 484eaca9bbdSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 485eaca9bbdSeschrock VDEV_AUX_VERSION_NEWER); 486ea8dc4b6Seschrock error = ENOTSUP; 487ea8dc4b6Seschrock goto out; 488fa9e4066Sahrens } 489fa9e4066Sahrens 490fa9e4066Sahrens /* 491fa9e4066Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 492fa9e4066Sahrens * incomplete configuration. 493fa9e4066Sahrens */ 494ecc2d604Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 495ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 496ea8dc4b6Seschrock VDEV_AUX_BAD_GUID_SUM); 497ea8dc4b6Seschrock error = ENXIO; 498ea8dc4b6Seschrock goto out; 499fa9e4066Sahrens } 500fa9e4066Sahrens 501fa9e4066Sahrens /* 502fa9e4066Sahrens * Initialize internal SPA structures. 503fa9e4066Sahrens */ 504fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 505fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 506fa9e4066Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 507ea8dc4b6Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 508ea8dc4b6Seschrock if (error) { 509ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 510ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 511ea8dc4b6Seschrock goto out; 512ea8dc4b6Seschrock } 513fa9e4066Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 514fa9e4066Sahrens 515ea8dc4b6Seschrock if (zap_lookup(spa->spa_meta_objset, 516fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 517ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 518ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 519ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 520ea8dc4b6Seschrock error = EIO; 521ea8dc4b6Seschrock goto out; 522ea8dc4b6Seschrock } 523fa9e4066Sahrens 524fa9e4066Sahrens if (!mosconfig) { 525*99653d4eSeschrock nvlist_t *newconfig; 526fa9e4066Sahrens 527*99653d4eSeschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 528ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 529ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 530ea8dc4b6Seschrock error = EIO; 531ea8dc4b6Seschrock goto out; 532ea8dc4b6Seschrock } 533fa9e4066Sahrens 534fa9e4066Sahrens spa_config_set(spa, newconfig); 535fa9e4066Sahrens spa_unload(spa); 536fa9e4066Sahrens spa_deactivate(spa); 537fa9e4066Sahrens spa_activate(spa); 538fa9e4066Sahrens 539ea8dc4b6Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 540fa9e4066Sahrens } 541fa9e4066Sahrens 542ea8dc4b6Seschrock if (zap_lookup(spa->spa_meta_objset, 543fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 544ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 545ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 546ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 547ea8dc4b6Seschrock error = EIO; 548ea8dc4b6Seschrock goto out; 549ea8dc4b6Seschrock } 550fa9e4066Sahrens 551*99653d4eSeschrock /* 552*99653d4eSeschrock * Load the bit that tells us to use the new accounting function 553*99653d4eSeschrock * (raid-z deflation). If we have an older pool, this will not 554*99653d4eSeschrock * be present. 555*99653d4eSeschrock */ 556*99653d4eSeschrock error = zap_lookup(spa->spa_meta_objset, 557*99653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 558*99653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate); 559*99653d4eSeschrock if (error != 0 && error != ENOENT) { 560*99653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 561*99653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 562*99653d4eSeschrock error = EIO; 563*99653d4eSeschrock goto out; 564*99653d4eSeschrock } 565*99653d4eSeschrock 566fa9e4066Sahrens /* 567ea8dc4b6Seschrock * Load the persistent error log. If we have an older pool, this will 568ea8dc4b6Seschrock * not be present. 569fa9e4066Sahrens */ 570ea8dc4b6Seschrock error = zap_lookup(spa->spa_meta_objset, 571ea8dc4b6Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 572ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 573d80c45e0Sbonwick if (error != 0 && error != ENOENT) { 574ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 575ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 576ea8dc4b6Seschrock error = EIO; 577ea8dc4b6Seschrock goto out; 578ea8dc4b6Seschrock } 579ea8dc4b6Seschrock 580ea8dc4b6Seschrock error = zap_lookup(spa->spa_meta_objset, 581ea8dc4b6Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 582ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 583ea8dc4b6Seschrock if (error != 0 && error != ENOENT) { 584ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 586ea8dc4b6Seschrock error = EIO; 587ea8dc4b6Seschrock goto out; 588ea8dc4b6Seschrock } 589ea8dc4b6Seschrock 590*99653d4eSeschrock /* 591*99653d4eSeschrock * Load any hot spares for this pool. 592*99653d4eSeschrock */ 593*99653d4eSeschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 594*99653d4eSeschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 595*99653d4eSeschrock if (error != 0 && error != ENOENT) { 596*99653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 597*99653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 598*99653d4eSeschrock error = EIO; 599*99653d4eSeschrock goto out; 600*99653d4eSeschrock } 601*99653d4eSeschrock if (error == 0) { 602*99653d4eSeschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 603*99653d4eSeschrock if (load_nvlist(spa, spa->spa_spares_object, 604*99653d4eSeschrock &spa->spa_sparelist) != 0) { 605*99653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 606*99653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 607*99653d4eSeschrock error = EIO; 608*99653d4eSeschrock goto out; 609*99653d4eSeschrock } 610*99653d4eSeschrock 611*99653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 612*99653d4eSeschrock spa_load_spares(spa); 613*99653d4eSeschrock spa_config_exit(spa, FTAG); 614*99653d4eSeschrock } 615*99653d4eSeschrock 616ea8dc4b6Seschrock /* 617560e6e96Seschrock * Load the vdev state for all toplevel vdevs. 618ea8dc4b6Seschrock */ 619560e6e96Seschrock vdev_load(rvd); 6200373e76bSbonwick 621fa9e4066Sahrens /* 622fa9e4066Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 623fa9e4066Sahrens */ 624ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 625fa9e4066Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 626ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 627fa9e4066Sahrens 628fa9e4066Sahrens /* 629fa9e4066Sahrens * Check the state of the root vdev. If it can't be opened, it 630fa9e4066Sahrens * indicates one or more toplevel vdevs are faulted. 631fa9e4066Sahrens */ 632ea8dc4b6Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 633ea8dc4b6Seschrock error = ENXIO; 634ea8dc4b6Seschrock goto out; 635ea8dc4b6Seschrock } 636fa9e4066Sahrens 637ea8dc4b6Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 6385dabedeeSbonwick dmu_tx_t *tx; 6390373e76bSbonwick int need_update = B_FALSE; 6400373e76bSbonwick int c; 6415dabedeeSbonwick 6420373e76bSbonwick /* 6430373e76bSbonwick * Claim log blocks that haven't been committed yet. 6440373e76bSbonwick * This must all happen in a single txg. 6450373e76bSbonwick */ 6465dabedeeSbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 647fa9e4066Sahrens spa_first_txg(spa)); 648fa9e4066Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 649fa9e4066Sahrens dmu_tx_commit(tx); 650fa9e4066Sahrens 651fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 652fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 653fa9e4066Sahrens 654fa9e4066Sahrens /* 655fa9e4066Sahrens * Wait for all claims to sync. 656fa9e4066Sahrens */ 657fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 6580e34b6a7Sbonwick 6590e34b6a7Sbonwick /* 6600373e76bSbonwick * If the config cache is stale, or we have uninitialized 6610373e76bSbonwick * metaslabs (see spa_vdev_add()), then update the config. 6620e34b6a7Sbonwick */ 6630373e76bSbonwick if (config_cache_txg != spa->spa_config_txg || 6640373e76bSbonwick state == SPA_LOAD_IMPORT) 6650373e76bSbonwick need_update = B_TRUE; 6660373e76bSbonwick 6670373e76bSbonwick for (c = 0; c < rvd->vdev_children; c++) 6680373e76bSbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 6690373e76bSbonwick need_update = B_TRUE; 6700e34b6a7Sbonwick 6710e34b6a7Sbonwick /* 6720373e76bSbonwick * Update the config cache asychronously in case we're the 6730373e76bSbonwick * root pool, in which case the config cache isn't writable yet. 6740e34b6a7Sbonwick */ 6750373e76bSbonwick if (need_update) 6760373e76bSbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 677fa9e4066Sahrens } 678fa9e4066Sahrens 679ea8dc4b6Seschrock error = 0; 680ea8dc4b6Seschrock out: 681*99653d4eSeschrock if (error && error != EBADF) 682ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 683ea8dc4b6Seschrock spa->spa_load_state = SPA_LOAD_NONE; 684ea8dc4b6Seschrock spa->spa_ena = 0; 685ea8dc4b6Seschrock 686ea8dc4b6Seschrock return (error); 687fa9e4066Sahrens } 688fa9e4066Sahrens 689fa9e4066Sahrens /* 690fa9e4066Sahrens * Pool Open/Import 691fa9e4066Sahrens * 692fa9e4066Sahrens * The import case is identical to an open except that the configuration is sent 693fa9e4066Sahrens * down from userland, instead of grabbed from the configuration cache. For the 694fa9e4066Sahrens * case of an open, the pool configuration will exist in the 695fa9e4066Sahrens * POOL_STATE_UNITIALIZED state. 696fa9e4066Sahrens * 697fa9e4066Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 698fa9e4066Sahrens * the same time open the pool, without having to keep around the spa_t in some 699fa9e4066Sahrens * ambiguous state. 700fa9e4066Sahrens */ 701fa9e4066Sahrens static int 702fa9e4066Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 703fa9e4066Sahrens { 704fa9e4066Sahrens spa_t *spa; 705fa9e4066Sahrens int error; 706fa9e4066Sahrens int loaded = B_FALSE; 707fa9e4066Sahrens int locked = B_FALSE; 708fa9e4066Sahrens 709fa9e4066Sahrens *spapp = NULL; 710fa9e4066Sahrens 711fa9e4066Sahrens /* 712fa9e4066Sahrens * As disgusting as this is, we need to support recursive calls to this 713fa9e4066Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 714fa9e4066Sahrens * up calling spa_open() again. The real fix is to figure out how to 715fa9e4066Sahrens * avoid dsl_dir_open() calling this in the first place. 716fa9e4066Sahrens */ 717fa9e4066Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 718fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 719fa9e4066Sahrens locked = B_TRUE; 720fa9e4066Sahrens } 721fa9e4066Sahrens 722fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 723fa9e4066Sahrens if (locked) 724fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 725fa9e4066Sahrens return (ENOENT); 726fa9e4066Sahrens } 727fa9e4066Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 728fa9e4066Sahrens 729fa9e4066Sahrens spa_activate(spa); 730fa9e4066Sahrens 7310373e76bSbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 732fa9e4066Sahrens 733fa9e4066Sahrens if (error == EBADF) { 734fa9e4066Sahrens /* 735560e6e96Seschrock * If vdev_validate() returns failure (indicated by 736560e6e96Seschrock * EBADF), it indicates that one of the vdevs indicates 737560e6e96Seschrock * that the pool has been exported or destroyed. If 738560e6e96Seschrock * this is the case, the config cache is out of sync and 739560e6e96Seschrock * we should remove the pool from the namespace. 740fa9e4066Sahrens */ 741*99653d4eSeschrock zfs_post_ok(spa, NULL); 742fa9e4066Sahrens spa_unload(spa); 743fa9e4066Sahrens spa_deactivate(spa); 744fa9e4066Sahrens spa_remove(spa); 745fa9e4066Sahrens spa_config_sync(); 746fa9e4066Sahrens if (locked) 747fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 748fa9e4066Sahrens return (ENOENT); 749ea8dc4b6Seschrock } 750ea8dc4b6Seschrock 751ea8dc4b6Seschrock if (error) { 752fa9e4066Sahrens /* 753fa9e4066Sahrens * We can't open the pool, but we still have useful 754fa9e4066Sahrens * information: the state of each vdev after the 755fa9e4066Sahrens * attempted vdev_open(). Return this to the user. 756fa9e4066Sahrens */ 7570373e76bSbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 7580373e76bSbonwick spa_config_enter(spa, RW_READER, FTAG); 759fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 760fa9e4066Sahrens B_TRUE); 7610373e76bSbonwick spa_config_exit(spa, FTAG); 7620373e76bSbonwick } 763fa9e4066Sahrens spa_unload(spa); 764fa9e4066Sahrens spa_deactivate(spa); 765ea8dc4b6Seschrock spa->spa_last_open_failed = B_TRUE; 766fa9e4066Sahrens if (locked) 767fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 768fa9e4066Sahrens *spapp = NULL; 769fa9e4066Sahrens return (error); 770ea8dc4b6Seschrock } else { 771ea8dc4b6Seschrock zfs_post_ok(spa, NULL); 772ea8dc4b6Seschrock spa->spa_last_open_failed = B_FALSE; 773fa9e4066Sahrens } 774fa9e4066Sahrens 775fa9e4066Sahrens loaded = B_TRUE; 776fa9e4066Sahrens } 777fa9e4066Sahrens 778fa9e4066Sahrens spa_open_ref(spa, tag); 779fa9e4066Sahrens if (locked) 780fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 781fa9e4066Sahrens 782fa9e4066Sahrens *spapp = spa; 783fa9e4066Sahrens 784fa9e4066Sahrens if (config != NULL) { 785ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 786fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 787ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 788fa9e4066Sahrens } 789fa9e4066Sahrens 790fa9e4066Sahrens /* 791fa9e4066Sahrens * If we just loaded the pool, resilver anything that's out of date. 792fa9e4066Sahrens */ 793fa9e4066Sahrens if (loaded && (spa_mode & FWRITE)) 794fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 795fa9e4066Sahrens 796fa9e4066Sahrens return (0); 797fa9e4066Sahrens } 798fa9e4066Sahrens 799fa9e4066Sahrens int 800fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 801fa9e4066Sahrens { 802fa9e4066Sahrens return (spa_open_common(name, spapp, tag, NULL)); 803fa9e4066Sahrens } 804fa9e4066Sahrens 805ea8dc4b6Seschrock /* 806ea8dc4b6Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 807ea8dc4b6Seschrock * preventing it from being exported or destroyed. 808ea8dc4b6Seschrock */ 809ea8dc4b6Seschrock spa_t * 810ea8dc4b6Seschrock spa_inject_addref(char *name) 811ea8dc4b6Seschrock { 812ea8dc4b6Seschrock spa_t *spa; 813ea8dc4b6Seschrock 814ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 815ea8dc4b6Seschrock if ((spa = spa_lookup(name)) == NULL) { 816ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 817ea8dc4b6Seschrock return (NULL); 818ea8dc4b6Seschrock } 819ea8dc4b6Seschrock spa->spa_inject_ref++; 820ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 821ea8dc4b6Seschrock 822ea8dc4b6Seschrock return (spa); 823ea8dc4b6Seschrock } 824ea8dc4b6Seschrock 825ea8dc4b6Seschrock void 826ea8dc4b6Seschrock spa_inject_delref(spa_t *spa) 827ea8dc4b6Seschrock { 828ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 829ea8dc4b6Seschrock spa->spa_inject_ref--; 830ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 831ea8dc4b6Seschrock } 832ea8dc4b6Seschrock 833*99653d4eSeschrock static void 834*99653d4eSeschrock spa_add_spares(spa_t *spa, nvlist_t *config) 835*99653d4eSeschrock { 836*99653d4eSeschrock nvlist_t **spares; 837*99653d4eSeschrock uint_t i, nspares; 838*99653d4eSeschrock nvlist_t *nvroot; 839*99653d4eSeschrock uint64_t guid; 840*99653d4eSeschrock vdev_stat_t *vs; 841*99653d4eSeschrock uint_t vsc; 842*99653d4eSeschrock 843*99653d4eSeschrock if (spa->spa_nspares == 0) 844*99653d4eSeschrock return; 845*99653d4eSeschrock 846*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist(config, 847*99653d4eSeschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 848*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 849*99653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 850*99653d4eSeschrock if (nspares != 0) { 851*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, 852*99653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 853*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 854*99653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 855*99653d4eSeschrock 856*99653d4eSeschrock /* 857*99653d4eSeschrock * Go through and find any spares which have since been 858*99653d4eSeschrock * repurposed as an active spare. If this is the case, update 859*99653d4eSeschrock * their status appropriately. 860*99653d4eSeschrock */ 861*99653d4eSeschrock for (i = 0; i < nspares; i++) { 862*99653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 863*99653d4eSeschrock ZPOOL_CONFIG_GUID, &guid) == 0); 864*99653d4eSeschrock if (spa_spare_inuse(guid)) { 865*99653d4eSeschrock VERIFY(nvlist_lookup_uint64_array( 866*99653d4eSeschrock spares[i], ZPOOL_CONFIG_STATS, 867*99653d4eSeschrock (uint64_t **)&vs, &vsc) == 0); 868*99653d4eSeschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 869*99653d4eSeschrock vs->vs_aux = VDEV_AUX_SPARED; 870*99653d4eSeschrock } 871*99653d4eSeschrock } 872*99653d4eSeschrock } 873*99653d4eSeschrock } 874*99653d4eSeschrock 875fa9e4066Sahrens int 876ea8dc4b6Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 877fa9e4066Sahrens { 878fa9e4066Sahrens int error; 879fa9e4066Sahrens spa_t *spa; 880fa9e4066Sahrens 881fa9e4066Sahrens *config = NULL; 882fa9e4066Sahrens error = spa_open_common(name, &spa, FTAG, config); 883fa9e4066Sahrens 884*99653d4eSeschrock if (spa && *config != NULL) { 885ea8dc4b6Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 886ea8dc4b6Seschrock spa_get_errlog_size(spa)) == 0); 887ea8dc4b6Seschrock 888*99653d4eSeschrock spa_add_spares(spa, *config); 889*99653d4eSeschrock } 890*99653d4eSeschrock 891ea8dc4b6Seschrock /* 892ea8dc4b6Seschrock * We want to get the alternate root even for faulted pools, so we cheat 893ea8dc4b6Seschrock * and call spa_lookup() directly. 894ea8dc4b6Seschrock */ 895ea8dc4b6Seschrock if (altroot) { 896ea8dc4b6Seschrock if (spa == NULL) { 897ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 898ea8dc4b6Seschrock spa = spa_lookup(name); 899ea8dc4b6Seschrock if (spa) 900ea8dc4b6Seschrock spa_altroot(spa, altroot, buflen); 901ea8dc4b6Seschrock else 902ea8dc4b6Seschrock altroot[0] = '\0'; 903ea8dc4b6Seschrock spa = NULL; 904ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 905ea8dc4b6Seschrock } else { 906ea8dc4b6Seschrock spa_altroot(spa, altroot, buflen); 907ea8dc4b6Seschrock } 908ea8dc4b6Seschrock } 909ea8dc4b6Seschrock 910fa9e4066Sahrens if (spa != NULL) 911fa9e4066Sahrens spa_close(spa, FTAG); 912fa9e4066Sahrens 913fa9e4066Sahrens return (error); 914fa9e4066Sahrens } 915fa9e4066Sahrens 916*99653d4eSeschrock /* 917*99653d4eSeschrock * Validate that the 'spares' array is well formed. We must have an array of 918*99653d4eSeschrock * nvlists, each which describes a valid leaf vdev. 919*99653d4eSeschrock */ 920*99653d4eSeschrock static int 921*99653d4eSeschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 922*99653d4eSeschrock { 923*99653d4eSeschrock nvlist_t **spares; 924*99653d4eSeschrock uint_t i, nspares; 925*99653d4eSeschrock vdev_t *vd; 926*99653d4eSeschrock int error; 927*99653d4eSeschrock 928*99653d4eSeschrock /* 929*99653d4eSeschrock * It's acceptable to have no spares specified. 930*99653d4eSeschrock */ 931*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 932*99653d4eSeschrock &spares, &nspares) != 0) 933*99653d4eSeschrock return (0); 934*99653d4eSeschrock 935*99653d4eSeschrock if (nspares == 0) 936*99653d4eSeschrock return (EINVAL); 937*99653d4eSeschrock 938*99653d4eSeschrock /* 939*99653d4eSeschrock * Make sure the pool is formatted with a version that supports hot 940*99653d4eSeschrock * spares. 941*99653d4eSeschrock */ 942*99653d4eSeschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 943*99653d4eSeschrock return (ENOTSUP); 944*99653d4eSeschrock 945*99653d4eSeschrock for (i = 0; i < nspares; i++) { 946*99653d4eSeschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 947*99653d4eSeschrock mode)) != 0) 948*99653d4eSeschrock return (error); 949*99653d4eSeschrock 950*99653d4eSeschrock if (!vd->vdev_ops->vdev_op_leaf) { 951*99653d4eSeschrock vdev_free(vd); 952*99653d4eSeschrock return (EINVAL); 953*99653d4eSeschrock } 954*99653d4eSeschrock 955*99653d4eSeschrock if ((error = vdev_open(vd)) != 0) { 956*99653d4eSeschrock vdev_free(vd); 957*99653d4eSeschrock return (error); 958*99653d4eSeschrock } 959*99653d4eSeschrock 960*99653d4eSeschrock vd->vdev_top = vd; 961*99653d4eSeschrock if ((error = vdev_label_spare(vd, crtxg)) != 0) { 962*99653d4eSeschrock vdev_free(vd); 963*99653d4eSeschrock return (error); 964*99653d4eSeschrock } 965*99653d4eSeschrock 966*99653d4eSeschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 967*99653d4eSeschrock vd->vdev_guid) == 0); 968*99653d4eSeschrock 969*99653d4eSeschrock vdev_free(vd); 970*99653d4eSeschrock } 971*99653d4eSeschrock 972*99653d4eSeschrock return (0); 973*99653d4eSeschrock } 974*99653d4eSeschrock 975fa9e4066Sahrens /* 976fa9e4066Sahrens * Pool Creation 977fa9e4066Sahrens */ 978fa9e4066Sahrens int 9790373e76bSbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 980fa9e4066Sahrens { 981fa9e4066Sahrens spa_t *spa; 9820373e76bSbonwick vdev_t *rvd; 983fa9e4066Sahrens dsl_pool_t *dp; 984fa9e4066Sahrens dmu_tx_t *tx; 985*99653d4eSeschrock int c, error = 0; 986fa9e4066Sahrens uint64_t txg = TXG_INITIAL; 987*99653d4eSeschrock nvlist_t **spares; 988*99653d4eSeschrock uint_t nspares; 989fa9e4066Sahrens 990fa9e4066Sahrens /* 991fa9e4066Sahrens * If this pool already exists, return failure. 992fa9e4066Sahrens */ 993fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 994fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 995fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 996fa9e4066Sahrens return (EEXIST); 997fa9e4066Sahrens } 998fa9e4066Sahrens 999fa9e4066Sahrens /* 1000fa9e4066Sahrens * Allocate a new spa_t structure. 1001fa9e4066Sahrens */ 10020373e76bSbonwick spa = spa_add(pool, altroot); 1003fa9e4066Sahrens spa_activate(spa); 1004fa9e4066Sahrens 1005fa9e4066Sahrens spa->spa_uberblock.ub_txg = txg - 1; 1006eaca9bbdSeschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1007fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 1008fa9e4066Sahrens 10090373e76bSbonwick /* 10100373e76bSbonwick * Create the root vdev. 10110373e76bSbonwick */ 10120373e76bSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10130373e76bSbonwick 1014*99653d4eSeschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 10150373e76bSbonwick 1016*99653d4eSeschrock ASSERT(error != 0 || rvd != NULL); 1017*99653d4eSeschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 10180373e76bSbonwick 1019*99653d4eSeschrock if (error == 0 && rvd->vdev_children == 0) 10200373e76bSbonwick error = EINVAL; 1021*99653d4eSeschrock 1022*99653d4eSeschrock if (error == 0 && 1023*99653d4eSeschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1024*99653d4eSeschrock (error = spa_validate_spares(spa, nvroot, txg, 1025*99653d4eSeschrock VDEV_ALLOC_ADD)) == 0) { 1026*99653d4eSeschrock for (c = 0; c < rvd->vdev_children; c++) 1027*99653d4eSeschrock vdev_init(rvd->vdev_child[c], txg); 1028*99653d4eSeschrock vdev_config_dirty(rvd); 10290373e76bSbonwick } 10300373e76bSbonwick 10310373e76bSbonwick spa_config_exit(spa, FTAG); 1032fa9e4066Sahrens 1033*99653d4eSeschrock if (error != 0) { 1034fa9e4066Sahrens spa_unload(spa); 1035fa9e4066Sahrens spa_deactivate(spa); 1036fa9e4066Sahrens spa_remove(spa); 1037fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1038fa9e4066Sahrens return (error); 1039fa9e4066Sahrens } 1040fa9e4066Sahrens 1041*99653d4eSeschrock /* 1042*99653d4eSeschrock * Get the list of spares, if specified. 1043*99653d4eSeschrock */ 1044*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1045*99653d4eSeschrock &spares, &nspares) == 0) { 1046*99653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1047*99653d4eSeschrock KM_SLEEP) == 0); 1048*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1049*99653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1050*99653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 1051*99653d4eSeschrock spa_load_spares(spa); 1052*99653d4eSeschrock spa_config_exit(spa, FTAG); 1053*99653d4eSeschrock spa->spa_sync_spares = B_TRUE; 1054*99653d4eSeschrock } 1055*99653d4eSeschrock 1056fa9e4066Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1057fa9e4066Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1058fa9e4066Sahrens 1059fa9e4066Sahrens tx = dmu_tx_create_assigned(dp, txg); 1060fa9e4066Sahrens 1061fa9e4066Sahrens /* 1062fa9e4066Sahrens * Create the pool config object. 1063fa9e4066Sahrens */ 1064fa9e4066Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1065fa9e4066Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1066fa9e4066Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1067fa9e4066Sahrens 1068ea8dc4b6Seschrock if (zap_add(spa->spa_meta_objset, 1069fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1070ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1071ea8dc4b6Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 1072ea8dc4b6Seschrock } 1073fa9e4066Sahrens 1074*99653d4eSeschrock /* Newly created pools are always deflated. */ 1075*99653d4eSeschrock spa->spa_deflate = TRUE; 1076*99653d4eSeschrock if (zap_add(spa->spa_meta_objset, 1077*99653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1078*99653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1079*99653d4eSeschrock cmn_err(CE_PANIC, "failed to add deflate"); 1080*99653d4eSeschrock } 1081*99653d4eSeschrock 1082fa9e4066Sahrens /* 1083fa9e4066Sahrens * Create the deferred-free bplist object. Turn off compression 1084fa9e4066Sahrens * because sync-to-convergence takes longer if the blocksize 1085fa9e4066Sahrens * keeps changing. 1086fa9e4066Sahrens */ 1087fa9e4066Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1088fa9e4066Sahrens 1 << 14, tx); 1089fa9e4066Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1090fa9e4066Sahrens ZIO_COMPRESS_OFF, tx); 1091fa9e4066Sahrens 1092ea8dc4b6Seschrock if (zap_add(spa->spa_meta_objset, 1093fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1094ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1095ea8dc4b6Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 1096ea8dc4b6Seschrock } 1097fa9e4066Sahrens 1098fa9e4066Sahrens dmu_tx_commit(tx); 1099fa9e4066Sahrens 1100fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 1101fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 1102fa9e4066Sahrens 1103fa9e4066Sahrens /* 1104fa9e4066Sahrens * We explicitly wait for the first transaction to complete so that our 1105fa9e4066Sahrens * bean counters are appropriately updated. 1106fa9e4066Sahrens */ 1107fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1108fa9e4066Sahrens 1109fa9e4066Sahrens spa_config_sync(); 1110fa9e4066Sahrens 1111fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1112fa9e4066Sahrens 1113fa9e4066Sahrens return (0); 1114fa9e4066Sahrens } 1115fa9e4066Sahrens 1116fa9e4066Sahrens /* 1117fa9e4066Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1118fa9e4066Sahrens * then call spa_load() to do the dirty work. 1119fa9e4066Sahrens */ 1120fa9e4066Sahrens int 11210373e76bSbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1122fa9e4066Sahrens { 1123fa9e4066Sahrens spa_t *spa; 1124fa9e4066Sahrens int error; 1125*99653d4eSeschrock nvlist_t *nvroot; 1126*99653d4eSeschrock nvlist_t **spares; 1127*99653d4eSeschrock uint_t nspares; 1128fa9e4066Sahrens 1129fa9e4066Sahrens if (!(spa_mode & FWRITE)) 1130fa9e4066Sahrens return (EROFS); 1131fa9e4066Sahrens 1132fa9e4066Sahrens /* 1133fa9e4066Sahrens * If a pool with this name exists, return failure. 1134fa9e4066Sahrens */ 1135fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1136fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 1137fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1138fa9e4066Sahrens return (EEXIST); 1139fa9e4066Sahrens } 1140fa9e4066Sahrens 1141fa9e4066Sahrens /* 11420373e76bSbonwick * Create and initialize the spa structure. 1143fa9e4066Sahrens */ 11440373e76bSbonwick spa = spa_add(pool, altroot); 1145fa9e4066Sahrens spa_activate(spa); 1146fa9e4066Sahrens 11475dabedeeSbonwick /* 11480373e76bSbonwick * Pass off the heavy lifting to spa_load(). 1149ecc2d604Sbonwick * Pass TRUE for mosconfig because the user-supplied config 1150ecc2d604Sbonwick * is actually the one to trust when doing an import. 11515dabedeeSbonwick */ 1152ecc2d604Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1153fa9e4066Sahrens 1154*99653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 1155*99653d4eSeschrock /* 1156*99653d4eSeschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 1157*99653d4eSeschrock * and conflicts with spa_has_spare(). 1158*99653d4eSeschrock */ 1159*99653d4eSeschrock if (spa->spa_sparelist) { 1160*99653d4eSeschrock nvlist_free(spa->spa_sparelist); 1161*99653d4eSeschrock spa->spa_sparelist = NULL; 1162*99653d4eSeschrock spa_load_spares(spa); 1163*99653d4eSeschrock } 1164*99653d4eSeschrock 1165*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1166*99653d4eSeschrock &nvroot) == 0); 1167*99653d4eSeschrock if (error == 0) 1168*99653d4eSeschrock error = spa_validate_spares(spa, nvroot, -1ULL, 1169*99653d4eSeschrock VDEV_ALLOC_SPARE); 1170*99653d4eSeschrock spa_config_exit(spa, FTAG); 1171*99653d4eSeschrock 1172*99653d4eSeschrock if (error != 0) { 1173fa9e4066Sahrens spa_unload(spa); 1174fa9e4066Sahrens spa_deactivate(spa); 1175fa9e4066Sahrens spa_remove(spa); 1176fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1177fa9e4066Sahrens return (error); 1178fa9e4066Sahrens } 1179fa9e4066Sahrens 1180*99653d4eSeschrock /* 1181*99653d4eSeschrock * Override any spares as specified by the user, as these may have 1182*99653d4eSeschrock * correct device names/devids, etc. 1183*99653d4eSeschrock */ 1184*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1185*99653d4eSeschrock &spares, &nspares) == 0) { 1186*99653d4eSeschrock if (spa->spa_sparelist) 1187*99653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, 1188*99653d4eSeschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1189*99653d4eSeschrock else 1190*99653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 1191*99653d4eSeschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 1192*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1193*99653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1194*99653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 1195*99653d4eSeschrock spa_load_spares(spa); 1196*99653d4eSeschrock spa_config_exit(spa, FTAG); 1197*99653d4eSeschrock spa->spa_sync_spares = B_TRUE; 1198*99653d4eSeschrock } 1199*99653d4eSeschrock 12000373e76bSbonwick /* 12010373e76bSbonwick * Update the config cache to include the newly-imported pool. 12020373e76bSbonwick */ 12030373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 12040373e76bSbonwick 1205fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1206fa9e4066Sahrens 1207fa9e4066Sahrens /* 1208fa9e4066Sahrens * Resilver anything that's out of date. 1209fa9e4066Sahrens */ 1210fa9e4066Sahrens if (spa_mode & FWRITE) 1211fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1212fa9e4066Sahrens 1213fa9e4066Sahrens return (0); 1214fa9e4066Sahrens } 1215fa9e4066Sahrens 1216fa9e4066Sahrens /* 1217fa9e4066Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1218fa9e4066Sahrens * to get the vdev stats associated with the imported devices. 1219fa9e4066Sahrens */ 1220fa9e4066Sahrens #define TRYIMPORT_NAME "$import" 1221fa9e4066Sahrens 1222fa9e4066Sahrens nvlist_t * 1223fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig) 1224fa9e4066Sahrens { 1225fa9e4066Sahrens nvlist_t *config = NULL; 1226fa9e4066Sahrens char *poolname; 1227fa9e4066Sahrens spa_t *spa; 1228fa9e4066Sahrens uint64_t state; 1229fa9e4066Sahrens 1230fa9e4066Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1231fa9e4066Sahrens return (NULL); 1232fa9e4066Sahrens 1233fa9e4066Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1234fa9e4066Sahrens return (NULL); 1235fa9e4066Sahrens 1236fa9e4066Sahrens /* 12370373e76bSbonwick * Create and initialize the spa structure. 1238fa9e4066Sahrens */ 12390373e76bSbonwick mutex_enter(&spa_namespace_lock); 12400373e76bSbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1241fa9e4066Sahrens spa_activate(spa); 1242fa9e4066Sahrens 1243fa9e4066Sahrens /* 12440373e76bSbonwick * Pass off the heavy lifting to spa_load(). 1245ecc2d604Sbonwick * Pass TRUE for mosconfig because the user-supplied config 1246ecc2d604Sbonwick * is actually the one to trust when doing an import. 1247fa9e4066Sahrens */ 1248ecc2d604Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1249fa9e4066Sahrens 1250fa9e4066Sahrens /* 1251fa9e4066Sahrens * If 'tryconfig' was at least parsable, return the current config. 1252fa9e4066Sahrens */ 1253fa9e4066Sahrens if (spa->spa_root_vdev != NULL) { 12540373e76bSbonwick spa_config_enter(spa, RW_READER, FTAG); 1255fa9e4066Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 12560373e76bSbonwick spa_config_exit(spa, FTAG); 1257fa9e4066Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1258fa9e4066Sahrens poolname) == 0); 1259fa9e4066Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1260fa9e4066Sahrens state) == 0); 1261*99653d4eSeschrock 1262*99653d4eSeschrock /* 1263*99653d4eSeschrock * Add the list of hot spares. 1264*99653d4eSeschrock */ 1265*99653d4eSeschrock spa_add_spares(spa, config); 1266fa9e4066Sahrens } 1267fa9e4066Sahrens 1268fa9e4066Sahrens spa_unload(spa); 1269fa9e4066Sahrens spa_deactivate(spa); 1270fa9e4066Sahrens spa_remove(spa); 1271fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1272fa9e4066Sahrens 1273fa9e4066Sahrens return (config); 1274fa9e4066Sahrens } 1275fa9e4066Sahrens 1276fa9e4066Sahrens /* 1277fa9e4066Sahrens * Pool export/destroy 1278fa9e4066Sahrens * 1279fa9e4066Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1280fa9e4066Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1281fa9e4066Sahrens * update the pool state and sync all the labels to disk, removing the 1282fa9e4066Sahrens * configuration from the cache afterwards. 1283fa9e4066Sahrens */ 1284fa9e4066Sahrens static int 128544cd46caSbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1286fa9e4066Sahrens { 1287fa9e4066Sahrens spa_t *spa; 1288fa9e4066Sahrens 128944cd46caSbillm if (oldconfig) 129044cd46caSbillm *oldconfig = NULL; 129144cd46caSbillm 1292fa9e4066Sahrens if (!(spa_mode & FWRITE)) 1293fa9e4066Sahrens return (EROFS); 1294fa9e4066Sahrens 1295fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1296fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1297fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1298fa9e4066Sahrens return (ENOENT); 1299fa9e4066Sahrens } 1300fa9e4066Sahrens 1301ea8dc4b6Seschrock /* 1302ea8dc4b6Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 1303ea8dc4b6Seschrock * reacquire the namespace lock, and see if we can export. 1304ea8dc4b6Seschrock */ 1305ea8dc4b6Seschrock spa_open_ref(spa, FTAG); 1306ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 1307ea8dc4b6Seschrock spa_async_suspend(spa); 1308ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 1309ea8dc4b6Seschrock spa_close(spa, FTAG); 1310ea8dc4b6Seschrock 1311fa9e4066Sahrens /* 1312fa9e4066Sahrens * The pool will be in core if it's openable, 1313fa9e4066Sahrens * in which case we can modify its state. 1314fa9e4066Sahrens */ 1315fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1316fa9e4066Sahrens /* 1317fa9e4066Sahrens * Objsets may be open only because they're dirty, so we 1318fa9e4066Sahrens * have to force it to sync before checking spa_refcnt. 1319fa9e4066Sahrens */ 1320fa9e4066Sahrens spa_scrub_suspend(spa); 1321fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1322fa9e4066Sahrens 1323ea8dc4b6Seschrock /* 1324ea8dc4b6Seschrock * A pool cannot be exported or destroyed if there are active 1325ea8dc4b6Seschrock * references. If we are resetting a pool, allow references by 1326ea8dc4b6Seschrock * fault injection handlers. 1327ea8dc4b6Seschrock */ 1328ea8dc4b6Seschrock if (!spa_refcount_zero(spa) || 1329ea8dc4b6Seschrock (spa->spa_inject_ref != 0 && 1330ea8dc4b6Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1331fa9e4066Sahrens spa_scrub_resume(spa); 1332ea8dc4b6Seschrock spa_async_resume(spa); 1333fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1334fa9e4066Sahrens return (EBUSY); 1335fa9e4066Sahrens } 1336fa9e4066Sahrens 1337fa9e4066Sahrens spa_scrub_resume(spa); 1338fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1339fa9e4066Sahrens 1340fa9e4066Sahrens /* 1341fa9e4066Sahrens * We want this to be reflected on every label, 1342fa9e4066Sahrens * so mark them all dirty. spa_unload() will do the 1343fa9e4066Sahrens * final sync that pushes these changes out. 1344fa9e4066Sahrens */ 1345ea8dc4b6Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 13465dabedeeSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 1347ea8dc4b6Seschrock spa->spa_state = new_state; 13480373e76bSbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1349ea8dc4b6Seschrock vdev_config_dirty(spa->spa_root_vdev); 13505dabedeeSbonwick spa_config_exit(spa, FTAG); 1351ea8dc4b6Seschrock } 1352fa9e4066Sahrens } 1353fa9e4066Sahrens 1354fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1355fa9e4066Sahrens spa_unload(spa); 1356fa9e4066Sahrens spa_deactivate(spa); 1357fa9e4066Sahrens } 1358fa9e4066Sahrens 135944cd46caSbillm if (oldconfig && spa->spa_config) 136044cd46caSbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 136144cd46caSbillm 1362ea8dc4b6Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 1363ea8dc4b6Seschrock spa_remove(spa); 1364ea8dc4b6Seschrock spa_config_sync(); 1365ea8dc4b6Seschrock } 1366fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1367fa9e4066Sahrens 1368fa9e4066Sahrens return (0); 1369fa9e4066Sahrens } 1370fa9e4066Sahrens 1371fa9e4066Sahrens /* 1372fa9e4066Sahrens * Destroy a storage pool. 1373fa9e4066Sahrens */ 1374fa9e4066Sahrens int 1375fa9e4066Sahrens spa_destroy(char *pool) 1376fa9e4066Sahrens { 137744cd46caSbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1378fa9e4066Sahrens } 1379fa9e4066Sahrens 1380fa9e4066Sahrens /* 1381fa9e4066Sahrens * Export a storage pool. 1382fa9e4066Sahrens */ 1383fa9e4066Sahrens int 138444cd46caSbillm spa_export(char *pool, nvlist_t **oldconfig) 1385fa9e4066Sahrens { 138644cd46caSbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1387fa9e4066Sahrens } 1388fa9e4066Sahrens 1389ea8dc4b6Seschrock /* 1390ea8dc4b6Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 1391ea8dc4b6Seschrock * from the namespace in any way. 1392ea8dc4b6Seschrock */ 1393ea8dc4b6Seschrock int 1394ea8dc4b6Seschrock spa_reset(char *pool) 1395ea8dc4b6Seschrock { 139644cd46caSbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1397ea8dc4b6Seschrock } 1398ea8dc4b6Seschrock 1399ea8dc4b6Seschrock 1400fa9e4066Sahrens /* 1401fa9e4066Sahrens * ========================================================================== 1402fa9e4066Sahrens * Device manipulation 1403fa9e4066Sahrens * ========================================================================== 1404fa9e4066Sahrens */ 1405fa9e4066Sahrens 1406fa9e4066Sahrens /* 1407fa9e4066Sahrens * Add capacity to a storage pool. 1408fa9e4066Sahrens */ 1409fa9e4066Sahrens int 1410fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1411fa9e4066Sahrens { 1412fa9e4066Sahrens uint64_t txg; 14130373e76bSbonwick int c, error; 1414fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 14150e34b6a7Sbonwick vdev_t *vd, *tvd; 1416*99653d4eSeschrock nvlist_t **spares; 1417*99653d4eSeschrock uint_t i, nspares; 1418fa9e4066Sahrens 1419fa9e4066Sahrens txg = spa_vdev_enter(spa); 1420fa9e4066Sahrens 1421*99653d4eSeschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1422*99653d4eSeschrock VDEV_ALLOC_ADD)) != 0) 1423*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, error)); 1424fa9e4066Sahrens 1425*99653d4eSeschrock if ((error = spa_validate_spares(spa, nvroot, txg, 1426*99653d4eSeschrock VDEV_ALLOC_ADD)) != 0) 1427*99653d4eSeschrock return (spa_vdev_exit(spa, vd, txg, error)); 1428*99653d4eSeschrock 1429*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1430*99653d4eSeschrock &spares, &nspares) != 0) 1431*99653d4eSeschrock nspares = 0; 1432*99653d4eSeschrock 1433*99653d4eSeschrock if (vd->vdev_children == 0 && nspares == 0) 1434fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1435fa9e4066Sahrens 1436*99653d4eSeschrock if (vd->vdev_children != 0) { 1437*99653d4eSeschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1438*99653d4eSeschrock return (spa_vdev_exit(spa, vd, txg, error)); 1439fa9e4066Sahrens 1440*99653d4eSeschrock /* 1441*99653d4eSeschrock * Transfer each new top-level vdev from vd to rvd. 1442*99653d4eSeschrock */ 1443*99653d4eSeschrock for (c = 0; c < vd->vdev_children; c++) { 1444*99653d4eSeschrock tvd = vd->vdev_child[c]; 1445*99653d4eSeschrock vdev_remove_child(vd, tvd); 1446*99653d4eSeschrock tvd->vdev_id = rvd->vdev_children; 1447*99653d4eSeschrock vdev_add_child(rvd, tvd); 1448*99653d4eSeschrock vdev_config_dirty(tvd); 1449*99653d4eSeschrock } 1450*99653d4eSeschrock } 1451*99653d4eSeschrock 1452*99653d4eSeschrock if (nspares != 0) { 1453*99653d4eSeschrock if (spa->spa_sparelist != NULL) { 1454*99653d4eSeschrock nvlist_t **oldspares; 1455*99653d4eSeschrock uint_t oldnspares; 1456*99653d4eSeschrock nvlist_t **newspares; 1457*99653d4eSeschrock 1458*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1459*99653d4eSeschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1460*99653d4eSeschrock 1461*99653d4eSeschrock newspares = kmem_alloc(sizeof (void *) * 1462*99653d4eSeschrock (nspares + oldnspares), KM_SLEEP); 1463*99653d4eSeschrock for (i = 0; i < oldnspares; i++) 1464*99653d4eSeschrock VERIFY(nvlist_dup(oldspares[i], 1465*99653d4eSeschrock &newspares[i], KM_SLEEP) == 0); 1466*99653d4eSeschrock for (i = 0; i < nspares; i++) 1467*99653d4eSeschrock VERIFY(nvlist_dup(spares[i], 1468*99653d4eSeschrock &newspares[i + oldnspares], 1469*99653d4eSeschrock KM_SLEEP) == 0); 1470*99653d4eSeschrock 1471*99653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, 1472*99653d4eSeschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1473*99653d4eSeschrock 1474*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1475*99653d4eSeschrock ZPOOL_CONFIG_SPARES, newspares, 1476*99653d4eSeschrock nspares + oldnspares) == 0); 1477*99653d4eSeschrock for (i = 0; i < oldnspares + nspares; i++) 1478*99653d4eSeschrock nvlist_free(newspares[i]); 1479*99653d4eSeschrock kmem_free(newspares, (oldnspares + nspares) * 1480*99653d4eSeschrock sizeof (void *)); 1481*99653d4eSeschrock } else { 1482*99653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 1483*99653d4eSeschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 1484*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1485*99653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1486*99653d4eSeschrock } 1487*99653d4eSeschrock 1488*99653d4eSeschrock spa_load_spares(spa); 1489*99653d4eSeschrock spa->spa_sync_spares = B_TRUE; 1490fa9e4066Sahrens } 1491fa9e4066Sahrens 1492fa9e4066Sahrens /* 14930e34b6a7Sbonwick * We have to be careful when adding new vdevs to an existing pool. 14940e34b6a7Sbonwick * If other threads start allocating from these vdevs before we 14950e34b6a7Sbonwick * sync the config cache, and we lose power, then upon reboot we may 14960e34b6a7Sbonwick * fail to open the pool because there are DVAs that the config cache 14970e34b6a7Sbonwick * can't translate. Therefore, we first add the vdevs without 14980e34b6a7Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 14990373e76bSbonwick * and then let spa_config_update() initialize the new metaslabs. 15000e34b6a7Sbonwick * 15010e34b6a7Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 15020e34b6a7Sbonwick * if we lose power at any point in this sequence, the remaining 15030e34b6a7Sbonwick * steps will be completed the next time we load the pool. 15040e34b6a7Sbonwick */ 15050373e76bSbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 15060e34b6a7Sbonwick 15070373e76bSbonwick mutex_enter(&spa_namespace_lock); 15080373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 15090373e76bSbonwick mutex_exit(&spa_namespace_lock); 1510fa9e4066Sahrens 15110373e76bSbonwick return (0); 1512fa9e4066Sahrens } 1513fa9e4066Sahrens 1514fa9e4066Sahrens /* 1515fa9e4066Sahrens * Attach a device to a mirror. The arguments are the path to any device 1516fa9e4066Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1517fa9e4066Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1518fa9e4066Sahrens * 1519fa9e4066Sahrens * If 'replacing' is specified, the new device is intended to replace the 1520fa9e4066Sahrens * existing device; in this case the two devices are made into their own 1521fa9e4066Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1522fa9e4066Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1523fa9e4066Sahrens * extra rules: you can't attach to it after it's been created, and upon 1524fa9e4066Sahrens * completion of resilvering, the first disk (the one being replaced) 1525fa9e4066Sahrens * is automatically detached. 1526fa9e4066Sahrens */ 1527fa9e4066Sahrens int 1528ea8dc4b6Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1529fa9e4066Sahrens { 1530fa9e4066Sahrens uint64_t txg, open_txg; 1531fa9e4066Sahrens int error; 1532fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1533fa9e4066Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1534*99653d4eSeschrock vdev_ops_t *pvops; 1535fa9e4066Sahrens 1536fa9e4066Sahrens txg = spa_vdev_enter(spa); 1537fa9e4066Sahrens 1538ea8dc4b6Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1539fa9e4066Sahrens 1540fa9e4066Sahrens if (oldvd == NULL) 1541fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1542fa9e4066Sahrens 15430e34b6a7Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 15440e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15450e34b6a7Sbonwick 1546fa9e4066Sahrens pvd = oldvd->vdev_parent; 1547fa9e4066Sahrens 1548*99653d4eSeschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1549*99653d4eSeschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1550fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1551fa9e4066Sahrens 1552fa9e4066Sahrens newvd = newrootvd->vdev_child[0]; 1553fa9e4066Sahrens 1554fa9e4066Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1555fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1556fa9e4066Sahrens 1557*99653d4eSeschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1558fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1559fa9e4066Sahrens 1560*99653d4eSeschrock if (!replacing) { 1561*99653d4eSeschrock /* 1562*99653d4eSeschrock * For attach, the only allowable parent is a mirror or the root 1563*99653d4eSeschrock * vdev. 1564*99653d4eSeschrock */ 1565*99653d4eSeschrock if (pvd->vdev_ops != &vdev_mirror_ops && 1566*99653d4eSeschrock pvd->vdev_ops != &vdev_root_ops) 1567*99653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1568*99653d4eSeschrock 1569*99653d4eSeschrock pvops = &vdev_mirror_ops; 1570*99653d4eSeschrock } else { 1571*99653d4eSeschrock /* 1572*99653d4eSeschrock * Active hot spares can only be replaced by inactive hot 1573*99653d4eSeschrock * spares. 1574*99653d4eSeschrock */ 1575*99653d4eSeschrock if (pvd->vdev_ops == &vdev_spare_ops && 1576*99653d4eSeschrock pvd->vdev_child[1] == oldvd && 1577*99653d4eSeschrock !spa_has_spare(spa, newvd->vdev_guid)) 1578*99653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1579*99653d4eSeschrock 1580*99653d4eSeschrock /* 1581*99653d4eSeschrock * If the source is a hot spare, and the parent isn't already a 1582*99653d4eSeschrock * spare, then we want to create a new hot spare. Otherwise, we 1583*99653d4eSeschrock * want to create a replacing vdev. 1584*99653d4eSeschrock */ 1585*99653d4eSeschrock if (pvd->vdev_ops == &vdev_replacing_ops) 1586*99653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1587*99653d4eSeschrock else if (pvd->vdev_ops != &vdev_spare_ops && 1588*99653d4eSeschrock newvd->vdev_isspare) 1589*99653d4eSeschrock pvops = &vdev_spare_ops; 1590*99653d4eSeschrock else 1591*99653d4eSeschrock pvops = &vdev_replacing_ops; 1592*99653d4eSeschrock } 1593*99653d4eSeschrock 15942a79c5feSlling /* 15952a79c5feSlling * Compare the new device size with the replaceable/attachable 15962a79c5feSlling * device size. 15972a79c5feSlling */ 15982a79c5feSlling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1599fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1600fa9e4066Sahrens 1601ecc2d604Sbonwick /* 1602ecc2d604Sbonwick * The new device cannot have a higher alignment requirement 1603ecc2d604Sbonwick * than the top-level vdev. 1604ecc2d604Sbonwick */ 1605ecc2d604Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1606fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1607fa9e4066Sahrens 1608fa9e4066Sahrens /* 1609fa9e4066Sahrens * If this is an in-place replacement, update oldvd's path and devid 1610fa9e4066Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1611fa9e4066Sahrens */ 1612fa9e4066Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1613fa9e4066Sahrens spa_strfree(oldvd->vdev_path); 1614fa9e4066Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1615fa9e4066Sahrens KM_SLEEP); 1616fa9e4066Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1617fa9e4066Sahrens newvd->vdev_path, "old"); 1618fa9e4066Sahrens if (oldvd->vdev_devid != NULL) { 1619fa9e4066Sahrens spa_strfree(oldvd->vdev_devid); 1620fa9e4066Sahrens oldvd->vdev_devid = NULL; 1621fa9e4066Sahrens } 1622fa9e4066Sahrens } 1623fa9e4066Sahrens 1624fa9e4066Sahrens /* 1625*99653d4eSeschrock * If the parent is not a mirror, or if we're replacing, insert the new 1626*99653d4eSeschrock * mirror/replacing/spare vdev above oldvd. 1627fa9e4066Sahrens */ 1628fa9e4066Sahrens if (pvd->vdev_ops != pvops) 1629fa9e4066Sahrens pvd = vdev_add_parent(oldvd, pvops); 1630fa9e4066Sahrens 1631fa9e4066Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1632fa9e4066Sahrens ASSERT(pvd->vdev_ops == pvops); 1633fa9e4066Sahrens ASSERT(oldvd->vdev_parent == pvd); 1634fa9e4066Sahrens 1635fa9e4066Sahrens /* 1636fa9e4066Sahrens * Extract the new device from its root and add it to pvd. 1637fa9e4066Sahrens */ 1638fa9e4066Sahrens vdev_remove_child(newrootvd, newvd); 1639fa9e4066Sahrens newvd->vdev_id = pvd->vdev_children; 1640fa9e4066Sahrens vdev_add_child(pvd, newvd); 1641fa9e4066Sahrens 1642ea8dc4b6Seschrock /* 1643ea8dc4b6Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 1644ea8dc4b6Seschrock * the addition of newvd may have decreased our parent's asize. 1645ea8dc4b6Seschrock */ 1646ea8dc4b6Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1647ea8dc4b6Seschrock 1648fa9e4066Sahrens tvd = newvd->vdev_top; 1649fa9e4066Sahrens ASSERT(pvd->vdev_top == tvd); 1650fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 1651fa9e4066Sahrens 1652fa9e4066Sahrens vdev_config_dirty(tvd); 1653fa9e4066Sahrens 1654fa9e4066Sahrens /* 1655fa9e4066Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1656fa9e4066Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1657fa9e4066Sahrens */ 1658fa9e4066Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1659fa9e4066Sahrens 1660fa9e4066Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1661fa9e4066Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1662fa9e4066Sahrens open_txg - TXG_INITIAL + 1); 1663fa9e4066Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1664fa9e4066Sahrens 1665ea8dc4b6Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1666ea8dc4b6Seschrock 1667fa9e4066Sahrens /* 1668fa9e4066Sahrens * Mark newvd's DTL dirty in this txg. 1669fa9e4066Sahrens */ 1670ecc2d604Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1671fa9e4066Sahrens 1672fa9e4066Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1673fa9e4066Sahrens 1674fa9e4066Sahrens /* 1675fa9e4066Sahrens * Kick off a resilver to update newvd. 1676fa9e4066Sahrens */ 1677fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1678fa9e4066Sahrens 1679fa9e4066Sahrens return (0); 1680fa9e4066Sahrens } 1681fa9e4066Sahrens 1682fa9e4066Sahrens /* 1683fa9e4066Sahrens * Detach a device from a mirror or replacing vdev. 1684fa9e4066Sahrens * If 'replace_done' is specified, only detach if the parent 1685fa9e4066Sahrens * is a replacing vdev. 1686fa9e4066Sahrens */ 1687fa9e4066Sahrens int 1688ea8dc4b6Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1689fa9e4066Sahrens { 1690fa9e4066Sahrens uint64_t txg; 1691fa9e4066Sahrens int c, t, error; 1692fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1693fa9e4066Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1694*99653d4eSeschrock boolean_t unspare = B_FALSE; 1695*99653d4eSeschrock uint64_t unspare_guid; 1696fa9e4066Sahrens 1697fa9e4066Sahrens txg = spa_vdev_enter(spa); 1698fa9e4066Sahrens 1699ea8dc4b6Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1700fa9e4066Sahrens 1701fa9e4066Sahrens if (vd == NULL) 1702fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1703fa9e4066Sahrens 17040e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 17050e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 17060e34b6a7Sbonwick 1707fa9e4066Sahrens pvd = vd->vdev_parent; 1708fa9e4066Sahrens 1709fa9e4066Sahrens /* 1710fa9e4066Sahrens * If replace_done is specified, only remove this device if it's 1711*99653d4eSeschrock * the first child of a replacing vdev. For the 'spare' vdev, either 1712*99653d4eSeschrock * disk can be removed. 1713*99653d4eSeschrock */ 1714*99653d4eSeschrock if (replace_done) { 1715*99653d4eSeschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 1716*99653d4eSeschrock if (vd->vdev_id != 0) 1717*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1718*99653d4eSeschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 1719*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1720*99653d4eSeschrock } 1721*99653d4eSeschrock } 1722*99653d4eSeschrock 1723*99653d4eSeschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1724*99653d4eSeschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1725fa9e4066Sahrens 1726fa9e4066Sahrens /* 1727*99653d4eSeschrock * Only mirror, replacing, and spare vdevs support detach. 1728fa9e4066Sahrens */ 1729fa9e4066Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1730*99653d4eSeschrock pvd->vdev_ops != &vdev_mirror_ops && 1731*99653d4eSeschrock pvd->vdev_ops != &vdev_spare_ops) 1732fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1733fa9e4066Sahrens 1734fa9e4066Sahrens /* 1735fa9e4066Sahrens * If there's only one replica, you can't detach it. 1736fa9e4066Sahrens */ 1737fa9e4066Sahrens if (pvd->vdev_children <= 1) 1738fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1739fa9e4066Sahrens 1740fa9e4066Sahrens /* 1741fa9e4066Sahrens * If all siblings have non-empty DTLs, this device may have the only 1742fa9e4066Sahrens * valid copy of the data, which means we cannot safely detach it. 1743fa9e4066Sahrens * 1744fa9e4066Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1745fa9e4066Sahrens * precise DTL check. 1746fa9e4066Sahrens */ 1747fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1748fa9e4066Sahrens uint64_t dirty; 1749fa9e4066Sahrens 1750fa9e4066Sahrens cvd = pvd->vdev_child[c]; 1751fa9e4066Sahrens if (cvd == vd) 1752fa9e4066Sahrens continue; 1753fa9e4066Sahrens if (vdev_is_dead(cvd)) 1754fa9e4066Sahrens continue; 1755fa9e4066Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1756fa9e4066Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1757fa9e4066Sahrens cvd->vdev_dtl_scrub.sm_space; 1758fa9e4066Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1759fa9e4066Sahrens if (!dirty) 1760fa9e4066Sahrens break; 1761fa9e4066Sahrens } 1762*99653d4eSeschrock 1763*99653d4eSeschrock /* 1764*99653d4eSeschrock * If we are a replacing or spare vdev, then we can always detach the 1765*99653d4eSeschrock * latter child, as that is how one cancels the operation. 1766*99653d4eSeschrock */ 1767*99653d4eSeschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1768*99653d4eSeschrock c == pvd->vdev_children) 1769fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1770fa9e4066Sahrens 1771*99653d4eSeschrock /* 1772*99653d4eSeschrock * If we are detaching the original disk from a spare, then it implies 1773*99653d4eSeschrock * that the spare should become a real disk, and be removed from the 1774*99653d4eSeschrock * active spare list for the pool. 1775*99653d4eSeschrock */ 1776*99653d4eSeschrock if (pvd->vdev_ops == &vdev_spare_ops && 1777*99653d4eSeschrock vd->vdev_id == 0) 1778*99653d4eSeschrock unspare = B_TRUE; 1779*99653d4eSeschrock 1780fa9e4066Sahrens /* 1781fa9e4066Sahrens * Erase the disk labels so the disk can be used for other things. 1782fa9e4066Sahrens * This must be done after all other error cases are handled, 1783fa9e4066Sahrens * but before we disembowel vd (so we can still do I/O to it). 1784fa9e4066Sahrens * But if we can't do it, don't treat the error as fatal -- 1785fa9e4066Sahrens * it may be that the unwritability of the disk is the reason 1786fa9e4066Sahrens * it's being detached! 1787fa9e4066Sahrens */ 1788*99653d4eSeschrock error = vdev_label_init(vd, 0, B_FALSE); 1789fa9e4066Sahrens if (error) 1790fa9e4066Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1791fa9e4066Sahrens 1792fa9e4066Sahrens /* 1793fa9e4066Sahrens * Remove vd from its parent and compact the parent's children. 1794fa9e4066Sahrens */ 1795fa9e4066Sahrens vdev_remove_child(pvd, vd); 1796fa9e4066Sahrens vdev_compact_children(pvd); 1797fa9e4066Sahrens 1798fa9e4066Sahrens /* 1799fa9e4066Sahrens * Remember one of the remaining children so we can get tvd below. 1800fa9e4066Sahrens */ 1801fa9e4066Sahrens cvd = pvd->vdev_child[0]; 1802fa9e4066Sahrens 1803*99653d4eSeschrock /* 1804*99653d4eSeschrock * If we need to remove the remaining child from the list of hot spares, 1805*99653d4eSeschrock * do it now, marking the vdev as no longer a spare in the process. We 1806*99653d4eSeschrock * must do this before vdev_remove_parent(), because that can change the 1807*99653d4eSeschrock * GUID if it creates a new toplevel GUID. 1808*99653d4eSeschrock */ 1809*99653d4eSeschrock if (unspare) { 1810*99653d4eSeschrock ASSERT(cvd->vdev_isspare); 1811*99653d4eSeschrock spa_spare_remove(cvd->vdev_guid); 1812*99653d4eSeschrock cvd->vdev_isspare = B_FALSE; 1813*99653d4eSeschrock unspare_guid = cvd->vdev_guid; 1814*99653d4eSeschrock } 1815*99653d4eSeschrock 1816fa9e4066Sahrens /* 1817fa9e4066Sahrens * If the parent mirror/replacing vdev only has one child, 1818fa9e4066Sahrens * the parent is no longer needed. Remove it from the tree. 1819fa9e4066Sahrens */ 1820fa9e4066Sahrens if (pvd->vdev_children == 1) 1821fa9e4066Sahrens vdev_remove_parent(cvd); 1822fa9e4066Sahrens 1823fa9e4066Sahrens /* 1824fa9e4066Sahrens * We don't set tvd until now because the parent we just removed 1825fa9e4066Sahrens * may have been the previous top-level vdev. 1826fa9e4066Sahrens */ 1827fa9e4066Sahrens tvd = cvd->vdev_top; 1828fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 1829fa9e4066Sahrens 1830fa9e4066Sahrens /* 1831fa9e4066Sahrens * Reopen this top-level vdev to reassess health after detach. 1832fa9e4066Sahrens */ 1833ea8dc4b6Seschrock vdev_reopen(tvd); 1834fa9e4066Sahrens 1835fa9e4066Sahrens /* 1836fa9e4066Sahrens * If the device we just detached was smaller than the others, 1837ecc2d604Sbonwick * it may be possible to add metaslabs (i.e. grow the pool). 1838ecc2d604Sbonwick * vdev_metaslab_init() can't fail because the existing metaslabs 1839ecc2d604Sbonwick * are already in core, so there's nothing to read from disk. 1840fa9e4066Sahrens */ 1841ecc2d604Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1842fa9e4066Sahrens 1843fa9e4066Sahrens vdev_config_dirty(tvd); 1844fa9e4066Sahrens 1845fa9e4066Sahrens /* 1846fa9e4066Sahrens * Mark vd's DTL as dirty in this txg. 1847fa9e4066Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1848fa9e4066Sahrens * and free vd's DTL object in syncing context. 1849fa9e4066Sahrens * But first make sure we're not on any *other* txg's DTL list, 1850fa9e4066Sahrens * to prevent vd from being accessed after it's freed. 1851fa9e4066Sahrens */ 1852fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 1853fa9e4066Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1854ecc2d604Sbonwick vd->vdev_detached = B_TRUE; 1855ecc2d604Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1856fa9e4066Sahrens 1857ea8dc4b6Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1858fa9e4066Sahrens 1859*99653d4eSeschrock error = spa_vdev_exit(spa, vd, txg, 0); 1860*99653d4eSeschrock 1861*99653d4eSeschrock /* 1862*99653d4eSeschrock * If we are supposed to remove the given vdev from the list of spares, 1863*99653d4eSeschrock * iterate over all pools in the system and replace it if it's present. 1864*99653d4eSeschrock */ 1865*99653d4eSeschrock if (unspare) { 1866*99653d4eSeschrock spa = NULL; 1867*99653d4eSeschrock mutex_enter(&spa_namespace_lock); 1868*99653d4eSeschrock while ((spa = spa_next(spa)) != NULL) { 1869*99653d4eSeschrock if (spa->spa_state != POOL_STATE_ACTIVE) 1870*99653d4eSeschrock continue; 1871*99653d4eSeschrock 1872*99653d4eSeschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1873*99653d4eSeschrock } 1874*99653d4eSeschrock mutex_exit(&spa_namespace_lock); 1875*99653d4eSeschrock } 1876*99653d4eSeschrock 1877*99653d4eSeschrock return (error); 1878*99653d4eSeschrock } 1879*99653d4eSeschrock 1880*99653d4eSeschrock /* 1881*99653d4eSeschrock * Remove a device from the pool. Currently, this supports removing only hot 1882*99653d4eSeschrock * spares. 1883*99653d4eSeschrock */ 1884*99653d4eSeschrock int 1885*99653d4eSeschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1886*99653d4eSeschrock { 1887*99653d4eSeschrock vdev_t *vd; 1888*99653d4eSeschrock nvlist_t **spares, *nv, **newspares; 1889*99653d4eSeschrock uint_t i, j, nspares; 1890*99653d4eSeschrock int ret = 0; 1891*99653d4eSeschrock 1892*99653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 1893*99653d4eSeschrock 1894*99653d4eSeschrock vd = spa_lookup_by_guid(spa, guid); 1895*99653d4eSeschrock 1896*99653d4eSeschrock nv = NULL; 1897*99653d4eSeschrock if (spa->spa_spares != NULL && 1898*99653d4eSeschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1899*99653d4eSeschrock &spares, &nspares) == 0) { 1900*99653d4eSeschrock for (i = 0; i < nspares; i++) { 1901*99653d4eSeschrock uint64_t theguid; 1902*99653d4eSeschrock 1903*99653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 1904*99653d4eSeschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 1905*99653d4eSeschrock if (theguid == guid) { 1906*99653d4eSeschrock nv = spares[i]; 1907*99653d4eSeschrock break; 1908*99653d4eSeschrock } 1909*99653d4eSeschrock } 1910*99653d4eSeschrock } 1911*99653d4eSeschrock 1912*99653d4eSeschrock /* 1913*99653d4eSeschrock * We only support removing a hot spare, and only if it's not currently 1914*99653d4eSeschrock * in use in this pool. 1915*99653d4eSeschrock */ 1916*99653d4eSeschrock if (nv == NULL && vd == NULL) { 1917*99653d4eSeschrock ret = ENOENT; 1918*99653d4eSeschrock goto out; 1919*99653d4eSeschrock } 1920*99653d4eSeschrock 1921*99653d4eSeschrock if (nv == NULL && vd != NULL) { 1922*99653d4eSeschrock ret = ENOTSUP; 1923*99653d4eSeschrock goto out; 1924*99653d4eSeschrock } 1925*99653d4eSeschrock 1926*99653d4eSeschrock if (!unspare && nv != NULL && vd != NULL) { 1927*99653d4eSeschrock ret = EBUSY; 1928*99653d4eSeschrock goto out; 1929*99653d4eSeschrock } 1930*99653d4eSeschrock 1931*99653d4eSeschrock if (nspares == 1) { 1932*99653d4eSeschrock newspares = NULL; 1933*99653d4eSeschrock } else { 1934*99653d4eSeschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1935*99653d4eSeschrock KM_SLEEP); 1936*99653d4eSeschrock for (i = 0, j = 0; i < nspares; i++) { 1937*99653d4eSeschrock if (spares[i] != nv) 1938*99653d4eSeschrock VERIFY(nvlist_dup(spares[i], 1939*99653d4eSeschrock &newspares[j++], KM_SLEEP) == 0); 1940*99653d4eSeschrock } 1941*99653d4eSeschrock } 1942*99653d4eSeschrock 1943*99653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1944*99653d4eSeschrock DATA_TYPE_NVLIST_ARRAY) == 0); 1945*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1946*99653d4eSeschrock newspares, nspares - 1) == 0); 1947*99653d4eSeschrock for (i = 0; i < nspares - 1; i++) 1948*99653d4eSeschrock nvlist_free(newspares[i]); 1949*99653d4eSeschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1950*99653d4eSeschrock spa_load_spares(spa); 1951*99653d4eSeschrock spa->spa_sync_spares = B_TRUE; 1952*99653d4eSeschrock 1953*99653d4eSeschrock out: 1954*99653d4eSeschrock spa_config_exit(spa, FTAG); 1955*99653d4eSeschrock 1956*99653d4eSeschrock return (ret); 1957fa9e4066Sahrens } 1958fa9e4066Sahrens 1959fa9e4066Sahrens /* 1960ea8dc4b6Seschrock * Find any device that's done replacing, so we can detach it. 1961fa9e4066Sahrens */ 1962ea8dc4b6Seschrock static vdev_t * 1963ea8dc4b6Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1964fa9e4066Sahrens { 1965ea8dc4b6Seschrock vdev_t *newvd, *oldvd; 1966fa9e4066Sahrens int c; 1967fa9e4066Sahrens 1968ea8dc4b6Seschrock for (c = 0; c < vd->vdev_children; c++) { 1969ea8dc4b6Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1970ea8dc4b6Seschrock if (oldvd != NULL) 1971ea8dc4b6Seschrock return (oldvd); 1972ea8dc4b6Seschrock } 1973fa9e4066Sahrens 1974fa9e4066Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1975ea8dc4b6Seschrock oldvd = vd->vdev_child[0]; 1976ea8dc4b6Seschrock newvd = vd->vdev_child[1]; 1977ea8dc4b6Seschrock 1978ea8dc4b6Seschrock mutex_enter(&newvd->vdev_dtl_lock); 1979ea8dc4b6Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 1980ea8dc4b6Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 1981ea8dc4b6Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1982ea8dc4b6Seschrock return (oldvd); 1983fa9e4066Sahrens } 1984ea8dc4b6Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1985fa9e4066Sahrens } 1986ea8dc4b6Seschrock 1987ea8dc4b6Seschrock return (NULL); 1988fa9e4066Sahrens } 1989fa9e4066Sahrens 1990ea8dc4b6Seschrock static void 1991fa9e4066Sahrens spa_vdev_replace_done(spa_t *spa) 1992fa9e4066Sahrens { 1993ea8dc4b6Seschrock vdev_t *vd; 1994*99653d4eSeschrock vdev_t *pvd; 1995ea8dc4b6Seschrock uint64_t guid; 1996*99653d4eSeschrock uint64_t pguid = 0; 1997ea8dc4b6Seschrock 1998ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 1999ea8dc4b6Seschrock 2000ea8dc4b6Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2001ea8dc4b6Seschrock guid = vd->vdev_guid; 2002*99653d4eSeschrock /* 2003*99653d4eSeschrock * If we have just finished replacing a hot spared device, then 2004*99653d4eSeschrock * we need to detach the parent's first child (the original hot 2005*99653d4eSeschrock * spare) as well. 2006*99653d4eSeschrock */ 2007*99653d4eSeschrock pvd = vd->vdev_parent; 2008*99653d4eSeschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2009*99653d4eSeschrock pvd->vdev_id == 0) { 2010*99653d4eSeschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2011*99653d4eSeschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 2012*99653d4eSeschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2013*99653d4eSeschrock } 2014ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2015ea8dc4b6Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2016ea8dc4b6Seschrock return; 2017*99653d4eSeschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2018*99653d4eSeschrock return; 2019ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 2020fa9e4066Sahrens } 2021fa9e4066Sahrens 2022ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2023fa9e4066Sahrens } 2024fa9e4066Sahrens 2025c67d9675Seschrock /* 2026c67d9675Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 2027c67d9675Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 2028c67d9675Seschrock */ 2029c67d9675Seschrock int 2030c67d9675Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2031c67d9675Seschrock { 2032c67d9675Seschrock vdev_t *rvd, *vd; 2033c67d9675Seschrock uint64_t txg; 2034c67d9675Seschrock 2035c67d9675Seschrock rvd = spa->spa_root_vdev; 2036c67d9675Seschrock 2037c67d9675Seschrock txg = spa_vdev_enter(spa); 2038c67d9675Seschrock 2039*99653d4eSeschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2040*99653d4eSeschrock /* 2041*99653d4eSeschrock * Determine if this is a reference to a hot spare. In that 2042*99653d4eSeschrock * case, update the path as stored in the spare list. 2043*99653d4eSeschrock */ 2044*99653d4eSeschrock nvlist_t **spares; 2045*99653d4eSeschrock uint_t i, nspares; 2046*99653d4eSeschrock if (spa->spa_sparelist != NULL) { 2047*99653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2048*99653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2049*99653d4eSeschrock for (i = 0; i < nspares; i++) { 2050*99653d4eSeschrock uint64_t theguid; 2051*99653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 2052*99653d4eSeschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 2053*99653d4eSeschrock if (theguid == guid) 2054*99653d4eSeschrock break; 2055*99653d4eSeschrock } 2056*99653d4eSeschrock 2057*99653d4eSeschrock if (i == nspares) 2058*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2059*99653d4eSeschrock 2060*99653d4eSeschrock VERIFY(nvlist_add_string(spares[i], 2061*99653d4eSeschrock ZPOOL_CONFIG_PATH, newpath) == 0); 2062*99653d4eSeschrock spa_load_spares(spa); 2063*99653d4eSeschrock spa->spa_sync_spares = B_TRUE; 2064*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 2065*99653d4eSeschrock } else { 2066*99653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2067*99653d4eSeschrock } 2068*99653d4eSeschrock } 2069c67d9675Seschrock 20700e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 20710e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 20720e34b6a7Sbonwick 2073c67d9675Seschrock spa_strfree(vd->vdev_path); 2074c67d9675Seschrock vd->vdev_path = spa_strdup(newpath); 2075c67d9675Seschrock 2076c67d9675Seschrock vdev_config_dirty(vd->vdev_top); 2077c67d9675Seschrock 2078c67d9675Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 2079c67d9675Seschrock } 2080c67d9675Seschrock 2081fa9e4066Sahrens /* 2082fa9e4066Sahrens * ========================================================================== 2083fa9e4066Sahrens * SPA Scrubbing 2084fa9e4066Sahrens * ========================================================================== 2085fa9e4066Sahrens */ 2086fa9e4066Sahrens 2087ea8dc4b6Seschrock void 2088ea8dc4b6Seschrock spa_scrub_throttle(spa_t *spa, int direction) 2089ea8dc4b6Seschrock { 2090ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 2091ea8dc4b6Seschrock spa->spa_scrub_throttled += direction; 2092ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 2093ea8dc4b6Seschrock if (spa->spa_scrub_throttled == 0) 2094ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 2095ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2096ea8dc4b6Seschrock } 2097fa9e4066Sahrens 2098fa9e4066Sahrens static void 2099fa9e4066Sahrens spa_scrub_io_done(zio_t *zio) 2100fa9e4066Sahrens { 2101fa9e4066Sahrens spa_t *spa = zio->io_spa; 2102fa9e4066Sahrens 2103fa9e4066Sahrens zio_buf_free(zio->io_data, zio->io_size); 2104fa9e4066Sahrens 2105fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2106ea8dc4b6Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 210744cd46caSbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2108ea8dc4b6Seschrock spa->spa_scrub_errors++; 2109fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2110fa9e4066Sahrens vd->vdev_stat.vs_scrub_errors++; 2111fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 2112fa9e4066Sahrens } 2113ea8dc4b6Seschrock if (--spa->spa_scrub_inflight == 0) { 2114ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 2115ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled == 0); 2116ea8dc4b6Seschrock } 2117ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2118fa9e4066Sahrens } 2119fa9e4066Sahrens 2120fa9e4066Sahrens static void 2121ea8dc4b6Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2122ea8dc4b6Seschrock zbookmark_t *zb) 2123fa9e4066Sahrens { 2124fa9e4066Sahrens size_t size = BP_GET_LSIZE(bp); 2125fa9e4066Sahrens void *data = zio_buf_alloc(size); 2126fa9e4066Sahrens 2127fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2128fa9e4066Sahrens spa->spa_scrub_inflight++; 2129fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2130fa9e4066Sahrens 2131ea8dc4b6Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2132ea8dc4b6Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2133ea8dc4b6Seschrock 2134d80c45e0Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2135ea8dc4b6Seschrock 2136fa9e4066Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 2137ea8dc4b6Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2138fa9e4066Sahrens } 2139fa9e4066Sahrens 2140fa9e4066Sahrens /* ARGSUSED */ 2141fa9e4066Sahrens static int 2142fa9e4066Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2143fa9e4066Sahrens { 2144fa9e4066Sahrens blkptr_t *bp = &bc->bc_blkptr; 214544cd46caSbillm vdev_t *vd = spa->spa_root_vdev; 214644cd46caSbillm dva_t *dva = bp->blk_dva; 214744cd46caSbillm int needs_resilver = B_FALSE; 214844cd46caSbillm int d; 2149fa9e4066Sahrens 215044cd46caSbillm if (bc->bc_errno) { 2151fa9e4066Sahrens /* 2152fa9e4066Sahrens * We can't scrub this block, but we can continue to scrub 2153fa9e4066Sahrens * the rest of the pool. Note the error and move along. 2154fa9e4066Sahrens */ 2155fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2156fa9e4066Sahrens spa->spa_scrub_errors++; 2157fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2158fa9e4066Sahrens 215944cd46caSbillm mutex_enter(&vd->vdev_stat_lock); 216044cd46caSbillm vd->vdev_stat.vs_scrub_errors++; 216144cd46caSbillm mutex_exit(&vd->vdev_stat_lock); 2162fa9e4066Sahrens 2163fa9e4066Sahrens return (ERESTART); 2164fa9e4066Sahrens } 2165fa9e4066Sahrens 2166fa9e4066Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2167fa9e4066Sahrens 216844cd46caSbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 216944cd46caSbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2170fa9e4066Sahrens 217144cd46caSbillm ASSERT(vd != NULL); 217244cd46caSbillm 217344cd46caSbillm /* 217444cd46caSbillm * Keep track of how much data we've examined so that 217544cd46caSbillm * zpool(1M) status can make useful progress reports. 217644cd46caSbillm */ 217744cd46caSbillm mutex_enter(&vd->vdev_stat_lock); 217844cd46caSbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 217944cd46caSbillm mutex_exit(&vd->vdev_stat_lock); 218044cd46caSbillm 218144cd46caSbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 218244cd46caSbillm if (DVA_GET_GANG(&dva[d])) { 218344cd46caSbillm /* 218444cd46caSbillm * Gang members may be spread across multiple 218544cd46caSbillm * vdevs, so the best we can do is look at the 218644cd46caSbillm * pool-wide DTL. 218744cd46caSbillm * XXX -- it would be better to change our 218844cd46caSbillm * allocation policy to ensure that this can't 218944cd46caSbillm * happen. 219044cd46caSbillm */ 219144cd46caSbillm vd = spa->spa_root_vdev; 219244cd46caSbillm } 219344cd46caSbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 219444cd46caSbillm bp->blk_birth, 1)) 219544cd46caSbillm needs_resilver = B_TRUE; 2196fa9e4066Sahrens } 219744cd46caSbillm } 219844cd46caSbillm 219944cd46caSbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2200fa9e4066Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2201ea8dc4b6Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 220244cd46caSbillm else if (needs_resilver) 220344cd46caSbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 220444cd46caSbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2205fa9e4066Sahrens 2206fa9e4066Sahrens return (0); 2207fa9e4066Sahrens } 2208fa9e4066Sahrens 2209fa9e4066Sahrens static void 2210fa9e4066Sahrens spa_scrub_thread(spa_t *spa) 2211fa9e4066Sahrens { 2212fa9e4066Sahrens callb_cpr_t cprinfo; 2213fa9e4066Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2214fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 2215fa9e4066Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2216fa9e4066Sahrens int error = 0; 2217fa9e4066Sahrens boolean_t complete; 2218fa9e4066Sahrens 2219fa9e4066Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2220fa9e4066Sahrens 2221f0aa80d4Sbonwick /* 2222f0aa80d4Sbonwick * If we're restarting due to a snapshot create/delete, 2223f0aa80d4Sbonwick * wait for that to complete. 2224f0aa80d4Sbonwick */ 2225f0aa80d4Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2226f0aa80d4Sbonwick 2227ea8dc4b6Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2228ea8dc4b6Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2229ea8dc4b6Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2230ea8dc4b6Seschrock 2231ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2232ea8dc4b6Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2233fa9e4066Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2234fa9e4066Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2235ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2236fa9e4066Sahrens 2237fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2238fa9e4066Sahrens spa->spa_scrub_errors = 0; 2239fa9e4066Sahrens spa->spa_scrub_active = 1; 2240ea8dc4b6Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2241ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled == 0); 2242fa9e4066Sahrens 2243fa9e4066Sahrens while (!spa->spa_scrub_stop) { 2244fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 2245ea8dc4b6Seschrock while (spa->spa_scrub_suspended) { 2246fa9e4066Sahrens spa->spa_scrub_active = 0; 2247fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2248fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2249fa9e4066Sahrens spa->spa_scrub_active = 1; 2250fa9e4066Sahrens } 2251fa9e4066Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2252fa9e4066Sahrens 2253fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 2254fa9e4066Sahrens break; 2255fa9e4066Sahrens 2256fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2257fa9e4066Sahrens error = traverse_more(th); 2258fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2259fa9e4066Sahrens if (error != EAGAIN) 2260fa9e4066Sahrens break; 2261ea8dc4b6Seschrock 2262ea8dc4b6Seschrock while (spa->spa_scrub_throttled > 0) 2263ea8dc4b6Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2264fa9e4066Sahrens } 2265fa9e4066Sahrens 2266fa9e4066Sahrens while (spa->spa_scrub_inflight) 2267fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2268fa9e4066Sahrens 22695dabedeeSbonwick spa->spa_scrub_active = 0; 22705dabedeeSbonwick cv_broadcast(&spa->spa_scrub_cv); 22715dabedeeSbonwick 22725dabedeeSbonwick mutex_exit(&spa->spa_scrub_lock); 22735dabedeeSbonwick 22745dabedeeSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 22755dabedeeSbonwick 22765dabedeeSbonwick mutex_enter(&spa->spa_scrub_lock); 22775dabedeeSbonwick 22785dabedeeSbonwick /* 22795dabedeeSbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 22805dabedeeSbonwick * AND the spa config lock to synchronize with any config changes 22815dabedeeSbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 22825dabedeeSbonwick */ 2283fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 2284fa9e4066Sahrens error = ERESTART; 2285fa9e4066Sahrens 2286ea8dc4b6Seschrock if (spa->spa_scrub_stop) 2287ea8dc4b6Seschrock error = EINTR; 2288ea8dc4b6Seschrock 2289fa9e4066Sahrens /* 2290ea8dc4b6Seschrock * Even if there were uncorrectable errors, we consider the scrub 2291ea8dc4b6Seschrock * completed. The downside is that if there is a transient error during 2292ea8dc4b6Seschrock * a resilver, we won't resilver the data properly to the target. But 2293ea8dc4b6Seschrock * if the damage is permanent (more likely) we will resilver forever, 2294ea8dc4b6Seschrock * which isn't really acceptable. Since there is enough information for 2295ea8dc4b6Seschrock * the user to know what has failed and why, this seems like a more 2296ea8dc4b6Seschrock * tractable approach. 2297fa9e4066Sahrens */ 2298ea8dc4b6Seschrock complete = (error == 0); 2299fa9e4066Sahrens 2300ea8dc4b6Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2301ea8dc4b6Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2302fa9e4066Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2303fa9e4066Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2304fa9e4066Sahrens 2305fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2306fa9e4066Sahrens 2307fa9e4066Sahrens /* 2308fa9e4066Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2309fa9e4066Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2310fa9e4066Sahrens */ 2311fa9e4066Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2312fa9e4066Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2313fa9e4066Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2314ea8dc4b6Seschrock spa_errlog_rotate(spa); 23155dabedeeSbonwick 2316ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2317fa9e4066Sahrens 2318fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2319fa9e4066Sahrens 2320ea8dc4b6Seschrock /* 2321ea8dc4b6Seschrock * We may have finished replacing a device. 2322ea8dc4b6Seschrock * Let the async thread assess this and handle the detach. 2323ea8dc4b6Seschrock */ 2324ea8dc4b6Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2325fa9e4066Sahrens 2326fa9e4066Sahrens /* 2327fa9e4066Sahrens * If we were told to restart, our final act is to start a new scrub. 2328fa9e4066Sahrens */ 2329fa9e4066Sahrens if (error == ERESTART) 2330ea8dc4b6Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2331ea8dc4b6Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2332fa9e4066Sahrens 2333ea8dc4b6Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 2334ea8dc4b6Seschrock spa->spa_scrub_active = 0; 2335ea8dc4b6Seschrock spa->spa_scrub_thread = NULL; 2336ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_cv); 2337fa9e4066Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2338fa9e4066Sahrens thread_exit(); 2339fa9e4066Sahrens } 2340fa9e4066Sahrens 2341fa9e4066Sahrens void 2342fa9e4066Sahrens spa_scrub_suspend(spa_t *spa) 2343fa9e4066Sahrens { 2344fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2345ea8dc4b6Seschrock spa->spa_scrub_suspended++; 2346fa9e4066Sahrens while (spa->spa_scrub_active) { 2347fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2348fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2349fa9e4066Sahrens } 2350fa9e4066Sahrens while (spa->spa_scrub_inflight) 2351fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2352fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2353fa9e4066Sahrens } 2354fa9e4066Sahrens 2355fa9e4066Sahrens void 2356fa9e4066Sahrens spa_scrub_resume(spa_t *spa) 2357fa9e4066Sahrens { 2358fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2359ea8dc4b6Seschrock ASSERT(spa->spa_scrub_suspended != 0); 2360ea8dc4b6Seschrock if (--spa->spa_scrub_suspended == 0) 2361fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2362fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2363fa9e4066Sahrens } 2364fa9e4066Sahrens 2365fa9e4066Sahrens void 2366fa9e4066Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2367fa9e4066Sahrens { 2368fa9e4066Sahrens /* 2369fa9e4066Sahrens * Something happened (e.g. snapshot create/delete) that means 2370fa9e4066Sahrens * we must restart any in-progress scrubs. The itinerary will 2371fa9e4066Sahrens * fix this properly. 2372fa9e4066Sahrens */ 2373fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2374fa9e4066Sahrens spa->spa_scrub_restart_txg = txg; 2375fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2376fa9e4066Sahrens } 2377fa9e4066Sahrens 2378ea8dc4b6Seschrock int 2379ea8dc4b6Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2380fa9e4066Sahrens { 2381fa9e4066Sahrens space_seg_t *ss; 2382fa9e4066Sahrens uint64_t mintxg, maxtxg; 2383fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 2384fa9e4066Sahrens 2385fa9e4066Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2386fa9e4066Sahrens return (ENOTSUP); 2387fa9e4066Sahrens 2388ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 2389ea8dc4b6Seschrock 2390fa9e4066Sahrens /* 2391fa9e4066Sahrens * If there's a scrub or resilver already in progress, stop it. 2392fa9e4066Sahrens */ 2393fa9e4066Sahrens while (spa->spa_scrub_thread != NULL) { 2394fa9e4066Sahrens /* 2395fa9e4066Sahrens * Don't stop a resilver unless forced. 2396fa9e4066Sahrens */ 2397ea8dc4b6Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2398ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2399fa9e4066Sahrens return (EBUSY); 2400ea8dc4b6Seschrock } 2401fa9e4066Sahrens spa->spa_scrub_stop = 1; 2402fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2403fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2404fa9e4066Sahrens } 2405fa9e4066Sahrens 2406fa9e4066Sahrens /* 2407fa9e4066Sahrens * Terminate the previous traverse. 2408fa9e4066Sahrens */ 2409fa9e4066Sahrens if (spa->spa_scrub_th != NULL) { 2410fa9e4066Sahrens traverse_fini(spa->spa_scrub_th); 2411fa9e4066Sahrens spa->spa_scrub_th = NULL; 2412fa9e4066Sahrens } 2413fa9e4066Sahrens 2414ea8dc4b6Seschrock if (rvd == NULL) { 2415ea8dc4b6Seschrock ASSERT(spa->spa_scrub_stop == 0); 2416ea8dc4b6Seschrock ASSERT(spa->spa_scrub_type == type); 2417ea8dc4b6Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 2418ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2419ea8dc4b6Seschrock return (0); 2420ea8dc4b6Seschrock } 2421fa9e4066Sahrens 2422fa9e4066Sahrens mintxg = TXG_INITIAL - 1; 2423fa9e4066Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2424fa9e4066Sahrens 2425ea8dc4b6Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2426fa9e4066Sahrens 2427ea8dc4b6Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 2428ea8dc4b6Seschrock /* 2429ea8dc4b6Seschrock * The pool-wide DTL is empty. 2430ecc2d604Sbonwick * If this is a resilver, there's nothing to do except 2431ecc2d604Sbonwick * check whether any in-progress replacements have completed. 2432ea8dc4b6Seschrock */ 2433ecc2d604Sbonwick if (type == POOL_SCRUB_RESILVER) { 2434ea8dc4b6Seschrock type = POOL_SCRUB_NONE; 2435ecc2d604Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2436ecc2d604Sbonwick } 2437ea8dc4b6Seschrock } else { 2438ea8dc4b6Seschrock /* 2439ea8dc4b6Seschrock * The pool-wide DTL is non-empty. 2440ea8dc4b6Seschrock * If this is a normal scrub, upgrade to a resilver instead. 2441ea8dc4b6Seschrock */ 2442ea8dc4b6Seschrock if (type == POOL_SCRUB_EVERYTHING) 2443ea8dc4b6Seschrock type = POOL_SCRUB_RESILVER; 2444ea8dc4b6Seschrock } 2445fa9e4066Sahrens 2446ea8dc4b6Seschrock if (type == POOL_SCRUB_RESILVER) { 2447fa9e4066Sahrens /* 2448fa9e4066Sahrens * Determine the resilvering boundaries. 2449fa9e4066Sahrens * 2450fa9e4066Sahrens * Note: (mintxg, maxtxg) is an open interval, 2451fa9e4066Sahrens * i.e. mintxg and maxtxg themselves are not included. 2452fa9e4066Sahrens * 2453fa9e4066Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2454fa9e4066Sahrens * so we don't claim to resilver a txg that's still changing. 2455fa9e4066Sahrens */ 2456fa9e4066Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2457ea8dc4b6Seschrock mintxg = ss->ss_start - 1; 2458fa9e4066Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2459ea8dc4b6Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2460fa9e4066Sahrens } 2461fa9e4066Sahrens 2462ea8dc4b6Seschrock mutex_exit(&rvd->vdev_dtl_lock); 2463ea8dc4b6Seschrock 2464ea8dc4b6Seschrock spa->spa_scrub_stop = 0; 2465ea8dc4b6Seschrock spa->spa_scrub_type = type; 2466ea8dc4b6Seschrock spa->spa_scrub_restart_txg = 0; 2467ea8dc4b6Seschrock 2468ea8dc4b6Seschrock if (type != POOL_SCRUB_NONE) { 2469ea8dc4b6Seschrock spa->spa_scrub_mintxg = mintxg; 2470fa9e4066Sahrens spa->spa_scrub_maxtxg = maxtxg; 2471fa9e4066Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 24720373e76bSbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 24730373e76bSbonwick ZIO_FLAG_CANFAIL); 2474fa9e4066Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2475fa9e4066Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2476fa9e4066Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2477fa9e4066Sahrens } 2478fa9e4066Sahrens 2479ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2480ea8dc4b6Seschrock 2481fa9e4066Sahrens return (0); 2482fa9e4066Sahrens } 2483fa9e4066Sahrens 2484ea8dc4b6Seschrock /* 2485ea8dc4b6Seschrock * ========================================================================== 2486ea8dc4b6Seschrock * SPA async task processing 2487ea8dc4b6Seschrock * ========================================================================== 2488ea8dc4b6Seschrock */ 2489ea8dc4b6Seschrock 2490ea8dc4b6Seschrock static void 2491ea8dc4b6Seschrock spa_async_reopen(spa_t *spa) 2492fa9e4066Sahrens { 2493ea8dc4b6Seschrock vdev_t *rvd = spa->spa_root_vdev; 2494ea8dc4b6Seschrock vdev_t *tvd; 2495ea8dc4b6Seschrock int c; 2496fa9e4066Sahrens 2497ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2498ea8dc4b6Seschrock 2499ea8dc4b6Seschrock for (c = 0; c < rvd->vdev_children; c++) { 2500ea8dc4b6Seschrock tvd = rvd->vdev_child[c]; 2501ea8dc4b6Seschrock if (tvd->vdev_reopen_wanted) { 2502ea8dc4b6Seschrock tvd->vdev_reopen_wanted = 0; 2503ea8dc4b6Seschrock vdev_reopen(tvd); 2504ea8dc4b6Seschrock } 2505ea8dc4b6Seschrock } 2506ea8dc4b6Seschrock 2507ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2508ea8dc4b6Seschrock } 2509fa9e4066Sahrens 2510ea8dc4b6Seschrock static void 2511ea8dc4b6Seschrock spa_async_thread(spa_t *spa) 2512ea8dc4b6Seschrock { 2513ea8dc4b6Seschrock int tasks; 2514ea8dc4b6Seschrock 2515ea8dc4b6Seschrock ASSERT(spa->spa_sync_on); 2516ea8dc4b6Seschrock 2517ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2518ea8dc4b6Seschrock tasks = spa->spa_async_tasks; 2519ea8dc4b6Seschrock spa->spa_async_tasks = 0; 2520ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2521ea8dc4b6Seschrock 25220373e76bSbonwick /* 25230373e76bSbonwick * See if the config needs to be updated. 25240373e76bSbonwick */ 25250373e76bSbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 25260373e76bSbonwick mutex_enter(&spa_namespace_lock); 25270373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 25280373e76bSbonwick mutex_exit(&spa_namespace_lock); 25290373e76bSbonwick } 25300373e76bSbonwick 2531ea8dc4b6Seschrock /* 2532ea8dc4b6Seschrock * See if any devices need to be reopened. 2533ea8dc4b6Seschrock */ 2534ea8dc4b6Seschrock if (tasks & SPA_ASYNC_REOPEN) 2535ea8dc4b6Seschrock spa_async_reopen(spa); 2536ea8dc4b6Seschrock 2537ea8dc4b6Seschrock /* 2538ea8dc4b6Seschrock * If any devices are done replacing, detach them. 2539ea8dc4b6Seschrock */ 2540ea8dc4b6Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2541fa9e4066Sahrens spa_vdev_replace_done(spa); 2542fa9e4066Sahrens 2543ea8dc4b6Seschrock /* 2544ea8dc4b6Seschrock * Kick off a scrub. 2545ea8dc4b6Seschrock */ 2546ea8dc4b6Seschrock if (tasks & SPA_ASYNC_SCRUB) 2547ea8dc4b6Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2548ea8dc4b6Seschrock 2549ea8dc4b6Seschrock /* 2550ea8dc4b6Seschrock * Kick off a resilver. 2551ea8dc4b6Seschrock */ 2552ea8dc4b6Seschrock if (tasks & SPA_ASYNC_RESILVER) 2553ea8dc4b6Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2554ea8dc4b6Seschrock 2555ea8dc4b6Seschrock /* 2556ea8dc4b6Seschrock * Let the world know that we're done. 2557ea8dc4b6Seschrock */ 2558ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2559ea8dc4b6Seschrock spa->spa_async_thread = NULL; 2560ea8dc4b6Seschrock cv_broadcast(&spa->spa_async_cv); 2561ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2562ea8dc4b6Seschrock thread_exit(); 2563ea8dc4b6Seschrock } 2564ea8dc4b6Seschrock 2565ea8dc4b6Seschrock void 2566ea8dc4b6Seschrock spa_async_suspend(spa_t *spa) 2567ea8dc4b6Seschrock { 2568ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2569ea8dc4b6Seschrock spa->spa_async_suspended++; 2570ea8dc4b6Seschrock while (spa->spa_async_thread != NULL) 2571ea8dc4b6Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2572ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2573ea8dc4b6Seschrock } 2574ea8dc4b6Seschrock 2575ea8dc4b6Seschrock void 2576ea8dc4b6Seschrock spa_async_resume(spa_t *spa) 2577ea8dc4b6Seschrock { 2578ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2579ea8dc4b6Seschrock ASSERT(spa->spa_async_suspended != 0); 2580ea8dc4b6Seschrock spa->spa_async_suspended--; 2581ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2582ea8dc4b6Seschrock } 2583ea8dc4b6Seschrock 2584ea8dc4b6Seschrock static void 2585ea8dc4b6Seschrock spa_async_dispatch(spa_t *spa) 2586ea8dc4b6Seschrock { 2587ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2588ea8dc4b6Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 25890373e76bSbonwick spa->spa_async_thread == NULL && 25900373e76bSbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 2591ea8dc4b6Seschrock spa->spa_async_thread = thread_create(NULL, 0, 2592ea8dc4b6Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2593ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2594ea8dc4b6Seschrock } 2595ea8dc4b6Seschrock 2596ea8dc4b6Seschrock void 2597ea8dc4b6Seschrock spa_async_request(spa_t *spa, int task) 2598ea8dc4b6Seschrock { 2599ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2600ea8dc4b6Seschrock spa->spa_async_tasks |= task; 2601ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2602fa9e4066Sahrens } 2603fa9e4066Sahrens 2604fa9e4066Sahrens /* 2605fa9e4066Sahrens * ========================================================================== 2606fa9e4066Sahrens * SPA syncing routines 2607fa9e4066Sahrens * ========================================================================== 2608fa9e4066Sahrens */ 2609fa9e4066Sahrens 2610fa9e4066Sahrens static void 2611fa9e4066Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2612fa9e4066Sahrens { 2613fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2614fa9e4066Sahrens dmu_tx_t *tx; 2615fa9e4066Sahrens blkptr_t blk; 2616fa9e4066Sahrens uint64_t itor = 0; 2617fa9e4066Sahrens zio_t *zio; 2618fa9e4066Sahrens int error; 2619fa9e4066Sahrens uint8_t c = 1; 2620fa9e4066Sahrens 2621fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2622fa9e4066Sahrens 2623fa9e4066Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2624fa9e4066Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2625fa9e4066Sahrens 2626fa9e4066Sahrens error = zio_wait(zio); 2627fa9e4066Sahrens ASSERT3U(error, ==, 0); 2628fa9e4066Sahrens 2629fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2630fa9e4066Sahrens bplist_vacate(bpl, tx); 2631fa9e4066Sahrens 2632fa9e4066Sahrens /* 2633fa9e4066Sahrens * Pre-dirty the first block so we sync to convergence faster. 2634fa9e4066Sahrens * (Usually only the first block is needed.) 2635fa9e4066Sahrens */ 2636fa9e4066Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2637fa9e4066Sahrens dmu_tx_commit(tx); 2638fa9e4066Sahrens } 2639fa9e4066Sahrens 2640fa9e4066Sahrens static void 2641*99653d4eSeschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2642fa9e4066Sahrens { 2643fa9e4066Sahrens char *packed = NULL; 2644fa9e4066Sahrens size_t nvsize = 0; 2645fa9e4066Sahrens dmu_buf_t *db; 2646fa9e4066Sahrens 2647*99653d4eSeschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2648fa9e4066Sahrens 2649fa9e4066Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2650fa9e4066Sahrens 2651*99653d4eSeschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2652ea8dc4b6Seschrock KM_SLEEP) == 0); 2653fa9e4066Sahrens 2654*99653d4eSeschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2655fa9e4066Sahrens 2656fa9e4066Sahrens kmem_free(packed, nvsize); 2657fa9e4066Sahrens 2658*99653d4eSeschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2659fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 2660fa9e4066Sahrens *(uint64_t *)db->db_data = nvsize; 2661ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 2662fa9e4066Sahrens } 2663fa9e4066Sahrens 2664*99653d4eSeschrock static void 2665*99653d4eSeschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2666*99653d4eSeschrock { 2667*99653d4eSeschrock nvlist_t *nvroot; 2668*99653d4eSeschrock nvlist_t **spares; 2669*99653d4eSeschrock int i; 2670*99653d4eSeschrock 2671*99653d4eSeschrock if (!spa->spa_sync_spares) 2672*99653d4eSeschrock return; 2673*99653d4eSeschrock 2674*99653d4eSeschrock /* 2675*99653d4eSeschrock * Update the MOS nvlist describing the list of available spares. 2676*99653d4eSeschrock * spa_validate_spares() will have already made sure this nvlist is 2677*99653d4eSeschrock * valid and the vdevs are labelled appropriately. 2678*99653d4eSeschrock */ 2679*99653d4eSeschrock if (spa->spa_spares_object == 0) { 2680*99653d4eSeschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2681*99653d4eSeschrock DMU_OT_PACKED_NVLIST, 1 << 14, 2682*99653d4eSeschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2683*99653d4eSeschrock VERIFY(zap_update(spa->spa_meta_objset, 2684*99653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2685*99653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2686*99653d4eSeschrock } 2687*99653d4eSeschrock 2688*99653d4eSeschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2689*99653d4eSeschrock if (spa->spa_nspares == 0) { 2690*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2691*99653d4eSeschrock NULL, 0) == 0); 2692*99653d4eSeschrock } else { 2693*99653d4eSeschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2694*99653d4eSeschrock KM_SLEEP); 2695*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 2696*99653d4eSeschrock spares[i] = vdev_config_generate(spa, 2697*99653d4eSeschrock spa->spa_spares[i], B_FALSE, B_TRUE); 2698*99653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2699*99653d4eSeschrock spares, spa->spa_nspares) == 0); 2700*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 2701*99653d4eSeschrock nvlist_free(spares[i]); 2702*99653d4eSeschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2703*99653d4eSeschrock } 2704*99653d4eSeschrock 2705*99653d4eSeschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2706*99653d4eSeschrock 2707*99653d4eSeschrock spa->spa_sync_spares = B_FALSE; 2708*99653d4eSeschrock } 2709*99653d4eSeschrock 2710*99653d4eSeschrock static void 2711*99653d4eSeschrock spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2712*99653d4eSeschrock { 2713*99653d4eSeschrock nvlist_t *config; 2714*99653d4eSeschrock 2715*99653d4eSeschrock if (list_is_empty(&spa->spa_dirty_list)) 2716*99653d4eSeschrock return; 2717*99653d4eSeschrock 2718*99653d4eSeschrock config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2719*99653d4eSeschrock 2720*99653d4eSeschrock if (spa->spa_config_syncing) 2721*99653d4eSeschrock nvlist_free(spa->spa_config_syncing); 2722*99653d4eSeschrock spa->spa_config_syncing = config; 2723*99653d4eSeschrock 2724*99653d4eSeschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2725*99653d4eSeschrock } 2726*99653d4eSeschrock 2727fa9e4066Sahrens /* 2728fa9e4066Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2729fa9e4066Sahrens * part of the process, so we iterate until it converges. 2730fa9e4066Sahrens */ 2731fa9e4066Sahrens void 2732fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg) 2733fa9e4066Sahrens { 2734fa9e4066Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2735fa9e4066Sahrens objset_t *mos = spa->spa_meta_objset; 2736fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 27370373e76bSbonwick vdev_t *rvd = spa->spa_root_vdev; 2738fa9e4066Sahrens vdev_t *vd; 2739fa9e4066Sahrens dmu_tx_t *tx; 2740fa9e4066Sahrens int dirty_vdevs; 2741fa9e4066Sahrens 2742fa9e4066Sahrens /* 2743fa9e4066Sahrens * Lock out configuration changes. 2744fa9e4066Sahrens */ 2745ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 2746fa9e4066Sahrens 2747fa9e4066Sahrens spa->spa_syncing_txg = txg; 2748fa9e4066Sahrens spa->spa_sync_pass = 0; 2749fa9e4066Sahrens 2750ea8dc4b6Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2751fa9e4066Sahrens 2752*99653d4eSeschrock tx = dmu_tx_create_assigned(dp, txg); 2753*99653d4eSeschrock 2754*99653d4eSeschrock /* 2755*99653d4eSeschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2756*99653d4eSeschrock * set spa_deflate if we have no raid-z vdevs. 2757*99653d4eSeschrock */ 2758*99653d4eSeschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2759*99653d4eSeschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2760*99653d4eSeschrock int i; 2761*99653d4eSeschrock 2762*99653d4eSeschrock for (i = 0; i < rvd->vdev_children; i++) { 2763*99653d4eSeschrock vd = rvd->vdev_child[i]; 2764*99653d4eSeschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2765*99653d4eSeschrock break; 2766*99653d4eSeschrock } 2767*99653d4eSeschrock if (i == rvd->vdev_children) { 2768*99653d4eSeschrock spa->spa_deflate = TRUE; 2769*99653d4eSeschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 2770*99653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2771*99653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2772*99653d4eSeschrock } 2773*99653d4eSeschrock } 2774*99653d4eSeschrock 2775fa9e4066Sahrens /* 2776fa9e4066Sahrens * If anything has changed in this txg, push the deferred frees 2777fa9e4066Sahrens * from the previous txg. If not, leave them alone so that we 2778fa9e4066Sahrens * don't generate work on an otherwise idle system. 2779fa9e4066Sahrens */ 2780fa9e4066Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2781fa9e4066Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2782fa9e4066Sahrens spa_sync_deferred_frees(spa, txg); 2783fa9e4066Sahrens 2784fa9e4066Sahrens /* 2785fa9e4066Sahrens * Iterate to convergence. 2786fa9e4066Sahrens */ 2787fa9e4066Sahrens do { 2788fa9e4066Sahrens spa->spa_sync_pass++; 2789fa9e4066Sahrens 2790fa9e4066Sahrens spa_sync_config_object(spa, tx); 2791*99653d4eSeschrock spa_sync_spares(spa, tx); 2792ea8dc4b6Seschrock spa_errlog_sync(spa, txg); 2793fa9e4066Sahrens dsl_pool_sync(dp, txg); 2794fa9e4066Sahrens 2795fa9e4066Sahrens dirty_vdevs = 0; 2796fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2797fa9e4066Sahrens vdev_sync(vd, txg); 2798fa9e4066Sahrens dirty_vdevs++; 2799fa9e4066Sahrens } 2800fa9e4066Sahrens 2801fa9e4066Sahrens bplist_sync(bpl, tx); 2802fa9e4066Sahrens } while (dirty_vdevs); 2803fa9e4066Sahrens 2804fa9e4066Sahrens bplist_close(bpl); 2805fa9e4066Sahrens 2806fa9e4066Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2807fa9e4066Sahrens 2808fa9e4066Sahrens /* 2809fa9e4066Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2810fa9e4066Sahrens * to commit the transaction group. 28110373e76bSbonwick * 28120373e76bSbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 28130373e76bSbonwick * Otherwise, pick a random top-level vdev that's known to be 28140373e76bSbonwick * visible in the config cache (see spa_vdev_add() for details). 28150373e76bSbonwick * If the write fails, try the next vdev until we're tried them all. 28160373e76bSbonwick */ 28170373e76bSbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 28180373e76bSbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28190373e76bSbonwick } else { 28200373e76bSbonwick int children = rvd->vdev_children; 28210373e76bSbonwick int c0 = spa_get_random(children); 28220373e76bSbonwick int c; 28230373e76bSbonwick 28240373e76bSbonwick for (c = 0; c < children; c++) { 28250373e76bSbonwick vd = rvd->vdev_child[(c0 + c) % children]; 28260373e76bSbonwick if (vd->vdev_ms_array == 0) 28270373e76bSbonwick continue; 28280373e76bSbonwick if (vdev_config_sync(vd, txg) == 0) 28290373e76bSbonwick break; 28300373e76bSbonwick } 28310373e76bSbonwick if (c == children) 28320373e76bSbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28330373e76bSbonwick } 28340373e76bSbonwick 2835*99653d4eSeschrock dmu_tx_commit(tx); 2836*99653d4eSeschrock 28370373e76bSbonwick /* 28380373e76bSbonwick * Clear the dirty config list. 2839fa9e4066Sahrens */ 28400373e76bSbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 28410373e76bSbonwick vdev_config_clean(vd); 28420373e76bSbonwick 28430373e76bSbonwick /* 28440373e76bSbonwick * Now that the new config has synced transactionally, 28450373e76bSbonwick * let it become visible to the config cache. 28460373e76bSbonwick */ 28470373e76bSbonwick if (spa->spa_config_syncing != NULL) { 28480373e76bSbonwick spa_config_set(spa, spa->spa_config_syncing); 28490373e76bSbonwick spa->spa_config_txg = txg; 28500373e76bSbonwick spa->spa_config_syncing = NULL; 28510373e76bSbonwick } 2852fa9e4066Sahrens 2853fa9e4066Sahrens /* 2854fa9e4066Sahrens * Make a stable copy of the fully synced uberblock. 2855fa9e4066Sahrens * We use this as the root for pool traversals. 2856fa9e4066Sahrens */ 2857fa9e4066Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2858fa9e4066Sahrens 2859fa9e4066Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2860fa9e4066Sahrens 2861fa9e4066Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2862fa9e4066Sahrens spa->spa_traverse_wanted = 0; 2863fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 2864fa9e4066Sahrens rw_exit(&spa->spa_traverse_lock); 2865fa9e4066Sahrens 2866fa9e4066Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2867fa9e4066Sahrens 2868fa9e4066Sahrens /* 2869fa9e4066Sahrens * Clean up the ZIL records for the synced txg. 2870fa9e4066Sahrens */ 2871fa9e4066Sahrens dsl_pool_zil_clean(dp); 2872fa9e4066Sahrens 2873fa9e4066Sahrens /* 2874fa9e4066Sahrens * Update usable space statistics. 2875fa9e4066Sahrens */ 2876fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2877fa9e4066Sahrens vdev_sync_done(vd, txg); 2878fa9e4066Sahrens 2879fa9e4066Sahrens /* 2880fa9e4066Sahrens * It had better be the case that we didn't dirty anything 2881*99653d4eSeschrock * since vdev_config_sync(). 2882fa9e4066Sahrens */ 2883fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2884fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2885fa9e4066Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2886fa9e4066Sahrens ASSERT(bpl->bpl_queue == NULL); 2887fa9e4066Sahrens 2888ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2889ea8dc4b6Seschrock 2890ea8dc4b6Seschrock /* 2891ea8dc4b6Seschrock * If any async tasks have been requested, kick them off. 2892ea8dc4b6Seschrock */ 2893ea8dc4b6Seschrock spa_async_dispatch(spa); 2894fa9e4066Sahrens } 2895fa9e4066Sahrens 2896fa9e4066Sahrens /* 2897fa9e4066Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2898fa9e4066Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2899fa9e4066Sahrens * sync. 2900fa9e4066Sahrens */ 2901fa9e4066Sahrens void 2902fa9e4066Sahrens spa_sync_allpools(void) 2903fa9e4066Sahrens { 2904fa9e4066Sahrens spa_t *spa = NULL; 2905fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2906fa9e4066Sahrens while ((spa = spa_next(spa)) != NULL) { 2907fa9e4066Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2908fa9e4066Sahrens continue; 2909fa9e4066Sahrens spa_open_ref(spa, FTAG); 2910fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2911fa9e4066Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2912fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2913fa9e4066Sahrens spa_close(spa, FTAG); 2914fa9e4066Sahrens } 2915fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2916fa9e4066Sahrens } 2917fa9e4066Sahrens 2918fa9e4066Sahrens /* 2919fa9e4066Sahrens * ========================================================================== 2920fa9e4066Sahrens * Miscellaneous routines 2921fa9e4066Sahrens * ========================================================================== 2922fa9e4066Sahrens */ 2923fa9e4066Sahrens 2924fa9e4066Sahrens /* 2925fa9e4066Sahrens * Remove all pools in the system. 2926fa9e4066Sahrens */ 2927fa9e4066Sahrens void 2928fa9e4066Sahrens spa_evict_all(void) 2929fa9e4066Sahrens { 2930fa9e4066Sahrens spa_t *spa; 2931fa9e4066Sahrens 2932fa9e4066Sahrens /* 2933fa9e4066Sahrens * Remove all cached state. All pools should be closed now, 2934fa9e4066Sahrens * so every spa in the AVL tree should be unreferenced. 2935fa9e4066Sahrens */ 2936fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2937fa9e4066Sahrens while ((spa = spa_next(NULL)) != NULL) { 2938fa9e4066Sahrens /* 2939ea8dc4b6Seschrock * Stop async tasks. The async thread may need to detach 2940ea8dc4b6Seschrock * a device that's been replaced, which requires grabbing 2941ea8dc4b6Seschrock * spa_namespace_lock, so we must drop it here. 2942fa9e4066Sahrens */ 2943fa9e4066Sahrens spa_open_ref(spa, FTAG); 2944fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2945ea8dc4b6Seschrock spa_async_suspend(spa); 2946fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2947fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2948fa9e4066Sahrens spa_close(spa, FTAG); 2949fa9e4066Sahrens 2950fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2951fa9e4066Sahrens spa_unload(spa); 2952fa9e4066Sahrens spa_deactivate(spa); 2953fa9e4066Sahrens } 2954fa9e4066Sahrens spa_remove(spa); 2955fa9e4066Sahrens } 2956fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2957fa9e4066Sahrens } 2958ea8dc4b6Seschrock 2959ea8dc4b6Seschrock vdev_t * 2960ea8dc4b6Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2961ea8dc4b6Seschrock { 2962ea8dc4b6Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2963ea8dc4b6Seschrock } 2964eaca9bbdSeschrock 2965eaca9bbdSeschrock void 2966eaca9bbdSeschrock spa_upgrade(spa_t *spa) 2967eaca9bbdSeschrock { 2968eaca9bbdSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 2969eaca9bbdSeschrock 2970eaca9bbdSeschrock /* 2971eaca9bbdSeschrock * This should only be called for a non-faulted pool, and since a 2972eaca9bbdSeschrock * future version would result in an unopenable pool, this shouldn't be 2973eaca9bbdSeschrock * possible. 2974eaca9bbdSeschrock */ 2975eaca9bbdSeschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2976eaca9bbdSeschrock 2977eaca9bbdSeschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 2978eaca9bbdSeschrock vdev_config_dirty(spa->spa_root_vdev); 2979eaca9bbdSeschrock 2980eaca9bbdSeschrock spa_config_exit(spa, FTAG); 2981*99653d4eSeschrock 2982*99653d4eSeschrock txg_wait_synced(spa_get_dsl(spa), 0); 2983*99653d4eSeschrock } 2984*99653d4eSeschrock 2985*99653d4eSeschrock boolean_t 2986*99653d4eSeschrock spa_has_spare(spa_t *spa, uint64_t guid) 2987*99653d4eSeschrock { 2988*99653d4eSeschrock int i; 2989*99653d4eSeschrock 2990*99653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 2991*99653d4eSeschrock if (spa->spa_spares[i]->vdev_guid == guid) 2992*99653d4eSeschrock return (B_TRUE); 2993*99653d4eSeschrock 2994*99653d4eSeschrock return (B_FALSE); 2995eaca9bbdSeschrock } 2996