1*fa9e4066Sahrens /* 2*fa9e4066Sahrens * CDDL HEADER START 3*fa9e4066Sahrens * 4*fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*fa9e4066Sahrens * Common Development and Distribution License, Version 1.0 only 6*fa9e4066Sahrens * (the "License"). You may not use this file except in compliance 7*fa9e4066Sahrens * with the License. 8*fa9e4066Sahrens * 9*fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 11*fa9e4066Sahrens * See the License for the specific language governing permissions 12*fa9e4066Sahrens * and limitations under the License. 13*fa9e4066Sahrens * 14*fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*fa9e4066Sahrens * 20*fa9e4066Sahrens * CDDL HEADER END 21*fa9e4066Sahrens */ 22*fa9e4066Sahrens /* 23*fa9e4066Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*fa9e4066Sahrens * Use is subject to license terms. 25*fa9e4066Sahrens */ 26*fa9e4066Sahrens 27*fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*fa9e4066Sahrens 29*fa9e4066Sahrens /* 30*fa9e4066Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31*fa9e4066Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32*fa9e4066Sahrens * pool. 33*fa9e4066Sahrens */ 34*fa9e4066Sahrens 35*fa9e4066Sahrens #include <sys/zfs_context.h> 36*fa9e4066Sahrens #include <sys/spa_impl.h> 37*fa9e4066Sahrens #include <sys/zio.h> 38*fa9e4066Sahrens #include <sys/zio_checksum.h> 39*fa9e4066Sahrens #include <sys/zio_compress.h> 40*fa9e4066Sahrens #include <sys/dmu.h> 41*fa9e4066Sahrens #include <sys/dmu_tx.h> 42*fa9e4066Sahrens #include <sys/zap.h> 43*fa9e4066Sahrens #include <sys/zil.h> 44*fa9e4066Sahrens #include <sys/vdev_impl.h> 45*fa9e4066Sahrens #include <sys/metaslab.h> 46*fa9e4066Sahrens #include <sys/uberblock_impl.h> 47*fa9e4066Sahrens #include <sys/txg.h> 48*fa9e4066Sahrens #include <sys/avl.h> 49*fa9e4066Sahrens #include <sys/dmu_traverse.h> 50*fa9e4066Sahrens #include <sys/unique.h> 51*fa9e4066Sahrens #include <sys/dsl_pool.h> 52*fa9e4066Sahrens #include <sys/dsl_dir.h> 53*fa9e4066Sahrens #include <sys/dsl_prop.h> 54*fa9e4066Sahrens #include <sys/fs/zfs.h> 55*fa9e4066Sahrens #include <sys/callb.h> 56*fa9e4066Sahrens 57*fa9e4066Sahrens static uint32_t spa_active_count; 58*fa9e4066Sahrens 59*fa9e4066Sahrens /* 60*fa9e4066Sahrens * ========================================================================== 61*fa9e4066Sahrens * SPA state manipulation (open/create/destroy/import/export) 62*fa9e4066Sahrens * ========================================================================== 63*fa9e4066Sahrens */ 64*fa9e4066Sahrens 65*fa9e4066Sahrens /* 66*fa9e4066Sahrens * Activate an uninitialized pool. 67*fa9e4066Sahrens */ 68*fa9e4066Sahrens static void 69*fa9e4066Sahrens spa_activate(spa_t *spa) 70*fa9e4066Sahrens { 71*fa9e4066Sahrens int t; 72*fa9e4066Sahrens 73*fa9e4066Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74*fa9e4066Sahrens 75*fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 76*fa9e4066Sahrens 77*fa9e4066Sahrens spa->spa_normal_class = metaslab_class_create(); 78*fa9e4066Sahrens 79*fa9e4066Sahrens spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80*fa9e4066Sahrens 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81*fa9e4066Sahrens 82*fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 83*fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84*fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 85*fa9e4066Sahrens TASKQ_PREPOPULATE); 86*fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87*fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 88*fa9e4066Sahrens TASKQ_PREPOPULATE); 89*fa9e4066Sahrens } 90*fa9e4066Sahrens 91*fa9e4066Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92*fa9e4066Sahrens 93*fa9e4066Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94*fa9e4066Sahrens offsetof(vdev_t, vdev_dirty_node)); 95*fa9e4066Sahrens 96*fa9e4066Sahrens txg_list_create(&spa->spa_vdev_txg_list, 97*fa9e4066Sahrens offsetof(struct vdev, vdev_txg_node)); 98*fa9e4066Sahrens } 99*fa9e4066Sahrens 100*fa9e4066Sahrens /* 101*fa9e4066Sahrens * Opposite of spa_activate(). 102*fa9e4066Sahrens */ 103*fa9e4066Sahrens static void 104*fa9e4066Sahrens spa_deactivate(spa_t *spa) 105*fa9e4066Sahrens { 106*fa9e4066Sahrens int t; 107*fa9e4066Sahrens 108*fa9e4066Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 109*fa9e4066Sahrens ASSERT(spa->spa_dsl_pool == NULL); 110*fa9e4066Sahrens ASSERT(spa->spa_root_vdev == NULL); 111*fa9e4066Sahrens 112*fa9e4066Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113*fa9e4066Sahrens 114*fa9e4066Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 115*fa9e4066Sahrens 116*fa9e4066Sahrens list_destroy(&spa->spa_dirty_list); 117*fa9e4066Sahrens 118*fa9e4066Sahrens rw_destroy(&spa->spa_traverse_lock); 119*fa9e4066Sahrens 120*fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 121*fa9e4066Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 122*fa9e4066Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 123*fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = NULL; 124*fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = NULL; 125*fa9e4066Sahrens } 126*fa9e4066Sahrens 127*fa9e4066Sahrens taskq_destroy(spa->spa_vdev_retry_taskq); 128*fa9e4066Sahrens spa->spa_vdev_retry_taskq = NULL; 129*fa9e4066Sahrens 130*fa9e4066Sahrens metaslab_class_destroy(spa->spa_normal_class); 131*fa9e4066Sahrens spa->spa_normal_class = NULL; 132*fa9e4066Sahrens 133*fa9e4066Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 134*fa9e4066Sahrens } 135*fa9e4066Sahrens 136*fa9e4066Sahrens /* 137*fa9e4066Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 138*fa9e4066Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 139*fa9e4066Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 140*fa9e4066Sahrens * All vdev validation is done by the vdev_alloc() routine. 141*fa9e4066Sahrens */ 142*fa9e4066Sahrens static vdev_t * 143*fa9e4066Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144*fa9e4066Sahrens { 145*fa9e4066Sahrens nvlist_t **child; 146*fa9e4066Sahrens uint_t c, children; 147*fa9e4066Sahrens vdev_t *vd; 148*fa9e4066Sahrens 149*fa9e4066Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150*fa9e4066Sahrens return (NULL); 151*fa9e4066Sahrens 152*fa9e4066Sahrens if (vd->vdev_ops->vdev_op_leaf) 153*fa9e4066Sahrens return (vd); 154*fa9e4066Sahrens 155*fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156*fa9e4066Sahrens &child, &children) != 0) { 157*fa9e4066Sahrens vdev_free(vd); 158*fa9e4066Sahrens return (NULL); 159*fa9e4066Sahrens } 160*fa9e4066Sahrens 161*fa9e4066Sahrens for (c = 0; c < children; c++) { 162*fa9e4066Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163*fa9e4066Sahrens vdev_free(vd); 164*fa9e4066Sahrens return (NULL); 165*fa9e4066Sahrens } 166*fa9e4066Sahrens } 167*fa9e4066Sahrens 168*fa9e4066Sahrens return (vd); 169*fa9e4066Sahrens } 170*fa9e4066Sahrens 171*fa9e4066Sahrens /* 172*fa9e4066Sahrens * Opposite of spa_load(). 173*fa9e4066Sahrens */ 174*fa9e4066Sahrens static void 175*fa9e4066Sahrens spa_unload(spa_t *spa) 176*fa9e4066Sahrens { 177*fa9e4066Sahrens /* 178*fa9e4066Sahrens * Stop syncing. 179*fa9e4066Sahrens */ 180*fa9e4066Sahrens if (spa->spa_sync_on) { 181*fa9e4066Sahrens txg_sync_stop(spa->spa_dsl_pool); 182*fa9e4066Sahrens spa->spa_sync_on = B_FALSE; 183*fa9e4066Sahrens } 184*fa9e4066Sahrens 185*fa9e4066Sahrens /* 186*fa9e4066Sahrens * Wait for any outstanding prefetch I/O to complete. 187*fa9e4066Sahrens */ 188*fa9e4066Sahrens spa_config_enter(spa, RW_WRITER); 189*fa9e4066Sahrens spa_config_exit(spa); 190*fa9e4066Sahrens 191*fa9e4066Sahrens /* 192*fa9e4066Sahrens * Close the dsl pool. 193*fa9e4066Sahrens */ 194*fa9e4066Sahrens if (spa->spa_dsl_pool) { 195*fa9e4066Sahrens dsl_pool_close(spa->spa_dsl_pool); 196*fa9e4066Sahrens spa->spa_dsl_pool = NULL; 197*fa9e4066Sahrens } 198*fa9e4066Sahrens 199*fa9e4066Sahrens /* 200*fa9e4066Sahrens * Close all vdevs. 201*fa9e4066Sahrens */ 202*fa9e4066Sahrens if (spa->spa_root_vdev) { 203*fa9e4066Sahrens vdev_free(spa->spa_root_vdev); 204*fa9e4066Sahrens spa->spa_root_vdev = NULL; 205*fa9e4066Sahrens } 206*fa9e4066Sahrens } 207*fa9e4066Sahrens 208*fa9e4066Sahrens /* 209*fa9e4066Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 210*fa9e4066Sahrens * source of configuration information. The 'readonly' flag will prevent us 211*fa9e4066Sahrens * from writing any updated state to disk, and can be use when testing a pool 212*fa9e4066Sahrens * for import. 213*fa9e4066Sahrens */ 214*fa9e4066Sahrens static int 215*fa9e4066Sahrens spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216*fa9e4066Sahrens { 217*fa9e4066Sahrens int error = 0; 218*fa9e4066Sahrens nvlist_t *nvroot = NULL; 219*fa9e4066Sahrens vdev_t *rvd; 220*fa9e4066Sahrens uberblock_t *ub = &spa->spa_uberblock; 221*fa9e4066Sahrens uint64_t pool_guid; 222*fa9e4066Sahrens zio_t *zio; 223*fa9e4066Sahrens 224*fa9e4066Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225*fa9e4066Sahrens nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226*fa9e4066Sahrens return (EINVAL); 227*fa9e4066Sahrens 228*fa9e4066Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229*fa9e4066Sahrens &spa->spa_config_txg); 230*fa9e4066Sahrens 231*fa9e4066Sahrens if (import && spa_guid_exists(pool_guid, 0)) 232*fa9e4066Sahrens return (EEXIST); 233*fa9e4066Sahrens 234*fa9e4066Sahrens /* 235*fa9e4066Sahrens * Parse the configuration into a vdev tree. 236*fa9e4066Sahrens */ 237*fa9e4066Sahrens spa_config_enter(spa, RW_WRITER); 238*fa9e4066Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239*fa9e4066Sahrens spa_config_exit(spa); 240*fa9e4066Sahrens 241*fa9e4066Sahrens if (rvd == NULL) 242*fa9e4066Sahrens return (EINVAL); 243*fa9e4066Sahrens 244*fa9e4066Sahrens spa->spa_root_vdev = rvd; 245*fa9e4066Sahrens ASSERT(spa_guid(spa) == pool_guid); 246*fa9e4066Sahrens 247*fa9e4066Sahrens /* 248*fa9e4066Sahrens * Try to open all vdevs, loading each label in the process. 249*fa9e4066Sahrens */ 250*fa9e4066Sahrens if (vdev_open(rvd) != 0) 251*fa9e4066Sahrens return (ENXIO); 252*fa9e4066Sahrens 253*fa9e4066Sahrens /* 254*fa9e4066Sahrens * Find the best uberblock. 255*fa9e4066Sahrens */ 256*fa9e4066Sahrens bzero(ub, sizeof (uberblock_t)); 257*fa9e4066Sahrens 258*fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, 259*fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260*fa9e4066Sahrens vdev_uberblock_load(zio, rvd, ub); 261*fa9e4066Sahrens error = zio_wait(zio); 262*fa9e4066Sahrens 263*fa9e4066Sahrens /* 264*fa9e4066Sahrens * If we weren't able to find a single valid uberblock, return failure. 265*fa9e4066Sahrens */ 266*fa9e4066Sahrens if (ub->ub_txg == 0) { 267*fa9e4066Sahrens dprintf("ub_txg is zero\n"); 268*fa9e4066Sahrens return (ENXIO); 269*fa9e4066Sahrens } 270*fa9e4066Sahrens 271*fa9e4066Sahrens /* 272*fa9e4066Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 273*fa9e4066Sahrens * incomplete configuration. 274*fa9e4066Sahrens */ 275*fa9e4066Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276*fa9e4066Sahrens rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277*fa9e4066Sahrens rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278*fa9e4066Sahrens dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279*fa9e4066Sahrens rvd->vdev_guid_sum, ub->ub_guid_sum); 280*fa9e4066Sahrens return (ENXIO); 281*fa9e4066Sahrens } 282*fa9e4066Sahrens 283*fa9e4066Sahrens /* 284*fa9e4066Sahrens * Initialize internal SPA structures. 285*fa9e4066Sahrens */ 286*fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 287*fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 288*fa9e4066Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289*fa9e4066Sahrens spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290*fa9e4066Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291*fa9e4066Sahrens 292*fa9e4066Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 293*fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294*fa9e4066Sahrens sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295*fa9e4066Sahrens 296*fa9e4066Sahrens if (!mosconfig) { 297*fa9e4066Sahrens dmu_buf_t *db; 298*fa9e4066Sahrens char *packed = NULL; 299*fa9e4066Sahrens size_t nvsize = 0; 300*fa9e4066Sahrens nvlist_t *newconfig = NULL; 301*fa9e4066Sahrens 302*fa9e4066Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, 303*fa9e4066Sahrens spa->spa_config_object); 304*fa9e4066Sahrens dmu_buf_read(db); 305*fa9e4066Sahrens nvsize = *(uint64_t *)db->db_data; 306*fa9e4066Sahrens dmu_buf_rele(db); 307*fa9e4066Sahrens 308*fa9e4066Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 309*fa9e4066Sahrens error = dmu_read_canfail(spa->spa_meta_objset, 310*fa9e4066Sahrens spa->spa_config_object, 0, nvsize, packed); 311*fa9e4066Sahrens if (error == 0) 312*fa9e4066Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313*fa9e4066Sahrens kmem_free(packed, nvsize); 314*fa9e4066Sahrens 315*fa9e4066Sahrens if (error) 316*fa9e4066Sahrens return (ENXIO); 317*fa9e4066Sahrens 318*fa9e4066Sahrens spa_config_set(spa, newconfig); 319*fa9e4066Sahrens 320*fa9e4066Sahrens spa_unload(spa); 321*fa9e4066Sahrens spa_deactivate(spa); 322*fa9e4066Sahrens spa_activate(spa); 323*fa9e4066Sahrens 324*fa9e4066Sahrens return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325*fa9e4066Sahrens } 326*fa9e4066Sahrens 327*fa9e4066Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 328*fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329*fa9e4066Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330*fa9e4066Sahrens 331*fa9e4066Sahrens /* 332*fa9e4066Sahrens * Load the vdev state for all top level vdevs. 333*fa9e4066Sahrens */ 334*fa9e4066Sahrens if ((error = vdev_load(rvd, import)) != 0) 335*fa9e4066Sahrens return (error); 336*fa9e4066Sahrens 337*fa9e4066Sahrens /* 338*fa9e4066Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 339*fa9e4066Sahrens */ 340*fa9e4066Sahrens spa_config_enter(spa, RW_WRITER); 341*fa9e4066Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342*fa9e4066Sahrens spa_config_exit(spa); 343*fa9e4066Sahrens 344*fa9e4066Sahrens /* 345*fa9e4066Sahrens * Check the state of the root vdev. If it can't be opened, it 346*fa9e4066Sahrens * indicates one or more toplevel vdevs are faulted. 347*fa9e4066Sahrens */ 348*fa9e4066Sahrens if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349*fa9e4066Sahrens return (ENXIO); 350*fa9e4066Sahrens 351*fa9e4066Sahrens /* 352*fa9e4066Sahrens * Claim log blocks that haven't been committed yet, and update all 353*fa9e4066Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 354*fa9e4066Sahrens * This must all happen in a single txg. 355*fa9e4066Sahrens */ 356*fa9e4066Sahrens if ((spa_mode & FWRITE) && !readonly) { 357*fa9e4066Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358*fa9e4066Sahrens spa_first_txg(spa)); 359*fa9e4066Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360*fa9e4066Sahrens vdev_config_dirty(rvd); 361*fa9e4066Sahrens dmu_tx_commit(tx); 362*fa9e4066Sahrens 363*fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 364*fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 365*fa9e4066Sahrens 366*fa9e4066Sahrens /* 367*fa9e4066Sahrens * Wait for all claims to sync. 368*fa9e4066Sahrens */ 369*fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 370*fa9e4066Sahrens } 371*fa9e4066Sahrens 372*fa9e4066Sahrens return (0); 373*fa9e4066Sahrens } 374*fa9e4066Sahrens 375*fa9e4066Sahrens /* 376*fa9e4066Sahrens * Pool Open/Import 377*fa9e4066Sahrens * 378*fa9e4066Sahrens * The import case is identical to an open except that the configuration is sent 379*fa9e4066Sahrens * down from userland, instead of grabbed from the configuration cache. For the 380*fa9e4066Sahrens * case of an open, the pool configuration will exist in the 381*fa9e4066Sahrens * POOL_STATE_UNITIALIZED state. 382*fa9e4066Sahrens * 383*fa9e4066Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 384*fa9e4066Sahrens * the same time open the pool, without having to keep around the spa_t in some 385*fa9e4066Sahrens * ambiguous state. 386*fa9e4066Sahrens */ 387*fa9e4066Sahrens static int 388*fa9e4066Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389*fa9e4066Sahrens { 390*fa9e4066Sahrens spa_t *spa; 391*fa9e4066Sahrens int error; 392*fa9e4066Sahrens int loaded = B_FALSE; 393*fa9e4066Sahrens int locked = B_FALSE; 394*fa9e4066Sahrens 395*fa9e4066Sahrens *spapp = NULL; 396*fa9e4066Sahrens 397*fa9e4066Sahrens /* 398*fa9e4066Sahrens * As disgusting as this is, we need to support recursive calls to this 399*fa9e4066Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 400*fa9e4066Sahrens * up calling spa_open() again. The real fix is to figure out how to 401*fa9e4066Sahrens * avoid dsl_dir_open() calling this in the first place. 402*fa9e4066Sahrens */ 403*fa9e4066Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 404*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 405*fa9e4066Sahrens locked = B_TRUE; 406*fa9e4066Sahrens } 407*fa9e4066Sahrens 408*fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 409*fa9e4066Sahrens if (locked) 410*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 411*fa9e4066Sahrens return (ENOENT); 412*fa9e4066Sahrens } 413*fa9e4066Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414*fa9e4066Sahrens 415*fa9e4066Sahrens spa_activate(spa); 416*fa9e4066Sahrens 417*fa9e4066Sahrens error = spa_load(spa, spa->spa_config, 418*fa9e4066Sahrens B_FALSE, B_FALSE, B_FALSE); 419*fa9e4066Sahrens 420*fa9e4066Sahrens if (error == EBADF) { 421*fa9e4066Sahrens /* 422*fa9e4066Sahrens * If vdev_load() returns EBADF, it indicates that one 423*fa9e4066Sahrens * of the vdevs indicates that the pool has been 424*fa9e4066Sahrens * exported or destroyed. If this is the case, the 425*fa9e4066Sahrens * config cache is out of sync and we should remove the 426*fa9e4066Sahrens * pool from the namespace. 427*fa9e4066Sahrens */ 428*fa9e4066Sahrens spa_unload(spa); 429*fa9e4066Sahrens spa_deactivate(spa); 430*fa9e4066Sahrens spa_remove(spa); 431*fa9e4066Sahrens spa_config_sync(); 432*fa9e4066Sahrens if (locked) 433*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 434*fa9e4066Sahrens return (ENOENT); 435*fa9e4066Sahrens } if (error) { 436*fa9e4066Sahrens /* 437*fa9e4066Sahrens * We can't open the pool, but we still have useful 438*fa9e4066Sahrens * information: the state of each vdev after the 439*fa9e4066Sahrens * attempted vdev_open(). Return this to the user. 440*fa9e4066Sahrens */ 441*fa9e4066Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 442*fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 443*fa9e4066Sahrens B_TRUE); 444*fa9e4066Sahrens spa_unload(spa); 445*fa9e4066Sahrens spa_deactivate(spa); 446*fa9e4066Sahrens if (locked) 447*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 448*fa9e4066Sahrens *spapp = NULL; 449*fa9e4066Sahrens return (error); 450*fa9e4066Sahrens } 451*fa9e4066Sahrens 452*fa9e4066Sahrens loaded = B_TRUE; 453*fa9e4066Sahrens } 454*fa9e4066Sahrens 455*fa9e4066Sahrens spa_open_ref(spa, tag); 456*fa9e4066Sahrens if (locked) 457*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 458*fa9e4066Sahrens 459*fa9e4066Sahrens *spapp = spa; 460*fa9e4066Sahrens 461*fa9e4066Sahrens if (config != NULL) { 462*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 463*fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464*fa9e4066Sahrens spa_config_exit(spa); 465*fa9e4066Sahrens } 466*fa9e4066Sahrens 467*fa9e4066Sahrens /* 468*fa9e4066Sahrens * If we just loaded the pool, resilver anything that's out of date. 469*fa9e4066Sahrens */ 470*fa9e4066Sahrens if (loaded && (spa_mode & FWRITE)) 471*fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472*fa9e4066Sahrens 473*fa9e4066Sahrens return (0); 474*fa9e4066Sahrens } 475*fa9e4066Sahrens 476*fa9e4066Sahrens int 477*fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 478*fa9e4066Sahrens { 479*fa9e4066Sahrens return (spa_open_common(name, spapp, tag, NULL)); 480*fa9e4066Sahrens } 481*fa9e4066Sahrens 482*fa9e4066Sahrens int 483*fa9e4066Sahrens spa_get_stats(const char *name, nvlist_t **config) 484*fa9e4066Sahrens { 485*fa9e4066Sahrens int error; 486*fa9e4066Sahrens spa_t *spa; 487*fa9e4066Sahrens 488*fa9e4066Sahrens *config = NULL; 489*fa9e4066Sahrens error = spa_open_common(name, &spa, FTAG, config); 490*fa9e4066Sahrens 491*fa9e4066Sahrens if (spa != NULL) 492*fa9e4066Sahrens spa_close(spa, FTAG); 493*fa9e4066Sahrens 494*fa9e4066Sahrens return (error); 495*fa9e4066Sahrens } 496*fa9e4066Sahrens 497*fa9e4066Sahrens /* 498*fa9e4066Sahrens * Pool Creation 499*fa9e4066Sahrens */ 500*fa9e4066Sahrens int 501*fa9e4066Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502*fa9e4066Sahrens { 503*fa9e4066Sahrens spa_t *spa; 504*fa9e4066Sahrens dsl_pool_t *dp; 505*fa9e4066Sahrens dmu_tx_t *tx; 506*fa9e4066Sahrens int error; 507*fa9e4066Sahrens uint64_t txg = TXG_INITIAL; 508*fa9e4066Sahrens 509*fa9e4066Sahrens /* 510*fa9e4066Sahrens * If this pool already exists, return failure. 511*fa9e4066Sahrens */ 512*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 513*fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 514*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 515*fa9e4066Sahrens return (EEXIST); 516*fa9e4066Sahrens } 517*fa9e4066Sahrens spa = spa_add(pool); 518*fa9e4066Sahrens 519*fa9e4066Sahrens /* 520*fa9e4066Sahrens * Allocate a new spa_t structure. 521*fa9e4066Sahrens */ 522*fa9e4066Sahrens spa_activate(spa); 523*fa9e4066Sahrens 524*fa9e4066Sahrens spa->spa_uberblock.ub_txg = txg - 1; 525*fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 526*fa9e4066Sahrens 527*fa9e4066Sahrens error = spa_vdev_add(spa, nvroot); 528*fa9e4066Sahrens 529*fa9e4066Sahrens if (error) { 530*fa9e4066Sahrens spa_unload(spa); 531*fa9e4066Sahrens spa_deactivate(spa); 532*fa9e4066Sahrens spa_remove(spa); 533*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 534*fa9e4066Sahrens return (error); 535*fa9e4066Sahrens } 536*fa9e4066Sahrens 537*fa9e4066Sahrens if (altroot != NULL) { 538*fa9e4066Sahrens spa->spa_root = spa_strdup(altroot); 539*fa9e4066Sahrens atomic_add_32(&spa_active_count, 1); 540*fa9e4066Sahrens } 541*fa9e4066Sahrens 542*fa9e4066Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543*fa9e4066Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 544*fa9e4066Sahrens 545*fa9e4066Sahrens tx = dmu_tx_create_assigned(dp, txg); 546*fa9e4066Sahrens 547*fa9e4066Sahrens /* 548*fa9e4066Sahrens * Create the pool config object. 549*fa9e4066Sahrens */ 550*fa9e4066Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551*fa9e4066Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 552*fa9e4066Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553*fa9e4066Sahrens 554*fa9e4066Sahrens VERIFY(zap_add(spa->spa_meta_objset, 555*fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556*fa9e4066Sahrens sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557*fa9e4066Sahrens 558*fa9e4066Sahrens /* 559*fa9e4066Sahrens * Create the deferred-free bplist object. Turn off compression 560*fa9e4066Sahrens * because sync-to-convergence takes longer if the blocksize 561*fa9e4066Sahrens * keeps changing. 562*fa9e4066Sahrens */ 563*fa9e4066Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564*fa9e4066Sahrens 1 << 14, tx); 565*fa9e4066Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566*fa9e4066Sahrens ZIO_COMPRESS_OFF, tx); 567*fa9e4066Sahrens 568*fa9e4066Sahrens VERIFY(zap_add(spa->spa_meta_objset, 569*fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570*fa9e4066Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571*fa9e4066Sahrens 572*fa9e4066Sahrens dmu_tx_commit(tx); 573*fa9e4066Sahrens 574*fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 575*fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 576*fa9e4066Sahrens 577*fa9e4066Sahrens /* 578*fa9e4066Sahrens * We explicitly wait for the first transaction to complete so that our 579*fa9e4066Sahrens * bean counters are appropriately updated. 580*fa9e4066Sahrens */ 581*fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 582*fa9e4066Sahrens 583*fa9e4066Sahrens spa_config_sync(); 584*fa9e4066Sahrens 585*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 586*fa9e4066Sahrens 587*fa9e4066Sahrens return (0); 588*fa9e4066Sahrens } 589*fa9e4066Sahrens 590*fa9e4066Sahrens /* 591*fa9e4066Sahrens * Import the given pool into the system. We set up the necessary spa_t and 592*fa9e4066Sahrens * then call spa_load() to do the dirty work. 593*fa9e4066Sahrens */ 594*fa9e4066Sahrens int 595*fa9e4066Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 596*fa9e4066Sahrens { 597*fa9e4066Sahrens spa_t *spa; 598*fa9e4066Sahrens int error; 599*fa9e4066Sahrens 600*fa9e4066Sahrens if (!(spa_mode & FWRITE)) 601*fa9e4066Sahrens return (EROFS); 602*fa9e4066Sahrens 603*fa9e4066Sahrens /* 604*fa9e4066Sahrens * If a pool with this name exists, return failure. 605*fa9e4066Sahrens */ 606*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 607*fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 608*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 609*fa9e4066Sahrens return (EEXIST); 610*fa9e4066Sahrens } 611*fa9e4066Sahrens 612*fa9e4066Sahrens /* 613*fa9e4066Sahrens * Create an initialize the spa structure 614*fa9e4066Sahrens */ 615*fa9e4066Sahrens spa = spa_add(pool); 616*fa9e4066Sahrens spa_activate(spa); 617*fa9e4066Sahrens 618*fa9e4066Sahrens /* 619*fa9e4066Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620*fa9e4066Sahrens * so that we don't try to open the pool if the config is damaged. 621*fa9e4066Sahrens */ 622*fa9e4066Sahrens error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623*fa9e4066Sahrens 624*fa9e4066Sahrens if (error) { 625*fa9e4066Sahrens spa_unload(spa); 626*fa9e4066Sahrens spa_deactivate(spa); 627*fa9e4066Sahrens spa_remove(spa); 628*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 629*fa9e4066Sahrens return (error); 630*fa9e4066Sahrens } 631*fa9e4066Sahrens 632*fa9e4066Sahrens /* 633*fa9e4066Sahrens * Set the alternate root, if there is one. 634*fa9e4066Sahrens */ 635*fa9e4066Sahrens if (altroot != NULL) { 636*fa9e4066Sahrens atomic_add_32(&spa_active_count, 1); 637*fa9e4066Sahrens spa->spa_root = spa_strdup(altroot); 638*fa9e4066Sahrens } 639*fa9e4066Sahrens 640*fa9e4066Sahrens /* 641*fa9e4066Sahrens * Initialize the config based on the in-core state. 642*fa9e4066Sahrens */ 643*fa9e4066Sahrens config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644*fa9e4066Sahrens 645*fa9e4066Sahrens spa_config_set(spa, config); 646*fa9e4066Sahrens 647*fa9e4066Sahrens /* 648*fa9e4066Sahrens * Sync the configuration cache. 649*fa9e4066Sahrens */ 650*fa9e4066Sahrens spa_config_sync(); 651*fa9e4066Sahrens 652*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 653*fa9e4066Sahrens 654*fa9e4066Sahrens /* 655*fa9e4066Sahrens * Resilver anything that's out of date. 656*fa9e4066Sahrens */ 657*fa9e4066Sahrens if (spa_mode & FWRITE) 658*fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659*fa9e4066Sahrens 660*fa9e4066Sahrens return (0); 661*fa9e4066Sahrens } 662*fa9e4066Sahrens 663*fa9e4066Sahrens /* 664*fa9e4066Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 665*fa9e4066Sahrens * to get the vdev stats associated with the imported devices. 666*fa9e4066Sahrens */ 667*fa9e4066Sahrens #define TRYIMPORT_NAME "$import" 668*fa9e4066Sahrens 669*fa9e4066Sahrens nvlist_t * 670*fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig) 671*fa9e4066Sahrens { 672*fa9e4066Sahrens nvlist_t *config = NULL; 673*fa9e4066Sahrens char *poolname; 674*fa9e4066Sahrens spa_t *spa; 675*fa9e4066Sahrens uint64_t state; 676*fa9e4066Sahrens 677*fa9e4066Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678*fa9e4066Sahrens return (NULL); 679*fa9e4066Sahrens 680*fa9e4066Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681*fa9e4066Sahrens return (NULL); 682*fa9e4066Sahrens 683*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 684*fa9e4066Sahrens spa = spa_add(TRYIMPORT_NAME); 685*fa9e4066Sahrens 686*fa9e4066Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687*fa9e4066Sahrens 688*fa9e4066Sahrens /* 689*fa9e4066Sahrens * Initialize the spa_t structure. 690*fa9e4066Sahrens */ 691*fa9e4066Sahrens spa_activate(spa); 692*fa9e4066Sahrens 693*fa9e4066Sahrens /* 694*fa9e4066Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695*fa9e4066Sahrens * so we don't try to open the pool if the config is damaged. 696*fa9e4066Sahrens */ 697*fa9e4066Sahrens (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698*fa9e4066Sahrens 699*fa9e4066Sahrens /* 700*fa9e4066Sahrens * If 'tryconfig' was at least parsable, return the current config. 701*fa9e4066Sahrens */ 702*fa9e4066Sahrens if (spa->spa_root_vdev != NULL) { 703*fa9e4066Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704*fa9e4066Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705*fa9e4066Sahrens poolname) == 0); 706*fa9e4066Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707*fa9e4066Sahrens state) == 0); 708*fa9e4066Sahrens } 709*fa9e4066Sahrens 710*fa9e4066Sahrens spa_unload(spa); 711*fa9e4066Sahrens spa_deactivate(spa); 712*fa9e4066Sahrens spa_remove(spa); 713*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 714*fa9e4066Sahrens 715*fa9e4066Sahrens return (config); 716*fa9e4066Sahrens } 717*fa9e4066Sahrens 718*fa9e4066Sahrens /* 719*fa9e4066Sahrens * Pool export/destroy 720*fa9e4066Sahrens * 721*fa9e4066Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 722*fa9e4066Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 723*fa9e4066Sahrens * update the pool state and sync all the labels to disk, removing the 724*fa9e4066Sahrens * configuration from the cache afterwards. 725*fa9e4066Sahrens */ 726*fa9e4066Sahrens static int 727*fa9e4066Sahrens spa_export_common(char *pool, int new_state) 728*fa9e4066Sahrens { 729*fa9e4066Sahrens spa_t *spa; 730*fa9e4066Sahrens 731*fa9e4066Sahrens if (!(spa_mode & FWRITE)) 732*fa9e4066Sahrens return (EROFS); 733*fa9e4066Sahrens 734*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 735*fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 736*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 737*fa9e4066Sahrens return (ENOENT); 738*fa9e4066Sahrens } 739*fa9e4066Sahrens 740*fa9e4066Sahrens /* 741*fa9e4066Sahrens * The pool will be in core if it's openable, 742*fa9e4066Sahrens * in which case we can modify its state. 743*fa9e4066Sahrens */ 744*fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745*fa9e4066Sahrens /* 746*fa9e4066Sahrens * Objsets may be open only because they're dirty, so we 747*fa9e4066Sahrens * have to force it to sync before checking spa_refcnt. 748*fa9e4066Sahrens */ 749*fa9e4066Sahrens spa_scrub_suspend(spa); 750*fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 751*fa9e4066Sahrens 752*fa9e4066Sahrens if (!spa_refcount_zero(spa)) { 753*fa9e4066Sahrens spa_scrub_resume(spa); 754*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 755*fa9e4066Sahrens return (EBUSY); 756*fa9e4066Sahrens } 757*fa9e4066Sahrens 758*fa9e4066Sahrens /* 759*fa9e4066Sahrens * Update the pool state. 760*fa9e4066Sahrens */ 761*fa9e4066Sahrens spa->spa_state = new_state; 762*fa9e4066Sahrens 763*fa9e4066Sahrens spa_scrub_resume(spa); 764*fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765*fa9e4066Sahrens 766*fa9e4066Sahrens if (spa->spa_root != NULL) 767*fa9e4066Sahrens atomic_add_32(&spa_active_count, -1); 768*fa9e4066Sahrens 769*fa9e4066Sahrens /* 770*fa9e4066Sahrens * We want this to be reflected on every label, 771*fa9e4066Sahrens * so mark them all dirty. spa_unload() will do the 772*fa9e4066Sahrens * final sync that pushes these changes out. 773*fa9e4066Sahrens */ 774*fa9e4066Sahrens vdev_config_dirty(spa->spa_root_vdev); 775*fa9e4066Sahrens } 776*fa9e4066Sahrens 777*fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778*fa9e4066Sahrens spa_unload(spa); 779*fa9e4066Sahrens spa_deactivate(spa); 780*fa9e4066Sahrens } 781*fa9e4066Sahrens 782*fa9e4066Sahrens spa_remove(spa); 783*fa9e4066Sahrens spa_config_sync(); 784*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 785*fa9e4066Sahrens 786*fa9e4066Sahrens return (0); 787*fa9e4066Sahrens } 788*fa9e4066Sahrens 789*fa9e4066Sahrens /* 790*fa9e4066Sahrens * Destroy a storage pool. 791*fa9e4066Sahrens */ 792*fa9e4066Sahrens int 793*fa9e4066Sahrens spa_destroy(char *pool) 794*fa9e4066Sahrens { 795*fa9e4066Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796*fa9e4066Sahrens } 797*fa9e4066Sahrens 798*fa9e4066Sahrens /* 799*fa9e4066Sahrens * Export a storage pool. 800*fa9e4066Sahrens */ 801*fa9e4066Sahrens int 802*fa9e4066Sahrens spa_export(char *pool) 803*fa9e4066Sahrens { 804*fa9e4066Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805*fa9e4066Sahrens } 806*fa9e4066Sahrens 807*fa9e4066Sahrens /* 808*fa9e4066Sahrens * ========================================================================== 809*fa9e4066Sahrens * Device manipulation 810*fa9e4066Sahrens * ========================================================================== 811*fa9e4066Sahrens */ 812*fa9e4066Sahrens 813*fa9e4066Sahrens /* 814*fa9e4066Sahrens * Add capacity to a storage pool. 815*fa9e4066Sahrens */ 816*fa9e4066Sahrens int 817*fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818*fa9e4066Sahrens { 819*fa9e4066Sahrens uint64_t txg; 820*fa9e4066Sahrens int c, error; 821*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 822*fa9e4066Sahrens vdev_t *vd; 823*fa9e4066Sahrens 824*fa9e4066Sahrens txg = spa_vdev_enter(spa); 825*fa9e4066Sahrens 826*fa9e4066Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827*fa9e4066Sahrens 828*fa9e4066Sahrens if (vd == NULL) 829*fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830*fa9e4066Sahrens 831*fa9e4066Sahrens if (rvd == NULL) /* spa_create() */ 832*fa9e4066Sahrens spa->spa_root_vdev = rvd = vd; 833*fa9e4066Sahrens 834*fa9e4066Sahrens if ((error = vdev_create(vd, txg)) != 0) 835*fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 836*fa9e4066Sahrens 837*fa9e4066Sahrens /* 838*fa9e4066Sahrens * Transfer each top-level vdev from the temporary root 839*fa9e4066Sahrens * to the spa's root and initialize its metaslabs. 840*fa9e4066Sahrens */ 841*fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) { 842*fa9e4066Sahrens vdev_t *tvd = vd->vdev_child[c]; 843*fa9e4066Sahrens if (vd != rvd) { 844*fa9e4066Sahrens vdev_remove_child(vd, tvd); 845*fa9e4066Sahrens tvd->vdev_id = rvd->vdev_children; 846*fa9e4066Sahrens vdev_add_child(rvd, tvd); 847*fa9e4066Sahrens } 848*fa9e4066Sahrens vdev_init(tvd, txg); 849*fa9e4066Sahrens vdev_config_dirty(tvd); 850*fa9e4066Sahrens } 851*fa9e4066Sahrens 852*fa9e4066Sahrens /* 853*fa9e4066Sahrens * Update the config based on the new in-core state. 854*fa9e4066Sahrens */ 855*fa9e4066Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856*fa9e4066Sahrens 857*fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 858*fa9e4066Sahrens } 859*fa9e4066Sahrens 860*fa9e4066Sahrens /* 861*fa9e4066Sahrens * Attach a device to a mirror. The arguments are the path to any device 862*fa9e4066Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 863*fa9e4066Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 864*fa9e4066Sahrens * 865*fa9e4066Sahrens * If 'replacing' is specified, the new device is intended to replace the 866*fa9e4066Sahrens * existing device; in this case the two devices are made into their own 867*fa9e4066Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 868*fa9e4066Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 869*fa9e4066Sahrens * extra rules: you can't attach to it after it's been created, and upon 870*fa9e4066Sahrens * completion of resilvering, the first disk (the one being replaced) 871*fa9e4066Sahrens * is automatically detached. 872*fa9e4066Sahrens */ 873*fa9e4066Sahrens int 874*fa9e4066Sahrens spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875*fa9e4066Sahrens { 876*fa9e4066Sahrens uint64_t txg, open_txg; 877*fa9e4066Sahrens int error; 878*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 879*fa9e4066Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880*fa9e4066Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881*fa9e4066Sahrens 882*fa9e4066Sahrens txg = spa_vdev_enter(spa); 883*fa9e4066Sahrens 884*fa9e4066Sahrens oldvd = vdev_lookup_by_path(rvd, path); 885*fa9e4066Sahrens 886*fa9e4066Sahrens if (oldvd == NULL) 887*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888*fa9e4066Sahrens 889*fa9e4066Sahrens pvd = oldvd->vdev_parent; 890*fa9e4066Sahrens 891*fa9e4066Sahrens /* 892*fa9e4066Sahrens * The parent must be a mirror or the root, unless we're replacing; 893*fa9e4066Sahrens * in that case, the parent can be anything but another replacing vdev. 894*fa9e4066Sahrens */ 895*fa9e4066Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 896*fa9e4066Sahrens pvd->vdev_ops != &vdev_root_ops && 897*fa9e4066Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899*fa9e4066Sahrens 900*fa9e4066Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901*fa9e4066Sahrens 902*fa9e4066Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 903*fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904*fa9e4066Sahrens 905*fa9e4066Sahrens newvd = newrootvd->vdev_child[0]; 906*fa9e4066Sahrens 907*fa9e4066Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 908*fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909*fa9e4066Sahrens 910*fa9e4066Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 911*fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 912*fa9e4066Sahrens 913*fa9e4066Sahrens if (newvd->vdev_psize < oldvd->vdev_psize) 914*fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 915*fa9e4066Sahrens 916*fa9e4066Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 917*fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 918*fa9e4066Sahrens 919*fa9e4066Sahrens /* 920*fa9e4066Sahrens * If this is an in-place replacement, update oldvd's path and devid 921*fa9e4066Sahrens * to make it distinguishable from newvd, and unopenable from now on. 922*fa9e4066Sahrens */ 923*fa9e4066Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 924*fa9e4066Sahrens spa_strfree(oldvd->vdev_path); 925*fa9e4066Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 926*fa9e4066Sahrens KM_SLEEP); 927*fa9e4066Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 928*fa9e4066Sahrens newvd->vdev_path, "old"); 929*fa9e4066Sahrens if (oldvd->vdev_devid != NULL) { 930*fa9e4066Sahrens spa_strfree(oldvd->vdev_devid); 931*fa9e4066Sahrens oldvd->vdev_devid = NULL; 932*fa9e4066Sahrens } 933*fa9e4066Sahrens } 934*fa9e4066Sahrens 935*fa9e4066Sahrens /* 936*fa9e4066Sahrens * If the parent is not a mirror, or if we're replacing, 937*fa9e4066Sahrens * insert the new mirror/replacing vdev above oldvd. 938*fa9e4066Sahrens */ 939*fa9e4066Sahrens if (pvd->vdev_ops != pvops) 940*fa9e4066Sahrens pvd = vdev_add_parent(oldvd, pvops); 941*fa9e4066Sahrens 942*fa9e4066Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 943*fa9e4066Sahrens ASSERT(pvd->vdev_ops == pvops); 944*fa9e4066Sahrens ASSERT(oldvd->vdev_parent == pvd); 945*fa9e4066Sahrens 946*fa9e4066Sahrens /* 947*fa9e4066Sahrens * Extract the new device from its root and add it to pvd. 948*fa9e4066Sahrens */ 949*fa9e4066Sahrens vdev_remove_child(newrootvd, newvd); 950*fa9e4066Sahrens newvd->vdev_id = pvd->vdev_children; 951*fa9e4066Sahrens vdev_add_child(pvd, newvd); 952*fa9e4066Sahrens 953*fa9e4066Sahrens tvd = newvd->vdev_top; 954*fa9e4066Sahrens ASSERT(pvd->vdev_top == tvd); 955*fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 956*fa9e4066Sahrens 957*fa9e4066Sahrens /* 958*fa9e4066Sahrens * Update the config based on the new in-core state. 959*fa9e4066Sahrens */ 960*fa9e4066Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 961*fa9e4066Sahrens 962*fa9e4066Sahrens vdev_config_dirty(tvd); 963*fa9e4066Sahrens 964*fa9e4066Sahrens /* 965*fa9e4066Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 966*fa9e4066Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 967*fa9e4066Sahrens */ 968*fa9e4066Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 969*fa9e4066Sahrens 970*fa9e4066Sahrens mutex_enter(&newvd->vdev_dtl_lock); 971*fa9e4066Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 972*fa9e4066Sahrens open_txg - TXG_INITIAL + 1); 973*fa9e4066Sahrens mutex_exit(&newvd->vdev_dtl_lock); 974*fa9e4066Sahrens 975*fa9e4066Sahrens /* 976*fa9e4066Sahrens * Mark newvd's DTL dirty in this txg. 977*fa9e4066Sahrens */ 978*fa9e4066Sahrens vdev_dirty(tvd, VDD_DTL, txg); 979*fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 980*fa9e4066Sahrens 981*fa9e4066Sahrens dprintf("attached %s, replacing=%d\n", path, replacing); 982*fa9e4066Sahrens 983*fa9e4066Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 984*fa9e4066Sahrens 985*fa9e4066Sahrens /* 986*fa9e4066Sahrens * Kick off a resilver to update newvd. 987*fa9e4066Sahrens */ 988*fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 989*fa9e4066Sahrens 990*fa9e4066Sahrens return (0); 991*fa9e4066Sahrens } 992*fa9e4066Sahrens 993*fa9e4066Sahrens /* 994*fa9e4066Sahrens * Detach a device from a mirror or replacing vdev. 995*fa9e4066Sahrens * If 'replace_done' is specified, only detach if the parent 996*fa9e4066Sahrens * is a replacing vdev. 997*fa9e4066Sahrens */ 998*fa9e4066Sahrens int 999*fa9e4066Sahrens spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1000*fa9e4066Sahrens { 1001*fa9e4066Sahrens uint64_t txg; 1002*fa9e4066Sahrens int c, t, error; 1003*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1004*fa9e4066Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1005*fa9e4066Sahrens 1006*fa9e4066Sahrens txg = spa_vdev_enter(spa); 1007*fa9e4066Sahrens 1008*fa9e4066Sahrens vd = vdev_lookup_by_path(rvd, path); 1009*fa9e4066Sahrens 1010*fa9e4066Sahrens if (vd == NULL) 1011*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1012*fa9e4066Sahrens 1013*fa9e4066Sahrens if (guid != 0 && vd->vdev_guid != guid) 1014*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1015*fa9e4066Sahrens 1016*fa9e4066Sahrens pvd = vd->vdev_parent; 1017*fa9e4066Sahrens 1018*fa9e4066Sahrens /* 1019*fa9e4066Sahrens * If replace_done is specified, only remove this device if it's 1020*fa9e4066Sahrens * the first child of a replacing vdev. 1021*fa9e4066Sahrens */ 1022*fa9e4066Sahrens if (replace_done && 1023*fa9e4066Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1024*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1025*fa9e4066Sahrens 1026*fa9e4066Sahrens /* 1027*fa9e4066Sahrens * Only mirror and replacing vdevs support detach. 1028*fa9e4066Sahrens */ 1029*fa9e4066Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1030*fa9e4066Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1031*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1032*fa9e4066Sahrens 1033*fa9e4066Sahrens /* 1034*fa9e4066Sahrens * If there's only one replica, you can't detach it. 1035*fa9e4066Sahrens */ 1036*fa9e4066Sahrens if (pvd->vdev_children <= 1) 1037*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1038*fa9e4066Sahrens 1039*fa9e4066Sahrens /* 1040*fa9e4066Sahrens * If all siblings have non-empty DTLs, this device may have the only 1041*fa9e4066Sahrens * valid copy of the data, which means we cannot safely detach it. 1042*fa9e4066Sahrens * 1043*fa9e4066Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1044*fa9e4066Sahrens * precise DTL check. 1045*fa9e4066Sahrens */ 1046*fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1047*fa9e4066Sahrens uint64_t dirty; 1048*fa9e4066Sahrens 1049*fa9e4066Sahrens cvd = pvd->vdev_child[c]; 1050*fa9e4066Sahrens if (cvd == vd) 1051*fa9e4066Sahrens continue; 1052*fa9e4066Sahrens if (vdev_is_dead(cvd)) 1053*fa9e4066Sahrens continue; 1054*fa9e4066Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1055*fa9e4066Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1056*fa9e4066Sahrens cvd->vdev_dtl_scrub.sm_space; 1057*fa9e4066Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1058*fa9e4066Sahrens if (!dirty) 1059*fa9e4066Sahrens break; 1060*fa9e4066Sahrens } 1061*fa9e4066Sahrens if (c == pvd->vdev_children) 1062*fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1063*fa9e4066Sahrens 1064*fa9e4066Sahrens /* 1065*fa9e4066Sahrens * Erase the disk labels so the disk can be used for other things. 1066*fa9e4066Sahrens * This must be done after all other error cases are handled, 1067*fa9e4066Sahrens * but before we disembowel vd (so we can still do I/O to it). 1068*fa9e4066Sahrens * But if we can't do it, don't treat the error as fatal -- 1069*fa9e4066Sahrens * it may be that the unwritability of the disk is the reason 1070*fa9e4066Sahrens * it's being detached! 1071*fa9e4066Sahrens */ 1072*fa9e4066Sahrens error = vdev_label_init(vd, 0); 1073*fa9e4066Sahrens if (error) 1074*fa9e4066Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1075*fa9e4066Sahrens 1076*fa9e4066Sahrens /* 1077*fa9e4066Sahrens * Remove vd from its parent and compact the parent's children. 1078*fa9e4066Sahrens */ 1079*fa9e4066Sahrens vdev_remove_child(pvd, vd); 1080*fa9e4066Sahrens vdev_compact_children(pvd); 1081*fa9e4066Sahrens 1082*fa9e4066Sahrens /* 1083*fa9e4066Sahrens * Remember one of the remaining children so we can get tvd below. 1084*fa9e4066Sahrens */ 1085*fa9e4066Sahrens cvd = pvd->vdev_child[0]; 1086*fa9e4066Sahrens 1087*fa9e4066Sahrens /* 1088*fa9e4066Sahrens * If the parent mirror/replacing vdev only has one child, 1089*fa9e4066Sahrens * the parent is no longer needed. Remove it from the tree. 1090*fa9e4066Sahrens */ 1091*fa9e4066Sahrens if (pvd->vdev_children == 1) 1092*fa9e4066Sahrens vdev_remove_parent(cvd); 1093*fa9e4066Sahrens 1094*fa9e4066Sahrens /* 1095*fa9e4066Sahrens * We don't set tvd until now because the parent we just removed 1096*fa9e4066Sahrens * may have been the previous top-level vdev. 1097*fa9e4066Sahrens */ 1098*fa9e4066Sahrens tvd = cvd->vdev_top; 1099*fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 1100*fa9e4066Sahrens 1101*fa9e4066Sahrens /* 1102*fa9e4066Sahrens * Reopen this top-level vdev to reassess health after detach. 1103*fa9e4066Sahrens */ 1104*fa9e4066Sahrens vdev_reopen(tvd, NULL); 1105*fa9e4066Sahrens 1106*fa9e4066Sahrens /* 1107*fa9e4066Sahrens * If the device we just detached was smaller than the others, 1108*fa9e4066Sahrens * it may be possible to add metaslabs (i.e. grow the pool). 1109*fa9e4066Sahrens */ 1110*fa9e4066Sahrens vdev_metaslab_init(tvd, txg); 1111*fa9e4066Sahrens 1112*fa9e4066Sahrens /* 1113*fa9e4066Sahrens * Update the config based on the new in-core state. 1114*fa9e4066Sahrens */ 1115*fa9e4066Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1116*fa9e4066Sahrens 1117*fa9e4066Sahrens vdev_config_dirty(tvd); 1118*fa9e4066Sahrens 1119*fa9e4066Sahrens /* 1120*fa9e4066Sahrens * Mark vd's DTL as dirty in this txg. 1121*fa9e4066Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1122*fa9e4066Sahrens * and free vd's DTL object in syncing context. 1123*fa9e4066Sahrens * But first make sure we're not on any *other* txg's DTL list, 1124*fa9e4066Sahrens * to prevent vd from being accessed after it's freed. 1125*fa9e4066Sahrens */ 1126*fa9e4066Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1127*fa9e4066Sahrens vd->vdev_detached = B_TRUE; 1128*fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 1129*fa9e4066Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1130*fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1131*fa9e4066Sahrens 1132*fa9e4066Sahrens dprintf("detached %s\n", path); 1133*fa9e4066Sahrens 1134*fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1135*fa9e4066Sahrens } 1136*fa9e4066Sahrens 1137*fa9e4066Sahrens /* 1138*fa9e4066Sahrens * If there are any replacing vdevs that have finished replacing, detach them. 1139*fa9e4066Sahrens * We can't hold the config lock across detaches, so we lock the config, 1140*fa9e4066Sahrens * build a list of candidates, unlock the config, and try each candidate. 1141*fa9e4066Sahrens */ 1142*fa9e4066Sahrens typedef struct vdev_detach_link { 1143*fa9e4066Sahrens char *vdl_path; 1144*fa9e4066Sahrens uint64_t vdl_guid; 1145*fa9e4066Sahrens list_node_t vdl_node; 1146*fa9e4066Sahrens } vdev_detach_link_t; 1147*fa9e4066Sahrens 1148*fa9e4066Sahrens static void 1149*fa9e4066Sahrens spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1150*fa9e4066Sahrens { 1151*fa9e4066Sahrens int c; 1152*fa9e4066Sahrens 1153*fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1154*fa9e4066Sahrens spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1155*fa9e4066Sahrens 1156*fa9e4066Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1157*fa9e4066Sahrens vdev_t *cvd0 = vd->vdev_child[0]; 1158*fa9e4066Sahrens vdev_t *cvd1 = vd->vdev_child[1]; 1159*fa9e4066Sahrens vdev_detach_link_t *vdl; 1160*fa9e4066Sahrens int dirty1; 1161*fa9e4066Sahrens 1162*fa9e4066Sahrens mutex_enter(&cvd1->vdev_dtl_lock); 1163*fa9e4066Sahrens dirty1 = cvd1->vdev_dtl_map.sm_space | 1164*fa9e4066Sahrens cvd1->vdev_dtl_scrub.sm_space; 1165*fa9e4066Sahrens mutex_exit(&cvd1->vdev_dtl_lock); 1166*fa9e4066Sahrens 1167*fa9e4066Sahrens if (!dirty1) { 1168*fa9e4066Sahrens vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1169*fa9e4066Sahrens vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1170*fa9e4066Sahrens vdl->vdl_guid = cvd0->vdev_guid; 1171*fa9e4066Sahrens list_insert_tail(l, vdl); 1172*fa9e4066Sahrens } 1173*fa9e4066Sahrens } 1174*fa9e4066Sahrens } 1175*fa9e4066Sahrens 1176*fa9e4066Sahrens void 1177*fa9e4066Sahrens spa_vdev_replace_done(spa_t *spa) 1178*fa9e4066Sahrens { 1179*fa9e4066Sahrens vdev_detach_link_t *vdl; 1180*fa9e4066Sahrens list_t vdlist; 1181*fa9e4066Sahrens 1182*fa9e4066Sahrens list_create(&vdlist, sizeof (vdev_detach_link_t), 1183*fa9e4066Sahrens offsetof(vdev_detach_link_t, vdl_node)); 1184*fa9e4066Sahrens 1185*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 1186*fa9e4066Sahrens spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1187*fa9e4066Sahrens spa_config_exit(spa); 1188*fa9e4066Sahrens 1189*fa9e4066Sahrens while ((vdl = list_head(&vdlist)) != NULL) { 1190*fa9e4066Sahrens list_remove(&vdlist, vdl); 1191*fa9e4066Sahrens (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1192*fa9e4066Sahrens B_TRUE); 1193*fa9e4066Sahrens spa_strfree(vdl->vdl_path); 1194*fa9e4066Sahrens kmem_free(vdl, sizeof (*vdl)); 1195*fa9e4066Sahrens } 1196*fa9e4066Sahrens 1197*fa9e4066Sahrens list_destroy(&vdlist); 1198*fa9e4066Sahrens } 1199*fa9e4066Sahrens 1200*fa9e4066Sahrens /* 1201*fa9e4066Sahrens * ========================================================================== 1202*fa9e4066Sahrens * SPA Scrubbing 1203*fa9e4066Sahrens * ========================================================================== 1204*fa9e4066Sahrens */ 1205*fa9e4066Sahrens 1206*fa9e4066Sahrens static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1207*fa9e4066Sahrens 1208*fa9e4066Sahrens static void 1209*fa9e4066Sahrens spa_scrub_io_done(zio_t *zio) 1210*fa9e4066Sahrens { 1211*fa9e4066Sahrens spa_t *spa = zio->io_spa; 1212*fa9e4066Sahrens 1213*fa9e4066Sahrens zio_buf_free(zio->io_data, zio->io_size); 1214*fa9e4066Sahrens 1215*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1216*fa9e4066Sahrens if (zio->io_error) 1217*fa9e4066Sahrens spa->spa_scrub_errors++; 1218*fa9e4066Sahrens if (--spa->spa_scrub_inflight == 0) 1219*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_io_cv); 1220*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1221*fa9e4066Sahrens 1222*fa9e4066Sahrens if (zio->io_error) { 1223*fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1224*fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1225*fa9e4066Sahrens vd->vdev_stat.vs_scrub_errors++; 1226*fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1227*fa9e4066Sahrens } 1228*fa9e4066Sahrens } 1229*fa9e4066Sahrens 1230*fa9e4066Sahrens static void 1231*fa9e4066Sahrens spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1232*fa9e4066Sahrens { 1233*fa9e4066Sahrens size_t size = BP_GET_LSIZE(bp); 1234*fa9e4066Sahrens void *data = zio_buf_alloc(size); 1235*fa9e4066Sahrens 1236*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1237*fa9e4066Sahrens spa->spa_scrub_inflight++; 1238*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1239*fa9e4066Sahrens 1240*fa9e4066Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 1241*fa9e4066Sahrens spa_scrub_io_done, NULL, priority, flags)); 1242*fa9e4066Sahrens } 1243*fa9e4066Sahrens 1244*fa9e4066Sahrens /* ARGSUSED */ 1245*fa9e4066Sahrens static int 1246*fa9e4066Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1247*fa9e4066Sahrens { 1248*fa9e4066Sahrens blkptr_t *bp = &bc->bc_blkptr; 1249*fa9e4066Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1250*fa9e4066Sahrens 1251*fa9e4066Sahrens if (bc->bc_errno || vd == NULL) { 1252*fa9e4066Sahrens /* 1253*fa9e4066Sahrens * We can't scrub this block, but we can continue to scrub 1254*fa9e4066Sahrens * the rest of the pool. Note the error and move along. 1255*fa9e4066Sahrens */ 1256*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1257*fa9e4066Sahrens spa->spa_scrub_errors++; 1258*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1259*fa9e4066Sahrens 1260*fa9e4066Sahrens if (vd != NULL) { 1261*fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1262*fa9e4066Sahrens vd->vdev_stat.vs_scrub_errors++; 1263*fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1264*fa9e4066Sahrens } 1265*fa9e4066Sahrens 1266*fa9e4066Sahrens return (ERESTART); 1267*fa9e4066Sahrens } 1268*fa9e4066Sahrens 1269*fa9e4066Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1270*fa9e4066Sahrens 1271*fa9e4066Sahrens /* 1272*fa9e4066Sahrens * Keep track of how much data we've examined so that 1273*fa9e4066Sahrens * zpool(1M) status can make useful progress reports. 1274*fa9e4066Sahrens */ 1275*fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1276*fa9e4066Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1277*fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1278*fa9e4066Sahrens 1279*fa9e4066Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1280*fa9e4066Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1281*fa9e4066Sahrens /* 1282*fa9e4066Sahrens * Gang members may be spread across multiple vdevs, 1283*fa9e4066Sahrens * so the best we can do is look at the pool-wide DTL. 1284*fa9e4066Sahrens * XXX -- it would be better to change our allocation 1285*fa9e4066Sahrens * policy to ensure that this can't happen. 1286*fa9e4066Sahrens */ 1287*fa9e4066Sahrens vd = spa->spa_root_vdev; 1288*fa9e4066Sahrens } 1289*fa9e4066Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1290*fa9e4066Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1291*fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1292*fa9e4066Sahrens ZIO_FLAG_RESILVER); 1293*fa9e4066Sahrens } 1294*fa9e4066Sahrens } else { 1295*fa9e4066Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1296*fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1297*fa9e4066Sahrens } 1298*fa9e4066Sahrens 1299*fa9e4066Sahrens return (0); 1300*fa9e4066Sahrens } 1301*fa9e4066Sahrens 1302*fa9e4066Sahrens static void 1303*fa9e4066Sahrens spa_scrub_thread(spa_t *spa) 1304*fa9e4066Sahrens { 1305*fa9e4066Sahrens callb_cpr_t cprinfo; 1306*fa9e4066Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1307*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1308*fa9e4066Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1309*fa9e4066Sahrens int error = 0; 1310*fa9e4066Sahrens boolean_t complete; 1311*fa9e4066Sahrens 1312*fa9e4066Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1313*fa9e4066Sahrens 1314*fa9e4066Sahrens spa_config_enter(spa, RW_WRITER); 1315*fa9e4066Sahrens vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1316*fa9e4066Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1317*fa9e4066Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1318*fa9e4066Sahrens spa_config_exit(spa); 1319*fa9e4066Sahrens 1320*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1321*fa9e4066Sahrens spa->spa_scrub_errors = 0; 1322*fa9e4066Sahrens spa->spa_scrub_active = 1; 1323*fa9e4066Sahrens 1324*fa9e4066Sahrens while (!spa->spa_scrub_stop) { 1325*fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 1326*fa9e4066Sahrens while (spa->spa_scrub_suspend) { 1327*fa9e4066Sahrens spa->spa_scrub_active = 0; 1328*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1329*fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1330*fa9e4066Sahrens spa->spa_scrub_active = 1; 1331*fa9e4066Sahrens } 1332*fa9e4066Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1333*fa9e4066Sahrens 1334*fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 1335*fa9e4066Sahrens break; 1336*fa9e4066Sahrens 1337*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1338*fa9e4066Sahrens error = traverse_more(th); 1339*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1340*fa9e4066Sahrens if (error != EAGAIN) 1341*fa9e4066Sahrens break; 1342*fa9e4066Sahrens } 1343*fa9e4066Sahrens 1344*fa9e4066Sahrens while (spa->spa_scrub_inflight) 1345*fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1346*fa9e4066Sahrens 1347*fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 1348*fa9e4066Sahrens error = ERESTART; 1349*fa9e4066Sahrens 1350*fa9e4066Sahrens spa->spa_scrub_active = 0; 1351*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1352*fa9e4066Sahrens 1353*fa9e4066Sahrens /* 1354*fa9e4066Sahrens * If the traverse completed, and there were no errors, 1355*fa9e4066Sahrens * then the scrub was completely successful. 1356*fa9e4066Sahrens */ 1357*fa9e4066Sahrens complete = (error == 0 && spa->spa_scrub_errors == 0); 1358*fa9e4066Sahrens 1359*fa9e4066Sahrens dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1360*fa9e4066Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1361*fa9e4066Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1362*fa9e4066Sahrens 1363*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1364*fa9e4066Sahrens 1365*fa9e4066Sahrens /* 1366*fa9e4066Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1367*fa9e4066Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1368*fa9e4066Sahrens */ 1369*fa9e4066Sahrens spa_config_enter(spa, RW_WRITER); 1370*fa9e4066Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1371*fa9e4066Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1372*fa9e4066Sahrens spa_config_exit(spa); 1373*fa9e4066Sahrens 1374*fa9e4066Sahrens spa_vdev_replace_done(spa); 1375*fa9e4066Sahrens 1376*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 1377*fa9e4066Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1378*fa9e4066Sahrens spa_config_exit(spa); 1379*fa9e4066Sahrens 1380*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1381*fa9e4066Sahrens 1382*fa9e4066Sahrens spa->spa_scrub_type = POOL_SCRUB_NONE; 1383*fa9e4066Sahrens spa->spa_scrub_active = 0; 1384*fa9e4066Sahrens spa->spa_scrub_thread = NULL; 1385*fa9e4066Sahrens 1386*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1387*fa9e4066Sahrens 1388*fa9e4066Sahrens /* 1389*fa9e4066Sahrens * If we were told to restart, our final act is to start a new scrub. 1390*fa9e4066Sahrens */ 1391*fa9e4066Sahrens if (error == ERESTART) 1392*fa9e4066Sahrens VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1393*fa9e4066Sahrens 1394*fa9e4066Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1395*fa9e4066Sahrens thread_exit(); 1396*fa9e4066Sahrens } 1397*fa9e4066Sahrens 1398*fa9e4066Sahrens void 1399*fa9e4066Sahrens spa_scrub_suspend(spa_t *spa) 1400*fa9e4066Sahrens { 1401*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1402*fa9e4066Sahrens spa->spa_scrub_suspend++; 1403*fa9e4066Sahrens while (spa->spa_scrub_active) { 1404*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1405*fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1406*fa9e4066Sahrens } 1407*fa9e4066Sahrens while (spa->spa_scrub_inflight) 1408*fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1409*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1410*fa9e4066Sahrens } 1411*fa9e4066Sahrens 1412*fa9e4066Sahrens void 1413*fa9e4066Sahrens spa_scrub_resume(spa_t *spa) 1414*fa9e4066Sahrens { 1415*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1416*fa9e4066Sahrens ASSERT(spa->spa_scrub_suspend != 0); 1417*fa9e4066Sahrens if (--spa->spa_scrub_suspend == 0) 1418*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1419*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1420*fa9e4066Sahrens } 1421*fa9e4066Sahrens 1422*fa9e4066Sahrens void 1423*fa9e4066Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1424*fa9e4066Sahrens { 1425*fa9e4066Sahrens /* 1426*fa9e4066Sahrens * Something happened (e.g. snapshot create/delete) that means 1427*fa9e4066Sahrens * we must restart any in-progress scrubs. The itinerary will 1428*fa9e4066Sahrens * fix this properly. 1429*fa9e4066Sahrens */ 1430*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1431*fa9e4066Sahrens spa->spa_scrub_restart_txg = txg; 1432*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1433*fa9e4066Sahrens } 1434*fa9e4066Sahrens 1435*fa9e4066Sahrens static int 1436*fa9e4066Sahrens spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1437*fa9e4066Sahrens { 1438*fa9e4066Sahrens space_seg_t *ss; 1439*fa9e4066Sahrens uint64_t mintxg, maxtxg; 1440*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1441*fa9e4066Sahrens int advance = 0; 1442*fa9e4066Sahrens 1443*fa9e4066Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1444*fa9e4066Sahrens return (ENOTSUP); 1445*fa9e4066Sahrens 1446*fa9e4066Sahrens /* 1447*fa9e4066Sahrens * If there's a scrub or resilver already in progress, stop it. 1448*fa9e4066Sahrens */ 1449*fa9e4066Sahrens while (spa->spa_scrub_thread != NULL) { 1450*fa9e4066Sahrens /* 1451*fa9e4066Sahrens * Don't stop a resilver unless forced. 1452*fa9e4066Sahrens */ 1453*fa9e4066Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1454*fa9e4066Sahrens return (EBUSY); 1455*fa9e4066Sahrens 1456*fa9e4066Sahrens spa->spa_scrub_stop = 1; 1457*fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 1458*fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1459*fa9e4066Sahrens } 1460*fa9e4066Sahrens 1461*fa9e4066Sahrens /* 1462*fa9e4066Sahrens * Terminate the previous traverse. 1463*fa9e4066Sahrens */ 1464*fa9e4066Sahrens if (spa->spa_scrub_th != NULL) { 1465*fa9e4066Sahrens traverse_fini(spa->spa_scrub_th); 1466*fa9e4066Sahrens spa->spa_scrub_th = NULL; 1467*fa9e4066Sahrens } 1468*fa9e4066Sahrens 1469*fa9e4066Sahrens spa->spa_scrub_stop = 0; 1470*fa9e4066Sahrens spa->spa_scrub_type = type; 1471*fa9e4066Sahrens spa->spa_scrub_restart_txg = 0; 1472*fa9e4066Sahrens 1473*fa9e4066Sahrens mintxg = TXG_INITIAL - 1; 1474*fa9e4066Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1475*fa9e4066Sahrens 1476*fa9e4066Sahrens switch (type) { 1477*fa9e4066Sahrens 1478*fa9e4066Sahrens case POOL_SCRUB_NONE: 1479*fa9e4066Sahrens break; 1480*fa9e4066Sahrens 1481*fa9e4066Sahrens case POOL_SCRUB_RESILVER: 1482*fa9e4066Sahrens /* 1483*fa9e4066Sahrens * Determine the resilvering boundaries. 1484*fa9e4066Sahrens * 1485*fa9e4066Sahrens * Note: (mintxg, maxtxg) is an open interval, 1486*fa9e4066Sahrens * i.e. mintxg and maxtxg themselves are not included. 1487*fa9e4066Sahrens * 1488*fa9e4066Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1489*fa9e4066Sahrens * so we don't claim to resilver a txg that's still changing. 1490*fa9e4066Sahrens */ 1491*fa9e4066Sahrens mutex_enter(&rvd->vdev_dtl_lock); 1492*fa9e4066Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1493*fa9e4066Sahrens mintxg = ss ? ss->ss_start - 1 : 0; 1494*fa9e4066Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1495*fa9e4066Sahrens maxtxg = ss ? ss->ss_end : 0; 1496*fa9e4066Sahrens maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1497*fa9e4066Sahrens mutex_exit(&rvd->vdev_dtl_lock); 1498*fa9e4066Sahrens 1499*fa9e4066Sahrens advance = ADVANCE_PRE | ADVANCE_PRUNE; 1500*fa9e4066Sahrens break; 1501*fa9e4066Sahrens 1502*fa9e4066Sahrens case POOL_SCRUB_EVERYTHING: 1503*fa9e4066Sahrens /* 1504*fa9e4066Sahrens * A scrub is like a resilver, but not pruned by DTL. 1505*fa9e4066Sahrens */ 1506*fa9e4066Sahrens advance = ADVANCE_PRE; 1507*fa9e4066Sahrens break; 1508*fa9e4066Sahrens } 1509*fa9e4066Sahrens 1510*fa9e4066Sahrens if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1511*fa9e4066Sahrens spa->spa_scrub_maxtxg = maxtxg; 1512*fa9e4066Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1513*fa9e4066Sahrens advance, ZIO_FLAG_CANFAIL); 1514*fa9e4066Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1515*fa9e4066Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1516*fa9e4066Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1517*fa9e4066Sahrens } 1518*fa9e4066Sahrens 1519*fa9e4066Sahrens return (0); 1520*fa9e4066Sahrens } 1521*fa9e4066Sahrens 1522*fa9e4066Sahrens int 1523*fa9e4066Sahrens spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1524*fa9e4066Sahrens { 1525*fa9e4066Sahrens int error; 1526*fa9e4066Sahrens traverse_handle_t *th; 1527*fa9e4066Sahrens 1528*fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 1529*fa9e4066Sahrens error = spa_scrub_locked(spa, type, force); 1530*fa9e4066Sahrens th = spa->spa_scrub_th; 1531*fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 1532*fa9e4066Sahrens 1533*fa9e4066Sahrens if (th == NULL && type != POOL_SCRUB_NONE) 1534*fa9e4066Sahrens spa_vdev_replace_done(spa); 1535*fa9e4066Sahrens 1536*fa9e4066Sahrens return (error); 1537*fa9e4066Sahrens } 1538*fa9e4066Sahrens 1539*fa9e4066Sahrens /* 1540*fa9e4066Sahrens * ========================================================================== 1541*fa9e4066Sahrens * SPA syncing routines 1542*fa9e4066Sahrens * ========================================================================== 1543*fa9e4066Sahrens */ 1544*fa9e4066Sahrens 1545*fa9e4066Sahrens static void 1546*fa9e4066Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1547*fa9e4066Sahrens { 1548*fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1549*fa9e4066Sahrens dmu_tx_t *tx; 1550*fa9e4066Sahrens blkptr_t blk; 1551*fa9e4066Sahrens uint64_t itor = 0; 1552*fa9e4066Sahrens zio_t *zio; 1553*fa9e4066Sahrens int error; 1554*fa9e4066Sahrens uint8_t c = 1; 1555*fa9e4066Sahrens 1556*fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1557*fa9e4066Sahrens 1558*fa9e4066Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1559*fa9e4066Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1560*fa9e4066Sahrens 1561*fa9e4066Sahrens error = zio_wait(zio); 1562*fa9e4066Sahrens ASSERT3U(error, ==, 0); 1563*fa9e4066Sahrens 1564*fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1565*fa9e4066Sahrens bplist_vacate(bpl, tx); 1566*fa9e4066Sahrens 1567*fa9e4066Sahrens /* 1568*fa9e4066Sahrens * Pre-dirty the first block so we sync to convergence faster. 1569*fa9e4066Sahrens * (Usually only the first block is needed.) 1570*fa9e4066Sahrens */ 1571*fa9e4066Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1572*fa9e4066Sahrens dmu_tx_commit(tx); 1573*fa9e4066Sahrens } 1574*fa9e4066Sahrens 1575*fa9e4066Sahrens static void 1576*fa9e4066Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1577*fa9e4066Sahrens { 1578*fa9e4066Sahrens nvlist_t *config; 1579*fa9e4066Sahrens char *packed = NULL; 1580*fa9e4066Sahrens size_t nvsize = 0; 1581*fa9e4066Sahrens dmu_buf_t *db; 1582*fa9e4066Sahrens 1583*fa9e4066Sahrens if (list_is_empty(&spa->spa_dirty_list)) 1584*fa9e4066Sahrens return; 1585*fa9e4066Sahrens 1586*fa9e4066Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1587*fa9e4066Sahrens 1588*fa9e4066Sahrens spa_config_set(spa, config); 1589*fa9e4066Sahrens 1590*fa9e4066Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1591*fa9e4066Sahrens 1592*fa9e4066Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 1593*fa9e4066Sahrens 1594*fa9e4066Sahrens VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1595*fa9e4066Sahrens 1596*fa9e4066Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1597*fa9e4066Sahrens packed, tx); 1598*fa9e4066Sahrens 1599*fa9e4066Sahrens kmem_free(packed, nvsize); 1600*fa9e4066Sahrens 1601*fa9e4066Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1602*fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 1603*fa9e4066Sahrens *(uint64_t *)db->db_data = nvsize; 1604*fa9e4066Sahrens dmu_buf_rele(db); 1605*fa9e4066Sahrens } 1606*fa9e4066Sahrens 1607*fa9e4066Sahrens /* 1608*fa9e4066Sahrens * Sync the specified transaction group. New blocks may be dirtied as 1609*fa9e4066Sahrens * part of the process, so we iterate until it converges. 1610*fa9e4066Sahrens */ 1611*fa9e4066Sahrens void 1612*fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg) 1613*fa9e4066Sahrens { 1614*fa9e4066Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 1615*fa9e4066Sahrens objset_t *mos = spa->spa_meta_objset; 1616*fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1617*fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1618*fa9e4066Sahrens vdev_t *vd; 1619*fa9e4066Sahrens dmu_tx_t *tx; 1620*fa9e4066Sahrens int dirty_vdevs; 1621*fa9e4066Sahrens 1622*fa9e4066Sahrens /* 1623*fa9e4066Sahrens * Lock out configuration changes. 1624*fa9e4066Sahrens */ 1625*fa9e4066Sahrens spa_config_enter(spa, RW_READER); 1626*fa9e4066Sahrens 1627*fa9e4066Sahrens spa->spa_syncing_txg = txg; 1628*fa9e4066Sahrens spa->spa_sync_pass = 0; 1629*fa9e4066Sahrens 1630*fa9e4066Sahrens bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1631*fa9e4066Sahrens 1632*fa9e4066Sahrens /* 1633*fa9e4066Sahrens * If anything has changed in this txg, push the deferred frees 1634*fa9e4066Sahrens * from the previous txg. If not, leave them alone so that we 1635*fa9e4066Sahrens * don't generate work on an otherwise idle system. 1636*fa9e4066Sahrens */ 1637*fa9e4066Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1638*fa9e4066Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1639*fa9e4066Sahrens spa_sync_deferred_frees(spa, txg); 1640*fa9e4066Sahrens 1641*fa9e4066Sahrens /* 1642*fa9e4066Sahrens * Iterate to convergence. 1643*fa9e4066Sahrens */ 1644*fa9e4066Sahrens do { 1645*fa9e4066Sahrens spa->spa_sync_pass++; 1646*fa9e4066Sahrens 1647*fa9e4066Sahrens tx = dmu_tx_create_assigned(dp, txg); 1648*fa9e4066Sahrens spa_sync_config_object(spa, tx); 1649*fa9e4066Sahrens dmu_tx_commit(tx); 1650*fa9e4066Sahrens 1651*fa9e4066Sahrens dsl_pool_sync(dp, txg); 1652*fa9e4066Sahrens 1653*fa9e4066Sahrens dirty_vdevs = 0; 1654*fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1655*fa9e4066Sahrens vdev_sync(vd, txg); 1656*fa9e4066Sahrens dirty_vdevs++; 1657*fa9e4066Sahrens } 1658*fa9e4066Sahrens 1659*fa9e4066Sahrens tx = dmu_tx_create_assigned(dp, txg); 1660*fa9e4066Sahrens bplist_sync(bpl, tx); 1661*fa9e4066Sahrens dmu_tx_commit(tx); 1662*fa9e4066Sahrens 1663*fa9e4066Sahrens } while (dirty_vdevs); 1664*fa9e4066Sahrens 1665*fa9e4066Sahrens bplist_close(bpl); 1666*fa9e4066Sahrens 1667*fa9e4066Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1668*fa9e4066Sahrens 1669*fa9e4066Sahrens /* 1670*fa9e4066Sahrens * Rewrite the vdev configuration (which includes the uberblock) 1671*fa9e4066Sahrens * to commit the transaction group. 1672*fa9e4066Sahrens */ 1673*fa9e4066Sahrens while (spa_sync_labels(spa, txg)) { 1674*fa9e4066Sahrens dprintf("waiting for devices to heal\n"); 1675*fa9e4066Sahrens delay(hz); 1676*fa9e4066Sahrens vdev_reopen(rvd, NULL); 1677*fa9e4066Sahrens } 1678*fa9e4066Sahrens 1679*fa9e4066Sahrens /* 1680*fa9e4066Sahrens * Make a stable copy of the fully synced uberblock. 1681*fa9e4066Sahrens * We use this as the root for pool traversals. 1682*fa9e4066Sahrens */ 1683*fa9e4066Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1684*fa9e4066Sahrens 1685*fa9e4066Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1686*fa9e4066Sahrens 1687*fa9e4066Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1688*fa9e4066Sahrens spa->spa_traverse_wanted = 0; 1689*fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 1690*fa9e4066Sahrens rw_exit(&spa->spa_traverse_lock); 1691*fa9e4066Sahrens 1692*fa9e4066Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1693*fa9e4066Sahrens 1694*fa9e4066Sahrens /* 1695*fa9e4066Sahrens * Clean up the ZIL records for the synced txg. 1696*fa9e4066Sahrens */ 1697*fa9e4066Sahrens dsl_pool_zil_clean(dp); 1698*fa9e4066Sahrens 1699*fa9e4066Sahrens /* 1700*fa9e4066Sahrens * Update usable space statistics. 1701*fa9e4066Sahrens */ 1702*fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1703*fa9e4066Sahrens vdev_sync_done(vd, txg); 1704*fa9e4066Sahrens 1705*fa9e4066Sahrens /* 1706*fa9e4066Sahrens * It had better be the case that we didn't dirty anything 1707*fa9e4066Sahrens * since spa_sync_labels(). 1708*fa9e4066Sahrens */ 1709*fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1710*fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1711*fa9e4066Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1712*fa9e4066Sahrens ASSERT(bpl->bpl_queue == NULL); 1713*fa9e4066Sahrens 1714*fa9e4066Sahrens spa_config_exit(spa); 1715*fa9e4066Sahrens } 1716*fa9e4066Sahrens 1717*fa9e4066Sahrens /* 1718*fa9e4066Sahrens * Sync all pools. We don't want to hold the namespace lock across these 1719*fa9e4066Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 1720*fa9e4066Sahrens * sync. 1721*fa9e4066Sahrens */ 1722*fa9e4066Sahrens void 1723*fa9e4066Sahrens spa_sync_allpools(void) 1724*fa9e4066Sahrens { 1725*fa9e4066Sahrens spa_t *spa = NULL; 1726*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1727*fa9e4066Sahrens while ((spa = spa_next(spa)) != NULL) { 1728*fa9e4066Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 1729*fa9e4066Sahrens continue; 1730*fa9e4066Sahrens spa_open_ref(spa, FTAG); 1731*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1732*fa9e4066Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 1733*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1734*fa9e4066Sahrens spa_close(spa, FTAG); 1735*fa9e4066Sahrens } 1736*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1737*fa9e4066Sahrens } 1738*fa9e4066Sahrens 1739*fa9e4066Sahrens /* 1740*fa9e4066Sahrens * ========================================================================== 1741*fa9e4066Sahrens * Miscellaneous routines 1742*fa9e4066Sahrens * ========================================================================== 1743*fa9e4066Sahrens */ 1744*fa9e4066Sahrens 1745*fa9e4066Sahrens int 1746*fa9e4066Sahrens spa_busy(void) 1747*fa9e4066Sahrens { 1748*fa9e4066Sahrens return (spa_active_count != 0); 1749*fa9e4066Sahrens } 1750*fa9e4066Sahrens 1751*fa9e4066Sahrens /* 1752*fa9e4066Sahrens * Remove all pools in the system. 1753*fa9e4066Sahrens */ 1754*fa9e4066Sahrens void 1755*fa9e4066Sahrens spa_evict_all(void) 1756*fa9e4066Sahrens { 1757*fa9e4066Sahrens spa_t *spa; 1758*fa9e4066Sahrens 1759*fa9e4066Sahrens /* 1760*fa9e4066Sahrens * Remove all cached state. All pools should be closed now, 1761*fa9e4066Sahrens * so every spa in the AVL tree should be unreferenced. 1762*fa9e4066Sahrens */ 1763*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1764*fa9e4066Sahrens while ((spa = spa_next(NULL)) != NULL) { 1765*fa9e4066Sahrens /* 1766*fa9e4066Sahrens * Stop all scrub and resilver activity. spa_scrub() needs to 1767*fa9e4066Sahrens * wait for the scrub thread, which may do a detach and sync the 1768*fa9e4066Sahrens * configs, which needs spa_namespace_lock. Drop the lock while 1769*fa9e4066Sahrens * maintaining a hold on the spa_t. 1770*fa9e4066Sahrens */ 1771*fa9e4066Sahrens spa_open_ref(spa, FTAG); 1772*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1773*fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1774*fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1775*fa9e4066Sahrens spa_close(spa, FTAG); 1776*fa9e4066Sahrens 1777*fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1778*fa9e4066Sahrens spa_unload(spa); 1779*fa9e4066Sahrens spa_deactivate(spa); 1780*fa9e4066Sahrens } 1781*fa9e4066Sahrens spa_remove(spa); 1782*fa9e4066Sahrens } 1783*fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1784*fa9e4066Sahrens } 1785