1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 2199653d4eSeschrock 22fa9e4066Sahrens /* 23c67d9675Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens /* 30fa9e4066Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31fa9e4066Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32fa9e4066Sahrens * pool. 33fa9e4066Sahrens */ 34fa9e4066Sahrens 35fa9e4066Sahrens #include <sys/zfs_context.h> 36ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 37fa9e4066Sahrens #include <sys/spa_impl.h> 38fa9e4066Sahrens #include <sys/zio.h> 39fa9e4066Sahrens #include <sys/zio_checksum.h> 40fa9e4066Sahrens #include <sys/zio_compress.h> 41fa9e4066Sahrens #include <sys/dmu.h> 42fa9e4066Sahrens #include <sys/dmu_tx.h> 43fa9e4066Sahrens #include <sys/zap.h> 44fa9e4066Sahrens #include <sys/zil.h> 45fa9e4066Sahrens #include <sys/vdev_impl.h> 46fa9e4066Sahrens #include <sys/metaslab.h> 47fa9e4066Sahrens #include <sys/uberblock_impl.h> 48fa9e4066Sahrens #include <sys/txg.h> 49fa9e4066Sahrens #include <sys/avl.h> 50fa9e4066Sahrens #include <sys/dmu_traverse.h> 51fa9e4066Sahrens #include <sys/unique.h> 52fa9e4066Sahrens #include <sys/dsl_pool.h> 53fa9e4066Sahrens #include <sys/dsl_dir.h> 54fa9e4066Sahrens #include <sys/dsl_prop.h> 55fa9e4066Sahrens #include <sys/fs/zfs.h> 56fa9e4066Sahrens #include <sys/callb.h> 57fa9e4066Sahrens 58fa9e4066Sahrens /* 59fa9e4066Sahrens * ========================================================================== 60fa9e4066Sahrens * SPA state manipulation (open/create/destroy/import/export) 61fa9e4066Sahrens * ========================================================================== 62fa9e4066Sahrens */ 63fa9e4066Sahrens 64ea8dc4b6Seschrock static int 65ea8dc4b6Seschrock spa_error_entry_compare(const void *a, const void *b) 66ea8dc4b6Seschrock { 67ea8dc4b6Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 68ea8dc4b6Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 69ea8dc4b6Seschrock int ret; 70ea8dc4b6Seschrock 71ea8dc4b6Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 72ea8dc4b6Seschrock sizeof (zbookmark_t)); 73ea8dc4b6Seschrock 74ea8dc4b6Seschrock if (ret < 0) 75ea8dc4b6Seschrock return (-1); 76ea8dc4b6Seschrock else if (ret > 0) 77ea8dc4b6Seschrock return (1); 78ea8dc4b6Seschrock else 79ea8dc4b6Seschrock return (0); 80ea8dc4b6Seschrock } 81ea8dc4b6Seschrock 82ea8dc4b6Seschrock /* 83ea8dc4b6Seschrock * Utility function which retrieves copies of the current logs and 84ea8dc4b6Seschrock * re-initializes them in the process. 85ea8dc4b6Seschrock */ 86ea8dc4b6Seschrock void 87ea8dc4b6Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 88ea8dc4b6Seschrock { 89ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 90ea8dc4b6Seschrock 91ea8dc4b6Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 92ea8dc4b6Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 93ea8dc4b6Seschrock 94ea8dc4b6Seschrock avl_create(&spa->spa_errlist_scrub, 95ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 96ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 97ea8dc4b6Seschrock avl_create(&spa->spa_errlist_last, 98ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 99ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 100ea8dc4b6Seschrock } 101ea8dc4b6Seschrock 102fa9e4066Sahrens /* 103fa9e4066Sahrens * Activate an uninitialized pool. 104fa9e4066Sahrens */ 105fa9e4066Sahrens static void 106fa9e4066Sahrens spa_activate(spa_t *spa) 107fa9e4066Sahrens { 108fa9e4066Sahrens int t; 109fa9e4066Sahrens 110fa9e4066Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111fa9e4066Sahrens 112fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 113fa9e4066Sahrens 114fa9e4066Sahrens spa->spa_normal_class = metaslab_class_create(); 115fa9e4066Sahrens 116fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 117fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 119fa9e4066Sahrens TASKQ_PREPOPULATE); 120fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121fa9e4066Sahrens 8, maxclsyspri, 50, INT_MAX, 122fa9e4066Sahrens TASKQ_PREPOPULATE); 123fa9e4066Sahrens } 124fa9e4066Sahrens 125fa9e4066Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126fa9e4066Sahrens 127fa9e4066Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128fa9e4066Sahrens offsetof(vdev_t, vdev_dirty_node)); 129fa9e4066Sahrens 130fa9e4066Sahrens txg_list_create(&spa->spa_vdev_txg_list, 131fa9e4066Sahrens offsetof(struct vdev, vdev_txg_node)); 132ea8dc4b6Seschrock 133ea8dc4b6Seschrock avl_create(&spa->spa_errlist_scrub, 134ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 135ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 136ea8dc4b6Seschrock avl_create(&spa->spa_errlist_last, 137ea8dc4b6Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 138ea8dc4b6Seschrock offsetof(spa_error_entry_t, se_avl)); 139fa9e4066Sahrens } 140fa9e4066Sahrens 141fa9e4066Sahrens /* 142fa9e4066Sahrens * Opposite of spa_activate(). 143fa9e4066Sahrens */ 144fa9e4066Sahrens static void 145fa9e4066Sahrens spa_deactivate(spa_t *spa) 146fa9e4066Sahrens { 147fa9e4066Sahrens int t; 148fa9e4066Sahrens 149fa9e4066Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 150fa9e4066Sahrens ASSERT(spa->spa_dsl_pool == NULL); 151fa9e4066Sahrens ASSERT(spa->spa_root_vdev == NULL); 152fa9e4066Sahrens 153fa9e4066Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154fa9e4066Sahrens 155fa9e4066Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 156fa9e4066Sahrens 157fa9e4066Sahrens list_destroy(&spa->spa_dirty_list); 158fa9e4066Sahrens 159fa9e4066Sahrens rw_destroy(&spa->spa_traverse_lock); 160fa9e4066Sahrens 161fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 162fa9e4066Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 163fa9e4066Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 164fa9e4066Sahrens spa->spa_zio_issue_taskq[t] = NULL; 165fa9e4066Sahrens spa->spa_zio_intr_taskq[t] = NULL; 166fa9e4066Sahrens } 167fa9e4066Sahrens 168fa9e4066Sahrens metaslab_class_destroy(spa->spa_normal_class); 169fa9e4066Sahrens spa->spa_normal_class = NULL; 170fa9e4066Sahrens 171ea8dc4b6Seschrock /* 172ea8dc4b6Seschrock * If this was part of an import or the open otherwise failed, we may 173ea8dc4b6Seschrock * still have errors left in the queues. Empty them just in case. 174ea8dc4b6Seschrock */ 175ea8dc4b6Seschrock spa_errlog_drain(spa); 176ea8dc4b6Seschrock 177ea8dc4b6Seschrock avl_destroy(&spa->spa_errlist_scrub); 178ea8dc4b6Seschrock avl_destroy(&spa->spa_errlist_last); 179ea8dc4b6Seschrock 180fa9e4066Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 181fa9e4066Sahrens } 182fa9e4066Sahrens 183fa9e4066Sahrens /* 184fa9e4066Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 185fa9e4066Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 186fa9e4066Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 187fa9e4066Sahrens * All vdev validation is done by the vdev_alloc() routine. 188fa9e4066Sahrens */ 18999653d4eSeschrock static int 19099653d4eSeschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 19199653d4eSeschrock uint_t id, int atype) 192fa9e4066Sahrens { 193fa9e4066Sahrens nvlist_t **child; 194fa9e4066Sahrens uint_t c, children; 19599653d4eSeschrock int error; 196fa9e4066Sahrens 19799653d4eSeschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 19899653d4eSeschrock return (error); 199fa9e4066Sahrens 20099653d4eSeschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 20199653d4eSeschrock return (0); 202fa9e4066Sahrens 203fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204fa9e4066Sahrens &child, &children) != 0) { 20599653d4eSeschrock vdev_free(*vdp); 20699653d4eSeschrock *vdp = NULL; 20799653d4eSeschrock return (EINVAL); 208fa9e4066Sahrens } 209fa9e4066Sahrens 210fa9e4066Sahrens for (c = 0; c < children; c++) { 21199653d4eSeschrock vdev_t *vd; 21299653d4eSeschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 21399653d4eSeschrock atype)) != 0) { 21499653d4eSeschrock vdev_free(*vdp); 21599653d4eSeschrock *vdp = NULL; 21699653d4eSeschrock return (error); 217fa9e4066Sahrens } 218fa9e4066Sahrens } 219fa9e4066Sahrens 22099653d4eSeschrock ASSERT(*vdp != NULL); 22199653d4eSeschrock 22299653d4eSeschrock return (0); 223fa9e4066Sahrens } 224fa9e4066Sahrens 225fa9e4066Sahrens /* 226fa9e4066Sahrens * Opposite of spa_load(). 227fa9e4066Sahrens */ 228fa9e4066Sahrens static void 229fa9e4066Sahrens spa_unload(spa_t *spa) 230fa9e4066Sahrens { 23199653d4eSeschrock int i; 23299653d4eSeschrock 233ea8dc4b6Seschrock /* 234ea8dc4b6Seschrock * Stop async tasks. 235ea8dc4b6Seschrock */ 236ea8dc4b6Seschrock spa_async_suspend(spa); 237ea8dc4b6Seschrock 238fa9e4066Sahrens /* 239fa9e4066Sahrens * Stop syncing. 240fa9e4066Sahrens */ 241fa9e4066Sahrens if (spa->spa_sync_on) { 242fa9e4066Sahrens txg_sync_stop(spa->spa_dsl_pool); 243fa9e4066Sahrens spa->spa_sync_on = B_FALSE; 244fa9e4066Sahrens } 245fa9e4066Sahrens 246fa9e4066Sahrens /* 247fa9e4066Sahrens * Wait for any outstanding prefetch I/O to complete. 248fa9e4066Sahrens */ 249ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 250ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 251fa9e4066Sahrens 252fa9e4066Sahrens /* 253fa9e4066Sahrens * Close the dsl pool. 254fa9e4066Sahrens */ 255fa9e4066Sahrens if (spa->spa_dsl_pool) { 256fa9e4066Sahrens dsl_pool_close(spa->spa_dsl_pool); 257fa9e4066Sahrens spa->spa_dsl_pool = NULL; 258fa9e4066Sahrens } 259fa9e4066Sahrens 260fa9e4066Sahrens /* 261fa9e4066Sahrens * Close all vdevs. 262fa9e4066Sahrens */ 2630e34b6a7Sbonwick if (spa->spa_root_vdev) 264fa9e4066Sahrens vdev_free(spa->spa_root_vdev); 2650e34b6a7Sbonwick ASSERT(spa->spa_root_vdev == NULL); 266ea8dc4b6Seschrock 26799653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 26899653d4eSeschrock vdev_free(spa->spa_spares[i]); 26999653d4eSeschrock if (spa->spa_spares) { 27099653d4eSeschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 27199653d4eSeschrock spa->spa_spares = NULL; 27299653d4eSeschrock } 27399653d4eSeschrock if (spa->spa_sparelist) { 27499653d4eSeschrock nvlist_free(spa->spa_sparelist); 27599653d4eSeschrock spa->spa_sparelist = NULL; 27699653d4eSeschrock } 27799653d4eSeschrock 278ea8dc4b6Seschrock spa->spa_async_suspended = 0; 279fa9e4066Sahrens } 280fa9e4066Sahrens 28199653d4eSeschrock /* 28299653d4eSeschrock * Load (or re-load) the current list of vdevs describing the active spares for 28399653d4eSeschrock * this pool. When this is called, we have some form of basic information in 28499653d4eSeschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 28599653d4eSeschrock * re-generate a more complete list including status information. 28699653d4eSeschrock */ 28799653d4eSeschrock static void 28899653d4eSeschrock spa_load_spares(spa_t *spa) 28999653d4eSeschrock { 29099653d4eSeschrock nvlist_t **spares; 29199653d4eSeschrock uint_t nspares; 29299653d4eSeschrock int i; 29399653d4eSeschrock 29499653d4eSeschrock /* 29599653d4eSeschrock * First, close and free any existing spare vdevs. 29699653d4eSeschrock */ 29799653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) { 29899653d4eSeschrock vdev_close(spa->spa_spares[i]); 29999653d4eSeschrock vdev_free(spa->spa_spares[i]); 30099653d4eSeschrock } 30199653d4eSeschrock if (spa->spa_spares) 30299653d4eSeschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 30399653d4eSeschrock 30499653d4eSeschrock if (spa->spa_sparelist == NULL) 30599653d4eSeschrock nspares = 0; 30699653d4eSeschrock else 30799653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 30899653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 30999653d4eSeschrock 31099653d4eSeschrock spa->spa_nspares = (int)nspares; 31199653d4eSeschrock spa->spa_spares = NULL; 31299653d4eSeschrock 31399653d4eSeschrock if (nspares == 0) 31499653d4eSeschrock return; 31599653d4eSeschrock 31699653d4eSeschrock /* 31799653d4eSeschrock * Construct the array of vdevs, opening them to get status in the 31899653d4eSeschrock * process. 31999653d4eSeschrock */ 32099653d4eSeschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 32199653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) { 32299653d4eSeschrock vdev_t *vd; 32399653d4eSeschrock 32499653d4eSeschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 32599653d4eSeschrock VDEV_ALLOC_SPARE) == 0); 32699653d4eSeschrock ASSERT(vd != NULL); 32799653d4eSeschrock 32899653d4eSeschrock spa->spa_spares[i] = vd; 32999653d4eSeschrock 33099653d4eSeschrock if (vdev_open(vd) != 0) 33199653d4eSeschrock continue; 33299653d4eSeschrock 33399653d4eSeschrock vd->vdev_top = vd; 33499653d4eSeschrock (void) vdev_validate_spare(vd); 33599653d4eSeschrock } 33699653d4eSeschrock 33799653d4eSeschrock /* 33899653d4eSeschrock * Recompute the stashed list of spares, with status information 33999653d4eSeschrock * this time. 34099653d4eSeschrock */ 34199653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 34299653d4eSeschrock DATA_TYPE_NVLIST_ARRAY) == 0); 34399653d4eSeschrock 34499653d4eSeschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 34599653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 34699653d4eSeschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 34799653d4eSeschrock B_TRUE, B_TRUE); 34899653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 34999653d4eSeschrock spares, spa->spa_nspares) == 0); 35099653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 35199653d4eSeschrock nvlist_free(spares[i]); 35299653d4eSeschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 35399653d4eSeschrock } 35499653d4eSeschrock 35599653d4eSeschrock static int 35699653d4eSeschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 35799653d4eSeschrock { 35899653d4eSeschrock dmu_buf_t *db; 35999653d4eSeschrock char *packed = NULL; 36099653d4eSeschrock size_t nvsize = 0; 36199653d4eSeschrock int error; 36299653d4eSeschrock *value = NULL; 36399653d4eSeschrock 36499653d4eSeschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 36599653d4eSeschrock nvsize = *(uint64_t *)db->db_data; 36699653d4eSeschrock dmu_buf_rele(db, FTAG); 36799653d4eSeschrock 36899653d4eSeschrock packed = kmem_alloc(nvsize, KM_SLEEP); 36999653d4eSeschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 37099653d4eSeschrock if (error == 0) 37199653d4eSeschrock error = nvlist_unpack(packed, nvsize, value, 0); 37299653d4eSeschrock kmem_free(packed, nvsize); 37399653d4eSeschrock 37499653d4eSeschrock return (error); 37599653d4eSeschrock } 37699653d4eSeschrock 377fa9e4066Sahrens /* 378fa9e4066Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 379ea8dc4b6Seschrock * source of configuration information. 380fa9e4066Sahrens */ 381fa9e4066Sahrens static int 382ea8dc4b6Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383fa9e4066Sahrens { 384fa9e4066Sahrens int error = 0; 385fa9e4066Sahrens nvlist_t *nvroot = NULL; 386fa9e4066Sahrens vdev_t *rvd; 387fa9e4066Sahrens uberblock_t *ub = &spa->spa_uberblock; 3880373e76bSbonwick uint64_t config_cache_txg = spa->spa_config_txg; 389fa9e4066Sahrens uint64_t pool_guid; 39099653d4eSeschrock uint64_t version; 391fa9e4066Sahrens zio_t *zio; 392fa9e4066Sahrens 393ea8dc4b6Seschrock spa->spa_load_state = state; 3940373e76bSbonwick 395fa9e4066Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 396a9926bf0Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 397ea8dc4b6Seschrock error = EINVAL; 398ea8dc4b6Seschrock goto out; 399ea8dc4b6Seschrock } 400fa9e4066Sahrens 40199653d4eSeschrock /* 40299653d4eSeschrock * Versioning wasn't explicitly added to the label until later, so if 40399653d4eSeschrock * it's not present treat it as the initial version. 40499653d4eSeschrock */ 40599653d4eSeschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 40699653d4eSeschrock version = ZFS_VERSION_INITIAL; 40799653d4eSeschrock 408a9926bf0Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 409a9926bf0Sbonwick &spa->spa_config_txg); 410a9926bf0Sbonwick 4110373e76bSbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 412ea8dc4b6Seschrock spa_guid_exists(pool_guid, 0)) { 413ea8dc4b6Seschrock error = EEXIST; 414ea8dc4b6Seschrock goto out; 415ea8dc4b6Seschrock } 416fa9e4066Sahrens 417*b5989ec7Seschrock spa->spa_load_guid = pool_guid; 418*b5989ec7Seschrock 419fa9e4066Sahrens /* 42099653d4eSeschrock * Parse the configuration into a vdev tree. We explicitly set the 42199653d4eSeschrock * value that will be returned by spa_version() since parsing the 42299653d4eSeschrock * configuration requires knowing the version number. 423fa9e4066Sahrens */ 424ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 42599653d4eSeschrock spa->spa_ubsync.ub_version = version; 42699653d4eSeschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 427ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 428fa9e4066Sahrens 42999653d4eSeschrock if (error != 0) 430ea8dc4b6Seschrock goto out; 431fa9e4066Sahrens 4320e34b6a7Sbonwick ASSERT(spa->spa_root_vdev == rvd); 433fa9e4066Sahrens ASSERT(spa_guid(spa) == pool_guid); 434fa9e4066Sahrens 435fa9e4066Sahrens /* 436fa9e4066Sahrens * Try to open all vdevs, loading each label in the process. 437fa9e4066Sahrens */ 438ea8dc4b6Seschrock if (vdev_open(rvd) != 0) { 439ea8dc4b6Seschrock error = ENXIO; 440ea8dc4b6Seschrock goto out; 441ea8dc4b6Seschrock } 442fa9e4066Sahrens 443560e6e96Seschrock /* 444560e6e96Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 445560e6e96Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 446560e6e96Seschrock * flag. 447560e6e96Seschrock */ 448560e6e96Seschrock spa_config_enter(spa, RW_READER, FTAG); 449560e6e96Seschrock error = vdev_validate(rvd); 450560e6e96Seschrock spa_config_exit(spa, FTAG); 451560e6e96Seschrock 452560e6e96Seschrock if (error != 0) { 453560e6e96Seschrock error = EBADF; 454560e6e96Seschrock goto out; 455560e6e96Seschrock } 456560e6e96Seschrock 457560e6e96Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 458560e6e96Seschrock error = ENXIO; 459560e6e96Seschrock goto out; 460560e6e96Seschrock } 461560e6e96Seschrock 462fa9e4066Sahrens /* 463fa9e4066Sahrens * Find the best uberblock. 464fa9e4066Sahrens */ 465fa9e4066Sahrens bzero(ub, sizeof (uberblock_t)); 466fa9e4066Sahrens 467fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, 468fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 469fa9e4066Sahrens vdev_uberblock_load(zio, rvd, ub); 470fa9e4066Sahrens error = zio_wait(zio); 471fa9e4066Sahrens 472fa9e4066Sahrens /* 473fa9e4066Sahrens * If we weren't able to find a single valid uberblock, return failure. 474fa9e4066Sahrens */ 475fa9e4066Sahrens if (ub->ub_txg == 0) { 476eaca9bbdSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 477eaca9bbdSeschrock VDEV_AUX_CORRUPT_DATA); 478ea8dc4b6Seschrock error = ENXIO; 479ea8dc4b6Seschrock goto out; 480ea8dc4b6Seschrock } 481ea8dc4b6Seschrock 482ea8dc4b6Seschrock /* 483ea8dc4b6Seschrock * If the pool is newer than the code, we can't open it. 484ea8dc4b6Seschrock */ 485eaca9bbdSeschrock if (ub->ub_version > ZFS_VERSION) { 486eaca9bbdSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 487eaca9bbdSeschrock VDEV_AUX_VERSION_NEWER); 488ea8dc4b6Seschrock error = ENOTSUP; 489ea8dc4b6Seschrock goto out; 490fa9e4066Sahrens } 491fa9e4066Sahrens 492fa9e4066Sahrens /* 493fa9e4066Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 494fa9e4066Sahrens * incomplete configuration. 495fa9e4066Sahrens */ 496ecc2d604Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 497ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 498ea8dc4b6Seschrock VDEV_AUX_BAD_GUID_SUM); 499ea8dc4b6Seschrock error = ENXIO; 500ea8dc4b6Seschrock goto out; 501fa9e4066Sahrens } 502fa9e4066Sahrens 503fa9e4066Sahrens /* 504fa9e4066Sahrens * Initialize internal SPA structures. 505fa9e4066Sahrens */ 506fa9e4066Sahrens spa->spa_state = POOL_STATE_ACTIVE; 507fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 508fa9e4066Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 509ea8dc4b6Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 510ea8dc4b6Seschrock if (error) { 511ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 512ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 513ea8dc4b6Seschrock goto out; 514ea8dc4b6Seschrock } 515fa9e4066Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 516fa9e4066Sahrens 517ea8dc4b6Seschrock if (zap_lookup(spa->spa_meta_objset, 518fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 519ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 520ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 521ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 522ea8dc4b6Seschrock error = EIO; 523ea8dc4b6Seschrock goto out; 524ea8dc4b6Seschrock } 525fa9e4066Sahrens 526fa9e4066Sahrens if (!mosconfig) { 52799653d4eSeschrock nvlist_t *newconfig; 528fa9e4066Sahrens 52999653d4eSeschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 530ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 531ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 532ea8dc4b6Seschrock error = EIO; 533ea8dc4b6Seschrock goto out; 534ea8dc4b6Seschrock } 535fa9e4066Sahrens 536fa9e4066Sahrens spa_config_set(spa, newconfig); 537fa9e4066Sahrens spa_unload(spa); 538fa9e4066Sahrens spa_deactivate(spa); 539fa9e4066Sahrens spa_activate(spa); 540fa9e4066Sahrens 541ea8dc4b6Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 542fa9e4066Sahrens } 543fa9e4066Sahrens 544ea8dc4b6Seschrock if (zap_lookup(spa->spa_meta_objset, 545fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 546ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 547ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 548ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 549ea8dc4b6Seschrock error = EIO; 550ea8dc4b6Seschrock goto out; 551ea8dc4b6Seschrock } 552fa9e4066Sahrens 55399653d4eSeschrock /* 55499653d4eSeschrock * Load the bit that tells us to use the new accounting function 55599653d4eSeschrock * (raid-z deflation). If we have an older pool, this will not 55699653d4eSeschrock * be present. 55799653d4eSeschrock */ 55899653d4eSeschrock error = zap_lookup(spa->spa_meta_objset, 55999653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 56099653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate); 56199653d4eSeschrock if (error != 0 && error != ENOENT) { 56299653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 56399653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 56499653d4eSeschrock error = EIO; 56599653d4eSeschrock goto out; 56699653d4eSeschrock } 56799653d4eSeschrock 568fa9e4066Sahrens /* 569ea8dc4b6Seschrock * Load the persistent error log. If we have an older pool, this will 570ea8dc4b6Seschrock * not be present. 571fa9e4066Sahrens */ 572ea8dc4b6Seschrock error = zap_lookup(spa->spa_meta_objset, 573ea8dc4b6Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 574ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 575d80c45e0Sbonwick if (error != 0 && error != ENOENT) { 576ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 577ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 578ea8dc4b6Seschrock error = EIO; 579ea8dc4b6Seschrock goto out; 580ea8dc4b6Seschrock } 581ea8dc4b6Seschrock 582ea8dc4b6Seschrock error = zap_lookup(spa->spa_meta_objset, 583ea8dc4b6Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 584ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 585ea8dc4b6Seschrock if (error != 0 && error != ENOENT) { 586ea8dc4b6Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 587ea8dc4b6Seschrock VDEV_AUX_CORRUPT_DATA); 588ea8dc4b6Seschrock error = EIO; 589ea8dc4b6Seschrock goto out; 590ea8dc4b6Seschrock } 591ea8dc4b6Seschrock 59299653d4eSeschrock /* 59399653d4eSeschrock * Load any hot spares for this pool. 59499653d4eSeschrock */ 59599653d4eSeschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 59699653d4eSeschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 59799653d4eSeschrock if (error != 0 && error != ENOENT) { 59899653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 59999653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 60099653d4eSeschrock error = EIO; 60199653d4eSeschrock goto out; 60299653d4eSeschrock } 60399653d4eSeschrock if (error == 0) { 60499653d4eSeschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 60599653d4eSeschrock if (load_nvlist(spa, spa->spa_spares_object, 60699653d4eSeschrock &spa->spa_sparelist) != 0) { 60799653d4eSeschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 60899653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 60999653d4eSeschrock error = EIO; 61099653d4eSeschrock goto out; 61199653d4eSeschrock } 61299653d4eSeschrock 61399653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 61499653d4eSeschrock spa_load_spares(spa); 61599653d4eSeschrock spa_config_exit(spa, FTAG); 61699653d4eSeschrock } 61799653d4eSeschrock 618ea8dc4b6Seschrock /* 619560e6e96Seschrock * Load the vdev state for all toplevel vdevs. 620ea8dc4b6Seschrock */ 621560e6e96Seschrock vdev_load(rvd); 6220373e76bSbonwick 623fa9e4066Sahrens /* 624fa9e4066Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 625fa9e4066Sahrens */ 626ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 627fa9e4066Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 628ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 629fa9e4066Sahrens 630fa9e4066Sahrens /* 631fa9e4066Sahrens * Check the state of the root vdev. If it can't be opened, it 632fa9e4066Sahrens * indicates one or more toplevel vdevs are faulted. 633fa9e4066Sahrens */ 634ea8dc4b6Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 635ea8dc4b6Seschrock error = ENXIO; 636ea8dc4b6Seschrock goto out; 637ea8dc4b6Seschrock } 638fa9e4066Sahrens 639ea8dc4b6Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 6405dabedeeSbonwick dmu_tx_t *tx; 6410373e76bSbonwick int need_update = B_FALSE; 6420373e76bSbonwick int c; 6435dabedeeSbonwick 6440373e76bSbonwick /* 6450373e76bSbonwick * Claim log blocks that haven't been committed yet. 6460373e76bSbonwick * This must all happen in a single txg. 6470373e76bSbonwick */ 6485dabedeeSbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 649fa9e4066Sahrens spa_first_txg(spa)); 650fa9e4066Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 651fa9e4066Sahrens dmu_tx_commit(tx); 652fa9e4066Sahrens 653fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 654fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 655fa9e4066Sahrens 656fa9e4066Sahrens /* 657fa9e4066Sahrens * Wait for all claims to sync. 658fa9e4066Sahrens */ 659fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 6600e34b6a7Sbonwick 6610e34b6a7Sbonwick /* 6620373e76bSbonwick * If the config cache is stale, or we have uninitialized 6630373e76bSbonwick * metaslabs (see spa_vdev_add()), then update the config. 6640e34b6a7Sbonwick */ 6650373e76bSbonwick if (config_cache_txg != spa->spa_config_txg || 6660373e76bSbonwick state == SPA_LOAD_IMPORT) 6670373e76bSbonwick need_update = B_TRUE; 6680373e76bSbonwick 6690373e76bSbonwick for (c = 0; c < rvd->vdev_children; c++) 6700373e76bSbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 6710373e76bSbonwick need_update = B_TRUE; 6720e34b6a7Sbonwick 6730e34b6a7Sbonwick /* 6740373e76bSbonwick * Update the config cache asychronously in case we're the 6750373e76bSbonwick * root pool, in which case the config cache isn't writable yet. 6760e34b6a7Sbonwick */ 6770373e76bSbonwick if (need_update) 6780373e76bSbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 679fa9e4066Sahrens } 680fa9e4066Sahrens 681ea8dc4b6Seschrock error = 0; 682ea8dc4b6Seschrock out: 68399653d4eSeschrock if (error && error != EBADF) 684ea8dc4b6Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 685ea8dc4b6Seschrock spa->spa_load_state = SPA_LOAD_NONE; 686ea8dc4b6Seschrock spa->spa_ena = 0; 687ea8dc4b6Seschrock 688ea8dc4b6Seschrock return (error); 689fa9e4066Sahrens } 690fa9e4066Sahrens 691fa9e4066Sahrens /* 692fa9e4066Sahrens * Pool Open/Import 693fa9e4066Sahrens * 694fa9e4066Sahrens * The import case is identical to an open except that the configuration is sent 695fa9e4066Sahrens * down from userland, instead of grabbed from the configuration cache. For the 696fa9e4066Sahrens * case of an open, the pool configuration will exist in the 697fa9e4066Sahrens * POOL_STATE_UNITIALIZED state. 698fa9e4066Sahrens * 699fa9e4066Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 700fa9e4066Sahrens * the same time open the pool, without having to keep around the spa_t in some 701fa9e4066Sahrens * ambiguous state. 702fa9e4066Sahrens */ 703fa9e4066Sahrens static int 704fa9e4066Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 705fa9e4066Sahrens { 706fa9e4066Sahrens spa_t *spa; 707fa9e4066Sahrens int error; 708fa9e4066Sahrens int loaded = B_FALSE; 709fa9e4066Sahrens int locked = B_FALSE; 710fa9e4066Sahrens 711fa9e4066Sahrens *spapp = NULL; 712fa9e4066Sahrens 713fa9e4066Sahrens /* 714fa9e4066Sahrens * As disgusting as this is, we need to support recursive calls to this 715fa9e4066Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 716fa9e4066Sahrens * up calling spa_open() again. The real fix is to figure out how to 717fa9e4066Sahrens * avoid dsl_dir_open() calling this in the first place. 718fa9e4066Sahrens */ 719fa9e4066Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 720fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 721fa9e4066Sahrens locked = B_TRUE; 722fa9e4066Sahrens } 723fa9e4066Sahrens 724fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 725fa9e4066Sahrens if (locked) 726fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 727fa9e4066Sahrens return (ENOENT); 728fa9e4066Sahrens } 729fa9e4066Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 730fa9e4066Sahrens 731fa9e4066Sahrens spa_activate(spa); 732fa9e4066Sahrens 7330373e76bSbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 734fa9e4066Sahrens 735fa9e4066Sahrens if (error == EBADF) { 736fa9e4066Sahrens /* 737560e6e96Seschrock * If vdev_validate() returns failure (indicated by 738560e6e96Seschrock * EBADF), it indicates that one of the vdevs indicates 739560e6e96Seschrock * that the pool has been exported or destroyed. If 740560e6e96Seschrock * this is the case, the config cache is out of sync and 741560e6e96Seschrock * we should remove the pool from the namespace. 742fa9e4066Sahrens */ 74399653d4eSeschrock zfs_post_ok(spa, NULL); 744fa9e4066Sahrens spa_unload(spa); 745fa9e4066Sahrens spa_deactivate(spa); 746fa9e4066Sahrens spa_remove(spa); 747fa9e4066Sahrens spa_config_sync(); 748fa9e4066Sahrens if (locked) 749fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 750fa9e4066Sahrens return (ENOENT); 751ea8dc4b6Seschrock } 752ea8dc4b6Seschrock 753ea8dc4b6Seschrock if (error) { 754fa9e4066Sahrens /* 755fa9e4066Sahrens * We can't open the pool, but we still have useful 756fa9e4066Sahrens * information: the state of each vdev after the 757fa9e4066Sahrens * attempted vdev_open(). Return this to the user. 758fa9e4066Sahrens */ 7590373e76bSbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 7600373e76bSbonwick spa_config_enter(spa, RW_READER, FTAG); 761fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 762fa9e4066Sahrens B_TRUE); 7630373e76bSbonwick spa_config_exit(spa, FTAG); 7640373e76bSbonwick } 765fa9e4066Sahrens spa_unload(spa); 766fa9e4066Sahrens spa_deactivate(spa); 767ea8dc4b6Seschrock spa->spa_last_open_failed = B_TRUE; 768fa9e4066Sahrens if (locked) 769fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 770fa9e4066Sahrens *spapp = NULL; 771fa9e4066Sahrens return (error); 772ea8dc4b6Seschrock } else { 773ea8dc4b6Seschrock zfs_post_ok(spa, NULL); 774ea8dc4b6Seschrock spa->spa_last_open_failed = B_FALSE; 775fa9e4066Sahrens } 776fa9e4066Sahrens 777fa9e4066Sahrens loaded = B_TRUE; 778fa9e4066Sahrens } 779fa9e4066Sahrens 780fa9e4066Sahrens spa_open_ref(spa, tag); 781fa9e4066Sahrens if (locked) 782fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 783fa9e4066Sahrens 784fa9e4066Sahrens *spapp = spa; 785fa9e4066Sahrens 786fa9e4066Sahrens if (config != NULL) { 787ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 788fa9e4066Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 789ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 790fa9e4066Sahrens } 791fa9e4066Sahrens 792fa9e4066Sahrens /* 793fa9e4066Sahrens * If we just loaded the pool, resilver anything that's out of date. 794fa9e4066Sahrens */ 795fa9e4066Sahrens if (loaded && (spa_mode & FWRITE)) 796fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 797fa9e4066Sahrens 798fa9e4066Sahrens return (0); 799fa9e4066Sahrens } 800fa9e4066Sahrens 801fa9e4066Sahrens int 802fa9e4066Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 803fa9e4066Sahrens { 804fa9e4066Sahrens return (spa_open_common(name, spapp, tag, NULL)); 805fa9e4066Sahrens } 806fa9e4066Sahrens 807ea8dc4b6Seschrock /* 808ea8dc4b6Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 809ea8dc4b6Seschrock * preventing it from being exported or destroyed. 810ea8dc4b6Seschrock */ 811ea8dc4b6Seschrock spa_t * 812ea8dc4b6Seschrock spa_inject_addref(char *name) 813ea8dc4b6Seschrock { 814ea8dc4b6Seschrock spa_t *spa; 815ea8dc4b6Seschrock 816ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 817ea8dc4b6Seschrock if ((spa = spa_lookup(name)) == NULL) { 818ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 819ea8dc4b6Seschrock return (NULL); 820ea8dc4b6Seschrock } 821ea8dc4b6Seschrock spa->spa_inject_ref++; 822ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 823ea8dc4b6Seschrock 824ea8dc4b6Seschrock return (spa); 825ea8dc4b6Seschrock } 826ea8dc4b6Seschrock 827ea8dc4b6Seschrock void 828ea8dc4b6Seschrock spa_inject_delref(spa_t *spa) 829ea8dc4b6Seschrock { 830ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 831ea8dc4b6Seschrock spa->spa_inject_ref--; 832ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 833ea8dc4b6Seschrock } 834ea8dc4b6Seschrock 83599653d4eSeschrock static void 83699653d4eSeschrock spa_add_spares(spa_t *spa, nvlist_t *config) 83799653d4eSeschrock { 83899653d4eSeschrock nvlist_t **spares; 83999653d4eSeschrock uint_t i, nspares; 84099653d4eSeschrock nvlist_t *nvroot; 84199653d4eSeschrock uint64_t guid; 84299653d4eSeschrock vdev_stat_t *vs; 84399653d4eSeschrock uint_t vsc; 84499653d4eSeschrock 84599653d4eSeschrock if (spa->spa_nspares == 0) 84699653d4eSeschrock return; 84799653d4eSeschrock 84899653d4eSeschrock VERIFY(nvlist_lookup_nvlist(config, 84999653d4eSeschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 85099653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 85199653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 85299653d4eSeschrock if (nspares != 0) { 85399653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, 85499653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 85599653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 85699653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 85799653d4eSeschrock 85899653d4eSeschrock /* 85999653d4eSeschrock * Go through and find any spares which have since been 86099653d4eSeschrock * repurposed as an active spare. If this is the case, update 86199653d4eSeschrock * their status appropriately. 86299653d4eSeschrock */ 86399653d4eSeschrock for (i = 0; i < nspares; i++) { 86499653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 86599653d4eSeschrock ZPOOL_CONFIG_GUID, &guid) == 0); 86699653d4eSeschrock if (spa_spare_inuse(guid)) { 86799653d4eSeschrock VERIFY(nvlist_lookup_uint64_array( 86899653d4eSeschrock spares[i], ZPOOL_CONFIG_STATS, 86999653d4eSeschrock (uint64_t **)&vs, &vsc) == 0); 87099653d4eSeschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 87199653d4eSeschrock vs->vs_aux = VDEV_AUX_SPARED; 87299653d4eSeschrock } 87399653d4eSeschrock } 87499653d4eSeschrock } 87599653d4eSeschrock } 87699653d4eSeschrock 877fa9e4066Sahrens int 878ea8dc4b6Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 879fa9e4066Sahrens { 880fa9e4066Sahrens int error; 881fa9e4066Sahrens spa_t *spa; 882fa9e4066Sahrens 883fa9e4066Sahrens *config = NULL; 884fa9e4066Sahrens error = spa_open_common(name, &spa, FTAG, config); 885fa9e4066Sahrens 88699653d4eSeschrock if (spa && *config != NULL) { 887ea8dc4b6Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 888ea8dc4b6Seschrock spa_get_errlog_size(spa)) == 0); 889ea8dc4b6Seschrock 89099653d4eSeschrock spa_add_spares(spa, *config); 89199653d4eSeschrock } 89299653d4eSeschrock 893ea8dc4b6Seschrock /* 894ea8dc4b6Seschrock * We want to get the alternate root even for faulted pools, so we cheat 895ea8dc4b6Seschrock * and call spa_lookup() directly. 896ea8dc4b6Seschrock */ 897ea8dc4b6Seschrock if (altroot) { 898ea8dc4b6Seschrock if (spa == NULL) { 899ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 900ea8dc4b6Seschrock spa = spa_lookup(name); 901ea8dc4b6Seschrock if (spa) 902ea8dc4b6Seschrock spa_altroot(spa, altroot, buflen); 903ea8dc4b6Seschrock else 904ea8dc4b6Seschrock altroot[0] = '\0'; 905ea8dc4b6Seschrock spa = NULL; 906ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 907ea8dc4b6Seschrock } else { 908ea8dc4b6Seschrock spa_altroot(spa, altroot, buflen); 909ea8dc4b6Seschrock } 910ea8dc4b6Seschrock } 911ea8dc4b6Seschrock 912fa9e4066Sahrens if (spa != NULL) 913fa9e4066Sahrens spa_close(spa, FTAG); 914fa9e4066Sahrens 915fa9e4066Sahrens return (error); 916fa9e4066Sahrens } 917fa9e4066Sahrens 91899653d4eSeschrock /* 91999653d4eSeschrock * Validate that the 'spares' array is well formed. We must have an array of 92099653d4eSeschrock * nvlists, each which describes a valid leaf vdev. 92199653d4eSeschrock */ 92299653d4eSeschrock static int 92399653d4eSeschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 92499653d4eSeschrock { 92599653d4eSeschrock nvlist_t **spares; 92699653d4eSeschrock uint_t i, nspares; 92799653d4eSeschrock vdev_t *vd; 92899653d4eSeschrock int error; 92999653d4eSeschrock 93099653d4eSeschrock /* 93199653d4eSeschrock * It's acceptable to have no spares specified. 93299653d4eSeschrock */ 93399653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 93499653d4eSeschrock &spares, &nspares) != 0) 93599653d4eSeschrock return (0); 93699653d4eSeschrock 93799653d4eSeschrock if (nspares == 0) 93899653d4eSeschrock return (EINVAL); 93999653d4eSeschrock 94099653d4eSeschrock /* 94199653d4eSeschrock * Make sure the pool is formatted with a version that supports hot 94299653d4eSeschrock * spares. 94399653d4eSeschrock */ 94499653d4eSeschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 94599653d4eSeschrock return (ENOTSUP); 94699653d4eSeschrock 94799653d4eSeschrock for (i = 0; i < nspares; i++) { 94899653d4eSeschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 94999653d4eSeschrock mode)) != 0) 95099653d4eSeschrock return (error); 95199653d4eSeschrock 95299653d4eSeschrock if (!vd->vdev_ops->vdev_op_leaf) { 95399653d4eSeschrock vdev_free(vd); 95499653d4eSeschrock return (EINVAL); 95599653d4eSeschrock } 95699653d4eSeschrock 95799653d4eSeschrock if ((error = vdev_open(vd)) != 0) { 95899653d4eSeschrock vdev_free(vd); 95999653d4eSeschrock return (error); 96099653d4eSeschrock } 96199653d4eSeschrock 96299653d4eSeschrock vd->vdev_top = vd; 96399653d4eSeschrock if ((error = vdev_label_spare(vd, crtxg)) != 0) { 96499653d4eSeschrock vdev_free(vd); 96599653d4eSeschrock return (error); 96699653d4eSeschrock } 96799653d4eSeschrock 96899653d4eSeschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 96999653d4eSeschrock vd->vdev_guid) == 0); 97099653d4eSeschrock 97199653d4eSeschrock vdev_free(vd); 97299653d4eSeschrock } 97399653d4eSeschrock 97499653d4eSeschrock return (0); 97599653d4eSeschrock } 97699653d4eSeschrock 977fa9e4066Sahrens /* 978fa9e4066Sahrens * Pool Creation 979fa9e4066Sahrens */ 980fa9e4066Sahrens int 9810373e76bSbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 982fa9e4066Sahrens { 983fa9e4066Sahrens spa_t *spa; 9840373e76bSbonwick vdev_t *rvd; 985fa9e4066Sahrens dsl_pool_t *dp; 986fa9e4066Sahrens dmu_tx_t *tx; 98799653d4eSeschrock int c, error = 0; 988fa9e4066Sahrens uint64_t txg = TXG_INITIAL; 98999653d4eSeschrock nvlist_t **spares; 99099653d4eSeschrock uint_t nspares; 991fa9e4066Sahrens 992fa9e4066Sahrens /* 993fa9e4066Sahrens * If this pool already exists, return failure. 994fa9e4066Sahrens */ 995fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 996fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 997fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 998fa9e4066Sahrens return (EEXIST); 999fa9e4066Sahrens } 1000fa9e4066Sahrens 1001fa9e4066Sahrens /* 1002fa9e4066Sahrens * Allocate a new spa_t structure. 1003fa9e4066Sahrens */ 10040373e76bSbonwick spa = spa_add(pool, altroot); 1005fa9e4066Sahrens spa_activate(spa); 1006fa9e4066Sahrens 1007fa9e4066Sahrens spa->spa_uberblock.ub_txg = txg - 1; 1008eaca9bbdSeschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1009fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 1010fa9e4066Sahrens 10110373e76bSbonwick /* 10120373e76bSbonwick * Create the root vdev. 10130373e76bSbonwick */ 10140373e76bSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10150373e76bSbonwick 101699653d4eSeschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 10170373e76bSbonwick 101899653d4eSeschrock ASSERT(error != 0 || rvd != NULL); 101999653d4eSeschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 10200373e76bSbonwick 102199653d4eSeschrock if (error == 0 && rvd->vdev_children == 0) 10220373e76bSbonwick error = EINVAL; 102399653d4eSeschrock 102499653d4eSeschrock if (error == 0 && 102599653d4eSeschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 102699653d4eSeschrock (error = spa_validate_spares(spa, nvroot, txg, 102799653d4eSeschrock VDEV_ALLOC_ADD)) == 0) { 102899653d4eSeschrock for (c = 0; c < rvd->vdev_children; c++) 102999653d4eSeschrock vdev_init(rvd->vdev_child[c], txg); 103099653d4eSeschrock vdev_config_dirty(rvd); 10310373e76bSbonwick } 10320373e76bSbonwick 10330373e76bSbonwick spa_config_exit(spa, FTAG); 1034fa9e4066Sahrens 103599653d4eSeschrock if (error != 0) { 1036fa9e4066Sahrens spa_unload(spa); 1037fa9e4066Sahrens spa_deactivate(spa); 1038fa9e4066Sahrens spa_remove(spa); 1039fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1040fa9e4066Sahrens return (error); 1041fa9e4066Sahrens } 1042fa9e4066Sahrens 104399653d4eSeschrock /* 104499653d4eSeschrock * Get the list of spares, if specified. 104599653d4eSeschrock */ 104699653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 104799653d4eSeschrock &spares, &nspares) == 0) { 104899653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 104999653d4eSeschrock KM_SLEEP) == 0); 105099653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 105199653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 105299653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 105399653d4eSeschrock spa_load_spares(spa); 105499653d4eSeschrock spa_config_exit(spa, FTAG); 105599653d4eSeschrock spa->spa_sync_spares = B_TRUE; 105699653d4eSeschrock } 105799653d4eSeschrock 1058fa9e4066Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1059fa9e4066Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1060fa9e4066Sahrens 1061fa9e4066Sahrens tx = dmu_tx_create_assigned(dp, txg); 1062fa9e4066Sahrens 1063fa9e4066Sahrens /* 1064fa9e4066Sahrens * Create the pool config object. 1065fa9e4066Sahrens */ 1066fa9e4066Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1067fa9e4066Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1068fa9e4066Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1069fa9e4066Sahrens 1070ea8dc4b6Seschrock if (zap_add(spa->spa_meta_objset, 1071fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1072ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1073ea8dc4b6Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 1074ea8dc4b6Seschrock } 1075fa9e4066Sahrens 107699653d4eSeschrock /* Newly created pools are always deflated. */ 107799653d4eSeschrock spa->spa_deflate = TRUE; 107899653d4eSeschrock if (zap_add(spa->spa_meta_objset, 107999653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 108099653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 108199653d4eSeschrock cmn_err(CE_PANIC, "failed to add deflate"); 108299653d4eSeschrock } 108399653d4eSeschrock 1084fa9e4066Sahrens /* 1085fa9e4066Sahrens * Create the deferred-free bplist object. Turn off compression 1086fa9e4066Sahrens * because sync-to-convergence takes longer if the blocksize 1087fa9e4066Sahrens * keeps changing. 1088fa9e4066Sahrens */ 1089fa9e4066Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1090fa9e4066Sahrens 1 << 14, tx); 1091fa9e4066Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1092fa9e4066Sahrens ZIO_COMPRESS_OFF, tx); 1093fa9e4066Sahrens 1094ea8dc4b6Seschrock if (zap_add(spa->spa_meta_objset, 1095fa9e4066Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1096ea8dc4b6Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1097ea8dc4b6Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 1098ea8dc4b6Seschrock } 1099fa9e4066Sahrens 1100fa9e4066Sahrens dmu_tx_commit(tx); 1101fa9e4066Sahrens 1102fa9e4066Sahrens spa->spa_sync_on = B_TRUE; 1103fa9e4066Sahrens txg_sync_start(spa->spa_dsl_pool); 1104fa9e4066Sahrens 1105fa9e4066Sahrens /* 1106fa9e4066Sahrens * We explicitly wait for the first transaction to complete so that our 1107fa9e4066Sahrens * bean counters are appropriately updated. 1108fa9e4066Sahrens */ 1109fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1110fa9e4066Sahrens 1111fa9e4066Sahrens spa_config_sync(); 1112fa9e4066Sahrens 1113fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1114fa9e4066Sahrens 1115fa9e4066Sahrens return (0); 1116fa9e4066Sahrens } 1117fa9e4066Sahrens 1118fa9e4066Sahrens /* 1119fa9e4066Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1120fa9e4066Sahrens * then call spa_load() to do the dirty work. 1121fa9e4066Sahrens */ 1122fa9e4066Sahrens int 11230373e76bSbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1124fa9e4066Sahrens { 1125fa9e4066Sahrens spa_t *spa; 1126fa9e4066Sahrens int error; 112799653d4eSeschrock nvlist_t *nvroot; 112899653d4eSeschrock nvlist_t **spares; 112999653d4eSeschrock uint_t nspares; 1130fa9e4066Sahrens 1131fa9e4066Sahrens if (!(spa_mode & FWRITE)) 1132fa9e4066Sahrens return (EROFS); 1133fa9e4066Sahrens 1134fa9e4066Sahrens /* 1135fa9e4066Sahrens * If a pool with this name exists, return failure. 1136fa9e4066Sahrens */ 1137fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1138fa9e4066Sahrens if (spa_lookup(pool) != NULL) { 1139fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1140fa9e4066Sahrens return (EEXIST); 1141fa9e4066Sahrens } 1142fa9e4066Sahrens 1143fa9e4066Sahrens /* 11440373e76bSbonwick * Create and initialize the spa structure. 1145fa9e4066Sahrens */ 11460373e76bSbonwick spa = spa_add(pool, altroot); 1147fa9e4066Sahrens spa_activate(spa); 1148fa9e4066Sahrens 11495dabedeeSbonwick /* 11500373e76bSbonwick * Pass off the heavy lifting to spa_load(). 1151ecc2d604Sbonwick * Pass TRUE for mosconfig because the user-supplied config 1152ecc2d604Sbonwick * is actually the one to trust when doing an import. 11535dabedeeSbonwick */ 1154ecc2d604Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1155fa9e4066Sahrens 115699653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 115799653d4eSeschrock /* 115899653d4eSeschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 115999653d4eSeschrock * and conflicts with spa_has_spare(). 116099653d4eSeschrock */ 116199653d4eSeschrock if (spa->spa_sparelist) { 116299653d4eSeschrock nvlist_free(spa->spa_sparelist); 116399653d4eSeschrock spa->spa_sparelist = NULL; 116499653d4eSeschrock spa_load_spares(spa); 116599653d4eSeschrock } 116699653d4eSeschrock 116799653d4eSeschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 116899653d4eSeschrock &nvroot) == 0); 116999653d4eSeschrock if (error == 0) 117099653d4eSeschrock error = spa_validate_spares(spa, nvroot, -1ULL, 117199653d4eSeschrock VDEV_ALLOC_SPARE); 117299653d4eSeschrock spa_config_exit(spa, FTAG); 117399653d4eSeschrock 117499653d4eSeschrock if (error != 0) { 1175fa9e4066Sahrens spa_unload(spa); 1176fa9e4066Sahrens spa_deactivate(spa); 1177fa9e4066Sahrens spa_remove(spa); 1178fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1179fa9e4066Sahrens return (error); 1180fa9e4066Sahrens } 1181fa9e4066Sahrens 118299653d4eSeschrock /* 118399653d4eSeschrock * Override any spares as specified by the user, as these may have 118499653d4eSeschrock * correct device names/devids, etc. 118599653d4eSeschrock */ 118699653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 118799653d4eSeschrock &spares, &nspares) == 0) { 118899653d4eSeschrock if (spa->spa_sparelist) 118999653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, 119099653d4eSeschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 119199653d4eSeschrock else 119299653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 119399653d4eSeschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 119499653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 119599653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 119699653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 119799653d4eSeschrock spa_load_spares(spa); 119899653d4eSeschrock spa_config_exit(spa, FTAG); 119999653d4eSeschrock spa->spa_sync_spares = B_TRUE; 120099653d4eSeschrock } 120199653d4eSeschrock 12020373e76bSbonwick /* 12030373e76bSbonwick * Update the config cache to include the newly-imported pool. 12040373e76bSbonwick */ 12050373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 12060373e76bSbonwick 1207fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1208fa9e4066Sahrens 1209fa9e4066Sahrens /* 1210fa9e4066Sahrens * Resilver anything that's out of date. 1211fa9e4066Sahrens */ 1212fa9e4066Sahrens if (spa_mode & FWRITE) 1213fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1214fa9e4066Sahrens 1215fa9e4066Sahrens return (0); 1216fa9e4066Sahrens } 1217fa9e4066Sahrens 1218fa9e4066Sahrens /* 1219fa9e4066Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1220fa9e4066Sahrens * to get the vdev stats associated with the imported devices. 1221fa9e4066Sahrens */ 1222fa9e4066Sahrens #define TRYIMPORT_NAME "$import" 1223fa9e4066Sahrens 1224fa9e4066Sahrens nvlist_t * 1225fa9e4066Sahrens spa_tryimport(nvlist_t *tryconfig) 1226fa9e4066Sahrens { 1227fa9e4066Sahrens nvlist_t *config = NULL; 1228fa9e4066Sahrens char *poolname; 1229fa9e4066Sahrens spa_t *spa; 1230fa9e4066Sahrens uint64_t state; 1231fa9e4066Sahrens 1232fa9e4066Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1233fa9e4066Sahrens return (NULL); 1234fa9e4066Sahrens 1235fa9e4066Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1236fa9e4066Sahrens return (NULL); 1237fa9e4066Sahrens 1238fa9e4066Sahrens /* 12390373e76bSbonwick * Create and initialize the spa structure. 1240fa9e4066Sahrens */ 12410373e76bSbonwick mutex_enter(&spa_namespace_lock); 12420373e76bSbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1243fa9e4066Sahrens spa_activate(spa); 1244fa9e4066Sahrens 1245fa9e4066Sahrens /* 12460373e76bSbonwick * Pass off the heavy lifting to spa_load(). 1247ecc2d604Sbonwick * Pass TRUE for mosconfig because the user-supplied config 1248ecc2d604Sbonwick * is actually the one to trust when doing an import. 1249fa9e4066Sahrens */ 1250ecc2d604Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1251fa9e4066Sahrens 1252fa9e4066Sahrens /* 1253fa9e4066Sahrens * If 'tryconfig' was at least parsable, return the current config. 1254fa9e4066Sahrens */ 1255fa9e4066Sahrens if (spa->spa_root_vdev != NULL) { 12560373e76bSbonwick spa_config_enter(spa, RW_READER, FTAG); 1257fa9e4066Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 12580373e76bSbonwick spa_config_exit(spa, FTAG); 1259fa9e4066Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1260fa9e4066Sahrens poolname) == 0); 1261fa9e4066Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1262fa9e4066Sahrens state) == 0); 126399653d4eSeschrock 126499653d4eSeschrock /* 126599653d4eSeschrock * Add the list of hot spares. 126699653d4eSeschrock */ 126799653d4eSeschrock spa_add_spares(spa, config); 1268fa9e4066Sahrens } 1269fa9e4066Sahrens 1270fa9e4066Sahrens spa_unload(spa); 1271fa9e4066Sahrens spa_deactivate(spa); 1272fa9e4066Sahrens spa_remove(spa); 1273fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1274fa9e4066Sahrens 1275fa9e4066Sahrens return (config); 1276fa9e4066Sahrens } 1277fa9e4066Sahrens 1278fa9e4066Sahrens /* 1279fa9e4066Sahrens * Pool export/destroy 1280fa9e4066Sahrens * 1281fa9e4066Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1282fa9e4066Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1283fa9e4066Sahrens * update the pool state and sync all the labels to disk, removing the 1284fa9e4066Sahrens * configuration from the cache afterwards. 1285fa9e4066Sahrens */ 1286fa9e4066Sahrens static int 128744cd46caSbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1288fa9e4066Sahrens { 1289fa9e4066Sahrens spa_t *spa; 1290fa9e4066Sahrens 129144cd46caSbillm if (oldconfig) 129244cd46caSbillm *oldconfig = NULL; 129344cd46caSbillm 1294fa9e4066Sahrens if (!(spa_mode & FWRITE)) 1295fa9e4066Sahrens return (EROFS); 1296fa9e4066Sahrens 1297fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 1298fa9e4066Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1299fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1300fa9e4066Sahrens return (ENOENT); 1301fa9e4066Sahrens } 1302fa9e4066Sahrens 1303ea8dc4b6Seschrock /* 1304ea8dc4b6Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 1305ea8dc4b6Seschrock * reacquire the namespace lock, and see if we can export. 1306ea8dc4b6Seschrock */ 1307ea8dc4b6Seschrock spa_open_ref(spa, FTAG); 1308ea8dc4b6Seschrock mutex_exit(&spa_namespace_lock); 1309ea8dc4b6Seschrock spa_async_suspend(spa); 1310ea8dc4b6Seschrock mutex_enter(&spa_namespace_lock); 1311ea8dc4b6Seschrock spa_close(spa, FTAG); 1312ea8dc4b6Seschrock 1313fa9e4066Sahrens /* 1314fa9e4066Sahrens * The pool will be in core if it's openable, 1315fa9e4066Sahrens * in which case we can modify its state. 1316fa9e4066Sahrens */ 1317fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1318fa9e4066Sahrens /* 1319fa9e4066Sahrens * Objsets may be open only because they're dirty, so we 1320fa9e4066Sahrens * have to force it to sync before checking spa_refcnt. 1321fa9e4066Sahrens */ 1322fa9e4066Sahrens spa_scrub_suspend(spa); 1323fa9e4066Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1324fa9e4066Sahrens 1325ea8dc4b6Seschrock /* 1326ea8dc4b6Seschrock * A pool cannot be exported or destroyed if there are active 1327ea8dc4b6Seschrock * references. If we are resetting a pool, allow references by 1328ea8dc4b6Seschrock * fault injection handlers. 1329ea8dc4b6Seschrock */ 1330ea8dc4b6Seschrock if (!spa_refcount_zero(spa) || 1331ea8dc4b6Seschrock (spa->spa_inject_ref != 0 && 1332ea8dc4b6Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1333fa9e4066Sahrens spa_scrub_resume(spa); 1334ea8dc4b6Seschrock spa_async_resume(spa); 1335fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1336fa9e4066Sahrens return (EBUSY); 1337fa9e4066Sahrens } 1338fa9e4066Sahrens 1339fa9e4066Sahrens spa_scrub_resume(spa); 1340fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1341fa9e4066Sahrens 1342fa9e4066Sahrens /* 1343fa9e4066Sahrens * We want this to be reflected on every label, 1344fa9e4066Sahrens * so mark them all dirty. spa_unload() will do the 1345fa9e4066Sahrens * final sync that pushes these changes out. 1346fa9e4066Sahrens */ 1347ea8dc4b6Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 13485dabedeeSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 1349ea8dc4b6Seschrock spa->spa_state = new_state; 13500373e76bSbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1351ea8dc4b6Seschrock vdev_config_dirty(spa->spa_root_vdev); 13525dabedeeSbonwick spa_config_exit(spa, FTAG); 1353ea8dc4b6Seschrock } 1354fa9e4066Sahrens } 1355fa9e4066Sahrens 1356fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1357fa9e4066Sahrens spa_unload(spa); 1358fa9e4066Sahrens spa_deactivate(spa); 1359fa9e4066Sahrens } 1360fa9e4066Sahrens 136144cd46caSbillm if (oldconfig && spa->spa_config) 136244cd46caSbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 136344cd46caSbillm 1364ea8dc4b6Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 1365ea8dc4b6Seschrock spa_remove(spa); 1366ea8dc4b6Seschrock spa_config_sync(); 1367ea8dc4b6Seschrock } 1368fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 1369fa9e4066Sahrens 1370fa9e4066Sahrens return (0); 1371fa9e4066Sahrens } 1372fa9e4066Sahrens 1373fa9e4066Sahrens /* 1374fa9e4066Sahrens * Destroy a storage pool. 1375fa9e4066Sahrens */ 1376fa9e4066Sahrens int 1377fa9e4066Sahrens spa_destroy(char *pool) 1378fa9e4066Sahrens { 137944cd46caSbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1380fa9e4066Sahrens } 1381fa9e4066Sahrens 1382fa9e4066Sahrens /* 1383fa9e4066Sahrens * Export a storage pool. 1384fa9e4066Sahrens */ 1385fa9e4066Sahrens int 138644cd46caSbillm spa_export(char *pool, nvlist_t **oldconfig) 1387fa9e4066Sahrens { 138844cd46caSbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1389fa9e4066Sahrens } 1390fa9e4066Sahrens 1391ea8dc4b6Seschrock /* 1392ea8dc4b6Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 1393ea8dc4b6Seschrock * from the namespace in any way. 1394ea8dc4b6Seschrock */ 1395ea8dc4b6Seschrock int 1396ea8dc4b6Seschrock spa_reset(char *pool) 1397ea8dc4b6Seschrock { 139844cd46caSbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1399ea8dc4b6Seschrock } 1400ea8dc4b6Seschrock 1401ea8dc4b6Seschrock 1402fa9e4066Sahrens /* 1403fa9e4066Sahrens * ========================================================================== 1404fa9e4066Sahrens * Device manipulation 1405fa9e4066Sahrens * ========================================================================== 1406fa9e4066Sahrens */ 1407fa9e4066Sahrens 1408fa9e4066Sahrens /* 1409fa9e4066Sahrens * Add capacity to a storage pool. 1410fa9e4066Sahrens */ 1411fa9e4066Sahrens int 1412fa9e4066Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1413fa9e4066Sahrens { 1414fa9e4066Sahrens uint64_t txg; 14150373e76bSbonwick int c, error; 1416fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 14170e34b6a7Sbonwick vdev_t *vd, *tvd; 141899653d4eSeschrock nvlist_t **spares; 141999653d4eSeschrock uint_t i, nspares; 1420fa9e4066Sahrens 1421fa9e4066Sahrens txg = spa_vdev_enter(spa); 1422fa9e4066Sahrens 142399653d4eSeschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 142499653d4eSeschrock VDEV_ALLOC_ADD)) != 0) 142599653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, error)); 1426fa9e4066Sahrens 142799653d4eSeschrock if ((error = spa_validate_spares(spa, nvroot, txg, 142899653d4eSeschrock VDEV_ALLOC_ADD)) != 0) 142999653d4eSeschrock return (spa_vdev_exit(spa, vd, txg, error)); 143099653d4eSeschrock 143199653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 143299653d4eSeschrock &spares, &nspares) != 0) 143399653d4eSeschrock nspares = 0; 143499653d4eSeschrock 143599653d4eSeschrock if (vd->vdev_children == 0 && nspares == 0) 1436fa9e4066Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1437fa9e4066Sahrens 143899653d4eSeschrock if (vd->vdev_children != 0) { 143999653d4eSeschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 144099653d4eSeschrock return (spa_vdev_exit(spa, vd, txg, error)); 1441fa9e4066Sahrens 144299653d4eSeschrock /* 144399653d4eSeschrock * Transfer each new top-level vdev from vd to rvd. 144499653d4eSeschrock */ 144599653d4eSeschrock for (c = 0; c < vd->vdev_children; c++) { 144699653d4eSeschrock tvd = vd->vdev_child[c]; 144799653d4eSeschrock vdev_remove_child(vd, tvd); 144899653d4eSeschrock tvd->vdev_id = rvd->vdev_children; 144999653d4eSeschrock vdev_add_child(rvd, tvd); 145099653d4eSeschrock vdev_config_dirty(tvd); 145199653d4eSeschrock } 145299653d4eSeschrock } 145399653d4eSeschrock 145499653d4eSeschrock if (nspares != 0) { 145599653d4eSeschrock if (spa->spa_sparelist != NULL) { 145699653d4eSeschrock nvlist_t **oldspares; 145799653d4eSeschrock uint_t oldnspares; 145899653d4eSeschrock nvlist_t **newspares; 145999653d4eSeschrock 146099653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 146199653d4eSeschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 146299653d4eSeschrock 146399653d4eSeschrock newspares = kmem_alloc(sizeof (void *) * 146499653d4eSeschrock (nspares + oldnspares), KM_SLEEP); 146599653d4eSeschrock for (i = 0; i < oldnspares; i++) 146699653d4eSeschrock VERIFY(nvlist_dup(oldspares[i], 146799653d4eSeschrock &newspares[i], KM_SLEEP) == 0); 146899653d4eSeschrock for (i = 0; i < nspares; i++) 146999653d4eSeschrock VERIFY(nvlist_dup(spares[i], 147099653d4eSeschrock &newspares[i + oldnspares], 147199653d4eSeschrock KM_SLEEP) == 0); 147299653d4eSeschrock 147399653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, 147499653d4eSeschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 147599653d4eSeschrock 147699653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 147799653d4eSeschrock ZPOOL_CONFIG_SPARES, newspares, 147899653d4eSeschrock nspares + oldnspares) == 0); 147999653d4eSeschrock for (i = 0; i < oldnspares + nspares; i++) 148099653d4eSeschrock nvlist_free(newspares[i]); 148199653d4eSeschrock kmem_free(newspares, (oldnspares + nspares) * 148299653d4eSeschrock sizeof (void *)); 148399653d4eSeschrock } else { 148499653d4eSeschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 148599653d4eSeschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 148699653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 148799653d4eSeschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 148899653d4eSeschrock } 148999653d4eSeschrock 149099653d4eSeschrock spa_load_spares(spa); 149199653d4eSeschrock spa->spa_sync_spares = B_TRUE; 1492fa9e4066Sahrens } 1493fa9e4066Sahrens 1494fa9e4066Sahrens /* 14950e34b6a7Sbonwick * We have to be careful when adding new vdevs to an existing pool. 14960e34b6a7Sbonwick * If other threads start allocating from these vdevs before we 14970e34b6a7Sbonwick * sync the config cache, and we lose power, then upon reboot we may 14980e34b6a7Sbonwick * fail to open the pool because there are DVAs that the config cache 14990e34b6a7Sbonwick * can't translate. Therefore, we first add the vdevs without 15000e34b6a7Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 15010373e76bSbonwick * and then let spa_config_update() initialize the new metaslabs. 15020e34b6a7Sbonwick * 15030e34b6a7Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 15040e34b6a7Sbonwick * if we lose power at any point in this sequence, the remaining 15050e34b6a7Sbonwick * steps will be completed the next time we load the pool. 15060e34b6a7Sbonwick */ 15070373e76bSbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 15080e34b6a7Sbonwick 15090373e76bSbonwick mutex_enter(&spa_namespace_lock); 15100373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 15110373e76bSbonwick mutex_exit(&spa_namespace_lock); 1512fa9e4066Sahrens 15130373e76bSbonwick return (0); 1514fa9e4066Sahrens } 1515fa9e4066Sahrens 1516fa9e4066Sahrens /* 1517fa9e4066Sahrens * Attach a device to a mirror. The arguments are the path to any device 1518fa9e4066Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1519fa9e4066Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1520fa9e4066Sahrens * 1521fa9e4066Sahrens * If 'replacing' is specified, the new device is intended to replace the 1522fa9e4066Sahrens * existing device; in this case the two devices are made into their own 1523fa9e4066Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1524fa9e4066Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1525fa9e4066Sahrens * extra rules: you can't attach to it after it's been created, and upon 1526fa9e4066Sahrens * completion of resilvering, the first disk (the one being replaced) 1527fa9e4066Sahrens * is automatically detached. 1528fa9e4066Sahrens */ 1529fa9e4066Sahrens int 1530ea8dc4b6Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1531fa9e4066Sahrens { 1532fa9e4066Sahrens uint64_t txg, open_txg; 1533fa9e4066Sahrens int error; 1534fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1535fa9e4066Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 153699653d4eSeschrock vdev_ops_t *pvops; 1537fa9e4066Sahrens 1538fa9e4066Sahrens txg = spa_vdev_enter(spa); 1539fa9e4066Sahrens 1540ea8dc4b6Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1541fa9e4066Sahrens 1542fa9e4066Sahrens if (oldvd == NULL) 1543fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1544fa9e4066Sahrens 15450e34b6a7Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 15460e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15470e34b6a7Sbonwick 1548fa9e4066Sahrens pvd = oldvd->vdev_parent; 1549fa9e4066Sahrens 155099653d4eSeschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 155199653d4eSeschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1552fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1553fa9e4066Sahrens 1554fa9e4066Sahrens newvd = newrootvd->vdev_child[0]; 1555fa9e4066Sahrens 1556fa9e4066Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1557fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1558fa9e4066Sahrens 155999653d4eSeschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1560fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1561fa9e4066Sahrens 156299653d4eSeschrock if (!replacing) { 156399653d4eSeschrock /* 156499653d4eSeschrock * For attach, the only allowable parent is a mirror or the root 156599653d4eSeschrock * vdev. 156699653d4eSeschrock */ 156799653d4eSeschrock if (pvd->vdev_ops != &vdev_mirror_ops && 156899653d4eSeschrock pvd->vdev_ops != &vdev_root_ops) 156999653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 157099653d4eSeschrock 157199653d4eSeschrock pvops = &vdev_mirror_ops; 157299653d4eSeschrock } else { 157399653d4eSeschrock /* 157499653d4eSeschrock * Active hot spares can only be replaced by inactive hot 157599653d4eSeschrock * spares. 157699653d4eSeschrock */ 157799653d4eSeschrock if (pvd->vdev_ops == &vdev_spare_ops && 157899653d4eSeschrock pvd->vdev_child[1] == oldvd && 157999653d4eSeschrock !spa_has_spare(spa, newvd->vdev_guid)) 158099653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 158199653d4eSeschrock 158299653d4eSeschrock /* 158399653d4eSeschrock * If the source is a hot spare, and the parent isn't already a 158499653d4eSeschrock * spare, then we want to create a new hot spare. Otherwise, we 158599653d4eSeschrock * want to create a replacing vdev. 158699653d4eSeschrock */ 158799653d4eSeschrock if (pvd->vdev_ops == &vdev_replacing_ops) 158899653d4eSeschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 158999653d4eSeschrock else if (pvd->vdev_ops != &vdev_spare_ops && 159099653d4eSeschrock newvd->vdev_isspare) 159199653d4eSeschrock pvops = &vdev_spare_ops; 159299653d4eSeschrock else 159399653d4eSeschrock pvops = &vdev_replacing_ops; 159499653d4eSeschrock } 159599653d4eSeschrock 15962a79c5feSlling /* 15972a79c5feSlling * Compare the new device size with the replaceable/attachable 15982a79c5feSlling * device size. 15992a79c5feSlling */ 16002a79c5feSlling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1601fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1602fa9e4066Sahrens 1603ecc2d604Sbonwick /* 1604ecc2d604Sbonwick * The new device cannot have a higher alignment requirement 1605ecc2d604Sbonwick * than the top-level vdev. 1606ecc2d604Sbonwick */ 1607ecc2d604Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1608fa9e4066Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1609fa9e4066Sahrens 1610fa9e4066Sahrens /* 1611fa9e4066Sahrens * If this is an in-place replacement, update oldvd's path and devid 1612fa9e4066Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1613fa9e4066Sahrens */ 1614fa9e4066Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1615fa9e4066Sahrens spa_strfree(oldvd->vdev_path); 1616fa9e4066Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1617fa9e4066Sahrens KM_SLEEP); 1618fa9e4066Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1619fa9e4066Sahrens newvd->vdev_path, "old"); 1620fa9e4066Sahrens if (oldvd->vdev_devid != NULL) { 1621fa9e4066Sahrens spa_strfree(oldvd->vdev_devid); 1622fa9e4066Sahrens oldvd->vdev_devid = NULL; 1623fa9e4066Sahrens } 1624fa9e4066Sahrens } 1625fa9e4066Sahrens 1626fa9e4066Sahrens /* 162799653d4eSeschrock * If the parent is not a mirror, or if we're replacing, insert the new 162899653d4eSeschrock * mirror/replacing/spare vdev above oldvd. 1629fa9e4066Sahrens */ 1630fa9e4066Sahrens if (pvd->vdev_ops != pvops) 1631fa9e4066Sahrens pvd = vdev_add_parent(oldvd, pvops); 1632fa9e4066Sahrens 1633fa9e4066Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1634fa9e4066Sahrens ASSERT(pvd->vdev_ops == pvops); 1635fa9e4066Sahrens ASSERT(oldvd->vdev_parent == pvd); 1636fa9e4066Sahrens 1637fa9e4066Sahrens /* 1638fa9e4066Sahrens * Extract the new device from its root and add it to pvd. 1639fa9e4066Sahrens */ 1640fa9e4066Sahrens vdev_remove_child(newrootvd, newvd); 1641fa9e4066Sahrens newvd->vdev_id = pvd->vdev_children; 1642fa9e4066Sahrens vdev_add_child(pvd, newvd); 1643fa9e4066Sahrens 1644ea8dc4b6Seschrock /* 1645ea8dc4b6Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 1646ea8dc4b6Seschrock * the addition of newvd may have decreased our parent's asize. 1647ea8dc4b6Seschrock */ 1648ea8dc4b6Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1649ea8dc4b6Seschrock 1650fa9e4066Sahrens tvd = newvd->vdev_top; 1651fa9e4066Sahrens ASSERT(pvd->vdev_top == tvd); 1652fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 1653fa9e4066Sahrens 1654fa9e4066Sahrens vdev_config_dirty(tvd); 1655fa9e4066Sahrens 1656fa9e4066Sahrens /* 1657fa9e4066Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1658fa9e4066Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1659fa9e4066Sahrens */ 1660fa9e4066Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1661fa9e4066Sahrens 1662fa9e4066Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1663fa9e4066Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1664fa9e4066Sahrens open_txg - TXG_INITIAL + 1); 1665fa9e4066Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1666fa9e4066Sahrens 1667ea8dc4b6Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1668ea8dc4b6Seschrock 1669fa9e4066Sahrens /* 1670fa9e4066Sahrens * Mark newvd's DTL dirty in this txg. 1671fa9e4066Sahrens */ 1672ecc2d604Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1673fa9e4066Sahrens 1674fa9e4066Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1675fa9e4066Sahrens 1676fa9e4066Sahrens /* 1677fa9e4066Sahrens * Kick off a resilver to update newvd. 1678fa9e4066Sahrens */ 1679fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1680fa9e4066Sahrens 1681fa9e4066Sahrens return (0); 1682fa9e4066Sahrens } 1683fa9e4066Sahrens 1684fa9e4066Sahrens /* 1685fa9e4066Sahrens * Detach a device from a mirror or replacing vdev. 1686fa9e4066Sahrens * If 'replace_done' is specified, only detach if the parent 1687fa9e4066Sahrens * is a replacing vdev. 1688fa9e4066Sahrens */ 1689fa9e4066Sahrens int 1690ea8dc4b6Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1691fa9e4066Sahrens { 1692fa9e4066Sahrens uint64_t txg; 1693fa9e4066Sahrens int c, t, error; 1694fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1695fa9e4066Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 169699653d4eSeschrock boolean_t unspare = B_FALSE; 169799653d4eSeschrock uint64_t unspare_guid; 1698fa9e4066Sahrens 1699fa9e4066Sahrens txg = spa_vdev_enter(spa); 1700fa9e4066Sahrens 1701ea8dc4b6Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1702fa9e4066Sahrens 1703fa9e4066Sahrens if (vd == NULL) 1704fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1705fa9e4066Sahrens 17060e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 17070e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 17080e34b6a7Sbonwick 1709fa9e4066Sahrens pvd = vd->vdev_parent; 1710fa9e4066Sahrens 1711fa9e4066Sahrens /* 1712fa9e4066Sahrens * If replace_done is specified, only remove this device if it's 171399653d4eSeschrock * the first child of a replacing vdev. For the 'spare' vdev, either 171499653d4eSeschrock * disk can be removed. 171599653d4eSeschrock */ 171699653d4eSeschrock if (replace_done) { 171799653d4eSeschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 171899653d4eSeschrock if (vd->vdev_id != 0) 171999653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 172099653d4eSeschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 172199653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 172299653d4eSeschrock } 172399653d4eSeschrock } 172499653d4eSeschrock 172599653d4eSeschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 172699653d4eSeschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1727fa9e4066Sahrens 1728fa9e4066Sahrens /* 172999653d4eSeschrock * Only mirror, replacing, and spare vdevs support detach. 1730fa9e4066Sahrens */ 1731fa9e4066Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 173299653d4eSeschrock pvd->vdev_ops != &vdev_mirror_ops && 173399653d4eSeschrock pvd->vdev_ops != &vdev_spare_ops) 1734fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1735fa9e4066Sahrens 1736fa9e4066Sahrens /* 1737fa9e4066Sahrens * If there's only one replica, you can't detach it. 1738fa9e4066Sahrens */ 1739fa9e4066Sahrens if (pvd->vdev_children <= 1) 1740fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1741fa9e4066Sahrens 1742fa9e4066Sahrens /* 1743fa9e4066Sahrens * If all siblings have non-empty DTLs, this device may have the only 1744fa9e4066Sahrens * valid copy of the data, which means we cannot safely detach it. 1745fa9e4066Sahrens * 1746fa9e4066Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1747fa9e4066Sahrens * precise DTL check. 1748fa9e4066Sahrens */ 1749fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1750fa9e4066Sahrens uint64_t dirty; 1751fa9e4066Sahrens 1752fa9e4066Sahrens cvd = pvd->vdev_child[c]; 1753fa9e4066Sahrens if (cvd == vd) 1754fa9e4066Sahrens continue; 1755fa9e4066Sahrens if (vdev_is_dead(cvd)) 1756fa9e4066Sahrens continue; 1757fa9e4066Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1758fa9e4066Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1759fa9e4066Sahrens cvd->vdev_dtl_scrub.sm_space; 1760fa9e4066Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1761fa9e4066Sahrens if (!dirty) 1762fa9e4066Sahrens break; 1763fa9e4066Sahrens } 176499653d4eSeschrock 176599653d4eSeschrock /* 176699653d4eSeschrock * If we are a replacing or spare vdev, then we can always detach the 176799653d4eSeschrock * latter child, as that is how one cancels the operation. 176899653d4eSeschrock */ 176999653d4eSeschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 177099653d4eSeschrock c == pvd->vdev_children) 1771fa9e4066Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1772fa9e4066Sahrens 177399653d4eSeschrock /* 177499653d4eSeschrock * If we are detaching the original disk from a spare, then it implies 177599653d4eSeschrock * that the spare should become a real disk, and be removed from the 177699653d4eSeschrock * active spare list for the pool. 177799653d4eSeschrock */ 177899653d4eSeschrock if (pvd->vdev_ops == &vdev_spare_ops && 177999653d4eSeschrock vd->vdev_id == 0) 178099653d4eSeschrock unspare = B_TRUE; 178199653d4eSeschrock 1782fa9e4066Sahrens /* 1783fa9e4066Sahrens * Erase the disk labels so the disk can be used for other things. 1784fa9e4066Sahrens * This must be done after all other error cases are handled, 1785fa9e4066Sahrens * but before we disembowel vd (so we can still do I/O to it). 1786fa9e4066Sahrens * But if we can't do it, don't treat the error as fatal -- 1787fa9e4066Sahrens * it may be that the unwritability of the disk is the reason 1788fa9e4066Sahrens * it's being detached! 1789fa9e4066Sahrens */ 179099653d4eSeschrock error = vdev_label_init(vd, 0, B_FALSE); 1791fa9e4066Sahrens if (error) 1792fa9e4066Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1793fa9e4066Sahrens 1794fa9e4066Sahrens /* 1795fa9e4066Sahrens * Remove vd from its parent and compact the parent's children. 1796fa9e4066Sahrens */ 1797fa9e4066Sahrens vdev_remove_child(pvd, vd); 1798fa9e4066Sahrens vdev_compact_children(pvd); 1799fa9e4066Sahrens 1800fa9e4066Sahrens /* 1801fa9e4066Sahrens * Remember one of the remaining children so we can get tvd below. 1802fa9e4066Sahrens */ 1803fa9e4066Sahrens cvd = pvd->vdev_child[0]; 1804fa9e4066Sahrens 180599653d4eSeschrock /* 180699653d4eSeschrock * If we need to remove the remaining child from the list of hot spares, 180799653d4eSeschrock * do it now, marking the vdev as no longer a spare in the process. We 180899653d4eSeschrock * must do this before vdev_remove_parent(), because that can change the 180999653d4eSeschrock * GUID if it creates a new toplevel GUID. 181099653d4eSeschrock */ 181199653d4eSeschrock if (unspare) { 181299653d4eSeschrock ASSERT(cvd->vdev_isspare); 181399653d4eSeschrock spa_spare_remove(cvd->vdev_guid); 181499653d4eSeschrock cvd->vdev_isspare = B_FALSE; 181599653d4eSeschrock unspare_guid = cvd->vdev_guid; 181699653d4eSeschrock } 181799653d4eSeschrock 1818fa9e4066Sahrens /* 1819fa9e4066Sahrens * If the parent mirror/replacing vdev only has one child, 1820fa9e4066Sahrens * the parent is no longer needed. Remove it from the tree. 1821fa9e4066Sahrens */ 1822fa9e4066Sahrens if (pvd->vdev_children == 1) 1823fa9e4066Sahrens vdev_remove_parent(cvd); 1824fa9e4066Sahrens 1825fa9e4066Sahrens /* 1826fa9e4066Sahrens * We don't set tvd until now because the parent we just removed 1827fa9e4066Sahrens * may have been the previous top-level vdev. 1828fa9e4066Sahrens */ 1829fa9e4066Sahrens tvd = cvd->vdev_top; 1830fa9e4066Sahrens ASSERT(tvd->vdev_parent == rvd); 1831fa9e4066Sahrens 1832fa9e4066Sahrens /* 1833fa9e4066Sahrens * Reopen this top-level vdev to reassess health after detach. 1834fa9e4066Sahrens */ 1835ea8dc4b6Seschrock vdev_reopen(tvd); 1836fa9e4066Sahrens 1837fa9e4066Sahrens /* 1838fa9e4066Sahrens * If the device we just detached was smaller than the others, 1839ecc2d604Sbonwick * it may be possible to add metaslabs (i.e. grow the pool). 1840ecc2d604Sbonwick * vdev_metaslab_init() can't fail because the existing metaslabs 1841ecc2d604Sbonwick * are already in core, so there's nothing to read from disk. 1842fa9e4066Sahrens */ 1843ecc2d604Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1844fa9e4066Sahrens 1845fa9e4066Sahrens vdev_config_dirty(tvd); 1846fa9e4066Sahrens 1847fa9e4066Sahrens /* 1848fa9e4066Sahrens * Mark vd's DTL as dirty in this txg. 1849fa9e4066Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1850fa9e4066Sahrens * and free vd's DTL object in syncing context. 1851fa9e4066Sahrens * But first make sure we're not on any *other* txg's DTL list, 1852fa9e4066Sahrens * to prevent vd from being accessed after it's freed. 1853fa9e4066Sahrens */ 1854fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) 1855fa9e4066Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1856ecc2d604Sbonwick vd->vdev_detached = B_TRUE; 1857ecc2d604Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1858fa9e4066Sahrens 1859ea8dc4b6Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1860fa9e4066Sahrens 186199653d4eSeschrock error = spa_vdev_exit(spa, vd, txg, 0); 186299653d4eSeschrock 186399653d4eSeschrock /* 186499653d4eSeschrock * If we are supposed to remove the given vdev from the list of spares, 186599653d4eSeschrock * iterate over all pools in the system and replace it if it's present. 186699653d4eSeschrock */ 186799653d4eSeschrock if (unspare) { 186899653d4eSeschrock spa = NULL; 186999653d4eSeschrock mutex_enter(&spa_namespace_lock); 187099653d4eSeschrock while ((spa = spa_next(spa)) != NULL) { 187199653d4eSeschrock if (spa->spa_state != POOL_STATE_ACTIVE) 187299653d4eSeschrock continue; 187399653d4eSeschrock 187499653d4eSeschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 187599653d4eSeschrock } 187699653d4eSeschrock mutex_exit(&spa_namespace_lock); 187799653d4eSeschrock } 187899653d4eSeschrock 187999653d4eSeschrock return (error); 188099653d4eSeschrock } 188199653d4eSeschrock 188299653d4eSeschrock /* 188399653d4eSeschrock * Remove a device from the pool. Currently, this supports removing only hot 188499653d4eSeschrock * spares. 188599653d4eSeschrock */ 188699653d4eSeschrock int 188799653d4eSeschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 188899653d4eSeschrock { 188999653d4eSeschrock vdev_t *vd; 189099653d4eSeschrock nvlist_t **spares, *nv, **newspares; 189199653d4eSeschrock uint_t i, j, nspares; 189299653d4eSeschrock int ret = 0; 189399653d4eSeschrock 189499653d4eSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 189599653d4eSeschrock 189699653d4eSeschrock vd = spa_lookup_by_guid(spa, guid); 189799653d4eSeschrock 189899653d4eSeschrock nv = NULL; 189999653d4eSeschrock if (spa->spa_spares != NULL && 190099653d4eSeschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 190199653d4eSeschrock &spares, &nspares) == 0) { 190299653d4eSeschrock for (i = 0; i < nspares; i++) { 190399653d4eSeschrock uint64_t theguid; 190499653d4eSeschrock 190599653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 190699653d4eSeschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 190799653d4eSeschrock if (theguid == guid) { 190899653d4eSeschrock nv = spares[i]; 190999653d4eSeschrock break; 191099653d4eSeschrock } 191199653d4eSeschrock } 191299653d4eSeschrock } 191399653d4eSeschrock 191499653d4eSeschrock /* 191599653d4eSeschrock * We only support removing a hot spare, and only if it's not currently 191699653d4eSeschrock * in use in this pool. 191799653d4eSeschrock */ 191899653d4eSeschrock if (nv == NULL && vd == NULL) { 191999653d4eSeschrock ret = ENOENT; 192099653d4eSeschrock goto out; 192199653d4eSeschrock } 192299653d4eSeschrock 192399653d4eSeschrock if (nv == NULL && vd != NULL) { 192499653d4eSeschrock ret = ENOTSUP; 192599653d4eSeschrock goto out; 192699653d4eSeschrock } 192799653d4eSeschrock 192899653d4eSeschrock if (!unspare && nv != NULL && vd != NULL) { 192999653d4eSeschrock ret = EBUSY; 193099653d4eSeschrock goto out; 193199653d4eSeschrock } 193299653d4eSeschrock 193399653d4eSeschrock if (nspares == 1) { 193499653d4eSeschrock newspares = NULL; 193599653d4eSeschrock } else { 193699653d4eSeschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 193799653d4eSeschrock KM_SLEEP); 193899653d4eSeschrock for (i = 0, j = 0; i < nspares; i++) { 193999653d4eSeschrock if (spares[i] != nv) 194099653d4eSeschrock VERIFY(nvlist_dup(spares[i], 194199653d4eSeschrock &newspares[j++], KM_SLEEP) == 0); 194299653d4eSeschrock } 194399653d4eSeschrock } 194499653d4eSeschrock 194599653d4eSeschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 194699653d4eSeschrock DATA_TYPE_NVLIST_ARRAY) == 0); 194799653d4eSeschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 194899653d4eSeschrock newspares, nspares - 1) == 0); 194999653d4eSeschrock for (i = 0; i < nspares - 1; i++) 195099653d4eSeschrock nvlist_free(newspares[i]); 195199653d4eSeschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 195299653d4eSeschrock spa_load_spares(spa); 195399653d4eSeschrock spa->spa_sync_spares = B_TRUE; 195499653d4eSeschrock 195599653d4eSeschrock out: 195699653d4eSeschrock spa_config_exit(spa, FTAG); 195799653d4eSeschrock 195899653d4eSeschrock return (ret); 1959fa9e4066Sahrens } 1960fa9e4066Sahrens 1961fa9e4066Sahrens /* 1962ea8dc4b6Seschrock * Find any device that's done replacing, so we can detach it. 1963fa9e4066Sahrens */ 1964ea8dc4b6Seschrock static vdev_t * 1965ea8dc4b6Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1966fa9e4066Sahrens { 1967ea8dc4b6Seschrock vdev_t *newvd, *oldvd; 1968fa9e4066Sahrens int c; 1969fa9e4066Sahrens 1970ea8dc4b6Seschrock for (c = 0; c < vd->vdev_children; c++) { 1971ea8dc4b6Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1972ea8dc4b6Seschrock if (oldvd != NULL) 1973ea8dc4b6Seschrock return (oldvd); 1974ea8dc4b6Seschrock } 1975fa9e4066Sahrens 1976fa9e4066Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1977ea8dc4b6Seschrock oldvd = vd->vdev_child[0]; 1978ea8dc4b6Seschrock newvd = vd->vdev_child[1]; 1979ea8dc4b6Seschrock 1980ea8dc4b6Seschrock mutex_enter(&newvd->vdev_dtl_lock); 1981ea8dc4b6Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 1982ea8dc4b6Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 1983ea8dc4b6Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1984ea8dc4b6Seschrock return (oldvd); 1985fa9e4066Sahrens } 1986ea8dc4b6Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1987fa9e4066Sahrens } 1988ea8dc4b6Seschrock 1989ea8dc4b6Seschrock return (NULL); 1990fa9e4066Sahrens } 1991fa9e4066Sahrens 1992ea8dc4b6Seschrock static void 1993fa9e4066Sahrens spa_vdev_replace_done(spa_t *spa) 1994fa9e4066Sahrens { 1995ea8dc4b6Seschrock vdev_t *vd; 199699653d4eSeschrock vdev_t *pvd; 1997ea8dc4b6Seschrock uint64_t guid; 199899653d4eSeschrock uint64_t pguid = 0; 1999ea8dc4b6Seschrock 2000ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 2001ea8dc4b6Seschrock 2002ea8dc4b6Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2003ea8dc4b6Seschrock guid = vd->vdev_guid; 200499653d4eSeschrock /* 200599653d4eSeschrock * If we have just finished replacing a hot spared device, then 200699653d4eSeschrock * we need to detach the parent's first child (the original hot 200799653d4eSeschrock * spare) as well. 200899653d4eSeschrock */ 200999653d4eSeschrock pvd = vd->vdev_parent; 201099653d4eSeschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 201199653d4eSeschrock pvd->vdev_id == 0) { 201299653d4eSeschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 201399653d4eSeschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 201499653d4eSeschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 201599653d4eSeschrock } 2016ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2017ea8dc4b6Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2018ea8dc4b6Seschrock return; 201999653d4eSeschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 202099653d4eSeschrock return; 2021ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 2022fa9e4066Sahrens } 2023fa9e4066Sahrens 2024ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2025fa9e4066Sahrens } 2026fa9e4066Sahrens 2027c67d9675Seschrock /* 2028c67d9675Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 2029c67d9675Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 2030c67d9675Seschrock */ 2031c67d9675Seschrock int 2032c67d9675Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2033c67d9675Seschrock { 2034c67d9675Seschrock vdev_t *rvd, *vd; 2035c67d9675Seschrock uint64_t txg; 2036c67d9675Seschrock 2037c67d9675Seschrock rvd = spa->spa_root_vdev; 2038c67d9675Seschrock 2039c67d9675Seschrock txg = spa_vdev_enter(spa); 2040c67d9675Seschrock 204199653d4eSeschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 204299653d4eSeschrock /* 204399653d4eSeschrock * Determine if this is a reference to a hot spare. In that 204499653d4eSeschrock * case, update the path as stored in the spare list. 204599653d4eSeschrock */ 204699653d4eSeschrock nvlist_t **spares; 204799653d4eSeschrock uint_t i, nspares; 204899653d4eSeschrock if (spa->spa_sparelist != NULL) { 204999653d4eSeschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 205099653d4eSeschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 205199653d4eSeschrock for (i = 0; i < nspares; i++) { 205299653d4eSeschrock uint64_t theguid; 205399653d4eSeschrock VERIFY(nvlist_lookup_uint64(spares[i], 205499653d4eSeschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 205599653d4eSeschrock if (theguid == guid) 205699653d4eSeschrock break; 205799653d4eSeschrock } 205899653d4eSeschrock 205999653d4eSeschrock if (i == nspares) 206099653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 206199653d4eSeschrock 206299653d4eSeschrock VERIFY(nvlist_add_string(spares[i], 206399653d4eSeschrock ZPOOL_CONFIG_PATH, newpath) == 0); 206499653d4eSeschrock spa_load_spares(spa); 206599653d4eSeschrock spa->spa_sync_spares = B_TRUE; 206699653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 206799653d4eSeschrock } else { 206899653d4eSeschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 206999653d4eSeschrock } 207099653d4eSeschrock } 2071c67d9675Seschrock 20720e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 20730e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 20740e34b6a7Sbonwick 2075c67d9675Seschrock spa_strfree(vd->vdev_path); 2076c67d9675Seschrock vd->vdev_path = spa_strdup(newpath); 2077c67d9675Seschrock 2078c67d9675Seschrock vdev_config_dirty(vd->vdev_top); 2079c67d9675Seschrock 2080c67d9675Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 2081c67d9675Seschrock } 2082c67d9675Seschrock 2083fa9e4066Sahrens /* 2084fa9e4066Sahrens * ========================================================================== 2085fa9e4066Sahrens * SPA Scrubbing 2086fa9e4066Sahrens * ========================================================================== 2087fa9e4066Sahrens */ 2088fa9e4066Sahrens 2089ea8dc4b6Seschrock void 2090ea8dc4b6Seschrock spa_scrub_throttle(spa_t *spa, int direction) 2091ea8dc4b6Seschrock { 2092ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 2093ea8dc4b6Seschrock spa->spa_scrub_throttled += direction; 2094ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 2095ea8dc4b6Seschrock if (spa->spa_scrub_throttled == 0) 2096ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 2097ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2098ea8dc4b6Seschrock } 2099fa9e4066Sahrens 2100fa9e4066Sahrens static void 2101fa9e4066Sahrens spa_scrub_io_done(zio_t *zio) 2102fa9e4066Sahrens { 2103fa9e4066Sahrens spa_t *spa = zio->io_spa; 2104fa9e4066Sahrens 2105fa9e4066Sahrens zio_buf_free(zio->io_data, zio->io_size); 2106fa9e4066Sahrens 2107fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2108ea8dc4b6Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 210944cd46caSbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2110ea8dc4b6Seschrock spa->spa_scrub_errors++; 2111fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2112fa9e4066Sahrens vd->vdev_stat.vs_scrub_errors++; 2113fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 2114fa9e4066Sahrens } 2115ea8dc4b6Seschrock if (--spa->spa_scrub_inflight == 0) { 2116ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 2117ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled == 0); 2118ea8dc4b6Seschrock } 2119ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2120fa9e4066Sahrens } 2121fa9e4066Sahrens 2122fa9e4066Sahrens static void 2123ea8dc4b6Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2124ea8dc4b6Seschrock zbookmark_t *zb) 2125fa9e4066Sahrens { 2126fa9e4066Sahrens size_t size = BP_GET_LSIZE(bp); 2127fa9e4066Sahrens void *data = zio_buf_alloc(size); 2128fa9e4066Sahrens 2129fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2130fa9e4066Sahrens spa->spa_scrub_inflight++; 2131fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2132fa9e4066Sahrens 2133ea8dc4b6Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2134ea8dc4b6Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2135ea8dc4b6Seschrock 2136d80c45e0Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2137ea8dc4b6Seschrock 2138fa9e4066Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 2139ea8dc4b6Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2140fa9e4066Sahrens } 2141fa9e4066Sahrens 2142fa9e4066Sahrens /* ARGSUSED */ 2143fa9e4066Sahrens static int 2144fa9e4066Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2145fa9e4066Sahrens { 2146fa9e4066Sahrens blkptr_t *bp = &bc->bc_blkptr; 214744cd46caSbillm vdev_t *vd = spa->spa_root_vdev; 214844cd46caSbillm dva_t *dva = bp->blk_dva; 214944cd46caSbillm int needs_resilver = B_FALSE; 215044cd46caSbillm int d; 2151fa9e4066Sahrens 215244cd46caSbillm if (bc->bc_errno) { 2153fa9e4066Sahrens /* 2154fa9e4066Sahrens * We can't scrub this block, but we can continue to scrub 2155fa9e4066Sahrens * the rest of the pool. Note the error and move along. 2156fa9e4066Sahrens */ 2157fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2158fa9e4066Sahrens spa->spa_scrub_errors++; 2159fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2160fa9e4066Sahrens 216144cd46caSbillm mutex_enter(&vd->vdev_stat_lock); 216244cd46caSbillm vd->vdev_stat.vs_scrub_errors++; 216344cd46caSbillm mutex_exit(&vd->vdev_stat_lock); 2164fa9e4066Sahrens 2165fa9e4066Sahrens return (ERESTART); 2166fa9e4066Sahrens } 2167fa9e4066Sahrens 2168fa9e4066Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2169fa9e4066Sahrens 217044cd46caSbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 217144cd46caSbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2172fa9e4066Sahrens 217344cd46caSbillm ASSERT(vd != NULL); 217444cd46caSbillm 217544cd46caSbillm /* 217644cd46caSbillm * Keep track of how much data we've examined so that 217744cd46caSbillm * zpool(1M) status can make useful progress reports. 217844cd46caSbillm */ 217944cd46caSbillm mutex_enter(&vd->vdev_stat_lock); 218044cd46caSbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 218144cd46caSbillm mutex_exit(&vd->vdev_stat_lock); 218244cd46caSbillm 218344cd46caSbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 218444cd46caSbillm if (DVA_GET_GANG(&dva[d])) { 218544cd46caSbillm /* 218644cd46caSbillm * Gang members may be spread across multiple 218744cd46caSbillm * vdevs, so the best we can do is look at the 218844cd46caSbillm * pool-wide DTL. 218944cd46caSbillm * XXX -- it would be better to change our 219044cd46caSbillm * allocation policy to ensure that this can't 219144cd46caSbillm * happen. 219244cd46caSbillm */ 219344cd46caSbillm vd = spa->spa_root_vdev; 219444cd46caSbillm } 219544cd46caSbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 219644cd46caSbillm bp->blk_birth, 1)) 219744cd46caSbillm needs_resilver = B_TRUE; 2198fa9e4066Sahrens } 219944cd46caSbillm } 220044cd46caSbillm 220144cd46caSbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2202fa9e4066Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2203ea8dc4b6Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 220444cd46caSbillm else if (needs_resilver) 220544cd46caSbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 220644cd46caSbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2207fa9e4066Sahrens 2208fa9e4066Sahrens return (0); 2209fa9e4066Sahrens } 2210fa9e4066Sahrens 2211fa9e4066Sahrens static void 2212fa9e4066Sahrens spa_scrub_thread(spa_t *spa) 2213fa9e4066Sahrens { 2214fa9e4066Sahrens callb_cpr_t cprinfo; 2215fa9e4066Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2216fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 2217fa9e4066Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2218fa9e4066Sahrens int error = 0; 2219fa9e4066Sahrens boolean_t complete; 2220fa9e4066Sahrens 2221fa9e4066Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2222fa9e4066Sahrens 2223f0aa80d4Sbonwick /* 2224f0aa80d4Sbonwick * If we're restarting due to a snapshot create/delete, 2225f0aa80d4Sbonwick * wait for that to complete. 2226f0aa80d4Sbonwick */ 2227f0aa80d4Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2228f0aa80d4Sbonwick 2229ea8dc4b6Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2230ea8dc4b6Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2231ea8dc4b6Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2232ea8dc4b6Seschrock 2233ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2234ea8dc4b6Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2235fa9e4066Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2236fa9e4066Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2237ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2238fa9e4066Sahrens 2239fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2240fa9e4066Sahrens spa->spa_scrub_errors = 0; 2241fa9e4066Sahrens spa->spa_scrub_active = 1; 2242ea8dc4b6Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2243ea8dc4b6Seschrock ASSERT(spa->spa_scrub_throttled == 0); 2244fa9e4066Sahrens 2245fa9e4066Sahrens while (!spa->spa_scrub_stop) { 2246fa9e4066Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 2247ea8dc4b6Seschrock while (spa->spa_scrub_suspended) { 2248fa9e4066Sahrens spa->spa_scrub_active = 0; 2249fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2250fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2251fa9e4066Sahrens spa->spa_scrub_active = 1; 2252fa9e4066Sahrens } 2253fa9e4066Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2254fa9e4066Sahrens 2255fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 2256fa9e4066Sahrens break; 2257fa9e4066Sahrens 2258fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2259fa9e4066Sahrens error = traverse_more(th); 2260fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2261fa9e4066Sahrens if (error != EAGAIN) 2262fa9e4066Sahrens break; 2263ea8dc4b6Seschrock 2264ea8dc4b6Seschrock while (spa->spa_scrub_throttled > 0) 2265ea8dc4b6Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2266fa9e4066Sahrens } 2267fa9e4066Sahrens 2268fa9e4066Sahrens while (spa->spa_scrub_inflight) 2269fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2270fa9e4066Sahrens 22715dabedeeSbonwick spa->spa_scrub_active = 0; 22725dabedeeSbonwick cv_broadcast(&spa->spa_scrub_cv); 22735dabedeeSbonwick 22745dabedeeSbonwick mutex_exit(&spa->spa_scrub_lock); 22755dabedeeSbonwick 22765dabedeeSbonwick spa_config_enter(spa, RW_WRITER, FTAG); 22775dabedeeSbonwick 22785dabedeeSbonwick mutex_enter(&spa->spa_scrub_lock); 22795dabedeeSbonwick 22805dabedeeSbonwick /* 22815dabedeeSbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 22825dabedeeSbonwick * AND the spa config lock to synchronize with any config changes 22835dabedeeSbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 22845dabedeeSbonwick */ 2285fa9e4066Sahrens if (spa->spa_scrub_restart_txg != 0) 2286fa9e4066Sahrens error = ERESTART; 2287fa9e4066Sahrens 2288ea8dc4b6Seschrock if (spa->spa_scrub_stop) 2289ea8dc4b6Seschrock error = EINTR; 2290ea8dc4b6Seschrock 2291fa9e4066Sahrens /* 2292ea8dc4b6Seschrock * Even if there were uncorrectable errors, we consider the scrub 2293ea8dc4b6Seschrock * completed. The downside is that if there is a transient error during 2294ea8dc4b6Seschrock * a resilver, we won't resilver the data properly to the target. But 2295ea8dc4b6Seschrock * if the damage is permanent (more likely) we will resilver forever, 2296ea8dc4b6Seschrock * which isn't really acceptable. Since there is enough information for 2297ea8dc4b6Seschrock * the user to know what has failed and why, this seems like a more 2298ea8dc4b6Seschrock * tractable approach. 2299fa9e4066Sahrens */ 2300ea8dc4b6Seschrock complete = (error == 0); 2301fa9e4066Sahrens 2302ea8dc4b6Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2303ea8dc4b6Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2304fa9e4066Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2305fa9e4066Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2306fa9e4066Sahrens 2307fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2308fa9e4066Sahrens 2309fa9e4066Sahrens /* 2310fa9e4066Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2311fa9e4066Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2312fa9e4066Sahrens */ 2313fa9e4066Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2314fa9e4066Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2315fa9e4066Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2316ea8dc4b6Seschrock spa_errlog_rotate(spa); 23175dabedeeSbonwick 2318ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2319fa9e4066Sahrens 2320fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2321fa9e4066Sahrens 2322ea8dc4b6Seschrock /* 2323ea8dc4b6Seschrock * We may have finished replacing a device. 2324ea8dc4b6Seschrock * Let the async thread assess this and handle the detach. 2325ea8dc4b6Seschrock */ 2326ea8dc4b6Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2327fa9e4066Sahrens 2328fa9e4066Sahrens /* 2329fa9e4066Sahrens * If we were told to restart, our final act is to start a new scrub. 2330fa9e4066Sahrens */ 2331fa9e4066Sahrens if (error == ERESTART) 2332ea8dc4b6Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2333ea8dc4b6Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2334fa9e4066Sahrens 2335ea8dc4b6Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 2336ea8dc4b6Seschrock spa->spa_scrub_active = 0; 2337ea8dc4b6Seschrock spa->spa_scrub_thread = NULL; 2338ea8dc4b6Seschrock cv_broadcast(&spa->spa_scrub_cv); 2339fa9e4066Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2340fa9e4066Sahrens thread_exit(); 2341fa9e4066Sahrens } 2342fa9e4066Sahrens 2343fa9e4066Sahrens void 2344fa9e4066Sahrens spa_scrub_suspend(spa_t *spa) 2345fa9e4066Sahrens { 2346fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2347ea8dc4b6Seschrock spa->spa_scrub_suspended++; 2348fa9e4066Sahrens while (spa->spa_scrub_active) { 2349fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2350fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2351fa9e4066Sahrens } 2352fa9e4066Sahrens while (spa->spa_scrub_inflight) 2353fa9e4066Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2354fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2355fa9e4066Sahrens } 2356fa9e4066Sahrens 2357fa9e4066Sahrens void 2358fa9e4066Sahrens spa_scrub_resume(spa_t *spa) 2359fa9e4066Sahrens { 2360fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2361ea8dc4b6Seschrock ASSERT(spa->spa_scrub_suspended != 0); 2362ea8dc4b6Seschrock if (--spa->spa_scrub_suspended == 0) 2363fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2364fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2365fa9e4066Sahrens } 2366fa9e4066Sahrens 2367fa9e4066Sahrens void 2368fa9e4066Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2369fa9e4066Sahrens { 2370fa9e4066Sahrens /* 2371fa9e4066Sahrens * Something happened (e.g. snapshot create/delete) that means 2372fa9e4066Sahrens * we must restart any in-progress scrubs. The itinerary will 2373fa9e4066Sahrens * fix this properly. 2374fa9e4066Sahrens */ 2375fa9e4066Sahrens mutex_enter(&spa->spa_scrub_lock); 2376fa9e4066Sahrens spa->spa_scrub_restart_txg = txg; 2377fa9e4066Sahrens mutex_exit(&spa->spa_scrub_lock); 2378fa9e4066Sahrens } 2379fa9e4066Sahrens 2380ea8dc4b6Seschrock int 2381ea8dc4b6Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2382fa9e4066Sahrens { 2383fa9e4066Sahrens space_seg_t *ss; 2384fa9e4066Sahrens uint64_t mintxg, maxtxg; 2385fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 2386fa9e4066Sahrens 2387fa9e4066Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2388fa9e4066Sahrens return (ENOTSUP); 2389fa9e4066Sahrens 2390ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 2391ea8dc4b6Seschrock 2392fa9e4066Sahrens /* 2393fa9e4066Sahrens * If there's a scrub or resilver already in progress, stop it. 2394fa9e4066Sahrens */ 2395fa9e4066Sahrens while (spa->spa_scrub_thread != NULL) { 2396fa9e4066Sahrens /* 2397fa9e4066Sahrens * Don't stop a resilver unless forced. 2398fa9e4066Sahrens */ 2399ea8dc4b6Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2400ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2401fa9e4066Sahrens return (EBUSY); 2402ea8dc4b6Seschrock } 2403fa9e4066Sahrens spa->spa_scrub_stop = 1; 2404fa9e4066Sahrens cv_broadcast(&spa->spa_scrub_cv); 2405fa9e4066Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2406fa9e4066Sahrens } 2407fa9e4066Sahrens 2408fa9e4066Sahrens /* 2409fa9e4066Sahrens * Terminate the previous traverse. 2410fa9e4066Sahrens */ 2411fa9e4066Sahrens if (spa->spa_scrub_th != NULL) { 2412fa9e4066Sahrens traverse_fini(spa->spa_scrub_th); 2413fa9e4066Sahrens spa->spa_scrub_th = NULL; 2414fa9e4066Sahrens } 2415fa9e4066Sahrens 2416ea8dc4b6Seschrock if (rvd == NULL) { 2417ea8dc4b6Seschrock ASSERT(spa->spa_scrub_stop == 0); 2418ea8dc4b6Seschrock ASSERT(spa->spa_scrub_type == type); 2419ea8dc4b6Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 2420ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2421ea8dc4b6Seschrock return (0); 2422ea8dc4b6Seschrock } 2423fa9e4066Sahrens 2424fa9e4066Sahrens mintxg = TXG_INITIAL - 1; 2425fa9e4066Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2426fa9e4066Sahrens 2427ea8dc4b6Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2428fa9e4066Sahrens 2429ea8dc4b6Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 2430ea8dc4b6Seschrock /* 2431ea8dc4b6Seschrock * The pool-wide DTL is empty. 2432ecc2d604Sbonwick * If this is a resilver, there's nothing to do except 2433ecc2d604Sbonwick * check whether any in-progress replacements have completed. 2434ea8dc4b6Seschrock */ 2435ecc2d604Sbonwick if (type == POOL_SCRUB_RESILVER) { 2436ea8dc4b6Seschrock type = POOL_SCRUB_NONE; 2437ecc2d604Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2438ecc2d604Sbonwick } 2439ea8dc4b6Seschrock } else { 2440ea8dc4b6Seschrock /* 2441ea8dc4b6Seschrock * The pool-wide DTL is non-empty. 2442ea8dc4b6Seschrock * If this is a normal scrub, upgrade to a resilver instead. 2443ea8dc4b6Seschrock */ 2444ea8dc4b6Seschrock if (type == POOL_SCRUB_EVERYTHING) 2445ea8dc4b6Seschrock type = POOL_SCRUB_RESILVER; 2446ea8dc4b6Seschrock } 2447fa9e4066Sahrens 2448ea8dc4b6Seschrock if (type == POOL_SCRUB_RESILVER) { 2449fa9e4066Sahrens /* 2450fa9e4066Sahrens * Determine the resilvering boundaries. 2451fa9e4066Sahrens * 2452fa9e4066Sahrens * Note: (mintxg, maxtxg) is an open interval, 2453fa9e4066Sahrens * i.e. mintxg and maxtxg themselves are not included. 2454fa9e4066Sahrens * 2455fa9e4066Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2456fa9e4066Sahrens * so we don't claim to resilver a txg that's still changing. 2457fa9e4066Sahrens */ 2458fa9e4066Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2459ea8dc4b6Seschrock mintxg = ss->ss_start - 1; 2460fa9e4066Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2461ea8dc4b6Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2462fa9e4066Sahrens } 2463fa9e4066Sahrens 2464ea8dc4b6Seschrock mutex_exit(&rvd->vdev_dtl_lock); 2465ea8dc4b6Seschrock 2466ea8dc4b6Seschrock spa->spa_scrub_stop = 0; 2467ea8dc4b6Seschrock spa->spa_scrub_type = type; 2468ea8dc4b6Seschrock spa->spa_scrub_restart_txg = 0; 2469ea8dc4b6Seschrock 2470ea8dc4b6Seschrock if (type != POOL_SCRUB_NONE) { 2471ea8dc4b6Seschrock spa->spa_scrub_mintxg = mintxg; 2472fa9e4066Sahrens spa->spa_scrub_maxtxg = maxtxg; 2473fa9e4066Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 24740373e76bSbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 24750373e76bSbonwick ZIO_FLAG_CANFAIL); 2476fa9e4066Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2477fa9e4066Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2478fa9e4066Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2479fa9e4066Sahrens } 2480fa9e4066Sahrens 2481ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 2482ea8dc4b6Seschrock 2483fa9e4066Sahrens return (0); 2484fa9e4066Sahrens } 2485fa9e4066Sahrens 2486ea8dc4b6Seschrock /* 2487ea8dc4b6Seschrock * ========================================================================== 2488ea8dc4b6Seschrock * SPA async task processing 2489ea8dc4b6Seschrock * ========================================================================== 2490ea8dc4b6Seschrock */ 2491ea8dc4b6Seschrock 2492ea8dc4b6Seschrock static void 2493ea8dc4b6Seschrock spa_async_reopen(spa_t *spa) 2494fa9e4066Sahrens { 2495ea8dc4b6Seschrock vdev_t *rvd = spa->spa_root_vdev; 2496ea8dc4b6Seschrock vdev_t *tvd; 2497ea8dc4b6Seschrock int c; 2498fa9e4066Sahrens 2499ea8dc4b6Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2500ea8dc4b6Seschrock 2501ea8dc4b6Seschrock for (c = 0; c < rvd->vdev_children; c++) { 2502ea8dc4b6Seschrock tvd = rvd->vdev_child[c]; 2503ea8dc4b6Seschrock if (tvd->vdev_reopen_wanted) { 2504ea8dc4b6Seschrock tvd->vdev_reopen_wanted = 0; 2505ea8dc4b6Seschrock vdev_reopen(tvd); 2506ea8dc4b6Seschrock } 2507ea8dc4b6Seschrock } 2508ea8dc4b6Seschrock 2509ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2510ea8dc4b6Seschrock } 2511fa9e4066Sahrens 2512ea8dc4b6Seschrock static void 2513ea8dc4b6Seschrock spa_async_thread(spa_t *spa) 2514ea8dc4b6Seschrock { 2515ea8dc4b6Seschrock int tasks; 2516ea8dc4b6Seschrock 2517ea8dc4b6Seschrock ASSERT(spa->spa_sync_on); 2518ea8dc4b6Seschrock 2519ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2520ea8dc4b6Seschrock tasks = spa->spa_async_tasks; 2521ea8dc4b6Seschrock spa->spa_async_tasks = 0; 2522ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2523ea8dc4b6Seschrock 25240373e76bSbonwick /* 25250373e76bSbonwick * See if the config needs to be updated. 25260373e76bSbonwick */ 25270373e76bSbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 25280373e76bSbonwick mutex_enter(&spa_namespace_lock); 25290373e76bSbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 25300373e76bSbonwick mutex_exit(&spa_namespace_lock); 25310373e76bSbonwick } 25320373e76bSbonwick 2533ea8dc4b6Seschrock /* 2534ea8dc4b6Seschrock * See if any devices need to be reopened. 2535ea8dc4b6Seschrock */ 2536ea8dc4b6Seschrock if (tasks & SPA_ASYNC_REOPEN) 2537ea8dc4b6Seschrock spa_async_reopen(spa); 2538ea8dc4b6Seschrock 2539ea8dc4b6Seschrock /* 2540ea8dc4b6Seschrock * If any devices are done replacing, detach them. 2541ea8dc4b6Seschrock */ 2542ea8dc4b6Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2543fa9e4066Sahrens spa_vdev_replace_done(spa); 2544fa9e4066Sahrens 2545ea8dc4b6Seschrock /* 2546ea8dc4b6Seschrock * Kick off a scrub. 2547ea8dc4b6Seschrock */ 2548ea8dc4b6Seschrock if (tasks & SPA_ASYNC_SCRUB) 2549ea8dc4b6Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2550ea8dc4b6Seschrock 2551ea8dc4b6Seschrock /* 2552ea8dc4b6Seschrock * Kick off a resilver. 2553ea8dc4b6Seschrock */ 2554ea8dc4b6Seschrock if (tasks & SPA_ASYNC_RESILVER) 2555ea8dc4b6Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2556ea8dc4b6Seschrock 2557ea8dc4b6Seschrock /* 2558ea8dc4b6Seschrock * Let the world know that we're done. 2559ea8dc4b6Seschrock */ 2560ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2561ea8dc4b6Seschrock spa->spa_async_thread = NULL; 2562ea8dc4b6Seschrock cv_broadcast(&spa->spa_async_cv); 2563ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2564ea8dc4b6Seschrock thread_exit(); 2565ea8dc4b6Seschrock } 2566ea8dc4b6Seschrock 2567ea8dc4b6Seschrock void 2568ea8dc4b6Seschrock spa_async_suspend(spa_t *spa) 2569ea8dc4b6Seschrock { 2570ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2571ea8dc4b6Seschrock spa->spa_async_suspended++; 2572ea8dc4b6Seschrock while (spa->spa_async_thread != NULL) 2573ea8dc4b6Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2574ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2575ea8dc4b6Seschrock } 2576ea8dc4b6Seschrock 2577ea8dc4b6Seschrock void 2578ea8dc4b6Seschrock spa_async_resume(spa_t *spa) 2579ea8dc4b6Seschrock { 2580ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2581ea8dc4b6Seschrock ASSERT(spa->spa_async_suspended != 0); 2582ea8dc4b6Seschrock spa->spa_async_suspended--; 2583ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2584ea8dc4b6Seschrock } 2585ea8dc4b6Seschrock 2586ea8dc4b6Seschrock static void 2587ea8dc4b6Seschrock spa_async_dispatch(spa_t *spa) 2588ea8dc4b6Seschrock { 2589ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2590ea8dc4b6Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 25910373e76bSbonwick spa->spa_async_thread == NULL && 25920373e76bSbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 2593ea8dc4b6Seschrock spa->spa_async_thread = thread_create(NULL, 0, 2594ea8dc4b6Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2595ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2596ea8dc4b6Seschrock } 2597ea8dc4b6Seschrock 2598ea8dc4b6Seschrock void 2599ea8dc4b6Seschrock spa_async_request(spa_t *spa, int task) 2600ea8dc4b6Seschrock { 2601ea8dc4b6Seschrock mutex_enter(&spa->spa_async_lock); 2602ea8dc4b6Seschrock spa->spa_async_tasks |= task; 2603ea8dc4b6Seschrock mutex_exit(&spa->spa_async_lock); 2604fa9e4066Sahrens } 2605fa9e4066Sahrens 2606fa9e4066Sahrens /* 2607fa9e4066Sahrens * ========================================================================== 2608fa9e4066Sahrens * SPA syncing routines 2609fa9e4066Sahrens * ========================================================================== 2610fa9e4066Sahrens */ 2611fa9e4066Sahrens 2612fa9e4066Sahrens static void 2613fa9e4066Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2614fa9e4066Sahrens { 2615fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2616fa9e4066Sahrens dmu_tx_t *tx; 2617fa9e4066Sahrens blkptr_t blk; 2618fa9e4066Sahrens uint64_t itor = 0; 2619fa9e4066Sahrens zio_t *zio; 2620fa9e4066Sahrens int error; 2621fa9e4066Sahrens uint8_t c = 1; 2622fa9e4066Sahrens 2623fa9e4066Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2624fa9e4066Sahrens 2625fa9e4066Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2626fa9e4066Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2627fa9e4066Sahrens 2628fa9e4066Sahrens error = zio_wait(zio); 2629fa9e4066Sahrens ASSERT3U(error, ==, 0); 2630fa9e4066Sahrens 2631fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2632fa9e4066Sahrens bplist_vacate(bpl, tx); 2633fa9e4066Sahrens 2634fa9e4066Sahrens /* 2635fa9e4066Sahrens * Pre-dirty the first block so we sync to convergence faster. 2636fa9e4066Sahrens * (Usually only the first block is needed.) 2637fa9e4066Sahrens */ 2638fa9e4066Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2639fa9e4066Sahrens dmu_tx_commit(tx); 2640fa9e4066Sahrens } 2641fa9e4066Sahrens 2642fa9e4066Sahrens static void 264399653d4eSeschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2644fa9e4066Sahrens { 2645fa9e4066Sahrens char *packed = NULL; 2646fa9e4066Sahrens size_t nvsize = 0; 2647fa9e4066Sahrens dmu_buf_t *db; 2648fa9e4066Sahrens 264999653d4eSeschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2650fa9e4066Sahrens 2651fa9e4066Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2652fa9e4066Sahrens 265399653d4eSeschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2654ea8dc4b6Seschrock KM_SLEEP) == 0); 2655fa9e4066Sahrens 265699653d4eSeschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2657fa9e4066Sahrens 2658fa9e4066Sahrens kmem_free(packed, nvsize); 2659fa9e4066Sahrens 266099653d4eSeschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2661fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 2662fa9e4066Sahrens *(uint64_t *)db->db_data = nvsize; 2663ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 2664fa9e4066Sahrens } 2665fa9e4066Sahrens 266699653d4eSeschrock static void 266799653d4eSeschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 266899653d4eSeschrock { 266999653d4eSeschrock nvlist_t *nvroot; 267099653d4eSeschrock nvlist_t **spares; 267199653d4eSeschrock int i; 267299653d4eSeschrock 267399653d4eSeschrock if (!spa->spa_sync_spares) 267499653d4eSeschrock return; 267599653d4eSeschrock 267699653d4eSeschrock /* 267799653d4eSeschrock * Update the MOS nvlist describing the list of available spares. 267899653d4eSeschrock * spa_validate_spares() will have already made sure this nvlist is 267999653d4eSeschrock * valid and the vdevs are labelled appropriately. 268099653d4eSeschrock */ 268199653d4eSeschrock if (spa->spa_spares_object == 0) { 268299653d4eSeschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 268399653d4eSeschrock DMU_OT_PACKED_NVLIST, 1 << 14, 268499653d4eSeschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 268599653d4eSeschrock VERIFY(zap_update(spa->spa_meta_objset, 268699653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 268799653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 268899653d4eSeschrock } 268999653d4eSeschrock 269099653d4eSeschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 269199653d4eSeschrock if (spa->spa_nspares == 0) { 269299653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 269399653d4eSeschrock NULL, 0) == 0); 269499653d4eSeschrock } else { 269599653d4eSeschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 269699653d4eSeschrock KM_SLEEP); 269799653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 269899653d4eSeschrock spares[i] = vdev_config_generate(spa, 269999653d4eSeschrock spa->spa_spares[i], B_FALSE, B_TRUE); 270099653d4eSeschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 270199653d4eSeschrock spares, spa->spa_nspares) == 0); 270299653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 270399653d4eSeschrock nvlist_free(spares[i]); 270499653d4eSeschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 270599653d4eSeschrock } 270699653d4eSeschrock 270799653d4eSeschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 270899653d4eSeschrock 270999653d4eSeschrock spa->spa_sync_spares = B_FALSE; 271099653d4eSeschrock } 271199653d4eSeschrock 271299653d4eSeschrock static void 271399653d4eSeschrock spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 271499653d4eSeschrock { 271599653d4eSeschrock nvlist_t *config; 271699653d4eSeschrock 271799653d4eSeschrock if (list_is_empty(&spa->spa_dirty_list)) 271899653d4eSeschrock return; 271999653d4eSeschrock 272099653d4eSeschrock config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 272199653d4eSeschrock 272299653d4eSeschrock if (spa->spa_config_syncing) 272399653d4eSeschrock nvlist_free(spa->spa_config_syncing); 272499653d4eSeschrock spa->spa_config_syncing = config; 272599653d4eSeschrock 272699653d4eSeschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 272799653d4eSeschrock } 272899653d4eSeschrock 2729fa9e4066Sahrens /* 2730fa9e4066Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2731fa9e4066Sahrens * part of the process, so we iterate until it converges. 2732fa9e4066Sahrens */ 2733fa9e4066Sahrens void 2734fa9e4066Sahrens spa_sync(spa_t *spa, uint64_t txg) 2735fa9e4066Sahrens { 2736fa9e4066Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2737fa9e4066Sahrens objset_t *mos = spa->spa_meta_objset; 2738fa9e4066Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 27390373e76bSbonwick vdev_t *rvd = spa->spa_root_vdev; 2740fa9e4066Sahrens vdev_t *vd; 2741fa9e4066Sahrens dmu_tx_t *tx; 2742fa9e4066Sahrens int dirty_vdevs; 2743fa9e4066Sahrens 2744fa9e4066Sahrens /* 2745fa9e4066Sahrens * Lock out configuration changes. 2746fa9e4066Sahrens */ 2747ea8dc4b6Seschrock spa_config_enter(spa, RW_READER, FTAG); 2748fa9e4066Sahrens 2749fa9e4066Sahrens spa->spa_syncing_txg = txg; 2750fa9e4066Sahrens spa->spa_sync_pass = 0; 2751fa9e4066Sahrens 2752ea8dc4b6Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2753fa9e4066Sahrens 275499653d4eSeschrock tx = dmu_tx_create_assigned(dp, txg); 275599653d4eSeschrock 275699653d4eSeschrock /* 275799653d4eSeschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 275899653d4eSeschrock * set spa_deflate if we have no raid-z vdevs. 275999653d4eSeschrock */ 276099653d4eSeschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 276199653d4eSeschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 276299653d4eSeschrock int i; 276399653d4eSeschrock 276499653d4eSeschrock for (i = 0; i < rvd->vdev_children; i++) { 276599653d4eSeschrock vd = rvd->vdev_child[i]; 276699653d4eSeschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 276799653d4eSeschrock break; 276899653d4eSeschrock } 276999653d4eSeschrock if (i == rvd->vdev_children) { 277099653d4eSeschrock spa->spa_deflate = TRUE; 277199653d4eSeschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 277299653d4eSeschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 277399653d4eSeschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 277499653d4eSeschrock } 277599653d4eSeschrock } 277699653d4eSeschrock 2777fa9e4066Sahrens /* 2778fa9e4066Sahrens * If anything has changed in this txg, push the deferred frees 2779fa9e4066Sahrens * from the previous txg. If not, leave them alone so that we 2780fa9e4066Sahrens * don't generate work on an otherwise idle system. 2781fa9e4066Sahrens */ 2782fa9e4066Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2783fa9e4066Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2784fa9e4066Sahrens spa_sync_deferred_frees(spa, txg); 2785fa9e4066Sahrens 2786fa9e4066Sahrens /* 2787fa9e4066Sahrens * Iterate to convergence. 2788fa9e4066Sahrens */ 2789fa9e4066Sahrens do { 2790fa9e4066Sahrens spa->spa_sync_pass++; 2791fa9e4066Sahrens 2792fa9e4066Sahrens spa_sync_config_object(spa, tx); 279399653d4eSeschrock spa_sync_spares(spa, tx); 2794ea8dc4b6Seschrock spa_errlog_sync(spa, txg); 2795fa9e4066Sahrens dsl_pool_sync(dp, txg); 2796fa9e4066Sahrens 2797fa9e4066Sahrens dirty_vdevs = 0; 2798fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2799fa9e4066Sahrens vdev_sync(vd, txg); 2800fa9e4066Sahrens dirty_vdevs++; 2801fa9e4066Sahrens } 2802fa9e4066Sahrens 2803fa9e4066Sahrens bplist_sync(bpl, tx); 2804fa9e4066Sahrens } while (dirty_vdevs); 2805fa9e4066Sahrens 2806fa9e4066Sahrens bplist_close(bpl); 2807fa9e4066Sahrens 2808fa9e4066Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2809fa9e4066Sahrens 2810fa9e4066Sahrens /* 2811fa9e4066Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2812fa9e4066Sahrens * to commit the transaction group. 28130373e76bSbonwick * 28140373e76bSbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 28150373e76bSbonwick * Otherwise, pick a random top-level vdev that's known to be 28160373e76bSbonwick * visible in the config cache (see spa_vdev_add() for details). 28170373e76bSbonwick * If the write fails, try the next vdev until we're tried them all. 28180373e76bSbonwick */ 28190373e76bSbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 28200373e76bSbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28210373e76bSbonwick } else { 28220373e76bSbonwick int children = rvd->vdev_children; 28230373e76bSbonwick int c0 = spa_get_random(children); 28240373e76bSbonwick int c; 28250373e76bSbonwick 28260373e76bSbonwick for (c = 0; c < children; c++) { 28270373e76bSbonwick vd = rvd->vdev_child[(c0 + c) % children]; 28280373e76bSbonwick if (vd->vdev_ms_array == 0) 28290373e76bSbonwick continue; 28300373e76bSbonwick if (vdev_config_sync(vd, txg) == 0) 28310373e76bSbonwick break; 28320373e76bSbonwick } 28330373e76bSbonwick if (c == children) 28340373e76bSbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28350373e76bSbonwick } 28360373e76bSbonwick 283799653d4eSeschrock dmu_tx_commit(tx); 283899653d4eSeschrock 28390373e76bSbonwick /* 28400373e76bSbonwick * Clear the dirty config list. 2841fa9e4066Sahrens */ 28420373e76bSbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 28430373e76bSbonwick vdev_config_clean(vd); 28440373e76bSbonwick 28450373e76bSbonwick /* 28460373e76bSbonwick * Now that the new config has synced transactionally, 28470373e76bSbonwick * let it become visible to the config cache. 28480373e76bSbonwick */ 28490373e76bSbonwick if (spa->spa_config_syncing != NULL) { 28500373e76bSbonwick spa_config_set(spa, spa->spa_config_syncing); 28510373e76bSbonwick spa->spa_config_txg = txg; 28520373e76bSbonwick spa->spa_config_syncing = NULL; 28530373e76bSbonwick } 2854fa9e4066Sahrens 2855fa9e4066Sahrens /* 2856fa9e4066Sahrens * Make a stable copy of the fully synced uberblock. 2857fa9e4066Sahrens * We use this as the root for pool traversals. 2858fa9e4066Sahrens */ 2859fa9e4066Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2860fa9e4066Sahrens 2861fa9e4066Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2862fa9e4066Sahrens 2863fa9e4066Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2864fa9e4066Sahrens spa->spa_traverse_wanted = 0; 2865fa9e4066Sahrens spa->spa_ubsync = spa->spa_uberblock; 2866fa9e4066Sahrens rw_exit(&spa->spa_traverse_lock); 2867fa9e4066Sahrens 2868fa9e4066Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2869fa9e4066Sahrens 2870fa9e4066Sahrens /* 2871fa9e4066Sahrens * Clean up the ZIL records for the synced txg. 2872fa9e4066Sahrens */ 2873fa9e4066Sahrens dsl_pool_zil_clean(dp); 2874fa9e4066Sahrens 2875fa9e4066Sahrens /* 2876fa9e4066Sahrens * Update usable space statistics. 2877fa9e4066Sahrens */ 2878fa9e4066Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2879fa9e4066Sahrens vdev_sync_done(vd, txg); 2880fa9e4066Sahrens 2881fa9e4066Sahrens /* 2882fa9e4066Sahrens * It had better be the case that we didn't dirty anything 288399653d4eSeschrock * since vdev_config_sync(). 2884fa9e4066Sahrens */ 2885fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2886fa9e4066Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2887fa9e4066Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2888fa9e4066Sahrens ASSERT(bpl->bpl_queue == NULL); 2889fa9e4066Sahrens 2890ea8dc4b6Seschrock spa_config_exit(spa, FTAG); 2891ea8dc4b6Seschrock 2892ea8dc4b6Seschrock /* 2893ea8dc4b6Seschrock * If any async tasks have been requested, kick them off. 2894ea8dc4b6Seschrock */ 2895ea8dc4b6Seschrock spa_async_dispatch(spa); 2896fa9e4066Sahrens } 2897fa9e4066Sahrens 2898fa9e4066Sahrens /* 2899fa9e4066Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2900fa9e4066Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2901fa9e4066Sahrens * sync. 2902fa9e4066Sahrens */ 2903fa9e4066Sahrens void 2904fa9e4066Sahrens spa_sync_allpools(void) 2905fa9e4066Sahrens { 2906fa9e4066Sahrens spa_t *spa = NULL; 2907fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2908fa9e4066Sahrens while ((spa = spa_next(spa)) != NULL) { 2909fa9e4066Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2910fa9e4066Sahrens continue; 2911fa9e4066Sahrens spa_open_ref(spa, FTAG); 2912fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2913fa9e4066Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2914fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2915fa9e4066Sahrens spa_close(spa, FTAG); 2916fa9e4066Sahrens } 2917fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2918fa9e4066Sahrens } 2919fa9e4066Sahrens 2920fa9e4066Sahrens /* 2921fa9e4066Sahrens * ========================================================================== 2922fa9e4066Sahrens * Miscellaneous routines 2923fa9e4066Sahrens * ========================================================================== 2924fa9e4066Sahrens */ 2925fa9e4066Sahrens 2926fa9e4066Sahrens /* 2927fa9e4066Sahrens * Remove all pools in the system. 2928fa9e4066Sahrens */ 2929fa9e4066Sahrens void 2930fa9e4066Sahrens spa_evict_all(void) 2931fa9e4066Sahrens { 2932fa9e4066Sahrens spa_t *spa; 2933fa9e4066Sahrens 2934fa9e4066Sahrens /* 2935fa9e4066Sahrens * Remove all cached state. All pools should be closed now, 2936fa9e4066Sahrens * so every spa in the AVL tree should be unreferenced. 2937fa9e4066Sahrens */ 2938fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2939fa9e4066Sahrens while ((spa = spa_next(NULL)) != NULL) { 2940fa9e4066Sahrens /* 2941ea8dc4b6Seschrock * Stop async tasks. The async thread may need to detach 2942ea8dc4b6Seschrock * a device that's been replaced, which requires grabbing 2943ea8dc4b6Seschrock * spa_namespace_lock, so we must drop it here. 2944fa9e4066Sahrens */ 2945fa9e4066Sahrens spa_open_ref(spa, FTAG); 2946fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2947ea8dc4b6Seschrock spa_async_suspend(spa); 2948fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2949fa9e4066Sahrens mutex_enter(&spa_namespace_lock); 2950fa9e4066Sahrens spa_close(spa, FTAG); 2951fa9e4066Sahrens 2952fa9e4066Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2953fa9e4066Sahrens spa_unload(spa); 2954fa9e4066Sahrens spa_deactivate(spa); 2955fa9e4066Sahrens } 2956fa9e4066Sahrens spa_remove(spa); 2957fa9e4066Sahrens } 2958fa9e4066Sahrens mutex_exit(&spa_namespace_lock); 2959fa9e4066Sahrens } 2960ea8dc4b6Seschrock 2961ea8dc4b6Seschrock vdev_t * 2962ea8dc4b6Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2963ea8dc4b6Seschrock { 2964ea8dc4b6Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2965ea8dc4b6Seschrock } 2966eaca9bbdSeschrock 2967eaca9bbdSeschrock void 2968eaca9bbdSeschrock spa_upgrade(spa_t *spa) 2969eaca9bbdSeschrock { 2970eaca9bbdSeschrock spa_config_enter(spa, RW_WRITER, FTAG); 2971eaca9bbdSeschrock 2972eaca9bbdSeschrock /* 2973eaca9bbdSeschrock * This should only be called for a non-faulted pool, and since a 2974eaca9bbdSeschrock * future version would result in an unopenable pool, this shouldn't be 2975eaca9bbdSeschrock * possible. 2976eaca9bbdSeschrock */ 2977eaca9bbdSeschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 2978eaca9bbdSeschrock 2979eaca9bbdSeschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 2980eaca9bbdSeschrock vdev_config_dirty(spa->spa_root_vdev); 2981eaca9bbdSeschrock 2982eaca9bbdSeschrock spa_config_exit(spa, FTAG); 298399653d4eSeschrock 298499653d4eSeschrock txg_wait_synced(spa_get_dsl(spa), 0); 298599653d4eSeschrock } 298699653d4eSeschrock 298799653d4eSeschrock boolean_t 298899653d4eSeschrock spa_has_spare(spa_t *spa, uint64_t guid) 298999653d4eSeschrock { 299099653d4eSeschrock int i; 299199653d4eSeschrock 299299653d4eSeschrock for (i = 0; i < spa->spa_nspares; i++) 299399653d4eSeschrock if (spa->spa_spares[i]->vdev_guid == guid) 299499653d4eSeschrock return (B_TRUE); 299599653d4eSeschrock 299699653d4eSeschrock return (B_FALSE); 2997eaca9bbdSeschrock } 2998