1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5441d80aaSlling * Common Development and Distribution License (the "License"). 6441d80aaSlling * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 2199653d4eSeschrock 22fa9e4066Sahrens /* 23*1195e687SMark J Musante * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #include <sys/zfs_context.h> 28ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 29fa9e4066Sahrens #include <sys/spa.h> 30fa9e4066Sahrens #include <sys/spa_impl.h> 31fa9e4066Sahrens #include <sys/dmu.h> 32fa9e4066Sahrens #include <sys/dmu_tx.h> 33fa9e4066Sahrens #include <sys/vdev_impl.h> 34fa9e4066Sahrens #include <sys/uberblock_impl.h> 35fa9e4066Sahrens #include <sys/metaslab.h> 36fa9e4066Sahrens #include <sys/metaslab_impl.h> 37fa9e4066Sahrens #include <sys/space_map.h> 38fa9e4066Sahrens #include <sys/zio.h> 39fa9e4066Sahrens #include <sys/zap.h> 40fa9e4066Sahrens #include <sys/fs/zfs.h> 41c5904d13Seschrock #include <sys/arc.h> 42e6ca193dSGeorge Wilson #include <sys/zil.h> 43fa9e4066Sahrens 44fa9e4066Sahrens /* 45fa9e4066Sahrens * Virtual device management. 46fa9e4066Sahrens */ 47fa9e4066Sahrens 48fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = { 49fa9e4066Sahrens &vdev_root_ops, 50fa9e4066Sahrens &vdev_raidz_ops, 51fa9e4066Sahrens &vdev_mirror_ops, 52fa9e4066Sahrens &vdev_replacing_ops, 5399653d4eSeschrock &vdev_spare_ops, 54fa9e4066Sahrens &vdev_disk_ops, 55fa9e4066Sahrens &vdev_file_ops, 56fa9e4066Sahrens &vdev_missing_ops, 5788ecc943SGeorge Wilson &vdev_hole_ops, 58fa9e4066Sahrens NULL 59fa9e4066Sahrens }; 60fa9e4066Sahrens 61088f3894Sahrens /* maximum scrub/resilver I/O queue per leaf vdev */ 62088f3894Sahrens int zfs_scrub_limit = 10; 6305b2b3b8Smishra 64fa9e4066Sahrens /* 65fa9e4066Sahrens * Given a vdev type, return the appropriate ops vector. 66fa9e4066Sahrens */ 67fa9e4066Sahrens static vdev_ops_t * 68fa9e4066Sahrens vdev_getops(const char *type) 69fa9e4066Sahrens { 70fa9e4066Sahrens vdev_ops_t *ops, **opspp; 71fa9e4066Sahrens 72fa9e4066Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 73fa9e4066Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 74fa9e4066Sahrens break; 75fa9e4066Sahrens 76fa9e4066Sahrens return (ops); 77fa9e4066Sahrens } 78fa9e4066Sahrens 79fa9e4066Sahrens /* 80fa9e4066Sahrens * Default asize function: return the MAX of psize with the asize of 81fa9e4066Sahrens * all children. This is what's used by anything other than RAID-Z. 82fa9e4066Sahrens */ 83fa9e4066Sahrens uint64_t 84fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 85fa9e4066Sahrens { 86ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 87fa9e4066Sahrens uint64_t csize; 88fa9e4066Sahrens 89573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 90fa9e4066Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91fa9e4066Sahrens asize = MAX(asize, csize); 92fa9e4066Sahrens } 93fa9e4066Sahrens 94fa9e4066Sahrens return (asize); 95fa9e4066Sahrens } 96fa9e4066Sahrens 972a79c5feSlling /* 98573ca77eSGeorge Wilson * Get the minimum allocatable size. We define the allocatable size as 99573ca77eSGeorge Wilson * the vdev's asize rounded to the nearest metaslab. This allows us to 100573ca77eSGeorge Wilson * replace or attach devices which don't have the same physical size but 101573ca77eSGeorge Wilson * can still satisfy the same number of allocations. 1022a79c5feSlling */ 1032a79c5feSlling uint64_t 104573ca77eSGeorge Wilson vdev_get_min_asize(vdev_t *vd) 1052a79c5feSlling { 106573ca77eSGeorge Wilson vdev_t *pvd = vd->vdev_parent; 1072a79c5feSlling 108573ca77eSGeorge Wilson /* 109573ca77eSGeorge Wilson * The our parent is NULL (inactive spare or cache) or is the root, 110573ca77eSGeorge Wilson * just return our own asize. 111573ca77eSGeorge Wilson */ 112573ca77eSGeorge Wilson if (pvd == NULL) 113573ca77eSGeorge Wilson return (vd->vdev_asize); 1142a79c5feSlling 1152a79c5feSlling /* 116573ca77eSGeorge Wilson * The top-level vdev just returns the allocatable size rounded 117573ca77eSGeorge Wilson * to the nearest metaslab. 1182a79c5feSlling */ 119573ca77eSGeorge Wilson if (vd == vd->vdev_top) 120573ca77eSGeorge Wilson return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 1212a79c5feSlling 122573ca77eSGeorge Wilson /* 123573ca77eSGeorge Wilson * The allocatable space for a raidz vdev is N * sizeof(smallest child), 124573ca77eSGeorge Wilson * so each child must provide at least 1/Nth of its asize. 125573ca77eSGeorge Wilson */ 126573ca77eSGeorge Wilson if (pvd->vdev_ops == &vdev_raidz_ops) 127573ca77eSGeorge Wilson return (pvd->vdev_min_asize / pvd->vdev_children); 1282a79c5feSlling 129573ca77eSGeorge Wilson return (pvd->vdev_min_asize); 130573ca77eSGeorge Wilson } 1312a79c5feSlling 132573ca77eSGeorge Wilson void 133573ca77eSGeorge Wilson vdev_set_min_asize(vdev_t *vd) 134573ca77eSGeorge Wilson { 135573ca77eSGeorge Wilson vd->vdev_min_asize = vdev_get_min_asize(vd); 136573ca77eSGeorge Wilson 137573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 138573ca77eSGeorge Wilson vdev_set_min_asize(vd->vdev_child[c]); 1392a79c5feSlling } 1402a79c5feSlling 141fa9e4066Sahrens vdev_t * 142fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 143fa9e4066Sahrens { 144fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 145fa9e4066Sahrens 146e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 147e05725b1Sbonwick 148088f3894Sahrens if (vdev < rvd->vdev_children) { 149088f3894Sahrens ASSERT(rvd->vdev_child[vdev] != NULL); 150fa9e4066Sahrens return (rvd->vdev_child[vdev]); 151088f3894Sahrens } 152fa9e4066Sahrens 153fa9e4066Sahrens return (NULL); 154fa9e4066Sahrens } 155fa9e4066Sahrens 156fa9e4066Sahrens vdev_t * 157fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 158fa9e4066Sahrens { 159fa9e4066Sahrens vdev_t *mvd; 160fa9e4066Sahrens 1610e34b6a7Sbonwick if (vd->vdev_guid == guid) 162fa9e4066Sahrens return (vd); 163fa9e4066Sahrens 164573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 165fa9e4066Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 166fa9e4066Sahrens NULL) 167fa9e4066Sahrens return (mvd); 168fa9e4066Sahrens 169fa9e4066Sahrens return (NULL); 170fa9e4066Sahrens } 171fa9e4066Sahrens 172fa9e4066Sahrens void 173fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 174fa9e4066Sahrens { 175fa9e4066Sahrens size_t oldsize, newsize; 176fa9e4066Sahrens uint64_t id = cvd->vdev_id; 177fa9e4066Sahrens vdev_t **newchild; 178fa9e4066Sahrens 179e14bb325SJeff Bonwick ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 180fa9e4066Sahrens ASSERT(cvd->vdev_parent == NULL); 181fa9e4066Sahrens 182fa9e4066Sahrens cvd->vdev_parent = pvd; 183fa9e4066Sahrens 184fa9e4066Sahrens if (pvd == NULL) 185fa9e4066Sahrens return; 186fa9e4066Sahrens 187fa9e4066Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 188fa9e4066Sahrens 189fa9e4066Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 190fa9e4066Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 191fa9e4066Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 192fa9e4066Sahrens 193fa9e4066Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 194fa9e4066Sahrens if (pvd->vdev_child != NULL) { 195fa9e4066Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 196fa9e4066Sahrens kmem_free(pvd->vdev_child, oldsize); 197fa9e4066Sahrens } 198fa9e4066Sahrens 199fa9e4066Sahrens pvd->vdev_child = newchild; 200fa9e4066Sahrens pvd->vdev_child[id] = cvd; 201fa9e4066Sahrens 202fa9e4066Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 203fa9e4066Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 204fa9e4066Sahrens 205fa9e4066Sahrens /* 206fa9e4066Sahrens * Walk up all ancestors to update guid sum. 207fa9e4066Sahrens */ 208fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 209fa9e4066Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 21005b2b3b8Smishra 21105b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 21205b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 213fa9e4066Sahrens } 214fa9e4066Sahrens 215fa9e4066Sahrens void 216fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 217fa9e4066Sahrens { 218fa9e4066Sahrens int c; 219fa9e4066Sahrens uint_t id = cvd->vdev_id; 220fa9e4066Sahrens 221fa9e4066Sahrens ASSERT(cvd->vdev_parent == pvd); 222fa9e4066Sahrens 223fa9e4066Sahrens if (pvd == NULL) 224fa9e4066Sahrens return; 225fa9e4066Sahrens 226fa9e4066Sahrens ASSERT(id < pvd->vdev_children); 227fa9e4066Sahrens ASSERT(pvd->vdev_child[id] == cvd); 228fa9e4066Sahrens 229fa9e4066Sahrens pvd->vdev_child[id] = NULL; 230fa9e4066Sahrens cvd->vdev_parent = NULL; 231fa9e4066Sahrens 232fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) 233fa9e4066Sahrens if (pvd->vdev_child[c]) 234fa9e4066Sahrens break; 235fa9e4066Sahrens 236fa9e4066Sahrens if (c == pvd->vdev_children) { 237fa9e4066Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 238fa9e4066Sahrens pvd->vdev_child = NULL; 239fa9e4066Sahrens pvd->vdev_children = 0; 240fa9e4066Sahrens } 241fa9e4066Sahrens 242fa9e4066Sahrens /* 243fa9e4066Sahrens * Walk up all ancestors to update guid sum. 244fa9e4066Sahrens */ 245fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 246fa9e4066Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 24705b2b3b8Smishra 24805b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 24905b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 250fa9e4066Sahrens } 251fa9e4066Sahrens 252fa9e4066Sahrens /* 253fa9e4066Sahrens * Remove any holes in the child array. 254fa9e4066Sahrens */ 255fa9e4066Sahrens void 256fa9e4066Sahrens vdev_compact_children(vdev_t *pvd) 257fa9e4066Sahrens { 258fa9e4066Sahrens vdev_t **newchild, *cvd; 259fa9e4066Sahrens int oldc = pvd->vdev_children; 260573ca77eSGeorge Wilson int newc; 261fa9e4066Sahrens 262e14bb325SJeff Bonwick ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 263fa9e4066Sahrens 264573ca77eSGeorge Wilson for (int c = newc = 0; c < oldc; c++) 265fa9e4066Sahrens if (pvd->vdev_child[c]) 266fa9e4066Sahrens newc++; 267fa9e4066Sahrens 268fa9e4066Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 269fa9e4066Sahrens 270573ca77eSGeorge Wilson for (int c = newc = 0; c < oldc; c++) { 271fa9e4066Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 272fa9e4066Sahrens newchild[newc] = cvd; 273fa9e4066Sahrens cvd->vdev_id = newc++; 274fa9e4066Sahrens } 275fa9e4066Sahrens } 276fa9e4066Sahrens 277fa9e4066Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 278fa9e4066Sahrens pvd->vdev_child = newchild; 279fa9e4066Sahrens pvd->vdev_children = newc; 280fa9e4066Sahrens } 281fa9e4066Sahrens 282fa9e4066Sahrens /* 283fa9e4066Sahrens * Allocate and minimally initialize a vdev_t. 284fa9e4066Sahrens */ 28588ecc943SGeorge Wilson vdev_t * 286fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 287fa9e4066Sahrens { 288fa9e4066Sahrens vdev_t *vd; 289fa9e4066Sahrens 290fa9e4066Sahrens vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 291fa9e4066Sahrens 2920e34b6a7Sbonwick if (spa->spa_root_vdev == NULL) { 2930e34b6a7Sbonwick ASSERT(ops == &vdev_root_ops); 2940e34b6a7Sbonwick spa->spa_root_vdev = vd; 2950e34b6a7Sbonwick } 2960e34b6a7Sbonwick 29788ecc943SGeorge Wilson if (guid == 0 && ops != &vdev_hole_ops) { 2980e34b6a7Sbonwick if (spa->spa_root_vdev == vd) { 2990e34b6a7Sbonwick /* 3000e34b6a7Sbonwick * The root vdev's guid will also be the pool guid, 3010e34b6a7Sbonwick * which must be unique among all pools. 3020e34b6a7Sbonwick */ 303*1195e687SMark J Musante guid = spa_generate_guid(NULL); 3040e34b6a7Sbonwick } else { 3050e34b6a7Sbonwick /* 3060e34b6a7Sbonwick * Any other vdev's guid must be unique within the pool. 3070e34b6a7Sbonwick */ 308*1195e687SMark J Musante guid = spa_generate_guid(spa); 3090e34b6a7Sbonwick } 3100e34b6a7Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3110e34b6a7Sbonwick } 3120e34b6a7Sbonwick 313fa9e4066Sahrens vd->vdev_spa = spa; 314fa9e4066Sahrens vd->vdev_id = id; 315fa9e4066Sahrens vd->vdev_guid = guid; 316fa9e4066Sahrens vd->vdev_guid_sum = guid; 317fa9e4066Sahrens vd->vdev_ops = ops; 318fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 31988ecc943SGeorge Wilson vd->vdev_ishole = (ops == &vdev_hole_ops); 320fa9e4066Sahrens 321fa9e4066Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3225ad82045Snd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 323e14bb325SJeff Bonwick mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 3248ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 3258ad4d6ddSJeff Bonwick space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 3268ad4d6ddSJeff Bonwick &vd->vdev_dtl_lock); 3278ad4d6ddSJeff Bonwick } 328fa9e4066Sahrens txg_list_create(&vd->vdev_ms_list, 329fa9e4066Sahrens offsetof(struct metaslab, ms_txg_node)); 330fa9e4066Sahrens txg_list_create(&vd->vdev_dtl_list, 331fa9e4066Sahrens offsetof(struct vdev, vdev_dtl_node)); 332fa9e4066Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3333d7072f8Seschrock vdev_queue_init(vd); 3343d7072f8Seschrock vdev_cache_init(vd); 335fa9e4066Sahrens 336fa9e4066Sahrens return (vd); 337fa9e4066Sahrens } 338fa9e4066Sahrens 339fa9e4066Sahrens /* 340fa9e4066Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 341fa9e4066Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 342fa9e4066Sahrens * different for each case. 343fa9e4066Sahrens */ 34499653d4eSeschrock int 34599653d4eSeschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 34699653d4eSeschrock int alloctype) 347fa9e4066Sahrens { 348fa9e4066Sahrens vdev_ops_t *ops; 349fa9e4066Sahrens char *type; 3508654d025Sperrin uint64_t guid = 0, islog, nparity; 351fa9e4066Sahrens vdev_t *vd; 352fa9e4066Sahrens 353e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 354fa9e4066Sahrens 355fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 35699653d4eSeschrock return (EINVAL); 357fa9e4066Sahrens 358fa9e4066Sahrens if ((ops = vdev_getops(type)) == NULL) 35999653d4eSeschrock return (EINVAL); 360fa9e4066Sahrens 361fa9e4066Sahrens /* 362fa9e4066Sahrens * If this is a load, get the vdev guid from the nvlist. 363fa9e4066Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 364fa9e4066Sahrens */ 365fa9e4066Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 366fa9e4066Sahrens uint64_t label_id; 367fa9e4066Sahrens 368fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 369fa9e4066Sahrens label_id != id) 37099653d4eSeschrock return (EINVAL); 371fa9e4066Sahrens 372fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 37399653d4eSeschrock return (EINVAL); 37499653d4eSeschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 37599653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 37699653d4eSeschrock return (EINVAL); 377fa94a07fSbrendan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 378fa94a07fSbrendan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 379fa94a07fSbrendan return (EINVAL); 38021ecdf64SLin Ling } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 38121ecdf64SLin Ling if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 38221ecdf64SLin Ling return (EINVAL); 383fa9e4066Sahrens } 384fa9e4066Sahrens 38599653d4eSeschrock /* 38699653d4eSeschrock * The first allocated vdev must be of type 'root'. 38799653d4eSeschrock */ 38899653d4eSeschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 38999653d4eSeschrock return (EINVAL); 39099653d4eSeschrock 3918654d025Sperrin /* 3928654d025Sperrin * Determine whether we're a log vdev. 3938654d025Sperrin */ 3948654d025Sperrin islog = 0; 3958654d025Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 396990b4856Slling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 3978654d025Sperrin return (ENOTSUP); 398fa9e4066Sahrens 39988ecc943SGeorge Wilson if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 40088ecc943SGeorge Wilson return (ENOTSUP); 40188ecc943SGeorge Wilson 40299653d4eSeschrock /* 4038654d025Sperrin * Set the nparity property for RAID-Z vdevs. 40499653d4eSeschrock */ 4058654d025Sperrin nparity = -1ULL; 40699653d4eSeschrock if (ops == &vdev_raidz_ops) { 40799653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 4088654d025Sperrin &nparity) == 0) { 409b24ab676SJeff Bonwick if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 41099653d4eSeschrock return (EINVAL); 41199653d4eSeschrock /* 412f94275ceSAdam Leventhal * Previous versions could only support 1 or 2 parity 413f94275ceSAdam Leventhal * device. 41499653d4eSeschrock */ 415f94275ceSAdam Leventhal if (nparity > 1 && 416f94275ceSAdam Leventhal spa_version(spa) < SPA_VERSION_RAIDZ2) 417f94275ceSAdam Leventhal return (ENOTSUP); 418f94275ceSAdam Leventhal if (nparity > 2 && 419f94275ceSAdam Leventhal spa_version(spa) < SPA_VERSION_RAIDZ3) 42099653d4eSeschrock return (ENOTSUP); 42199653d4eSeschrock } else { 42299653d4eSeschrock /* 42399653d4eSeschrock * We require the parity to be specified for SPAs that 42499653d4eSeschrock * support multiple parity levels. 42599653d4eSeschrock */ 426f94275ceSAdam Leventhal if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 42799653d4eSeschrock return (EINVAL); 42899653d4eSeschrock /* 42999653d4eSeschrock * Otherwise, we default to 1 parity device for RAID-Z. 43099653d4eSeschrock */ 4318654d025Sperrin nparity = 1; 43299653d4eSeschrock } 43399653d4eSeschrock } else { 4348654d025Sperrin nparity = 0; 43599653d4eSeschrock } 4368654d025Sperrin ASSERT(nparity != -1ULL); 4378654d025Sperrin 4388654d025Sperrin vd = vdev_alloc_common(spa, id, guid, ops); 4398654d025Sperrin 4408654d025Sperrin vd->vdev_islog = islog; 4418654d025Sperrin vd->vdev_nparity = nparity; 4428654d025Sperrin 4438654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 4448654d025Sperrin vd->vdev_path = spa_strdup(vd->vdev_path); 4458654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 4468654d025Sperrin vd->vdev_devid = spa_strdup(vd->vdev_devid); 4478654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 4488654d025Sperrin &vd->vdev_physpath) == 0) 4498654d025Sperrin vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 4506809eb4eSEric Schrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 4516809eb4eSEric Schrock vd->vdev_fru = spa_strdup(vd->vdev_fru); 45299653d4eSeschrock 453afefbcddSeschrock /* 454afefbcddSeschrock * Set the whole_disk property. If it's not specified, leave the value 455afefbcddSeschrock * as -1. 456afefbcddSeschrock */ 457afefbcddSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 458afefbcddSeschrock &vd->vdev_wholedisk) != 0) 459afefbcddSeschrock vd->vdev_wholedisk = -1ULL; 460afefbcddSeschrock 461ea8dc4b6Seschrock /* 462ea8dc4b6Seschrock * Look for the 'not present' flag. This will only be set if the device 463ea8dc4b6Seschrock * was not present at the time of import. 464ea8dc4b6Seschrock */ 4656809eb4eSEric Schrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 4666809eb4eSEric Schrock &vd->vdev_not_present); 467ea8dc4b6Seschrock 468ecc2d604Sbonwick /* 469ecc2d604Sbonwick * Get the alignment requirement. 470ecc2d604Sbonwick */ 471ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 472ecc2d604Sbonwick 47388ecc943SGeorge Wilson /* 47488ecc943SGeorge Wilson * Retrieve the vdev creation time. 47588ecc943SGeorge Wilson */ 47688ecc943SGeorge Wilson (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 47788ecc943SGeorge Wilson &vd->vdev_crtxg); 47888ecc943SGeorge Wilson 479fa9e4066Sahrens /* 480fa9e4066Sahrens * If we're a top-level vdev, try to load the allocation parameters. 481fa9e4066Sahrens */ 482*1195e687SMark J Musante if (parent && !parent->vdev_parent && 483*1195e687SMark J Musante (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 484fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 485fa9e4066Sahrens &vd->vdev_ms_array); 486fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 487fa9e4066Sahrens &vd->vdev_ms_shift); 488fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 489fa9e4066Sahrens &vd->vdev_asize); 490fa9e4066Sahrens } 491fa9e4066Sahrens 492a1521560SJeff Bonwick if (parent && !parent->vdev_parent) { 493a1521560SJeff Bonwick ASSERT(alloctype == VDEV_ALLOC_LOAD || 4949f4ab4d8SGeorge Wilson alloctype == VDEV_ALLOC_ADD || 495*1195e687SMark J Musante alloctype == VDEV_ALLOC_SPLIT || 4969f4ab4d8SGeorge Wilson alloctype == VDEV_ALLOC_ROOTPOOL); 497a1521560SJeff Bonwick vd->vdev_mg = metaslab_group_create(islog ? 498a1521560SJeff Bonwick spa_log_class(spa) : spa_normal_class(spa), vd); 499a1521560SJeff Bonwick } 500a1521560SJeff Bonwick 501fa9e4066Sahrens /* 5023d7072f8Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 503fa9e4066Sahrens */ 504c5904d13Seschrock if (vd->vdev_ops->vdev_op_leaf && 50521ecdf64SLin Ling (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 50621ecdf64SLin Ling alloctype == VDEV_ALLOC_ROOTPOOL)) { 507c5904d13Seschrock if (alloctype == VDEV_ALLOC_LOAD) { 508c5904d13Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 5098ad4d6ddSJeff Bonwick &vd->vdev_dtl_smo.smo_object); 510c5904d13Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 511c5904d13Seschrock &vd->vdev_unspare); 512c5904d13Seschrock } 51321ecdf64SLin Ling 51421ecdf64SLin Ling if (alloctype == VDEV_ALLOC_ROOTPOOL) { 51521ecdf64SLin Ling uint64_t spare = 0; 51621ecdf64SLin Ling 51721ecdf64SLin Ling if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 51821ecdf64SLin Ling &spare) == 0 && spare) 51921ecdf64SLin Ling spa_spare_add(vd); 52021ecdf64SLin Ling } 52121ecdf64SLin Ling 522ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 523ecc2d604Sbonwick &vd->vdev_offline); 524c5904d13Seschrock 5253d7072f8Seschrock /* 5263d7072f8Seschrock * When importing a pool, we want to ignore the persistent fault 5273d7072f8Seschrock * state, as the diagnosis made on another system may not be 528069f55e2SEric Schrock * valid in the current context. Local vdevs will 529069f55e2SEric Schrock * remain in the faulted state. 5303d7072f8Seschrock */ 531b16da2e2SGeorge Wilson if (spa_load_state(spa) == SPA_LOAD_OPEN) { 5323d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 5333d7072f8Seschrock &vd->vdev_faulted); 5343d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 5353d7072f8Seschrock &vd->vdev_degraded); 5363d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 5373d7072f8Seschrock &vd->vdev_removed); 538069f55e2SEric Schrock 539069f55e2SEric Schrock if (vd->vdev_faulted || vd->vdev_degraded) { 540069f55e2SEric Schrock char *aux; 541069f55e2SEric Schrock 542069f55e2SEric Schrock vd->vdev_label_aux = 543069f55e2SEric Schrock VDEV_AUX_ERR_EXCEEDED; 544069f55e2SEric Schrock if (nvlist_lookup_string(nv, 545069f55e2SEric Schrock ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 546069f55e2SEric Schrock strcmp(aux, "external") == 0) 547069f55e2SEric Schrock vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 548069f55e2SEric Schrock } 5493d7072f8Seschrock } 550fa9e4066Sahrens } 551fa9e4066Sahrens 552fa9e4066Sahrens /* 553fa9e4066Sahrens * Add ourselves to the parent's list of children. 554fa9e4066Sahrens */ 555fa9e4066Sahrens vdev_add_child(parent, vd); 556fa9e4066Sahrens 55799653d4eSeschrock *vdp = vd; 55899653d4eSeschrock 55999653d4eSeschrock return (0); 560fa9e4066Sahrens } 561fa9e4066Sahrens 562fa9e4066Sahrens void 563fa9e4066Sahrens vdev_free(vdev_t *vd) 564fa9e4066Sahrens { 5653d7072f8Seschrock spa_t *spa = vd->vdev_spa; 566fa9e4066Sahrens 567fa9e4066Sahrens /* 568fa9e4066Sahrens * vdev_free() implies closing the vdev first. This is simpler than 569fa9e4066Sahrens * trying to ensure complicated semantics for all callers. 570fa9e4066Sahrens */ 571fa9e4066Sahrens vdev_close(vd); 572fa9e4066Sahrens 573e14bb325SJeff Bonwick ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 574b24ab676SJeff Bonwick ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 575fa9e4066Sahrens 576fa9e4066Sahrens /* 577fa9e4066Sahrens * Free all children. 578fa9e4066Sahrens */ 579573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 580fa9e4066Sahrens vdev_free(vd->vdev_child[c]); 581fa9e4066Sahrens 582fa9e4066Sahrens ASSERT(vd->vdev_child == NULL); 583fa9e4066Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 584fa9e4066Sahrens 585fa9e4066Sahrens /* 586fa9e4066Sahrens * Discard allocation state. 587fa9e4066Sahrens */ 588a1521560SJeff Bonwick if (vd->vdev_mg != NULL) { 589fa9e4066Sahrens vdev_metaslab_fini(vd); 590a1521560SJeff Bonwick metaslab_group_destroy(vd->vdev_mg); 591a1521560SJeff Bonwick } 592fa9e4066Sahrens 593fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 59499653d4eSeschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 595fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 596fa9e4066Sahrens 597fa9e4066Sahrens /* 598fa9e4066Sahrens * Remove this vdev from its parent's child list. 599fa9e4066Sahrens */ 600fa9e4066Sahrens vdev_remove_child(vd->vdev_parent, vd); 601fa9e4066Sahrens 602fa9e4066Sahrens ASSERT(vd->vdev_parent == NULL); 603fa9e4066Sahrens 6043d7072f8Seschrock /* 6053d7072f8Seschrock * Clean up vdev structure. 6063d7072f8Seschrock */ 6073d7072f8Seschrock vdev_queue_fini(vd); 6083d7072f8Seschrock vdev_cache_fini(vd); 6093d7072f8Seschrock 6103d7072f8Seschrock if (vd->vdev_path) 6113d7072f8Seschrock spa_strfree(vd->vdev_path); 6123d7072f8Seschrock if (vd->vdev_devid) 6133d7072f8Seschrock spa_strfree(vd->vdev_devid); 6143d7072f8Seschrock if (vd->vdev_physpath) 6153d7072f8Seschrock spa_strfree(vd->vdev_physpath); 6166809eb4eSEric Schrock if (vd->vdev_fru) 6176809eb4eSEric Schrock spa_strfree(vd->vdev_fru); 6183d7072f8Seschrock 6193d7072f8Seschrock if (vd->vdev_isspare) 6203d7072f8Seschrock spa_spare_remove(vd); 621fa94a07fSbrendan if (vd->vdev_isl2cache) 622fa94a07fSbrendan spa_l2cache_remove(vd); 6233d7072f8Seschrock 6243d7072f8Seschrock txg_list_destroy(&vd->vdev_ms_list); 6253d7072f8Seschrock txg_list_destroy(&vd->vdev_dtl_list); 6268ad4d6ddSJeff Bonwick 6273d7072f8Seschrock mutex_enter(&vd->vdev_dtl_lock); 6288ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 6298ad4d6ddSJeff Bonwick space_map_unload(&vd->vdev_dtl[t]); 6308ad4d6ddSJeff Bonwick space_map_destroy(&vd->vdev_dtl[t]); 6318ad4d6ddSJeff Bonwick } 6323d7072f8Seschrock mutex_exit(&vd->vdev_dtl_lock); 6338ad4d6ddSJeff Bonwick 6343d7072f8Seschrock mutex_destroy(&vd->vdev_dtl_lock); 6353d7072f8Seschrock mutex_destroy(&vd->vdev_stat_lock); 636e14bb325SJeff Bonwick mutex_destroy(&vd->vdev_probe_lock); 6373d7072f8Seschrock 6383d7072f8Seschrock if (vd == spa->spa_root_vdev) 6393d7072f8Seschrock spa->spa_root_vdev = NULL; 6403d7072f8Seschrock 6413d7072f8Seschrock kmem_free(vd, sizeof (vdev_t)); 642fa9e4066Sahrens } 643fa9e4066Sahrens 644fa9e4066Sahrens /* 645fa9e4066Sahrens * Transfer top-level vdev state from svd to tvd. 646fa9e4066Sahrens */ 647fa9e4066Sahrens static void 648fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 649fa9e4066Sahrens { 650fa9e4066Sahrens spa_t *spa = svd->vdev_spa; 651fa9e4066Sahrens metaslab_t *msp; 652fa9e4066Sahrens vdev_t *vd; 653fa9e4066Sahrens int t; 654fa9e4066Sahrens 655fa9e4066Sahrens ASSERT(tvd == tvd->vdev_top); 656fa9e4066Sahrens 657fa9e4066Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 658fa9e4066Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 659fa9e4066Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 660fa9e4066Sahrens 661fa9e4066Sahrens svd->vdev_ms_array = 0; 662fa9e4066Sahrens svd->vdev_ms_shift = 0; 663fa9e4066Sahrens svd->vdev_ms_count = 0; 664fa9e4066Sahrens 665fa9e4066Sahrens tvd->vdev_mg = svd->vdev_mg; 666fa9e4066Sahrens tvd->vdev_ms = svd->vdev_ms; 667fa9e4066Sahrens 668fa9e4066Sahrens svd->vdev_mg = NULL; 669fa9e4066Sahrens svd->vdev_ms = NULL; 670ecc2d604Sbonwick 671ecc2d604Sbonwick if (tvd->vdev_mg != NULL) 672ecc2d604Sbonwick tvd->vdev_mg->mg_vd = tvd; 673fa9e4066Sahrens 674fa9e4066Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 675fa9e4066Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 67699653d4eSeschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 677fa9e4066Sahrens 678fa9e4066Sahrens svd->vdev_stat.vs_alloc = 0; 679fa9e4066Sahrens svd->vdev_stat.vs_space = 0; 68099653d4eSeschrock svd->vdev_stat.vs_dspace = 0; 681fa9e4066Sahrens 682fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) { 683fa9e4066Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 684fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 685fa9e4066Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 686fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 687fa9e4066Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 688fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 689fa9e4066Sahrens } 690fa9e4066Sahrens 691e14bb325SJeff Bonwick if (list_link_active(&svd->vdev_config_dirty_node)) { 692fa9e4066Sahrens vdev_config_clean(svd); 693fa9e4066Sahrens vdev_config_dirty(tvd); 694fa9e4066Sahrens } 695fa9e4066Sahrens 696e14bb325SJeff Bonwick if (list_link_active(&svd->vdev_state_dirty_node)) { 697e14bb325SJeff Bonwick vdev_state_clean(svd); 698e14bb325SJeff Bonwick vdev_state_dirty(tvd); 699e14bb325SJeff Bonwick } 700e14bb325SJeff Bonwick 70199653d4eSeschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 70299653d4eSeschrock svd->vdev_deflate_ratio = 0; 7038654d025Sperrin 7048654d025Sperrin tvd->vdev_islog = svd->vdev_islog; 7058654d025Sperrin svd->vdev_islog = 0; 706fa9e4066Sahrens } 707fa9e4066Sahrens 708fa9e4066Sahrens static void 709fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 710fa9e4066Sahrens { 711fa9e4066Sahrens if (vd == NULL) 712fa9e4066Sahrens return; 713fa9e4066Sahrens 714fa9e4066Sahrens vd->vdev_top = tvd; 715fa9e4066Sahrens 716573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 717fa9e4066Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 718fa9e4066Sahrens } 719fa9e4066Sahrens 720fa9e4066Sahrens /* 721fa9e4066Sahrens * Add a mirror/replacing vdev above an existing vdev. 722fa9e4066Sahrens */ 723fa9e4066Sahrens vdev_t * 724fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 725fa9e4066Sahrens { 726fa9e4066Sahrens spa_t *spa = cvd->vdev_spa; 727fa9e4066Sahrens vdev_t *pvd = cvd->vdev_parent; 728fa9e4066Sahrens vdev_t *mvd; 729fa9e4066Sahrens 730e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 731fa9e4066Sahrens 732fa9e4066Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 733ecc2d604Sbonwick 734ecc2d604Sbonwick mvd->vdev_asize = cvd->vdev_asize; 735573ca77eSGeorge Wilson mvd->vdev_min_asize = cvd->vdev_min_asize; 736ecc2d604Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 737ecc2d604Sbonwick mvd->vdev_state = cvd->vdev_state; 73888ecc943SGeorge Wilson mvd->vdev_crtxg = cvd->vdev_crtxg; 739ecc2d604Sbonwick 740fa9e4066Sahrens vdev_remove_child(pvd, cvd); 741fa9e4066Sahrens vdev_add_child(pvd, mvd); 742fa9e4066Sahrens cvd->vdev_id = mvd->vdev_children; 743fa9e4066Sahrens vdev_add_child(mvd, cvd); 744fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 745fa9e4066Sahrens 746fa9e4066Sahrens if (mvd == mvd->vdev_top) 747fa9e4066Sahrens vdev_top_transfer(cvd, mvd); 748fa9e4066Sahrens 749fa9e4066Sahrens return (mvd); 750fa9e4066Sahrens } 751fa9e4066Sahrens 752fa9e4066Sahrens /* 753fa9e4066Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 754fa9e4066Sahrens */ 755fa9e4066Sahrens void 756fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd) 757fa9e4066Sahrens { 758fa9e4066Sahrens vdev_t *mvd = cvd->vdev_parent; 759fa9e4066Sahrens vdev_t *pvd = mvd->vdev_parent; 760fa9e4066Sahrens 761e14bb325SJeff Bonwick ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 762fa9e4066Sahrens 763fa9e4066Sahrens ASSERT(mvd->vdev_children == 1); 764fa9e4066Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 76599653d4eSeschrock mvd->vdev_ops == &vdev_replacing_ops || 76699653d4eSeschrock mvd->vdev_ops == &vdev_spare_ops); 767ecc2d604Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 768fa9e4066Sahrens 769fa9e4066Sahrens vdev_remove_child(mvd, cvd); 770fa9e4066Sahrens vdev_remove_child(pvd, mvd); 7718ad4d6ddSJeff Bonwick 77299653d4eSeschrock /* 773e14bb325SJeff Bonwick * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 774e14bb325SJeff Bonwick * Otherwise, we could have detached an offline device, and when we 775e14bb325SJeff Bonwick * go to import the pool we'll think we have two top-level vdevs, 776e14bb325SJeff Bonwick * instead of a different version of the same top-level vdev. 77799653d4eSeschrock */ 7788ad4d6ddSJeff Bonwick if (mvd->vdev_top == mvd) { 7798ad4d6ddSJeff Bonwick uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 780*1195e687SMark J Musante cvd->vdev_orig_guid = cvd->vdev_guid; 7818ad4d6ddSJeff Bonwick cvd->vdev_guid += guid_delta; 7828ad4d6ddSJeff Bonwick cvd->vdev_guid_sum += guid_delta; 7838ad4d6ddSJeff Bonwick } 784e14bb325SJeff Bonwick cvd->vdev_id = mvd->vdev_id; 785e14bb325SJeff Bonwick vdev_add_child(pvd, cvd); 786fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 787fa9e4066Sahrens 788fa9e4066Sahrens if (cvd == cvd->vdev_top) 789fa9e4066Sahrens vdev_top_transfer(mvd, cvd); 790fa9e4066Sahrens 791fa9e4066Sahrens ASSERT(mvd->vdev_children == 0); 792fa9e4066Sahrens vdev_free(mvd); 793fa9e4066Sahrens } 794fa9e4066Sahrens 795ea8dc4b6Seschrock int 796fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 797fa9e4066Sahrens { 798fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 799ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 800ecc2d604Sbonwick uint64_t m; 801fa9e4066Sahrens uint64_t oldc = vd->vdev_ms_count; 802fa9e4066Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 803ecc2d604Sbonwick metaslab_t **mspp; 804ecc2d604Sbonwick int error; 805fa9e4066Sahrens 806a1521560SJeff Bonwick ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 807a1521560SJeff Bonwick 80888ecc943SGeorge Wilson /* 80988ecc943SGeorge Wilson * This vdev is not being allocated from yet or is a hole. 81088ecc943SGeorge Wilson */ 81188ecc943SGeorge Wilson if (vd->vdev_ms_shift == 0) 8120e34b6a7Sbonwick return (0); 8130e34b6a7Sbonwick 81488ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 81588ecc943SGeorge Wilson 816e6ca193dSGeorge Wilson /* 817e6ca193dSGeorge Wilson * Compute the raidz-deflation ratio. Note, we hard-code 818e6ca193dSGeorge Wilson * in 128k (1 << 17) because it is the current "typical" blocksize. 819e6ca193dSGeorge Wilson * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, 820e6ca193dSGeorge Wilson * or we will inconsistently account for existing bp's. 821e6ca193dSGeorge Wilson */ 822e6ca193dSGeorge Wilson vd->vdev_deflate_ratio = (1 << 17) / 823e6ca193dSGeorge Wilson (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 824e6ca193dSGeorge Wilson 825fa9e4066Sahrens ASSERT(oldc <= newc); 826fa9e4066Sahrens 827ecc2d604Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 828fa9e4066Sahrens 829ecc2d604Sbonwick if (oldc != 0) { 830ecc2d604Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 831ecc2d604Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 832ecc2d604Sbonwick } 833fa9e4066Sahrens 834ecc2d604Sbonwick vd->vdev_ms = mspp; 835ecc2d604Sbonwick vd->vdev_ms_count = newc; 836fa9e4066Sahrens 837ecc2d604Sbonwick for (m = oldc; m < newc; m++) { 838ecc2d604Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 839ecc2d604Sbonwick if (txg == 0) { 840ecc2d604Sbonwick uint64_t object = 0; 841ecc2d604Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 8427bfdf011SNeil Perrin m * sizeof (uint64_t), sizeof (uint64_t), &object, 8437bfdf011SNeil Perrin DMU_READ_PREFETCH); 844ecc2d604Sbonwick if (error) 845ecc2d604Sbonwick return (error); 846ecc2d604Sbonwick if (object != 0) { 847ecc2d604Sbonwick dmu_buf_t *db; 848ecc2d604Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 849ecc2d604Sbonwick if (error) 850ecc2d604Sbonwick return (error); 8511934e92fSmaybee ASSERT3U(db->db_size, >=, sizeof (smo)); 8521934e92fSmaybee bcopy(db->db_data, &smo, sizeof (smo)); 853ecc2d604Sbonwick ASSERT3U(smo.smo_object, ==, object); 854ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 855fa9e4066Sahrens } 856fa9e4066Sahrens } 857ecc2d604Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 858ecc2d604Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 859fa9e4066Sahrens } 860fa9e4066Sahrens 861a1521560SJeff Bonwick if (txg == 0) 862a1521560SJeff Bonwick spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 863a1521560SJeff Bonwick 864a1521560SJeff Bonwick if (oldc == 0) 865a1521560SJeff Bonwick metaslab_group_activate(vd->vdev_mg); 866a1521560SJeff Bonwick 867a1521560SJeff Bonwick if (txg == 0) 868a1521560SJeff Bonwick spa_config_exit(spa, SCL_ALLOC, FTAG); 869a1521560SJeff Bonwick 870ea8dc4b6Seschrock return (0); 871fa9e4066Sahrens } 872fa9e4066Sahrens 873fa9e4066Sahrens void 874fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd) 875fa9e4066Sahrens { 876fa9e4066Sahrens uint64_t m; 877fa9e4066Sahrens uint64_t count = vd->vdev_ms_count; 878fa9e4066Sahrens 879fa9e4066Sahrens if (vd->vdev_ms != NULL) { 880a1521560SJeff Bonwick metaslab_group_passivate(vd->vdev_mg); 881fa9e4066Sahrens for (m = 0; m < count; m++) 882ecc2d604Sbonwick if (vd->vdev_ms[m] != NULL) 883ecc2d604Sbonwick metaslab_fini(vd->vdev_ms[m]); 884fa9e4066Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 885fa9e4066Sahrens vd->vdev_ms = NULL; 886fa9e4066Sahrens } 887fa9e4066Sahrens } 888fa9e4066Sahrens 889e14bb325SJeff Bonwick typedef struct vdev_probe_stats { 890e14bb325SJeff Bonwick boolean_t vps_readable; 891e14bb325SJeff Bonwick boolean_t vps_writeable; 892e14bb325SJeff Bonwick int vps_flags; 893e14bb325SJeff Bonwick } vdev_probe_stats_t; 894e14bb325SJeff Bonwick 895e14bb325SJeff Bonwick static void 896e14bb325SJeff Bonwick vdev_probe_done(zio_t *zio) 8970a4e9518Sgw { 8988ad4d6ddSJeff Bonwick spa_t *spa = zio->io_spa; 899a3f829aeSBill Moore vdev_t *vd = zio->io_vd; 900e14bb325SJeff Bonwick vdev_probe_stats_t *vps = zio->io_private; 901a3f829aeSBill Moore 902a3f829aeSBill Moore ASSERT(vd->vdev_probe_zio != NULL); 903e14bb325SJeff Bonwick 904e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_READ) { 905e14bb325SJeff Bonwick if (zio->io_error == 0) 906e14bb325SJeff Bonwick vps->vps_readable = 1; 9078ad4d6ddSJeff Bonwick if (zio->io_error == 0 && spa_writeable(spa)) { 908a3f829aeSBill Moore zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 909e14bb325SJeff Bonwick zio->io_offset, zio->io_size, zio->io_data, 910e14bb325SJeff Bonwick ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 911e14bb325SJeff Bonwick ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 912e14bb325SJeff Bonwick } else { 913e14bb325SJeff Bonwick zio_buf_free(zio->io_data, zio->io_size); 914e14bb325SJeff Bonwick } 915e14bb325SJeff Bonwick } else if (zio->io_type == ZIO_TYPE_WRITE) { 916e14bb325SJeff Bonwick if (zio->io_error == 0) 917e14bb325SJeff Bonwick vps->vps_writeable = 1; 918e14bb325SJeff Bonwick zio_buf_free(zio->io_data, zio->io_size); 919e14bb325SJeff Bonwick } else if (zio->io_type == ZIO_TYPE_NULL) { 920a3f829aeSBill Moore zio_t *pio; 921e14bb325SJeff Bonwick 922e14bb325SJeff Bonwick vd->vdev_cant_read |= !vps->vps_readable; 923e14bb325SJeff Bonwick vd->vdev_cant_write |= !vps->vps_writeable; 924e14bb325SJeff Bonwick 925e14bb325SJeff Bonwick if (vdev_readable(vd) && 9268ad4d6ddSJeff Bonwick (vdev_writeable(vd) || !spa_writeable(spa))) { 927e14bb325SJeff Bonwick zio->io_error = 0; 928e14bb325SJeff Bonwick } else { 929e14bb325SJeff Bonwick ASSERT(zio->io_error != 0); 930e14bb325SJeff Bonwick zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 9318ad4d6ddSJeff Bonwick spa, vd, NULL, 0, 0); 932e14bb325SJeff Bonwick zio->io_error = ENXIO; 933e14bb325SJeff Bonwick } 934a3f829aeSBill Moore 935a3f829aeSBill Moore mutex_enter(&vd->vdev_probe_lock); 936a3f829aeSBill Moore ASSERT(vd->vdev_probe_zio == zio); 937a3f829aeSBill Moore vd->vdev_probe_zio = NULL; 938a3f829aeSBill Moore mutex_exit(&vd->vdev_probe_lock); 939a3f829aeSBill Moore 940a3f829aeSBill Moore while ((pio = zio_walk_parents(zio)) != NULL) 941a3f829aeSBill Moore if (!vdev_accessible(vd, pio)) 942a3f829aeSBill Moore pio->io_error = ENXIO; 943a3f829aeSBill Moore 944e14bb325SJeff Bonwick kmem_free(vps, sizeof (*vps)); 945e14bb325SJeff Bonwick } 946e14bb325SJeff Bonwick } 9470a4e9518Sgw 948e14bb325SJeff Bonwick /* 949e14bb325SJeff Bonwick * Determine whether this device is accessible by reading and writing 950e14bb325SJeff Bonwick * to several known locations: the pad regions of each vdev label 951e14bb325SJeff Bonwick * but the first (which we leave alone in case it contains a VTOC). 952e14bb325SJeff Bonwick */ 953e14bb325SJeff Bonwick zio_t * 954a3f829aeSBill Moore vdev_probe(vdev_t *vd, zio_t *zio) 955e14bb325SJeff Bonwick { 956e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 957a3f829aeSBill Moore vdev_probe_stats_t *vps = NULL; 958a3f829aeSBill Moore zio_t *pio; 959a3f829aeSBill Moore 960a3f829aeSBill Moore ASSERT(vd->vdev_ops->vdev_op_leaf); 9610a4e9518Sgw 962a3f829aeSBill Moore /* 963a3f829aeSBill Moore * Don't probe the probe. 964a3f829aeSBill Moore */ 965a3f829aeSBill Moore if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 966a3f829aeSBill Moore return (NULL); 967e14bb325SJeff Bonwick 968a3f829aeSBill Moore /* 969a3f829aeSBill Moore * To prevent 'probe storms' when a device fails, we create 970a3f829aeSBill Moore * just one probe i/o at a time. All zios that want to probe 971a3f829aeSBill Moore * this vdev will become parents of the probe io. 972a3f829aeSBill Moore */ 973a3f829aeSBill Moore mutex_enter(&vd->vdev_probe_lock); 974e14bb325SJeff Bonwick 975a3f829aeSBill Moore if ((pio = vd->vdev_probe_zio) == NULL) { 976a3f829aeSBill Moore vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 977a3f829aeSBill Moore 978a3f829aeSBill Moore vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 979a3f829aeSBill Moore ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 9808956713aSEric Schrock ZIO_FLAG_TRYHARD; 981a3f829aeSBill Moore 982a3f829aeSBill Moore if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 983a3f829aeSBill Moore /* 984a3f829aeSBill Moore * vdev_cant_read and vdev_cant_write can only 985a3f829aeSBill Moore * transition from TRUE to FALSE when we have the 986a3f829aeSBill Moore * SCL_ZIO lock as writer; otherwise they can only 987a3f829aeSBill Moore * transition from FALSE to TRUE. This ensures that 988a3f829aeSBill Moore * any zio looking at these values can assume that 989a3f829aeSBill Moore * failures persist for the life of the I/O. That's 990a3f829aeSBill Moore * important because when a device has intermittent 991a3f829aeSBill Moore * connectivity problems, we want to ensure that 992a3f829aeSBill Moore * they're ascribed to the device (ENXIO) and not 993a3f829aeSBill Moore * the zio (EIO). 994a3f829aeSBill Moore * 995a3f829aeSBill Moore * Since we hold SCL_ZIO as writer here, clear both 996a3f829aeSBill Moore * values so the probe can reevaluate from first 997a3f829aeSBill Moore * principles. 998a3f829aeSBill Moore */ 999a3f829aeSBill Moore vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1000a3f829aeSBill Moore vd->vdev_cant_read = B_FALSE; 1001a3f829aeSBill Moore vd->vdev_cant_write = B_FALSE; 1002a3f829aeSBill Moore } 1003a3f829aeSBill Moore 1004a3f829aeSBill Moore vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1005a3f829aeSBill Moore vdev_probe_done, vps, 1006a3f829aeSBill Moore vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1007a3f829aeSBill Moore 1008a3f829aeSBill Moore if (zio != NULL) { 1009a3f829aeSBill Moore vd->vdev_probe_wanted = B_TRUE; 1010a3f829aeSBill Moore spa_async_request(spa, SPA_ASYNC_PROBE); 1011a3f829aeSBill Moore } 1012e14bb325SJeff Bonwick } 1013e14bb325SJeff Bonwick 1014a3f829aeSBill Moore if (zio != NULL) 1015a3f829aeSBill Moore zio_add_child(zio, pio); 1016e14bb325SJeff Bonwick 1017a3f829aeSBill Moore mutex_exit(&vd->vdev_probe_lock); 1018e14bb325SJeff Bonwick 1019a3f829aeSBill Moore if (vps == NULL) { 1020a3f829aeSBill Moore ASSERT(zio != NULL); 1021a3f829aeSBill Moore return (NULL); 1022a3f829aeSBill Moore } 1023e14bb325SJeff Bonwick 1024e14bb325SJeff Bonwick for (int l = 1; l < VDEV_LABELS; l++) { 1025a3f829aeSBill Moore zio_nowait(zio_read_phys(pio, vd, 1026e14bb325SJeff Bonwick vdev_label_offset(vd->vdev_psize, l, 1027f83ffe1aSLin Ling offsetof(vdev_label_t, vl_pad2)), 1028f83ffe1aSLin Ling VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1029e14bb325SJeff Bonwick ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1030e14bb325SJeff Bonwick ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1031e14bb325SJeff Bonwick } 1032e14bb325SJeff Bonwick 1033a3f829aeSBill Moore if (zio == NULL) 1034a3f829aeSBill Moore return (pio); 1035a3f829aeSBill Moore 1036a3f829aeSBill Moore zio_nowait(pio); 1037a3f829aeSBill Moore return (NULL); 10380a4e9518Sgw } 10390a4e9518Sgw 1040f64c0e34SEric Taylor static void 1041f64c0e34SEric Taylor vdev_open_child(void *arg) 1042f64c0e34SEric Taylor { 1043f64c0e34SEric Taylor vdev_t *vd = arg; 1044f64c0e34SEric Taylor 1045f64c0e34SEric Taylor vd->vdev_open_thread = curthread; 1046f64c0e34SEric Taylor vd->vdev_open_error = vdev_open(vd); 1047f64c0e34SEric Taylor vd->vdev_open_thread = NULL; 1048f64c0e34SEric Taylor } 1049f64c0e34SEric Taylor 1050681d9761SEric Taylor boolean_t 1051681d9761SEric Taylor vdev_uses_zvols(vdev_t *vd) 1052681d9761SEric Taylor { 1053681d9761SEric Taylor if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1054681d9761SEric Taylor strlen(ZVOL_DIR)) == 0) 1055681d9761SEric Taylor return (B_TRUE); 1056681d9761SEric Taylor for (int c = 0; c < vd->vdev_children; c++) 1057681d9761SEric Taylor if (vdev_uses_zvols(vd->vdev_child[c])) 1058681d9761SEric Taylor return (B_TRUE); 1059681d9761SEric Taylor return (B_FALSE); 1060681d9761SEric Taylor } 1061681d9761SEric Taylor 1062f64c0e34SEric Taylor void 1063f64c0e34SEric Taylor vdev_open_children(vdev_t *vd) 1064f64c0e34SEric Taylor { 1065f64c0e34SEric Taylor taskq_t *tq; 1066f64c0e34SEric Taylor int children = vd->vdev_children; 1067f64c0e34SEric Taylor 1068681d9761SEric Taylor /* 1069681d9761SEric Taylor * in order to handle pools on top of zvols, do the opens 1070681d9761SEric Taylor * in a single thread so that the same thread holds the 1071681d9761SEric Taylor * spa_namespace_lock 1072681d9761SEric Taylor */ 1073681d9761SEric Taylor if (vdev_uses_zvols(vd)) { 1074681d9761SEric Taylor for (int c = 0; c < children; c++) 1075681d9761SEric Taylor vd->vdev_child[c]->vdev_open_error = 1076681d9761SEric Taylor vdev_open(vd->vdev_child[c]); 1077681d9761SEric Taylor return; 1078681d9761SEric Taylor } 1079f64c0e34SEric Taylor tq = taskq_create("vdev_open", children, minclsyspri, 1080f64c0e34SEric Taylor children, children, TASKQ_PREPOPULATE); 1081f64c0e34SEric Taylor 1082f64c0e34SEric Taylor for (int c = 0; c < children; c++) 1083f64c0e34SEric Taylor VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1084f64c0e34SEric Taylor TQ_SLEEP) != NULL); 1085f64c0e34SEric Taylor 1086f64c0e34SEric Taylor taskq_destroy(tq); 1087f64c0e34SEric Taylor } 1088f64c0e34SEric Taylor 1089fa9e4066Sahrens /* 1090fa9e4066Sahrens * Prepare a virtual device for access. 1091fa9e4066Sahrens */ 1092fa9e4066Sahrens int 1093fa9e4066Sahrens vdev_open(vdev_t *vd) 1094fa9e4066Sahrens { 10958ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 1096fa9e4066Sahrens int error; 1097fa9e4066Sahrens uint64_t osize = 0; 1098fa9e4066Sahrens uint64_t asize, psize; 1099ecc2d604Sbonwick uint64_t ashift = 0; 1100fa9e4066Sahrens 1101f64c0e34SEric Taylor ASSERT(vd->vdev_open_thread == curthread || 1102f64c0e34SEric Taylor spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1103fa9e4066Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1104fa9e4066Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 1105fa9e4066Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 1106fa9e4066Sahrens 1107fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1108e6ca193dSGeorge Wilson vd->vdev_cant_read = B_FALSE; 1109e6ca193dSGeorge Wilson vd->vdev_cant_write = B_FALSE; 1110573ca77eSGeorge Wilson vd->vdev_min_asize = vdev_get_min_asize(vd); 1111fa9e4066Sahrens 1112069f55e2SEric Schrock /* 1113069f55e2SEric Schrock * If this vdev is not removed, check its fault status. If it's 1114069f55e2SEric Schrock * faulted, bail out of the open. 1115069f55e2SEric Schrock */ 11163d7072f8Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 11173d7072f8Seschrock ASSERT(vd->vdev_children == 0); 1118069f55e2SEric Schrock ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1119069f55e2SEric Schrock vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 11203d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1121069f55e2SEric Schrock vd->vdev_label_aux); 11223d7072f8Seschrock return (ENXIO); 11233d7072f8Seschrock } else if (vd->vdev_offline) { 1124fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 1125ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1126fa9e4066Sahrens return (ENXIO); 1127fa9e4066Sahrens } 1128fa9e4066Sahrens 1129fa9e4066Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 1130fa9e4066Sahrens 1131095bcd66SGeorge Wilson /* 1132095bcd66SGeorge Wilson * Reset the vdev_reopening flag so that we actually close 1133095bcd66SGeorge Wilson * the vdev on error. 1134095bcd66SGeorge Wilson */ 1135095bcd66SGeorge Wilson vd->vdev_reopening = B_FALSE; 1136ea8dc4b6Seschrock if (zio_injection_enabled && error == 0) 11378956713aSEric Schrock error = zio_handle_device_injection(vd, NULL, ENXIO); 1138ea8dc4b6Seschrock 1139fa9e4066Sahrens if (error) { 11403d7072f8Seschrock if (vd->vdev_removed && 11413d7072f8Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 11423d7072f8Seschrock vd->vdev_removed = B_FALSE; 11433d7072f8Seschrock 1144ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1145fa9e4066Sahrens vd->vdev_stat.vs_aux); 1146fa9e4066Sahrens return (error); 1147fa9e4066Sahrens } 1148fa9e4066Sahrens 11493d7072f8Seschrock vd->vdev_removed = B_FALSE; 11503d7072f8Seschrock 1151096d22d4SEric Schrock /* 1152096d22d4SEric Schrock * Recheck the faulted flag now that we have confirmed that 1153096d22d4SEric Schrock * the vdev is accessible. If we're faulted, bail. 1154096d22d4SEric Schrock */ 1155096d22d4SEric Schrock if (vd->vdev_faulted) { 1156096d22d4SEric Schrock ASSERT(vd->vdev_children == 0); 1157096d22d4SEric Schrock ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1158096d22d4SEric Schrock vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1159096d22d4SEric Schrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1160096d22d4SEric Schrock vd->vdev_label_aux); 1161096d22d4SEric Schrock return (ENXIO); 1162096d22d4SEric Schrock } 1163096d22d4SEric Schrock 11643d7072f8Seschrock if (vd->vdev_degraded) { 11653d7072f8Seschrock ASSERT(vd->vdev_children == 0); 11663d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 11673d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 11683d7072f8Seschrock } else { 1169069f55e2SEric Schrock vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 11703d7072f8Seschrock } 1171fa9e4066Sahrens 117288ecc943SGeorge Wilson /* 117388ecc943SGeorge Wilson * For hole or missing vdevs we just return success. 117488ecc943SGeorge Wilson */ 117588ecc943SGeorge Wilson if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 117688ecc943SGeorge Wilson return (0); 117788ecc943SGeorge Wilson 1178573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 1179ea8dc4b6Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1180ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1181ea8dc4b6Seschrock VDEV_AUX_NONE); 1182ea8dc4b6Seschrock break; 1183ea8dc4b6Seschrock } 1184573ca77eSGeorge Wilson } 1185fa9e4066Sahrens 1186fa9e4066Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1187fa9e4066Sahrens 1188fa9e4066Sahrens if (vd->vdev_children == 0) { 1189fa9e4066Sahrens if (osize < SPA_MINDEVSIZE) { 1190ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1191ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 1192fa9e4066Sahrens return (EOVERFLOW); 1193fa9e4066Sahrens } 1194fa9e4066Sahrens psize = osize; 1195fa9e4066Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1196fa9e4066Sahrens } else { 1197ecc2d604Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1198fa9e4066Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1199ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1200ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 1201fa9e4066Sahrens return (EOVERFLOW); 1202fa9e4066Sahrens } 1203fa9e4066Sahrens psize = 0; 1204fa9e4066Sahrens asize = osize; 1205fa9e4066Sahrens } 1206fa9e4066Sahrens 1207fa9e4066Sahrens vd->vdev_psize = psize; 1208fa9e4066Sahrens 1209573ca77eSGeorge Wilson /* 1210573ca77eSGeorge Wilson * Make sure the allocatable size hasn't shrunk. 1211573ca77eSGeorge Wilson */ 1212573ca77eSGeorge Wilson if (asize < vd->vdev_min_asize) { 1213573ca77eSGeorge Wilson vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1214573ca77eSGeorge Wilson VDEV_AUX_BAD_LABEL); 1215573ca77eSGeorge Wilson return (EINVAL); 1216573ca77eSGeorge Wilson } 1217573ca77eSGeorge Wilson 1218fa9e4066Sahrens if (vd->vdev_asize == 0) { 1219fa9e4066Sahrens /* 1220fa9e4066Sahrens * This is the first-ever open, so use the computed values. 1221ecc2d604Sbonwick * For testing purposes, a higher ashift can be requested. 1222fa9e4066Sahrens */ 1223fa9e4066Sahrens vd->vdev_asize = asize; 1224ecc2d604Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1225fa9e4066Sahrens } else { 1226fa9e4066Sahrens /* 1227fa9e4066Sahrens * Make sure the alignment requirement hasn't increased. 1228fa9e4066Sahrens */ 1229ecc2d604Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 1230ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231ea8dc4b6Seschrock VDEV_AUX_BAD_LABEL); 1232fa9e4066Sahrens return (EINVAL); 1233fa9e4066Sahrens } 1234573ca77eSGeorge Wilson } 1235fa9e4066Sahrens 1236573ca77eSGeorge Wilson /* 1237573ca77eSGeorge Wilson * If all children are healthy and the asize has increased, 1238573ca77eSGeorge Wilson * then we've experienced dynamic LUN growth. If automatic 1239573ca77eSGeorge Wilson * expansion is enabled then use the additional space. 1240573ca77eSGeorge Wilson */ 1241573ca77eSGeorge Wilson if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1242573ca77eSGeorge Wilson (vd->vdev_expanding || spa->spa_autoexpand)) 1243573ca77eSGeorge Wilson vd->vdev_asize = asize; 1244fa9e4066Sahrens 1245573ca77eSGeorge Wilson vdev_set_min_asize(vd); 1246fa9e4066Sahrens 12470a4e9518Sgw /* 12480a4e9518Sgw * Ensure we can issue some IO before declaring the 12490a4e9518Sgw * vdev open for business. 12500a4e9518Sgw */ 1251e14bb325SJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && 1252e14bb325SJeff Bonwick (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 12530a4e9518Sgw vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1254e14bb325SJeff Bonwick VDEV_AUX_IO_FAILURE); 12550a4e9518Sgw return (error); 12560a4e9518Sgw } 12570a4e9518Sgw 1258088f3894Sahrens /* 1259088f3894Sahrens * If a leaf vdev has a DTL, and seems healthy, then kick off a 12608ad4d6ddSJeff Bonwick * resilver. But don't do this if we are doing a reopen for a scrub, 12618ad4d6ddSJeff Bonwick * since this would just restart the scrub we are already doing. 1262088f3894Sahrens */ 12638ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 12648ad4d6ddSJeff Bonwick vdev_resilver_needed(vd, NULL, NULL)) 12658ad4d6ddSJeff Bonwick spa_async_request(spa, SPA_ASYNC_RESILVER); 1266088f3894Sahrens 1267fa9e4066Sahrens return (0); 1268fa9e4066Sahrens } 1269fa9e4066Sahrens 1270560e6e96Seschrock /* 1271560e6e96Seschrock * Called once the vdevs are all opened, this routine validates the label 1272560e6e96Seschrock * contents. This needs to be done before vdev_load() so that we don't 12733d7072f8Seschrock * inadvertently do repair I/Os to the wrong device. 1274560e6e96Seschrock * 1275560e6e96Seschrock * This function will only return failure if one of the vdevs indicates that it 1276560e6e96Seschrock * has since been destroyed or exported. This is only possible if 1277560e6e96Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1278560e6e96Seschrock * will be updated but the function will return 0. 1279560e6e96Seschrock */ 1280560e6e96Seschrock int 1281560e6e96Seschrock vdev_validate(vdev_t *vd) 1282560e6e96Seschrock { 1283560e6e96Seschrock spa_t *spa = vd->vdev_spa; 1284560e6e96Seschrock nvlist_t *label; 1285*1195e687SMark J Musante uint64_t guid = 0, top_guid; 1286560e6e96Seschrock uint64_t state; 1287560e6e96Seschrock 1288573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 1289560e6e96Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 12900bf246f5Smc return (EBADF); 1291560e6e96Seschrock 1292b5989ec7Seschrock /* 1293b5989ec7Seschrock * If the device has already failed, or was marked offline, don't do 1294b5989ec7Seschrock * any further validation. Otherwise, label I/O will fail and we will 1295b5989ec7Seschrock * overwrite the previous state. 1296b5989ec7Seschrock */ 1297e14bb325SJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1298*1195e687SMark J Musante uint64_t aux_guid = 0; 1299*1195e687SMark J Musante nvlist_t *nvl; 1300560e6e96Seschrock 1301560e6e96Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 1302560e6e96Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1303560e6e96Seschrock VDEV_AUX_BAD_LABEL); 1304560e6e96Seschrock return (0); 1305560e6e96Seschrock } 1306560e6e96Seschrock 1307*1195e687SMark J Musante /* 1308*1195e687SMark J Musante * Determine if this vdev has been split off into another 1309*1195e687SMark J Musante * pool. If so, then refuse to open it. 1310*1195e687SMark J Musante */ 1311*1195e687SMark J Musante if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1312*1195e687SMark J Musante &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1313*1195e687SMark J Musante vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1314*1195e687SMark J Musante VDEV_AUX_SPLIT_POOL); 1315*1195e687SMark J Musante nvlist_free(label); 1316*1195e687SMark J Musante return (0); 1317*1195e687SMark J Musante } 1318*1195e687SMark J Musante 1319560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1320560e6e96Seschrock &guid) != 0 || guid != spa_guid(spa)) { 1321560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1322560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1323560e6e96Seschrock nvlist_free(label); 1324560e6e96Seschrock return (0); 1325560e6e96Seschrock } 1326560e6e96Seschrock 1327*1195e687SMark J Musante if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1328*1195e687SMark J Musante != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1329*1195e687SMark J Musante &aux_guid) != 0) 1330*1195e687SMark J Musante aux_guid = 0; 1331*1195e687SMark J Musante 1332e14bb325SJeff Bonwick /* 1333e14bb325SJeff Bonwick * If this vdev just became a top-level vdev because its 1334e14bb325SJeff Bonwick * sibling was detached, it will have adopted the parent's 1335e14bb325SJeff Bonwick * vdev guid -- but the label may or may not be on disk yet. 1336e14bb325SJeff Bonwick * Fortunately, either version of the label will have the 1337e14bb325SJeff Bonwick * same top guid, so if we're a top-level vdev, we can 1338e14bb325SJeff Bonwick * safely compare to that instead. 1339*1195e687SMark J Musante * 1340*1195e687SMark J Musante * If we split this vdev off instead, then we also check the 1341*1195e687SMark J Musante * original pool's guid. We don't want to consider the vdev 1342*1195e687SMark J Musante * corrupt if it is partway through a split operation. 1343e14bb325SJeff Bonwick */ 1344560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1345e14bb325SJeff Bonwick &guid) != 0 || 1346e14bb325SJeff Bonwick nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1347e14bb325SJeff Bonwick &top_guid) != 0 || 1348*1195e687SMark J Musante ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1349e14bb325SJeff Bonwick (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1350560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1351560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1352560e6e96Seschrock nvlist_free(label); 1353560e6e96Seschrock return (0); 1354560e6e96Seschrock } 1355560e6e96Seschrock 1356560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1357560e6e96Seschrock &state) != 0) { 1358560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1359560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1360560e6e96Seschrock nvlist_free(label); 1361560e6e96Seschrock return (0); 1362560e6e96Seschrock } 1363560e6e96Seschrock 1364560e6e96Seschrock nvlist_free(label); 1365560e6e96Seschrock 1366bc758434SLin Ling /* 1367bc758434SLin Ling * If spa->spa_load_verbatim is true, no need to check the 1368bc758434SLin Ling * state of the pool. 1369bc758434SLin Ling */ 1370bc758434SLin Ling if (!spa->spa_load_verbatim && 1371b16da2e2SGeorge Wilson spa_load_state(spa) == SPA_LOAD_OPEN && 1372bc758434SLin Ling state != POOL_STATE_ACTIVE) 13730bf246f5Smc return (EBADF); 1374560e6e96Seschrock 137551ece835Seschrock /* 137651ece835Seschrock * If we were able to open and validate a vdev that was 137751ece835Seschrock * previously marked permanently unavailable, clear that state 137851ece835Seschrock * now. 137951ece835Seschrock */ 138051ece835Seschrock if (vd->vdev_not_present) 138151ece835Seschrock vd->vdev_not_present = 0; 138251ece835Seschrock } 1383560e6e96Seschrock 1384560e6e96Seschrock return (0); 1385560e6e96Seschrock } 1386560e6e96Seschrock 1387fa9e4066Sahrens /* 1388fa9e4066Sahrens * Close a virtual device. 1389fa9e4066Sahrens */ 1390fa9e4066Sahrens void 1391fa9e4066Sahrens vdev_close(vdev_t *vd) 1392fa9e4066Sahrens { 13938ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 1394095bcd66SGeorge Wilson vdev_t *pvd = vd->vdev_parent; 13958ad4d6ddSJeff Bonwick 13968ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 13978ad4d6ddSJeff Bonwick 1398*1195e687SMark J Musante /* 1399*1195e687SMark J Musante * If our parent is reopening, then we are as well, unless we are 1400*1195e687SMark J Musante * going offline. 1401*1195e687SMark J Musante */ 1402095bcd66SGeorge Wilson if (pvd != NULL && pvd->vdev_reopening) 1403*1195e687SMark J Musante vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1404095bcd66SGeorge Wilson 1405fa9e4066Sahrens vd->vdev_ops->vdev_op_close(vd); 1406fa9e4066Sahrens 14073d7072f8Seschrock vdev_cache_purge(vd); 1408fa9e4066Sahrens 1409560e6e96Seschrock /* 1410573ca77eSGeorge Wilson * We record the previous state before we close it, so that if we are 1411560e6e96Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 1412560e6e96Seschrock * it's still faulted. 1413560e6e96Seschrock */ 1414560e6e96Seschrock vd->vdev_prevstate = vd->vdev_state; 1415560e6e96Seschrock 1416fa9e4066Sahrens if (vd->vdev_offline) 1417fa9e4066Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1418fa9e4066Sahrens else 1419fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 1420ea8dc4b6Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1421fa9e4066Sahrens } 1422fa9e4066Sahrens 1423095bcd66SGeorge Wilson /* 1424095bcd66SGeorge Wilson * Reopen all interior vdevs and any unopened leaves. We don't actually 1425095bcd66SGeorge Wilson * reopen leaf vdevs which had previously been opened as they might deadlock 1426095bcd66SGeorge Wilson * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1427095bcd66SGeorge Wilson * If the leaf has never been opened then open it, as usual. 1428095bcd66SGeorge Wilson */ 1429fa9e4066Sahrens void 1430ea8dc4b6Seschrock vdev_reopen(vdev_t *vd) 1431fa9e4066Sahrens { 1432ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1433fa9e4066Sahrens 1434e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1435ea8dc4b6Seschrock 1436*1195e687SMark J Musante /* set the reopening flag unless we're taking the vdev offline */ 1437*1195e687SMark J Musante vd->vdev_reopening = !vd->vdev_offline; 1438fa9e4066Sahrens vdev_close(vd); 1439fa9e4066Sahrens (void) vdev_open(vd); 1440fa9e4066Sahrens 144139c23413Seschrock /* 144239c23413Seschrock * Call vdev_validate() here to make sure we have the same device. 144339c23413Seschrock * Otherwise, a device with an invalid label could be successfully 144439c23413Seschrock * opened in response to vdev_reopen(). 144539c23413Seschrock */ 1446c5904d13Seschrock if (vd->vdev_aux) { 1447c5904d13Seschrock (void) vdev_validate_aux(vd); 1448e14bb325SJeff Bonwick if (vdev_readable(vd) && vdev_writeable(vd) && 14496809eb4eSEric Schrock vd->vdev_aux == &spa->spa_l2cache && 1450573ca77eSGeorge Wilson !l2arc_vdev_present(vd)) 1451573ca77eSGeorge Wilson l2arc_add_vdev(spa, vd); 1452c5904d13Seschrock } else { 1453c5904d13Seschrock (void) vdev_validate(vd); 1454c5904d13Seschrock } 145539c23413Seschrock 1456fa9e4066Sahrens /* 14573d7072f8Seschrock * Reassess parent vdev's health. 1458fa9e4066Sahrens */ 14593d7072f8Seschrock vdev_propagate_state(vd); 1460fa9e4066Sahrens } 1461fa9e4066Sahrens 1462fa9e4066Sahrens int 146399653d4eSeschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1464fa9e4066Sahrens { 1465fa9e4066Sahrens int error; 1466fa9e4066Sahrens 1467fa9e4066Sahrens /* 1468fa9e4066Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1469fa9e4066Sahrens * For a create, however, we want to fail the request if 1470fa9e4066Sahrens * there are any components we can't open. 1471fa9e4066Sahrens */ 1472fa9e4066Sahrens error = vdev_open(vd); 1473fa9e4066Sahrens 1474fa9e4066Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1475fa9e4066Sahrens vdev_close(vd); 1476fa9e4066Sahrens return (error ? error : ENXIO); 1477fa9e4066Sahrens } 1478fa9e4066Sahrens 1479fa9e4066Sahrens /* 1480fa9e4066Sahrens * Recursively initialize all labels. 1481fa9e4066Sahrens */ 148239c23413Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 148339c23413Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1484fa9e4066Sahrens vdev_close(vd); 1485fa9e4066Sahrens return (error); 1486fa9e4066Sahrens } 1487fa9e4066Sahrens 1488fa9e4066Sahrens return (0); 1489fa9e4066Sahrens } 1490fa9e4066Sahrens 14910e34b6a7Sbonwick void 1492573ca77eSGeorge Wilson vdev_metaslab_set_size(vdev_t *vd) 1493fa9e4066Sahrens { 1494fa9e4066Sahrens /* 1495fa9e4066Sahrens * Aim for roughly 200 metaslabs per vdev. 1496fa9e4066Sahrens */ 1497fa9e4066Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1498fa9e4066Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1499fa9e4066Sahrens } 1500fa9e4066Sahrens 1501fa9e4066Sahrens void 1502ecc2d604Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1503fa9e4066Sahrens { 1504ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 150588ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 1506ecc2d604Sbonwick ASSERT(ISP2(flags)); 1507fa9e4066Sahrens 1508ecc2d604Sbonwick if (flags & VDD_METASLAB) 1509ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1510ecc2d604Sbonwick 1511ecc2d604Sbonwick if (flags & VDD_DTL) 1512ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1513ecc2d604Sbonwick 1514ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1515fa9e4066Sahrens } 1516fa9e4066Sahrens 15178ad4d6ddSJeff Bonwick /* 15188ad4d6ddSJeff Bonwick * DTLs. 15198ad4d6ddSJeff Bonwick * 15208ad4d6ddSJeff Bonwick * A vdev's DTL (dirty time log) is the set of transaction groups for which 15218ad4d6ddSJeff Bonwick * the vdev has less than perfect replication. There are three kinds of DTL: 15228ad4d6ddSJeff Bonwick * 15238ad4d6ddSJeff Bonwick * DTL_MISSING: txgs for which the vdev has no valid copies of the data 15248ad4d6ddSJeff Bonwick * 15258ad4d6ddSJeff Bonwick * DTL_PARTIAL: txgs for which data is available, but not fully replicated 15268ad4d6ddSJeff Bonwick * 15278ad4d6ddSJeff Bonwick * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 15288ad4d6ddSJeff Bonwick * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 15298ad4d6ddSJeff Bonwick * txgs that was scrubbed. 15308ad4d6ddSJeff Bonwick * 15318ad4d6ddSJeff Bonwick * DTL_OUTAGE: txgs which cannot currently be read, whether due to 15328ad4d6ddSJeff Bonwick * persistent errors or just some device being offline. 15338ad4d6ddSJeff Bonwick * Unlike the other three, the DTL_OUTAGE map is not generally 15348ad4d6ddSJeff Bonwick * maintained; it's only computed when needed, typically to 15358ad4d6ddSJeff Bonwick * determine whether a device can be detached. 15368ad4d6ddSJeff Bonwick * 15378ad4d6ddSJeff Bonwick * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 15388ad4d6ddSJeff Bonwick * either has the data or it doesn't. 15398ad4d6ddSJeff Bonwick * 15408ad4d6ddSJeff Bonwick * For interior vdevs such as mirror and RAID-Z the picture is more complex. 15418ad4d6ddSJeff Bonwick * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 15428ad4d6ddSJeff Bonwick * if any child is less than fully replicated, then so is its parent. 15438ad4d6ddSJeff Bonwick * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 15448ad4d6ddSJeff Bonwick * comprising only those txgs which appear in 'maxfaults' or more children; 15458ad4d6ddSJeff Bonwick * those are the txgs we don't have enough replication to read. For example, 15468ad4d6ddSJeff Bonwick * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 15478ad4d6ddSJeff Bonwick * thus, its DTL_MISSING consists of the set of txgs that appear in more than 15488ad4d6ddSJeff Bonwick * two child DTL_MISSING maps. 15498ad4d6ddSJeff Bonwick * 15508ad4d6ddSJeff Bonwick * It should be clear from the above that to compute the DTLs and outage maps 15518ad4d6ddSJeff Bonwick * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 15528ad4d6ddSJeff Bonwick * Therefore, that is all we keep on disk. When loading the pool, or after 15538ad4d6ddSJeff Bonwick * a configuration change, we generate all other DTLs from first principles. 15548ad4d6ddSJeff Bonwick */ 1555fa9e4066Sahrens void 15568ad4d6ddSJeff Bonwick vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1557fa9e4066Sahrens { 15588ad4d6ddSJeff Bonwick space_map_t *sm = &vd->vdev_dtl[t]; 15598ad4d6ddSJeff Bonwick 15608ad4d6ddSJeff Bonwick ASSERT(t < DTL_TYPES); 15618ad4d6ddSJeff Bonwick ASSERT(vd != vd->vdev_spa->spa_root_vdev); 15628ad4d6ddSJeff Bonwick 1563fa9e4066Sahrens mutex_enter(sm->sm_lock); 1564fa9e4066Sahrens if (!space_map_contains(sm, txg, size)) 1565fa9e4066Sahrens space_map_add(sm, txg, size); 1566fa9e4066Sahrens mutex_exit(sm->sm_lock); 1567fa9e4066Sahrens } 1568fa9e4066Sahrens 15698ad4d6ddSJeff Bonwick boolean_t 15708ad4d6ddSJeff Bonwick vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1571fa9e4066Sahrens { 15728ad4d6ddSJeff Bonwick space_map_t *sm = &vd->vdev_dtl[t]; 15738ad4d6ddSJeff Bonwick boolean_t dirty = B_FALSE; 1574fa9e4066Sahrens 15758ad4d6ddSJeff Bonwick ASSERT(t < DTL_TYPES); 15768ad4d6ddSJeff Bonwick ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1577fa9e4066Sahrens 1578fa9e4066Sahrens mutex_enter(sm->sm_lock); 15798ad4d6ddSJeff Bonwick if (sm->sm_space != 0) 15808ad4d6ddSJeff Bonwick dirty = space_map_contains(sm, txg, size); 1581fa9e4066Sahrens mutex_exit(sm->sm_lock); 1582fa9e4066Sahrens 1583fa9e4066Sahrens return (dirty); 1584fa9e4066Sahrens } 1585fa9e4066Sahrens 15868ad4d6ddSJeff Bonwick boolean_t 15878ad4d6ddSJeff Bonwick vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 15888ad4d6ddSJeff Bonwick { 15898ad4d6ddSJeff Bonwick space_map_t *sm = &vd->vdev_dtl[t]; 15908ad4d6ddSJeff Bonwick boolean_t empty; 15918ad4d6ddSJeff Bonwick 15928ad4d6ddSJeff Bonwick mutex_enter(sm->sm_lock); 15938ad4d6ddSJeff Bonwick empty = (sm->sm_space == 0); 15948ad4d6ddSJeff Bonwick mutex_exit(sm->sm_lock); 15958ad4d6ddSJeff Bonwick 15968ad4d6ddSJeff Bonwick return (empty); 15978ad4d6ddSJeff Bonwick } 15988ad4d6ddSJeff Bonwick 1599fa9e4066Sahrens /* 1600fa9e4066Sahrens * Reassess DTLs after a config change or scrub completion. 1601fa9e4066Sahrens */ 1602fa9e4066Sahrens void 1603fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1604fa9e4066Sahrens { 1605ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 16068ad4d6ddSJeff Bonwick avl_tree_t reftree; 16078ad4d6ddSJeff Bonwick int minref; 1608fa9e4066Sahrens 16098ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1610fa9e4066Sahrens 16118ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) 16128ad4d6ddSJeff Bonwick vdev_dtl_reassess(vd->vdev_child[c], txg, 16138ad4d6ddSJeff Bonwick scrub_txg, scrub_done); 16148ad4d6ddSJeff Bonwick 1615b24ab676SJeff Bonwick if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 16168ad4d6ddSJeff Bonwick return; 16178ad4d6ddSJeff Bonwick 16188ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf) { 1619fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1620088f3894Sahrens if (scrub_txg != 0 && 1621088f3894Sahrens (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { 1622088f3894Sahrens /* XXX should check scrub_done? */ 1623088f3894Sahrens /* 1624088f3894Sahrens * We completed a scrub up to scrub_txg. If we 1625088f3894Sahrens * did it without rebooting, then the scrub dtl 1626088f3894Sahrens * will be valid, so excise the old region and 1627088f3894Sahrens * fold in the scrub dtl. Otherwise, leave the 1628088f3894Sahrens * dtl as-is if there was an error. 16298ad4d6ddSJeff Bonwick * 16308ad4d6ddSJeff Bonwick * There's little trick here: to excise the beginning 16318ad4d6ddSJeff Bonwick * of the DTL_MISSING map, we put it into a reference 16328ad4d6ddSJeff Bonwick * tree and then add a segment with refcnt -1 that 16338ad4d6ddSJeff Bonwick * covers the range [0, scrub_txg). This means 16348ad4d6ddSJeff Bonwick * that each txg in that range has refcnt -1 or 0. 16358ad4d6ddSJeff Bonwick * We then add DTL_SCRUB with a refcnt of 2, so that 16368ad4d6ddSJeff Bonwick * entries in the range [0, scrub_txg) will have a 16378ad4d6ddSJeff Bonwick * positive refcnt -- either 1 or 2. We then convert 16388ad4d6ddSJeff Bonwick * the reference tree into the new DTL_MISSING map. 1639088f3894Sahrens */ 16408ad4d6ddSJeff Bonwick space_map_ref_create(&reftree); 16418ad4d6ddSJeff Bonwick space_map_ref_add_map(&reftree, 16428ad4d6ddSJeff Bonwick &vd->vdev_dtl[DTL_MISSING], 1); 16438ad4d6ddSJeff Bonwick space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 16448ad4d6ddSJeff Bonwick space_map_ref_add_map(&reftree, 16458ad4d6ddSJeff Bonwick &vd->vdev_dtl[DTL_SCRUB], 2); 16468ad4d6ddSJeff Bonwick space_map_ref_generate_map(&reftree, 16478ad4d6ddSJeff Bonwick &vd->vdev_dtl[DTL_MISSING], 1); 16488ad4d6ddSJeff Bonwick space_map_ref_destroy(&reftree); 1649fa9e4066Sahrens } 16508ad4d6ddSJeff Bonwick space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 16518ad4d6ddSJeff Bonwick space_map_walk(&vd->vdev_dtl[DTL_MISSING], 16528ad4d6ddSJeff Bonwick space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1653fa9e4066Sahrens if (scrub_done) 16548ad4d6ddSJeff Bonwick space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 16558ad4d6ddSJeff Bonwick space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 16568ad4d6ddSJeff Bonwick if (!vdev_readable(vd)) 16578ad4d6ddSJeff Bonwick space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 16588ad4d6ddSJeff Bonwick else 16598ad4d6ddSJeff Bonwick space_map_walk(&vd->vdev_dtl[DTL_MISSING], 16608ad4d6ddSJeff Bonwick space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1661fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1662088f3894Sahrens 1663ecc2d604Sbonwick if (txg != 0) 1664ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1665fa9e4066Sahrens return; 1666fa9e4066Sahrens } 1667fa9e4066Sahrens 1668fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 16698ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 167099bb17e2SEric Taylor /* account for child's outage in parent's missing map */ 167199bb17e2SEric Taylor int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 16728ad4d6ddSJeff Bonwick if (t == DTL_SCRUB) 16738ad4d6ddSJeff Bonwick continue; /* leaf vdevs only */ 16748ad4d6ddSJeff Bonwick if (t == DTL_PARTIAL) 16758ad4d6ddSJeff Bonwick minref = 1; /* i.e. non-zero */ 16768ad4d6ddSJeff Bonwick else if (vd->vdev_nparity != 0) 16778ad4d6ddSJeff Bonwick minref = vd->vdev_nparity + 1; /* RAID-Z */ 16788ad4d6ddSJeff Bonwick else 16798ad4d6ddSJeff Bonwick minref = vd->vdev_children; /* any kind of mirror */ 16808ad4d6ddSJeff Bonwick space_map_ref_create(&reftree); 16818ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) { 16828ad4d6ddSJeff Bonwick vdev_t *cvd = vd->vdev_child[c]; 16838ad4d6ddSJeff Bonwick mutex_enter(&cvd->vdev_dtl_lock); 168499bb17e2SEric Taylor space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); 16858ad4d6ddSJeff Bonwick mutex_exit(&cvd->vdev_dtl_lock); 16868ad4d6ddSJeff Bonwick } 16878ad4d6ddSJeff Bonwick space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 16888ad4d6ddSJeff Bonwick space_map_ref_destroy(&reftree); 1689fa9e4066Sahrens } 16908ad4d6ddSJeff Bonwick mutex_exit(&vd->vdev_dtl_lock); 1691fa9e4066Sahrens } 1692fa9e4066Sahrens 1693fa9e4066Sahrens static int 1694fa9e4066Sahrens vdev_dtl_load(vdev_t *vd) 1695fa9e4066Sahrens { 1696fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 16978ad4d6ddSJeff Bonwick space_map_obj_t *smo = &vd->vdev_dtl_smo; 1698ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1699fa9e4066Sahrens dmu_buf_t *db; 1700fa9e4066Sahrens int error; 1701fa9e4066Sahrens 1702fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 1703fa9e4066Sahrens 1704fa9e4066Sahrens if (smo->smo_object == 0) 1705fa9e4066Sahrens return (0); 1706fa9e4066Sahrens 170788ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 170888ecc943SGeorge Wilson 1709ecc2d604Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1710ea8dc4b6Seschrock return (error); 1711ecc2d604Sbonwick 17121934e92fSmaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 17131934e92fSmaybee bcopy(db->db_data, smo, sizeof (*smo)); 1714ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1715fa9e4066Sahrens 1716fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 17178ad4d6ddSJeff Bonwick error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 17188ad4d6ddSJeff Bonwick NULL, SM_ALLOC, smo, mos); 1719fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1720fa9e4066Sahrens 1721fa9e4066Sahrens return (error); 1722fa9e4066Sahrens } 1723fa9e4066Sahrens 1724fa9e4066Sahrens void 1725fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1726fa9e4066Sahrens { 1727fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 17288ad4d6ddSJeff Bonwick space_map_obj_t *smo = &vd->vdev_dtl_smo; 17298ad4d6ddSJeff Bonwick space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1730ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1731fa9e4066Sahrens space_map_t smsync; 1732fa9e4066Sahrens kmutex_t smlock; 1733fa9e4066Sahrens dmu_buf_t *db; 1734fa9e4066Sahrens dmu_tx_t *tx; 1735fa9e4066Sahrens 173688ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 173788ecc943SGeorge Wilson 1738fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1739fa9e4066Sahrens 1740fa9e4066Sahrens if (vd->vdev_detached) { 1741fa9e4066Sahrens if (smo->smo_object != 0) { 1742ecc2d604Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1743fa9e4066Sahrens ASSERT3U(err, ==, 0); 1744fa9e4066Sahrens smo->smo_object = 0; 1745fa9e4066Sahrens } 1746fa9e4066Sahrens dmu_tx_commit(tx); 1747fa9e4066Sahrens return; 1748fa9e4066Sahrens } 1749fa9e4066Sahrens 1750fa9e4066Sahrens if (smo->smo_object == 0) { 1751fa9e4066Sahrens ASSERT(smo->smo_objsize == 0); 1752fa9e4066Sahrens ASSERT(smo->smo_alloc == 0); 1753ecc2d604Sbonwick smo->smo_object = dmu_object_alloc(mos, 1754fa9e4066Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1755fa9e4066Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1756fa9e4066Sahrens ASSERT(smo->smo_object != 0); 1757fa9e4066Sahrens vdev_config_dirty(vd->vdev_top); 1758fa9e4066Sahrens } 1759fa9e4066Sahrens 1760fa9e4066Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1761fa9e4066Sahrens 1762fa9e4066Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1763fa9e4066Sahrens &smlock); 1764fa9e4066Sahrens 1765fa9e4066Sahrens mutex_enter(&smlock); 1766fa9e4066Sahrens 1767fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1768ecc2d604Sbonwick space_map_walk(sm, space_map_add, &smsync); 1769fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1770fa9e4066Sahrens 1771ecc2d604Sbonwick space_map_truncate(smo, mos, tx); 1772ecc2d604Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1773fa9e4066Sahrens 1774fa9e4066Sahrens space_map_destroy(&smsync); 1775fa9e4066Sahrens 1776fa9e4066Sahrens mutex_exit(&smlock); 1777fa9e4066Sahrens mutex_destroy(&smlock); 1778fa9e4066Sahrens 1779ecc2d604Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1780fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 17811934e92fSmaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 17821934e92fSmaybee bcopy(smo, db->db_data, sizeof (*smo)); 1783ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1784fa9e4066Sahrens 1785fa9e4066Sahrens dmu_tx_commit(tx); 1786fa9e4066Sahrens } 1787fa9e4066Sahrens 17888ad4d6ddSJeff Bonwick /* 17898ad4d6ddSJeff Bonwick * Determine whether the specified vdev can be offlined/detached/removed 17908ad4d6ddSJeff Bonwick * without losing data. 17918ad4d6ddSJeff Bonwick */ 17928ad4d6ddSJeff Bonwick boolean_t 17938ad4d6ddSJeff Bonwick vdev_dtl_required(vdev_t *vd) 17948ad4d6ddSJeff Bonwick { 17958ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 17968ad4d6ddSJeff Bonwick vdev_t *tvd = vd->vdev_top; 17978ad4d6ddSJeff Bonwick uint8_t cant_read = vd->vdev_cant_read; 17988ad4d6ddSJeff Bonwick boolean_t required; 17998ad4d6ddSJeff Bonwick 18008ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 18018ad4d6ddSJeff Bonwick 18028ad4d6ddSJeff Bonwick if (vd == spa->spa_root_vdev || vd == tvd) 18038ad4d6ddSJeff Bonwick return (B_TRUE); 18048ad4d6ddSJeff Bonwick 18058ad4d6ddSJeff Bonwick /* 18068ad4d6ddSJeff Bonwick * Temporarily mark the device as unreadable, and then determine 18078ad4d6ddSJeff Bonwick * whether this results in any DTL outages in the top-level vdev. 18088ad4d6ddSJeff Bonwick * If not, we can safely offline/detach/remove the device. 18098ad4d6ddSJeff Bonwick */ 18108ad4d6ddSJeff Bonwick vd->vdev_cant_read = B_TRUE; 18118ad4d6ddSJeff Bonwick vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 18128ad4d6ddSJeff Bonwick required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 18138ad4d6ddSJeff Bonwick vd->vdev_cant_read = cant_read; 18148ad4d6ddSJeff Bonwick vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 18158ad4d6ddSJeff Bonwick 18168ad4d6ddSJeff Bonwick return (required); 18178ad4d6ddSJeff Bonwick } 18188ad4d6ddSJeff Bonwick 1819088f3894Sahrens /* 1820088f3894Sahrens * Determine if resilver is needed, and if so the txg range. 1821088f3894Sahrens */ 1822088f3894Sahrens boolean_t 1823088f3894Sahrens vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1824088f3894Sahrens { 1825088f3894Sahrens boolean_t needed = B_FALSE; 1826088f3894Sahrens uint64_t thismin = UINT64_MAX; 1827088f3894Sahrens uint64_t thismax = 0; 1828088f3894Sahrens 1829088f3894Sahrens if (vd->vdev_children == 0) { 1830088f3894Sahrens mutex_enter(&vd->vdev_dtl_lock); 18318ad4d6ddSJeff Bonwick if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 18328ad4d6ddSJeff Bonwick vdev_writeable(vd)) { 1833088f3894Sahrens space_seg_t *ss; 1834088f3894Sahrens 18358ad4d6ddSJeff Bonwick ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1836088f3894Sahrens thismin = ss->ss_start - 1; 18378ad4d6ddSJeff Bonwick ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1838088f3894Sahrens thismax = ss->ss_end; 1839088f3894Sahrens needed = B_TRUE; 1840088f3894Sahrens } 1841088f3894Sahrens mutex_exit(&vd->vdev_dtl_lock); 1842088f3894Sahrens } else { 18438ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) { 1844088f3894Sahrens vdev_t *cvd = vd->vdev_child[c]; 1845088f3894Sahrens uint64_t cmin, cmax; 1846088f3894Sahrens 1847088f3894Sahrens if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1848088f3894Sahrens thismin = MIN(thismin, cmin); 1849088f3894Sahrens thismax = MAX(thismax, cmax); 1850088f3894Sahrens needed = B_TRUE; 1851088f3894Sahrens } 1852088f3894Sahrens } 1853088f3894Sahrens } 1854088f3894Sahrens 1855088f3894Sahrens if (needed && minp) { 1856088f3894Sahrens *minp = thismin; 1857088f3894Sahrens *maxp = thismax; 1858088f3894Sahrens } 1859088f3894Sahrens return (needed); 1860088f3894Sahrens } 1861088f3894Sahrens 1862560e6e96Seschrock void 1863ea8dc4b6Seschrock vdev_load(vdev_t *vd) 1864fa9e4066Sahrens { 1865fa9e4066Sahrens /* 1866fa9e4066Sahrens * Recursively load all children. 1867fa9e4066Sahrens */ 18688ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) 1869560e6e96Seschrock vdev_load(vd->vdev_child[c]); 1870fa9e4066Sahrens 1871fa9e4066Sahrens /* 18720e34b6a7Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1873fa9e4066Sahrens */ 187488ecc943SGeorge Wilson if (vd == vd->vdev_top && !vd->vdev_ishole && 1875560e6e96Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1876560e6e96Seschrock vdev_metaslab_init(vd, 0) != 0)) 1877560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1878560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1879fa9e4066Sahrens 1880fa9e4066Sahrens /* 1881fa9e4066Sahrens * If this is a leaf vdev, load its DTL. 1882fa9e4066Sahrens */ 1883560e6e96Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1884560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1885560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1886fa9e4066Sahrens } 1887fa9e4066Sahrens 188899653d4eSeschrock /* 1889fa94a07fSbrendan * The special vdev case is used for hot spares and l2cache devices. Its 1890fa94a07fSbrendan * sole purpose it to set the vdev state for the associated vdev. To do this, 1891fa94a07fSbrendan * we make sure that we can open the underlying device, then try to read the 1892fa94a07fSbrendan * label, and make sure that the label is sane and that it hasn't been 1893fa94a07fSbrendan * repurposed to another pool. 189499653d4eSeschrock */ 189599653d4eSeschrock int 1896fa94a07fSbrendan vdev_validate_aux(vdev_t *vd) 189799653d4eSeschrock { 189899653d4eSeschrock nvlist_t *label; 189999653d4eSeschrock uint64_t guid, version; 190099653d4eSeschrock uint64_t state; 190199653d4eSeschrock 1902e14bb325SJeff Bonwick if (!vdev_readable(vd)) 1903c5904d13Seschrock return (0); 1904c5904d13Seschrock 190599653d4eSeschrock if ((label = vdev_label_read_config(vd)) == NULL) { 190699653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 190799653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 190899653d4eSeschrock return (-1); 190999653d4eSeschrock } 191099653d4eSeschrock 191199653d4eSeschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1912e7437265Sahrens version > SPA_VERSION || 191399653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 191499653d4eSeschrock guid != vd->vdev_guid || 191599653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 191699653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 191799653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 191899653d4eSeschrock nvlist_free(label); 191999653d4eSeschrock return (-1); 192099653d4eSeschrock } 192199653d4eSeschrock 192299653d4eSeschrock /* 192399653d4eSeschrock * We don't actually check the pool state here. If it's in fact in 192499653d4eSeschrock * use by another pool, we update this fact on the fly when requested. 192599653d4eSeschrock */ 192699653d4eSeschrock nvlist_free(label); 192799653d4eSeschrock return (0); 192899653d4eSeschrock } 192999653d4eSeschrock 193088ecc943SGeorge Wilson void 193188ecc943SGeorge Wilson vdev_remove(vdev_t *vd, uint64_t txg) 193288ecc943SGeorge Wilson { 193388ecc943SGeorge Wilson spa_t *spa = vd->vdev_spa; 193488ecc943SGeorge Wilson objset_t *mos = spa->spa_meta_objset; 193588ecc943SGeorge Wilson dmu_tx_t *tx; 193688ecc943SGeorge Wilson 193788ecc943SGeorge Wilson tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 193888ecc943SGeorge Wilson 193988ecc943SGeorge Wilson if (vd->vdev_dtl_smo.smo_object) { 194088ecc943SGeorge Wilson ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); 194188ecc943SGeorge Wilson (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); 194288ecc943SGeorge Wilson vd->vdev_dtl_smo.smo_object = 0; 194388ecc943SGeorge Wilson } 194488ecc943SGeorge Wilson 194588ecc943SGeorge Wilson if (vd->vdev_ms != NULL) { 194688ecc943SGeorge Wilson for (int m = 0; m < vd->vdev_ms_count; m++) { 194788ecc943SGeorge Wilson metaslab_t *msp = vd->vdev_ms[m]; 194888ecc943SGeorge Wilson 194988ecc943SGeorge Wilson if (msp == NULL || msp->ms_smo.smo_object == 0) 195088ecc943SGeorge Wilson continue; 195188ecc943SGeorge Wilson 195288ecc943SGeorge Wilson ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); 195388ecc943SGeorge Wilson (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); 195488ecc943SGeorge Wilson msp->ms_smo.smo_object = 0; 195588ecc943SGeorge Wilson } 195688ecc943SGeorge Wilson } 195788ecc943SGeorge Wilson 195888ecc943SGeorge Wilson if (vd->vdev_ms_array) { 195988ecc943SGeorge Wilson (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 196088ecc943SGeorge Wilson vd->vdev_ms_array = 0; 196188ecc943SGeorge Wilson vd->vdev_ms_shift = 0; 196288ecc943SGeorge Wilson } 196388ecc943SGeorge Wilson dmu_tx_commit(tx); 196488ecc943SGeorge Wilson } 196588ecc943SGeorge Wilson 1966fa9e4066Sahrens void 1967fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1968fa9e4066Sahrens { 1969fa9e4066Sahrens metaslab_t *msp; 197080eb36f2SGeorge Wilson boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 1971fa9e4066Sahrens 197288ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 197388ecc943SGeorge Wilson 1974fa9e4066Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1975fa9e4066Sahrens metaslab_sync_done(msp, txg); 197680eb36f2SGeorge Wilson 197780eb36f2SGeorge Wilson if (reassess) 197880eb36f2SGeorge Wilson metaslab_sync_reassess(vd->vdev_mg); 1979fa9e4066Sahrens } 1980fa9e4066Sahrens 1981fa9e4066Sahrens void 1982fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1983fa9e4066Sahrens { 1984fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1985fa9e4066Sahrens vdev_t *lvd; 1986fa9e4066Sahrens metaslab_t *msp; 1987ecc2d604Sbonwick dmu_tx_t *tx; 1988fa9e4066Sahrens 198988ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 199088ecc943SGeorge Wilson 1991ecc2d604Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1992ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 1993ecc2d604Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1994ecc2d604Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1995ecc2d604Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1996ecc2d604Sbonwick ASSERT(vd->vdev_ms_array != 0); 1997ecc2d604Sbonwick vdev_config_dirty(vd); 1998ecc2d604Sbonwick dmu_tx_commit(tx); 1999ecc2d604Sbonwick } 2000fa9e4066Sahrens 200188ecc943SGeorge Wilson if (vd->vdev_removing) 200288ecc943SGeorge Wilson vdev_remove(vd, txg); 200388ecc943SGeorge Wilson 2004ecc2d604Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2005fa9e4066Sahrens metaslab_sync(msp, txg); 2006ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2007ecc2d604Sbonwick } 2008fa9e4066Sahrens 2009fa9e4066Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2010fa9e4066Sahrens vdev_dtl_sync(lvd, txg); 2011fa9e4066Sahrens 2012fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2013fa9e4066Sahrens } 2014fa9e4066Sahrens 2015fa9e4066Sahrens uint64_t 2016fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2017fa9e4066Sahrens { 2018fa9e4066Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2019fa9e4066Sahrens } 2020fa9e4066Sahrens 20213d7072f8Seschrock /* 20223d7072f8Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 20233d7072f8Seschrock * not be opened, and no I/O is attempted. 20243d7072f8Seschrock */ 2025fa9e4066Sahrens int 2026069f55e2SEric Schrock vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2027fa9e4066Sahrens { 2028c5904d13Seschrock vdev_t *vd; 2029fa9e4066Sahrens 20308f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 2031fa9e4066Sahrens 2032c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2033e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2034e14bb325SJeff Bonwick 20353d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 2036e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2037fa9e4066Sahrens 2038069f55e2SEric Schrock /* 2039069f55e2SEric Schrock * We don't directly use the aux state here, but if we do a 2040069f55e2SEric Schrock * vdev_reopen(), we need this value to be present to remember why we 2041069f55e2SEric Schrock * were faulted. 2042069f55e2SEric Schrock */ 2043069f55e2SEric Schrock vd->vdev_label_aux = aux; 2044069f55e2SEric Schrock 20453d7072f8Seschrock /* 20463d7072f8Seschrock * Faulted state takes precedence over degraded. 20473d7072f8Seschrock */ 20483d7072f8Seschrock vd->vdev_faulted = 1ULL; 20493d7072f8Seschrock vd->vdev_degraded = 0ULL; 2050069f55e2SEric Schrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 20513d7072f8Seschrock 20523d7072f8Seschrock /* 20536988b9faSDavid Marker * If marking the vdev as faulted cause the top-level vdev to become 20543d7072f8Seschrock * unavailable, then back off and simply mark the vdev as degraded 20553d7072f8Seschrock * instead. 20563d7072f8Seschrock */ 20578f18d1faSGeorge Wilson if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog && 20588f18d1faSGeorge Wilson vd->vdev_aux == NULL) { 20593d7072f8Seschrock vd->vdev_degraded = 1ULL; 20603d7072f8Seschrock vd->vdev_faulted = 0ULL; 20613d7072f8Seschrock 20623d7072f8Seschrock /* 20633d7072f8Seschrock * If we reopen the device and it's not dead, only then do we 20643d7072f8Seschrock * mark it degraded. 20653d7072f8Seschrock */ 20663d7072f8Seschrock vdev_reopen(vd); 20673d7072f8Seschrock 2068069f55e2SEric Schrock if (vdev_readable(vd)) 2069069f55e2SEric Schrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 20703d7072f8Seschrock } 20713d7072f8Seschrock 2072e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 20733d7072f8Seschrock } 20743d7072f8Seschrock 20753d7072f8Seschrock /* 20763d7072f8Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 20773d7072f8Seschrock * user that something is wrong. The vdev continues to operate as normal as far 20783d7072f8Seschrock * as I/O is concerned. 20793d7072f8Seschrock */ 20803d7072f8Seschrock int 2081069f55e2SEric Schrock vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 20823d7072f8Seschrock { 2083c5904d13Seschrock vdev_t *vd; 20840a4e9518Sgw 20858f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 20863d7072f8Seschrock 2087c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2088e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2089e14bb325SJeff Bonwick 20900e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 2091e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 20920e34b6a7Sbonwick 20933d7072f8Seschrock /* 20943d7072f8Seschrock * If the vdev is already faulted, then don't do anything. 20953d7072f8Seschrock */ 2096e14bb325SJeff Bonwick if (vd->vdev_faulted || vd->vdev_degraded) 2097e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, 0)); 20983d7072f8Seschrock 20993d7072f8Seschrock vd->vdev_degraded = 1ULL; 21003d7072f8Seschrock if (!vdev_is_dead(vd)) 21013d7072f8Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2102069f55e2SEric Schrock aux); 21033d7072f8Seschrock 2104e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 21053d7072f8Seschrock } 21063d7072f8Seschrock 21073d7072f8Seschrock /* 21083d7072f8Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 21093d7072f8Seschrock * any attached spare device should be detached when the device finishes 21103d7072f8Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 21113d7072f8Seschrock * so no FMA events are generated if the device fails to open. 21123d7072f8Seschrock */ 21133d7072f8Seschrock int 2114e14bb325SJeff Bonwick vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 21153d7072f8Seschrock { 2116573ca77eSGeorge Wilson vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 21173d7072f8Seschrock 21188f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 21193d7072f8Seschrock 2120c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2121e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 21223d7072f8Seschrock 21233d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 2124e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2125fa9e4066Sahrens 2126573ca77eSGeorge Wilson tvd = vd->vdev_top; 2127fa9e4066Sahrens vd->vdev_offline = B_FALSE; 2128441d80aaSlling vd->vdev_tmpoffline = B_FALSE; 2129e14bb325SJeff Bonwick vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2130e14bb325SJeff Bonwick vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2131573ca77eSGeorge Wilson 2132573ca77eSGeorge Wilson /* XXX - L2ARC 1.0 does not support expansion */ 2133573ca77eSGeorge Wilson if (!vd->vdev_aux) { 2134573ca77eSGeorge Wilson for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2135573ca77eSGeorge Wilson pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2136573ca77eSGeorge Wilson } 2137573ca77eSGeorge Wilson 2138573ca77eSGeorge Wilson vdev_reopen(tvd); 21393d7072f8Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 21403d7072f8Seschrock 2141573ca77eSGeorge Wilson if (!vd->vdev_aux) { 2142573ca77eSGeorge Wilson for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2143573ca77eSGeorge Wilson pvd->vdev_expanding = B_FALSE; 2144573ca77eSGeorge Wilson } 2145573ca77eSGeorge Wilson 21463d7072f8Seschrock if (newstate) 21473d7072f8Seschrock *newstate = vd->vdev_state; 21483d7072f8Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 21493d7072f8Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 21503d7072f8Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 21513d7072f8Seschrock vd->vdev_parent->vdev_child[0] == vd) 21523d7072f8Seschrock vd->vdev_unspare = B_TRUE; 2153fa9e4066Sahrens 2154573ca77eSGeorge Wilson if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2155573ca77eSGeorge Wilson 2156573ca77eSGeorge Wilson /* XXX - L2ARC 1.0 does not support expansion */ 2157573ca77eSGeorge Wilson if (vd->vdev_aux) 2158573ca77eSGeorge Wilson return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2159573ca77eSGeorge Wilson spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2160573ca77eSGeorge Wilson } 21618ad4d6ddSJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 2162fa9e4066Sahrens } 2163fa9e4066Sahrens 2164a1521560SJeff Bonwick static int 2165a1521560SJeff Bonwick vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2166fa9e4066Sahrens { 2167e6ca193dSGeorge Wilson vdev_t *vd, *tvd; 21688f18d1faSGeorge Wilson int error = 0; 21698f18d1faSGeorge Wilson uint64_t generation; 21708f18d1faSGeorge Wilson metaslab_group_t *mg; 21710a4e9518Sgw 21728f18d1faSGeorge Wilson top: 21738f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_ALLOC); 2174fa9e4066Sahrens 2175c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2176e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2177fa9e4066Sahrens 21780e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 2179e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 21800e34b6a7Sbonwick 2181e6ca193dSGeorge Wilson tvd = vd->vdev_top; 21828f18d1faSGeorge Wilson mg = tvd->vdev_mg; 21838f18d1faSGeorge Wilson generation = spa->spa_config_generation + 1; 2184e6ca193dSGeorge Wilson 2185fa9e4066Sahrens /* 2186ecc2d604Sbonwick * If the device isn't already offline, try to offline it. 2187fa9e4066Sahrens */ 2188ecc2d604Sbonwick if (!vd->vdev_offline) { 2189ecc2d604Sbonwick /* 21908ad4d6ddSJeff Bonwick * If this device has the only valid copy of some data, 2191e6ca193dSGeorge Wilson * don't allow it to be offlined. Log devices are always 2192e6ca193dSGeorge Wilson * expendable. 2193ecc2d604Sbonwick */ 2194e6ca193dSGeorge Wilson if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2195e6ca193dSGeorge Wilson vdev_dtl_required(vd)) 2196e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2197fa9e4066Sahrens 21988f18d1faSGeorge Wilson /* 2199b24ab676SJeff Bonwick * If the top-level is a slog and it has had allocations 2200b24ab676SJeff Bonwick * then proceed. We check that the vdev's metaslab group 2201b24ab676SJeff Bonwick * is not NULL since it's possible that we may have just 2202b24ab676SJeff Bonwick * added this vdev but not yet initialized its metaslabs. 22038f18d1faSGeorge Wilson */ 22048f18d1faSGeorge Wilson if (tvd->vdev_islog && mg != NULL) { 22058f18d1faSGeorge Wilson /* 22068f18d1faSGeorge Wilson * Prevent any future allocations. 22078f18d1faSGeorge Wilson */ 2208a1521560SJeff Bonwick metaslab_group_passivate(mg); 22098f18d1faSGeorge Wilson (void) spa_vdev_state_exit(spa, vd, 0); 22108f18d1faSGeorge Wilson 2211*1195e687SMark J Musante error = spa_offline_log(spa); 22128f18d1faSGeorge Wilson 22138f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_ALLOC); 22148f18d1faSGeorge Wilson 22158f18d1faSGeorge Wilson /* 22168f18d1faSGeorge Wilson * Check to see if the config has changed. 22178f18d1faSGeorge Wilson */ 22188f18d1faSGeorge Wilson if (error || generation != spa->spa_config_generation) { 2219a1521560SJeff Bonwick metaslab_group_activate(mg); 22208f18d1faSGeorge Wilson if (error) 22218f18d1faSGeorge Wilson return (spa_vdev_state_exit(spa, 22228f18d1faSGeorge Wilson vd, error)); 22238f18d1faSGeorge Wilson (void) spa_vdev_state_exit(spa, vd, 0); 22248f18d1faSGeorge Wilson goto top; 22258f18d1faSGeorge Wilson } 22268f18d1faSGeorge Wilson ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); 22278f18d1faSGeorge Wilson } 22288f18d1faSGeorge Wilson 2229ecc2d604Sbonwick /* 2230ecc2d604Sbonwick * Offline this device and reopen its top-level vdev. 2231e6ca193dSGeorge Wilson * If the top-level vdev is a log device then just offline 2232e6ca193dSGeorge Wilson * it. Otherwise, if this action results in the top-level 2233e6ca193dSGeorge Wilson * vdev becoming unusable, undo it and fail the request. 2234ecc2d604Sbonwick */ 2235ecc2d604Sbonwick vd->vdev_offline = B_TRUE; 2236e6ca193dSGeorge Wilson vdev_reopen(tvd); 2237e6ca193dSGeorge Wilson 2238e6ca193dSGeorge Wilson if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2239e6ca193dSGeorge Wilson vdev_is_dead(tvd)) { 2240ecc2d604Sbonwick vd->vdev_offline = B_FALSE; 2241e6ca193dSGeorge Wilson vdev_reopen(tvd); 2242e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2243ecc2d604Sbonwick } 22448f18d1faSGeorge Wilson 22458f18d1faSGeorge Wilson /* 22468f18d1faSGeorge Wilson * Add the device back into the metaslab rotor so that 22478f18d1faSGeorge Wilson * once we online the device it's open for business. 22488f18d1faSGeorge Wilson */ 22498f18d1faSGeorge Wilson if (tvd->vdev_islog && mg != NULL) 2250a1521560SJeff Bonwick metaslab_group_activate(mg); 2251fa9e4066Sahrens } 2252fa9e4066Sahrens 2253e14bb325SJeff Bonwick vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2254ecc2d604Sbonwick 22558f18d1faSGeorge Wilson return (spa_vdev_state_exit(spa, vd, 0)); 2256fa9e4066Sahrens } 2257fa9e4066Sahrens 2258a1521560SJeff Bonwick int 2259a1521560SJeff Bonwick vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2260a1521560SJeff Bonwick { 2261a1521560SJeff Bonwick int error; 2262a1521560SJeff Bonwick 2263a1521560SJeff Bonwick mutex_enter(&spa->spa_vdev_top_lock); 2264a1521560SJeff Bonwick error = vdev_offline_locked(spa, guid, flags); 2265a1521560SJeff Bonwick mutex_exit(&spa->spa_vdev_top_lock); 2266a1521560SJeff Bonwick 2267a1521560SJeff Bonwick return (error); 2268a1521560SJeff Bonwick } 2269a1521560SJeff Bonwick 2270ea8dc4b6Seschrock /* 2271ea8dc4b6Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 2272ea8dc4b6Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 2273ea8dc4b6Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2274ea8dc4b6Seschrock */ 2275ea8dc4b6Seschrock void 2276e14bb325SJeff Bonwick vdev_clear(spa_t *spa, vdev_t *vd) 2277fa9e4066Sahrens { 2278e14bb325SJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 2279e14bb325SJeff Bonwick 2280e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2281fa9e4066Sahrens 2282ea8dc4b6Seschrock if (vd == NULL) 2283e14bb325SJeff Bonwick vd = rvd; 2284fa9e4066Sahrens 2285ea8dc4b6Seschrock vd->vdev_stat.vs_read_errors = 0; 2286ea8dc4b6Seschrock vd->vdev_stat.vs_write_errors = 0; 2287ea8dc4b6Seschrock vd->vdev_stat.vs_checksum_errors = 0; 2288fa9e4066Sahrens 2289e14bb325SJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) 2290e14bb325SJeff Bonwick vdev_clear(spa, vd->vdev_child[c]); 22913d7072f8Seschrock 22923d7072f8Seschrock /* 22938a79c1b5Sek * If we're in the FAULTED state or have experienced failed I/O, then 22948a79c1b5Sek * clear the persistent state and attempt to reopen the device. We 22958a79c1b5Sek * also mark the vdev config dirty, so that the new faulted state is 22968a79c1b5Sek * written out to disk. 22973d7072f8Seschrock */ 2298e14bb325SJeff Bonwick if (vd->vdev_faulted || vd->vdev_degraded || 2299e14bb325SJeff Bonwick !vdev_readable(vd) || !vdev_writeable(vd)) { 23008a79c1b5Sek 2301096d22d4SEric Schrock /* 2302096d22d4SEric Schrock * When reopening in reponse to a clear event, it may be due to 2303096d22d4SEric Schrock * a fmadm repair request. In this case, if the device is 2304096d22d4SEric Schrock * still broken, we want to still post the ereport again. 2305096d22d4SEric Schrock */ 2306096d22d4SEric Schrock vd->vdev_forcefault = B_TRUE; 2307096d22d4SEric Schrock 23083d7072f8Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 2309e14bb325SJeff Bonwick vd->vdev_cant_read = B_FALSE; 2310e14bb325SJeff Bonwick vd->vdev_cant_write = B_FALSE; 2311e14bb325SJeff Bonwick 23123d7072f8Seschrock vdev_reopen(vd); 23133d7072f8Seschrock 2314096d22d4SEric Schrock vd->vdev_forcefault = B_FALSE; 2315096d22d4SEric Schrock 2316e14bb325SJeff Bonwick if (vd != rvd) 2317e14bb325SJeff Bonwick vdev_state_dirty(vd->vdev_top); 2318e14bb325SJeff Bonwick 2319e14bb325SJeff Bonwick if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2320bb8b5132Sek spa_async_request(spa, SPA_ASYNC_RESILVER); 23213d7072f8Seschrock 23223d7072f8Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 23233d7072f8Seschrock } 2324096d22d4SEric Schrock 2325096d22d4SEric Schrock /* 2326096d22d4SEric Schrock * When clearing a FMA-diagnosed fault, we always want to 2327096d22d4SEric Schrock * unspare the device, as we assume that the original spare was 2328096d22d4SEric Schrock * done in response to the FMA fault. 2329096d22d4SEric Schrock */ 2330096d22d4SEric Schrock if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2331096d22d4SEric Schrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2332096d22d4SEric Schrock vd->vdev_parent->vdev_child[0] == vd) 2333096d22d4SEric Schrock vd->vdev_unspare = B_TRUE; 2334fa9e4066Sahrens } 2335fa9e4066Sahrens 2336e14bb325SJeff Bonwick boolean_t 2337e14bb325SJeff Bonwick vdev_is_dead(vdev_t *vd) 23380a4e9518Sgw { 233988ecc943SGeorge Wilson /* 234088ecc943SGeorge Wilson * Holes and missing devices are always considered "dead". 234188ecc943SGeorge Wilson * This simplifies the code since we don't have to check for 234288ecc943SGeorge Wilson * these types of devices in the various code paths. 234388ecc943SGeorge Wilson * Instead we rely on the fact that we skip over dead devices 234488ecc943SGeorge Wilson * before issuing I/O to them. 234588ecc943SGeorge Wilson */ 234688ecc943SGeorge Wilson return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 234788ecc943SGeorge Wilson vd->vdev_ops == &vdev_missing_ops); 23480a4e9518Sgw } 23490a4e9518Sgw 2350e14bb325SJeff Bonwick boolean_t 2351e14bb325SJeff Bonwick vdev_readable(vdev_t *vd) 23520a4e9518Sgw { 2353e14bb325SJeff Bonwick return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 23540a4e9518Sgw } 23550a4e9518Sgw 2356e14bb325SJeff Bonwick boolean_t 2357e14bb325SJeff Bonwick vdev_writeable(vdev_t *vd) 2358fa9e4066Sahrens { 2359e14bb325SJeff Bonwick return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2360fa9e4066Sahrens } 2361fa9e4066Sahrens 2362a31e6787SGeorge Wilson boolean_t 2363a31e6787SGeorge Wilson vdev_allocatable(vdev_t *vd) 2364a31e6787SGeorge Wilson { 23658ad4d6ddSJeff Bonwick uint64_t state = vd->vdev_state; 23668ad4d6ddSJeff Bonwick 2367a31e6787SGeorge Wilson /* 23688ad4d6ddSJeff Bonwick * We currently allow allocations from vdevs which may be in the 2369a31e6787SGeorge Wilson * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2370a31e6787SGeorge Wilson * fails to reopen then we'll catch it later when we're holding 23718ad4d6ddSJeff Bonwick * the proper locks. Note that we have to get the vdev state 23728ad4d6ddSJeff Bonwick * in a local variable because although it changes atomically, 23738ad4d6ddSJeff Bonwick * we're asking two separate questions about it. 2374a31e6787SGeorge Wilson */ 23758ad4d6ddSJeff Bonwick return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 237688ecc943SGeorge Wilson !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); 2377a31e6787SGeorge Wilson } 2378a31e6787SGeorge Wilson 2379e14bb325SJeff Bonwick boolean_t 2380e14bb325SJeff Bonwick vdev_accessible(vdev_t *vd, zio_t *zio) 2381fa9e4066Sahrens { 2382e14bb325SJeff Bonwick ASSERT(zio->io_vd == vd); 2383fa9e4066Sahrens 2384e14bb325SJeff Bonwick if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2385e14bb325SJeff Bonwick return (B_FALSE); 2386fa9e4066Sahrens 2387e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_READ) 2388e14bb325SJeff Bonwick return (!vd->vdev_cant_read); 2389fa9e4066Sahrens 2390e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_WRITE) 2391e14bb325SJeff Bonwick return (!vd->vdev_cant_write); 2392fa9e4066Sahrens 2393e14bb325SJeff Bonwick return (B_TRUE); 2394fa9e4066Sahrens } 2395fa9e4066Sahrens 2396fa9e4066Sahrens /* 2397fa9e4066Sahrens * Get statistics for the given vdev. 2398fa9e4066Sahrens */ 2399fa9e4066Sahrens void 2400fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2401fa9e4066Sahrens { 2402fa9e4066Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2403fa9e4066Sahrens 2404fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2405fa9e4066Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2406088f3894Sahrens vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; 2407fa9e4066Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2408fa9e4066Sahrens vs->vs_state = vd->vdev_state; 2409573ca77eSGeorge Wilson vs->vs_rsize = vdev_get_min_asize(vd); 2410573ca77eSGeorge Wilson if (vd->vdev_ops->vdev_op_leaf) 2411573ca77eSGeorge Wilson vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2412fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 2413fa9e4066Sahrens 2414fa9e4066Sahrens /* 2415fa9e4066Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 2416fa9e4066Sahrens * over all top-level vdevs (i.e. the direct children of the root). 2417fa9e4066Sahrens */ 2418fa9e4066Sahrens if (vd == rvd) { 2419e14bb325SJeff Bonwick for (int c = 0; c < rvd->vdev_children; c++) { 2420fa9e4066Sahrens vdev_t *cvd = rvd->vdev_child[c]; 2421fa9e4066Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 2422fa9e4066Sahrens 2423fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2424e14bb325SJeff Bonwick for (int t = 0; t < ZIO_TYPES; t++) { 2425fa9e4066Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 2426fa9e4066Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 2427fa9e4066Sahrens } 2428fa9e4066Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 2429fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 2430fa9e4066Sahrens } 2431fa9e4066Sahrens } 2432fa9e4066Sahrens } 2433fa9e4066Sahrens 2434fa94a07fSbrendan void 2435fa94a07fSbrendan vdev_clear_stats(vdev_t *vd) 2436fa94a07fSbrendan { 2437fa94a07fSbrendan mutex_enter(&vd->vdev_stat_lock); 2438fa94a07fSbrendan vd->vdev_stat.vs_space = 0; 2439fa94a07fSbrendan vd->vdev_stat.vs_dspace = 0; 2440fa94a07fSbrendan vd->vdev_stat.vs_alloc = 0; 2441fa94a07fSbrendan mutex_exit(&vd->vdev_stat_lock); 2442fa94a07fSbrendan } 2443fa94a07fSbrendan 2444fa9e4066Sahrens void 2445e14bb325SJeff Bonwick vdev_stat_update(zio_t *zio, uint64_t psize) 2446fa9e4066Sahrens { 24478ad4d6ddSJeff Bonwick spa_t *spa = zio->io_spa; 24488ad4d6ddSJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 2449e14bb325SJeff Bonwick vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2450fa9e4066Sahrens vdev_t *pvd; 2451fa9e4066Sahrens uint64_t txg = zio->io_txg; 2452fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 2453fa9e4066Sahrens zio_type_t type = zio->io_type; 2454fa9e4066Sahrens int flags = zio->io_flags; 2455fa9e4066Sahrens 2456e14bb325SJeff Bonwick /* 2457e14bb325SJeff Bonwick * If this i/o is a gang leader, it didn't do any actual work. 2458e14bb325SJeff Bonwick */ 2459e14bb325SJeff Bonwick if (zio->io_gang_tree) 2460e14bb325SJeff Bonwick return; 2461e14bb325SJeff Bonwick 2462fa9e4066Sahrens if (zio->io_error == 0) { 2463e14bb325SJeff Bonwick /* 2464e14bb325SJeff Bonwick * If this is a root i/o, don't count it -- we've already 2465e14bb325SJeff Bonwick * counted the top-level vdevs, and vdev_get_stats() will 2466e14bb325SJeff Bonwick * aggregate them when asked. This reduces contention on 2467e14bb325SJeff Bonwick * the root vdev_stat_lock and implicitly handles blocks 2468e14bb325SJeff Bonwick * that compress away to holes, for which there is no i/o. 2469e14bb325SJeff Bonwick * (Holes never create vdev children, so all the counters 2470e14bb325SJeff Bonwick * remain zero, which is what we want.) 2471e14bb325SJeff Bonwick * 2472e14bb325SJeff Bonwick * Note: this only applies to successful i/o (io_error == 0) 2473e14bb325SJeff Bonwick * because unlike i/o counts, errors are not additive. 2474e14bb325SJeff Bonwick * When reading a ditto block, for example, failure of 2475e14bb325SJeff Bonwick * one top-level vdev does not imply a root-level error. 2476e14bb325SJeff Bonwick */ 2477e14bb325SJeff Bonwick if (vd == rvd) 2478e14bb325SJeff Bonwick return; 2479e14bb325SJeff Bonwick 2480e14bb325SJeff Bonwick ASSERT(vd == zio->io_vd); 24818ad4d6ddSJeff Bonwick 24828ad4d6ddSJeff Bonwick if (flags & ZIO_FLAG_IO_BYPASS) 24838ad4d6ddSJeff Bonwick return; 24848ad4d6ddSJeff Bonwick 24858ad4d6ddSJeff Bonwick mutex_enter(&vd->vdev_stat_lock); 24868ad4d6ddSJeff Bonwick 2487e14bb325SJeff Bonwick if (flags & ZIO_FLAG_IO_REPAIR) { 2488d80c45e0Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 2489e14bb325SJeff Bonwick vs->vs_scrub_repaired += psize; 24908ad4d6ddSJeff Bonwick if (flags & ZIO_FLAG_SELF_HEAL) 2491e14bb325SJeff Bonwick vs->vs_self_healed += psize; 2492fa9e4066Sahrens } 24938ad4d6ddSJeff Bonwick 24948ad4d6ddSJeff Bonwick vs->vs_ops[type]++; 24958ad4d6ddSJeff Bonwick vs->vs_bytes[type] += psize; 24968ad4d6ddSJeff Bonwick 24978ad4d6ddSJeff Bonwick mutex_exit(&vd->vdev_stat_lock); 2498fa9e4066Sahrens return; 2499fa9e4066Sahrens } 2500fa9e4066Sahrens 2501fa9e4066Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 2502fa9e4066Sahrens return; 2503fa9e4066Sahrens 25048956713aSEric Schrock /* 25058956713aSEric Schrock * If this is an I/O error that is going to be retried, then ignore the 25068956713aSEric Schrock * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 25078956713aSEric Schrock * hard errors, when in reality they can happen for any number of 25088956713aSEric Schrock * innocuous reasons (bus resets, MPxIO link failure, etc). 25098956713aSEric Schrock */ 25108956713aSEric Schrock if (zio->io_error == EIO && 25118956713aSEric Schrock !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 25128956713aSEric Schrock return; 25138956713aSEric Schrock 25148f18d1faSGeorge Wilson /* 25158f18d1faSGeorge Wilson * Intent logs writes won't propagate their error to the root 25168f18d1faSGeorge Wilson * I/O so don't mark these types of failures as pool-level 25178f18d1faSGeorge Wilson * errors. 25188f18d1faSGeorge Wilson */ 25198f18d1faSGeorge Wilson if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 25208f18d1faSGeorge Wilson return; 25218f18d1faSGeorge Wilson 2522e14bb325SJeff Bonwick mutex_enter(&vd->vdev_stat_lock); 2523b47119fdSGeorge Wilson if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2524e14bb325SJeff Bonwick if (zio->io_error == ECKSUM) 2525e14bb325SJeff Bonwick vs->vs_checksum_errors++; 2526e14bb325SJeff Bonwick else 2527e14bb325SJeff Bonwick vs->vs_read_errors++; 2528fa9e4066Sahrens } 2529b47119fdSGeorge Wilson if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2530e14bb325SJeff Bonwick vs->vs_write_errors++; 2531e14bb325SJeff Bonwick mutex_exit(&vd->vdev_stat_lock); 2532fa9e4066Sahrens 25338ad4d6ddSJeff Bonwick if (type == ZIO_TYPE_WRITE && txg != 0 && 25348ad4d6ddSJeff Bonwick (!(flags & ZIO_FLAG_IO_REPAIR) || 2535b24ab676SJeff Bonwick (flags & ZIO_FLAG_SCRUB_THREAD) || 2536b24ab676SJeff Bonwick spa->spa_claiming)) { 25378ad4d6ddSJeff Bonwick /* 2538b24ab676SJeff Bonwick * This is either a normal write (not a repair), or it's 2539b24ab676SJeff Bonwick * a repair induced by the scrub thread, or it's a repair 2540b24ab676SJeff Bonwick * made by zil_claim() during spa_load() in the first txg. 2541b24ab676SJeff Bonwick * In the normal case, we commit the DTL change in the same 2542b24ab676SJeff Bonwick * txg as the block was born. In the scrub-induced repair 2543b24ab676SJeff Bonwick * case, we know that scrubs run in first-pass syncing context, 2544b24ab676SJeff Bonwick * so we commit the DTL change in spa_syncing_txg(spa). 2545b24ab676SJeff Bonwick * In the zil_claim() case, we commit in spa_first_txg(spa). 25468ad4d6ddSJeff Bonwick * 25478ad4d6ddSJeff Bonwick * We currently do not make DTL entries for failed spontaneous 25488ad4d6ddSJeff Bonwick * self-healing writes triggered by normal (non-scrubbing) 25498ad4d6ddSJeff Bonwick * reads, because we have no transactional context in which to 25508ad4d6ddSJeff Bonwick * do so -- and it's not clear that it'd be desirable anyway. 25518ad4d6ddSJeff Bonwick */ 25528ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf) { 25538ad4d6ddSJeff Bonwick uint64_t commit_txg = txg; 25548ad4d6ddSJeff Bonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 25558ad4d6ddSJeff Bonwick ASSERT(flags & ZIO_FLAG_IO_REPAIR); 25568ad4d6ddSJeff Bonwick ASSERT(spa_sync_pass(spa) == 1); 25578ad4d6ddSJeff Bonwick vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2558b24ab676SJeff Bonwick commit_txg = spa_syncing_txg(spa); 2559b24ab676SJeff Bonwick } else if (spa->spa_claiming) { 2560b24ab676SJeff Bonwick ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2561b24ab676SJeff Bonwick commit_txg = spa_first_txg(spa); 25628ad4d6ddSJeff Bonwick } 2563b24ab676SJeff Bonwick ASSERT(commit_txg >= spa_syncing_txg(spa)); 25648ad4d6ddSJeff Bonwick if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2565fa9e4066Sahrens return; 25668ad4d6ddSJeff Bonwick for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 25678ad4d6ddSJeff Bonwick vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 25688ad4d6ddSJeff Bonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2569fa9e4066Sahrens } 25708ad4d6ddSJeff Bonwick if (vd != rvd) 25718ad4d6ddSJeff Bonwick vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2572fa9e4066Sahrens } 2573fa9e4066Sahrens } 2574fa9e4066Sahrens 2575fa9e4066Sahrens void 2576fa9e4066Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 2577fa9e4066Sahrens { 2578fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 2579fa9e4066Sahrens 2580573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 2581fa9e4066Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 2582fa9e4066Sahrens 2583fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2584fa9e4066Sahrens 2585fa9e4066Sahrens if (type == POOL_SCRUB_NONE) { 2586fa9e4066Sahrens /* 2587fa9e4066Sahrens * Update completion and end time. Leave everything else alone 2588fa9e4066Sahrens * so we can report what happened during the previous scrub. 2589fa9e4066Sahrens */ 2590fa9e4066Sahrens vs->vs_scrub_complete = complete; 2591fa9e4066Sahrens vs->vs_scrub_end = gethrestime_sec(); 2592fa9e4066Sahrens } else { 2593fa9e4066Sahrens vs->vs_scrub_type = type; 2594fa9e4066Sahrens vs->vs_scrub_complete = 0; 2595fa9e4066Sahrens vs->vs_scrub_examined = 0; 2596fa9e4066Sahrens vs->vs_scrub_repaired = 0; 2597fa9e4066Sahrens vs->vs_scrub_start = gethrestime_sec(); 2598fa9e4066Sahrens vs->vs_scrub_end = 0; 2599fa9e4066Sahrens } 2600fa9e4066Sahrens 2601fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 2602fa9e4066Sahrens } 2603fa9e4066Sahrens 2604fa9e4066Sahrens /* 2605b24ab676SJeff Bonwick * Update the in-core space usage stats for this vdev, its metaslab class, 2606b24ab676SJeff Bonwick * and the root vdev. 2607fa9e4066Sahrens */ 2608fa9e4066Sahrens void 2609b24ab676SJeff Bonwick vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2610b24ab676SJeff Bonwick int64_t space_delta) 2611fa9e4066Sahrens { 261299653d4eSeschrock int64_t dspace_delta = space_delta; 26138654d025Sperrin spa_t *spa = vd->vdev_spa; 26148654d025Sperrin vdev_t *rvd = spa->spa_root_vdev; 2615b24ab676SJeff Bonwick metaslab_group_t *mg = vd->vdev_mg; 2616b24ab676SJeff Bonwick metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2617fa9e4066Sahrens 26188654d025Sperrin ASSERT(vd == vd->vdev_top); 261999653d4eSeschrock 26208654d025Sperrin /* 26218654d025Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 26228654d025Sperrin * factor. We must calculate this here and not at the root vdev 26238654d025Sperrin * because the root vdev's psize-to-asize is simply the max of its 26248654d025Sperrin * childrens', thus not accurate enough for us. 26258654d025Sperrin */ 26268654d025Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2627e6ca193dSGeorge Wilson ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 26288654d025Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 26298654d025Sperrin vd->vdev_deflate_ratio; 26308654d025Sperrin 26318654d025Sperrin mutex_enter(&vd->vdev_stat_lock); 26328654d025Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 2633b24ab676SJeff Bonwick vd->vdev_stat.vs_space += space_delta; 26348654d025Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 26358654d025Sperrin mutex_exit(&vd->vdev_stat_lock); 26368654d025Sperrin 2637b24ab676SJeff Bonwick if (mc == spa_normal_class(spa)) { 2638fa94a07fSbrendan mutex_enter(&rvd->vdev_stat_lock); 2639fa94a07fSbrendan rvd->vdev_stat.vs_alloc += alloc_delta; 2640b24ab676SJeff Bonwick rvd->vdev_stat.vs_space += space_delta; 2641fa94a07fSbrendan rvd->vdev_stat.vs_dspace += dspace_delta; 2642fa94a07fSbrendan mutex_exit(&rvd->vdev_stat_lock); 2643fa94a07fSbrendan } 2644b24ab676SJeff Bonwick 2645b24ab676SJeff Bonwick if (mc != NULL) { 2646b24ab676SJeff Bonwick ASSERT(rvd == vd->vdev_parent); 2647b24ab676SJeff Bonwick ASSERT(vd->vdev_ms_count != 0); 2648b24ab676SJeff Bonwick 2649b24ab676SJeff Bonwick metaslab_class_space_update(mc, 2650b24ab676SJeff Bonwick alloc_delta, defer_delta, space_delta, dspace_delta); 2651b24ab676SJeff Bonwick } 2652fa9e4066Sahrens } 2653fa9e4066Sahrens 2654fa9e4066Sahrens /* 2655fa9e4066Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 2656fa9e4066Sahrens * so that it will be written out next time the vdev configuration is synced. 2657fa9e4066Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2658fa9e4066Sahrens */ 2659fa9e4066Sahrens void 2660fa9e4066Sahrens vdev_config_dirty(vdev_t *vd) 2661fa9e4066Sahrens { 2662fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 2663fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 2664fa9e4066Sahrens int c; 2665fa9e4066Sahrens 2666c5904d13Seschrock /* 26676809eb4eSEric Schrock * If this is an aux vdev (as with l2cache and spare devices), then we 26686809eb4eSEric Schrock * update the vdev config manually and set the sync flag. 2669c5904d13Seschrock */ 2670c5904d13Seschrock if (vd->vdev_aux != NULL) { 2671c5904d13Seschrock spa_aux_vdev_t *sav = vd->vdev_aux; 2672c5904d13Seschrock nvlist_t **aux; 2673c5904d13Seschrock uint_t naux; 2674c5904d13Seschrock 2675c5904d13Seschrock for (c = 0; c < sav->sav_count; c++) { 2676c5904d13Seschrock if (sav->sav_vdevs[c] == vd) 2677c5904d13Seschrock break; 2678c5904d13Seschrock } 2679c5904d13Seschrock 2680e14bb325SJeff Bonwick if (c == sav->sav_count) { 2681e14bb325SJeff Bonwick /* 2682e14bb325SJeff Bonwick * We're being removed. There's nothing more to do. 2683e14bb325SJeff Bonwick */ 2684e14bb325SJeff Bonwick ASSERT(sav->sav_sync == B_TRUE); 2685e14bb325SJeff Bonwick return; 2686e14bb325SJeff Bonwick } 2687e14bb325SJeff Bonwick 2688c5904d13Seschrock sav->sav_sync = B_TRUE; 2689c5904d13Seschrock 26906809eb4eSEric Schrock if (nvlist_lookup_nvlist_array(sav->sav_config, 26916809eb4eSEric Schrock ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 26926809eb4eSEric Schrock VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 26936809eb4eSEric Schrock ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 26946809eb4eSEric Schrock } 2695c5904d13Seschrock 2696c5904d13Seschrock ASSERT(c < naux); 2697c5904d13Seschrock 2698c5904d13Seschrock /* 2699c5904d13Seschrock * Setting the nvlist in the middle if the array is a little 2700c5904d13Seschrock * sketchy, but it will work. 2701c5904d13Seschrock */ 2702c5904d13Seschrock nvlist_free(aux[c]); 2703c5904d13Seschrock aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 2704c5904d13Seschrock 2705c5904d13Seschrock return; 2706c5904d13Seschrock } 2707c5904d13Seschrock 27085dabedeeSbonwick /* 2709e14bb325SJeff Bonwick * The dirty list is protected by the SCL_CONFIG lock. The caller 2710e14bb325SJeff Bonwick * must either hold SCL_CONFIG as writer, or must be the sync thread 2711e14bb325SJeff Bonwick * (which holds SCL_CONFIG as reader). There's only one sync thread, 27125dabedeeSbonwick * so this is sufficient to ensure mutual exclusion. 27135dabedeeSbonwick */ 2714e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2715e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 2716e14bb325SJeff Bonwick spa_config_held(spa, SCL_CONFIG, RW_READER))); 27175dabedeeSbonwick 2718fa9e4066Sahrens if (vd == rvd) { 2719fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) 2720fa9e4066Sahrens vdev_config_dirty(rvd->vdev_child[c]); 2721fa9e4066Sahrens } else { 2722fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 2723fa9e4066Sahrens 272488ecc943SGeorge Wilson if (!list_link_active(&vd->vdev_config_dirty_node) && 272588ecc943SGeorge Wilson !vd->vdev_ishole) 2726e14bb325SJeff Bonwick list_insert_head(&spa->spa_config_dirty_list, vd); 2727fa9e4066Sahrens } 2728fa9e4066Sahrens } 2729fa9e4066Sahrens 2730fa9e4066Sahrens void 2731fa9e4066Sahrens vdev_config_clean(vdev_t *vd) 2732fa9e4066Sahrens { 27335dabedeeSbonwick spa_t *spa = vd->vdev_spa; 27345dabedeeSbonwick 2735e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2736e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 2737e14bb325SJeff Bonwick spa_config_held(spa, SCL_CONFIG, RW_READER))); 27385dabedeeSbonwick 2739e14bb325SJeff Bonwick ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2740e14bb325SJeff Bonwick list_remove(&spa->spa_config_dirty_list, vd); 2741e14bb325SJeff Bonwick } 2742e14bb325SJeff Bonwick 2743e14bb325SJeff Bonwick /* 2744e14bb325SJeff Bonwick * Mark a top-level vdev's state as dirty, so that the next pass of 2745e14bb325SJeff Bonwick * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2746e14bb325SJeff Bonwick * the state changes from larger config changes because they require 2747e14bb325SJeff Bonwick * much less locking, and are often needed for administrative actions. 2748e14bb325SJeff Bonwick */ 2749e14bb325SJeff Bonwick void 2750e14bb325SJeff Bonwick vdev_state_dirty(vdev_t *vd) 2751e14bb325SJeff Bonwick { 2752e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 2753e14bb325SJeff Bonwick 2754e14bb325SJeff Bonwick ASSERT(vd == vd->vdev_top); 2755e14bb325SJeff Bonwick 2756e14bb325SJeff Bonwick /* 2757e14bb325SJeff Bonwick * The state list is protected by the SCL_STATE lock. The caller 2758e14bb325SJeff Bonwick * must either hold SCL_STATE as writer, or must be the sync thread 2759e14bb325SJeff Bonwick * (which holds SCL_STATE as reader). There's only one sync thread, 2760e14bb325SJeff Bonwick * so this is sufficient to ensure mutual exclusion. 2761e14bb325SJeff Bonwick */ 2762e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2763e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 2764e14bb325SJeff Bonwick spa_config_held(spa, SCL_STATE, RW_READER))); 2765e14bb325SJeff Bonwick 2766b24ab676SJeff Bonwick if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 2767e14bb325SJeff Bonwick list_insert_head(&spa->spa_state_dirty_list, vd); 2768e14bb325SJeff Bonwick } 2769e14bb325SJeff Bonwick 2770e14bb325SJeff Bonwick void 2771e14bb325SJeff Bonwick vdev_state_clean(vdev_t *vd) 2772e14bb325SJeff Bonwick { 2773e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 2774e14bb325SJeff Bonwick 2775e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2776e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 2777e14bb325SJeff Bonwick spa_config_held(spa, SCL_STATE, RW_READER))); 2778e14bb325SJeff Bonwick 2779e14bb325SJeff Bonwick ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2780e14bb325SJeff Bonwick list_remove(&spa->spa_state_dirty_list, vd); 2781fa9e4066Sahrens } 2782fa9e4066Sahrens 278332b87932Sek /* 278432b87932Sek * Propagate vdev state up from children to parent. 278532b87932Sek */ 278644cd46caSbillm void 278744cd46caSbillm vdev_propagate_state(vdev_t *vd) 278844cd46caSbillm { 27898ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 27908ad4d6ddSJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 279144cd46caSbillm int degraded = 0, faulted = 0; 279244cd46caSbillm int corrupted = 0; 279344cd46caSbillm vdev_t *child; 279444cd46caSbillm 27953d7072f8Seschrock if (vd->vdev_children > 0) { 2796573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 27973d7072f8Seschrock child = vd->vdev_child[c]; 279851ece835Seschrock 279988ecc943SGeorge Wilson /* 280088ecc943SGeorge Wilson * Don't factor holes into the decision. 280188ecc943SGeorge Wilson */ 280288ecc943SGeorge Wilson if (child->vdev_ishole) 280388ecc943SGeorge Wilson continue; 280488ecc943SGeorge Wilson 2805e14bb325SJeff Bonwick if (!vdev_readable(child) || 28068ad4d6ddSJeff Bonwick (!vdev_writeable(child) && spa_writeable(spa))) { 280751ece835Seschrock /* 280851ece835Seschrock * Root special: if there is a top-level log 280951ece835Seschrock * device, treat the root vdev as if it were 281051ece835Seschrock * degraded. 281151ece835Seschrock */ 281251ece835Seschrock if (child->vdev_islog && vd == rvd) 281351ece835Seschrock degraded++; 281451ece835Seschrock else 281551ece835Seschrock faulted++; 281651ece835Seschrock } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 28173d7072f8Seschrock degraded++; 281851ece835Seschrock } 281944cd46caSbillm 28203d7072f8Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 28213d7072f8Seschrock corrupted++; 28223d7072f8Seschrock } 282344cd46caSbillm 28243d7072f8Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 28253d7072f8Seschrock 28263d7072f8Seschrock /* 2827e14bb325SJeff Bonwick * Root special: if there is a top-level vdev that cannot be 28283d7072f8Seschrock * opened due to corrupted metadata, then propagate the root 28293d7072f8Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 28303d7072f8Seschrock * replicas'. 28313d7072f8Seschrock */ 28323d7072f8Seschrock if (corrupted && vd == rvd && 28333d7072f8Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 28343d7072f8Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 28353d7072f8Seschrock VDEV_AUX_CORRUPT_DATA); 28363d7072f8Seschrock } 28373d7072f8Seschrock 283851ece835Seschrock if (vd->vdev_parent) 28393d7072f8Seschrock vdev_propagate_state(vd->vdev_parent); 284044cd46caSbillm } 284144cd46caSbillm 2842fa9e4066Sahrens /* 2843ea8dc4b6Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 2844ea8dc4b6Seschrock * state, because we're in the process of opening children depth-first. 2845ea8dc4b6Seschrock * Otherwise, we propagate the change to the parent. 2846ea8dc4b6Seschrock * 2847ea8dc4b6Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 2848ea8dc4b6Seschrock * generated. 2849fa9e4066Sahrens */ 2850fa9e4066Sahrens void 2851ea8dc4b6Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2852fa9e4066Sahrens { 2853560e6e96Seschrock uint64_t save_state; 2854c5904d13Seschrock spa_t *spa = vd->vdev_spa; 2855ea8dc4b6Seschrock 2856ea8dc4b6Seschrock if (state == vd->vdev_state) { 2857ea8dc4b6Seschrock vd->vdev_stat.vs_aux = aux; 2858fa9e4066Sahrens return; 2859ea8dc4b6Seschrock } 2860ea8dc4b6Seschrock 2861560e6e96Seschrock save_state = vd->vdev_state; 2862fa9e4066Sahrens 2863fa9e4066Sahrens vd->vdev_state = state; 2864fa9e4066Sahrens vd->vdev_stat.vs_aux = aux; 2865fa9e4066Sahrens 28663d7072f8Seschrock /* 28673d7072f8Seschrock * If we are setting the vdev state to anything but an open state, then 28683d7072f8Seschrock * always close the underlying device. Otherwise, we keep accessible 28693d7072f8Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 28703d7072f8Seschrock * because that implies some extra checks (offline, etc) that we don't 28713d7072f8Seschrock * want here. This is limited to leaf devices, because otherwise 28723d7072f8Seschrock * closing the device will affect other children. 28733d7072f8Seschrock */ 2874cbd2b15eSJeff Bonwick if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) 28753d7072f8Seschrock vd->vdev_ops->vdev_op_close(vd); 28763d7072f8Seschrock 2877069f55e2SEric Schrock /* 2878069f55e2SEric Schrock * If we have brought this vdev back into service, we need 2879069f55e2SEric Schrock * to notify fmd so that it can gracefully repair any outstanding 2880069f55e2SEric Schrock * cases due to a missing device. We do this in all cases, even those 2881069f55e2SEric Schrock * that probably don't correlate to a repaired fault. This is sure to 2882069f55e2SEric Schrock * catch all cases, and we let the zfs-retire agent sort it out. If 2883069f55e2SEric Schrock * this is a transient state it's OK, as the retire agent will 2884069f55e2SEric Schrock * double-check the state of the vdev before repairing it. 2885069f55e2SEric Schrock */ 2886069f55e2SEric Schrock if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 2887069f55e2SEric Schrock vd->vdev_prevstate != state) 2888069f55e2SEric Schrock zfs_post_state_change(spa, vd); 2889069f55e2SEric Schrock 28903d7072f8Seschrock if (vd->vdev_removed && 28913d7072f8Seschrock state == VDEV_STATE_CANT_OPEN && 28923d7072f8Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 28933d7072f8Seschrock /* 28943d7072f8Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 28953d7072f8Seschrock * device was previously marked removed and someone attempted to 28963d7072f8Seschrock * reopen it. If this failed due to a nonexistent device, then 28973d7072f8Seschrock * keep the device in the REMOVED state. We also let this be if 28983d7072f8Seschrock * it is one of our special test online cases, which is only 28993d7072f8Seschrock * attempting to online the device and shouldn't generate an FMA 29003d7072f8Seschrock * fault. 29013d7072f8Seschrock */ 29023d7072f8Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 29033d7072f8Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 29043d7072f8Seschrock } else if (state == VDEV_STATE_REMOVED) { 29053d7072f8Seschrock vd->vdev_removed = B_TRUE; 29063d7072f8Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 2907ea8dc4b6Seschrock /* 2908ea8dc4b6Seschrock * If we fail to open a vdev during an import, we mark it as 2909ea8dc4b6Seschrock * "not available", which signifies that it was never there to 2910ea8dc4b6Seschrock * begin with. Failure to open such a device is not considered 2911ea8dc4b6Seschrock * an error. 2912ea8dc4b6Seschrock */ 2913b16da2e2SGeorge Wilson if (spa_load_state(spa) == SPA_LOAD_IMPORT && 2914560e6e96Seschrock vd->vdev_ops->vdev_op_leaf) 2915560e6e96Seschrock vd->vdev_not_present = 1; 2916560e6e96Seschrock 2917560e6e96Seschrock /* 2918560e6e96Seschrock * Post the appropriate ereport. If the 'prevstate' field is 2919560e6e96Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 2920560e6e96Seschrock * that this is part of a vdev_reopen(). In this case, we don't 2921560e6e96Seschrock * want to post the ereport if the device was already in the 2922560e6e96Seschrock * CANT_OPEN state beforehand. 29233d7072f8Seschrock * 29243d7072f8Seschrock * If the 'checkremove' flag is set, then this is an attempt to 29253d7072f8Seschrock * online the device in response to an insertion event. If we 29263d7072f8Seschrock * hit this case, then we have detected an insertion event for a 29273d7072f8Seschrock * faulted or offline device that wasn't in the removed state. 29283d7072f8Seschrock * In this scenario, we don't post an ereport because we are 29293d7072f8Seschrock * about to replace the device, or attempt an online with 29303d7072f8Seschrock * vdev_forcefault, which will generate the fault for us. 2931560e6e96Seschrock */ 29323d7072f8Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 29333d7072f8Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 2934c5904d13Seschrock vd != spa->spa_root_vdev) { 2935ea8dc4b6Seschrock const char *class; 2936ea8dc4b6Seschrock 2937ea8dc4b6Seschrock switch (aux) { 2938ea8dc4b6Seschrock case VDEV_AUX_OPEN_FAILED: 2939ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2940ea8dc4b6Seschrock break; 2941ea8dc4b6Seschrock case VDEV_AUX_CORRUPT_DATA: 2942ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2943ea8dc4b6Seschrock break; 2944ea8dc4b6Seschrock case VDEV_AUX_NO_REPLICAS: 2945ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2946ea8dc4b6Seschrock break; 2947ea8dc4b6Seschrock case VDEV_AUX_BAD_GUID_SUM: 2948ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2949ea8dc4b6Seschrock break; 2950ea8dc4b6Seschrock case VDEV_AUX_TOO_SMALL: 2951ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2952ea8dc4b6Seschrock break; 2953ea8dc4b6Seschrock case VDEV_AUX_BAD_LABEL: 2954ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2955ea8dc4b6Seschrock break; 2956e14bb325SJeff Bonwick case VDEV_AUX_IO_FAILURE: 2957e14bb325SJeff Bonwick class = FM_EREPORT_ZFS_IO_FAILURE; 2958e14bb325SJeff Bonwick break; 2959ea8dc4b6Seschrock default: 2960ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2961ea8dc4b6Seschrock } 2962ea8dc4b6Seschrock 2963c5904d13Seschrock zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 2964ea8dc4b6Seschrock } 2965ea8dc4b6Seschrock 29663d7072f8Seschrock /* Erase any notion of persistent removed state */ 29673d7072f8Seschrock vd->vdev_removed = B_FALSE; 29683d7072f8Seschrock } else { 29693d7072f8Seschrock vd->vdev_removed = B_FALSE; 29703d7072f8Seschrock } 2971ea8dc4b6Seschrock 29728b33d774STim Haley if (!isopen && vd->vdev_parent) 29738b33d774STim Haley vdev_propagate_state(vd->vdev_parent); 2974fa9e4066Sahrens } 297515e6edf1Sgw 297615e6edf1Sgw /* 297715e6edf1Sgw * Check the vdev configuration to ensure that it's capable of supporting 297815e6edf1Sgw * a root pool. Currently, we do not support RAID-Z or partial configuration. 297915e6edf1Sgw * In addition, only a single top-level vdev is allowed and none of the leaves 298015e6edf1Sgw * can be wholedisks. 298115e6edf1Sgw */ 298215e6edf1Sgw boolean_t 298315e6edf1Sgw vdev_is_bootable(vdev_t *vd) 298415e6edf1Sgw { 298515e6edf1Sgw if (!vd->vdev_ops->vdev_op_leaf) { 298615e6edf1Sgw char *vdev_type = vd->vdev_ops->vdev_op_type; 298715e6edf1Sgw 298815e6edf1Sgw if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 298915e6edf1Sgw vd->vdev_children > 1) { 299015e6edf1Sgw return (B_FALSE); 299115e6edf1Sgw } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 299215e6edf1Sgw strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 299315e6edf1Sgw return (B_FALSE); 299415e6edf1Sgw } 299515e6edf1Sgw } else if (vd->vdev_wholedisk == 1) { 299615e6edf1Sgw return (B_FALSE); 299715e6edf1Sgw } 299815e6edf1Sgw 2999573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 300015e6edf1Sgw if (!vdev_is_bootable(vd->vdev_child[c])) 300115e6edf1Sgw return (B_FALSE); 300215e6edf1Sgw } 300315e6edf1Sgw return (B_TRUE); 300415e6edf1Sgw } 3005e6ca193dSGeorge Wilson 300688ecc943SGeorge Wilson /* 300788ecc943SGeorge Wilson * Load the state from the original vdev tree (ovd) which 300888ecc943SGeorge Wilson * we've retrieved from the MOS config object. If the original 300988ecc943SGeorge Wilson * vdev was offline then we transfer that state to the device 301088ecc943SGeorge Wilson * in the current vdev tree (nvd). 301188ecc943SGeorge Wilson */ 3012e6ca193dSGeorge Wilson void 301388ecc943SGeorge Wilson vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3014e6ca193dSGeorge Wilson { 301588ecc943SGeorge Wilson spa_t *spa = nvd->vdev_spa; 3016e6ca193dSGeorge Wilson 301788ecc943SGeorge Wilson ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 301888ecc943SGeorge Wilson ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3019e6ca193dSGeorge Wilson 302088ecc943SGeorge Wilson for (int c = 0; c < nvd->vdev_children; c++) 302188ecc943SGeorge Wilson vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3022e6ca193dSGeorge Wilson 302388ecc943SGeorge Wilson if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { 3024e6ca193dSGeorge Wilson /* 3025e6ca193dSGeorge Wilson * It would be nice to call vdev_offline() 3026e6ca193dSGeorge Wilson * directly but the pool isn't fully loaded and 3027e6ca193dSGeorge Wilson * the txg threads have not been started yet. 3028e6ca193dSGeorge Wilson */ 302988ecc943SGeorge Wilson nvd->vdev_offline = ovd->vdev_offline; 303088ecc943SGeorge Wilson vdev_reopen(nvd->vdev_top); 3031e6ca193dSGeorge Wilson } 3032e6ca193dSGeorge Wilson } 3033573ca77eSGeorge Wilson 3034573ca77eSGeorge Wilson /* 3035573ca77eSGeorge Wilson * Expand a vdev if possible. 3036573ca77eSGeorge Wilson */ 3037573ca77eSGeorge Wilson void 3038573ca77eSGeorge Wilson vdev_expand(vdev_t *vd, uint64_t txg) 3039573ca77eSGeorge Wilson { 3040573ca77eSGeorge Wilson ASSERT(vd->vdev_top == vd); 3041573ca77eSGeorge Wilson ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3042573ca77eSGeorge Wilson 3043573ca77eSGeorge Wilson if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3044573ca77eSGeorge Wilson VERIFY(vdev_metaslab_init(vd, txg) == 0); 3045573ca77eSGeorge Wilson vdev_config_dirty(vd); 3046573ca77eSGeorge Wilson } 3047573ca77eSGeorge Wilson } 3048*1195e687SMark J Musante 3049*1195e687SMark J Musante /* 3050*1195e687SMark J Musante * Split a vdev. 3051*1195e687SMark J Musante */ 3052*1195e687SMark J Musante void 3053*1195e687SMark J Musante vdev_split(vdev_t *vd) 3054*1195e687SMark J Musante { 3055*1195e687SMark J Musante vdev_t *cvd, *pvd = vd->vdev_parent; 3056*1195e687SMark J Musante 3057*1195e687SMark J Musante vdev_remove_child(pvd, vd); 3058*1195e687SMark J Musante vdev_compact_children(pvd); 3059*1195e687SMark J Musante 3060*1195e687SMark J Musante cvd = pvd->vdev_child[0]; 3061*1195e687SMark J Musante if (pvd->vdev_children == 1) { 3062*1195e687SMark J Musante vdev_remove_parent(cvd); 3063*1195e687SMark J Musante cvd->vdev_splitting = B_TRUE; 3064*1195e687SMark J Musante } 3065*1195e687SMark J Musante vdev_propagate_state(cvd); 3066*1195e687SMark J Musante } 3067