1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5441d80aaSlling * Common Development and Distribution License (the "License"). 6441d80aaSlling * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 2199653d4eSeschrock 22fa9e4066Sahrens /* 2339c23413Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens #include <sys/zfs_context.h> 30ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 31fa9e4066Sahrens #include <sys/spa.h> 32fa9e4066Sahrens #include <sys/spa_impl.h> 33fa9e4066Sahrens #include <sys/dmu.h> 34fa9e4066Sahrens #include <sys/dmu_tx.h> 35fa9e4066Sahrens #include <sys/vdev_impl.h> 36fa9e4066Sahrens #include <sys/uberblock_impl.h> 37fa9e4066Sahrens #include <sys/metaslab.h> 38fa9e4066Sahrens #include <sys/metaslab_impl.h> 39fa9e4066Sahrens #include <sys/space_map.h> 40fa9e4066Sahrens #include <sys/zio.h> 41fa9e4066Sahrens #include <sys/zap.h> 42fa9e4066Sahrens #include <sys/fs/zfs.h> 43fa9e4066Sahrens 44fa9e4066Sahrens /* 45fa9e4066Sahrens * Virtual device management. 46fa9e4066Sahrens */ 47fa9e4066Sahrens 48fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = { 49fa9e4066Sahrens &vdev_root_ops, 50fa9e4066Sahrens &vdev_raidz_ops, 51fa9e4066Sahrens &vdev_mirror_ops, 52fa9e4066Sahrens &vdev_replacing_ops, 5399653d4eSeschrock &vdev_spare_ops, 54fa9e4066Sahrens &vdev_disk_ops, 55fa9e4066Sahrens &vdev_file_ops, 56fa9e4066Sahrens &vdev_missing_ops, 57fa9e4066Sahrens NULL 58fa9e4066Sahrens }; 59fa9e4066Sahrens 6005b2b3b8Smishra /* maximum scrub/resilver I/O queue */ 6105b2b3b8Smishra int zfs_scrub_limit = 70; 6205b2b3b8Smishra 63fa9e4066Sahrens /* 64fa9e4066Sahrens * Given a vdev type, return the appropriate ops vector. 65fa9e4066Sahrens */ 66fa9e4066Sahrens static vdev_ops_t * 67fa9e4066Sahrens vdev_getops(const char *type) 68fa9e4066Sahrens { 69fa9e4066Sahrens vdev_ops_t *ops, **opspp; 70fa9e4066Sahrens 71fa9e4066Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 72fa9e4066Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 73fa9e4066Sahrens break; 74fa9e4066Sahrens 75fa9e4066Sahrens return (ops); 76fa9e4066Sahrens } 77fa9e4066Sahrens 78fa9e4066Sahrens /* 79fa9e4066Sahrens * Default asize function: return the MAX of psize with the asize of 80fa9e4066Sahrens * all children. This is what's used by anything other than RAID-Z. 81fa9e4066Sahrens */ 82fa9e4066Sahrens uint64_t 83fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 84fa9e4066Sahrens { 85ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 86fa9e4066Sahrens uint64_t csize; 87fa9e4066Sahrens uint64_t c; 88fa9e4066Sahrens 89fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) { 90fa9e4066Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91fa9e4066Sahrens asize = MAX(asize, csize); 92fa9e4066Sahrens } 93fa9e4066Sahrens 94fa9e4066Sahrens return (asize); 95fa9e4066Sahrens } 96fa9e4066Sahrens 972a79c5feSlling /* 982a79c5feSlling * Get the replaceable or attachable device size. 992a79c5feSlling * If the parent is a mirror or raidz, the replaceable size is the minimum 1002a79c5feSlling * psize of all its children. For the rest, just return our own psize. 1012a79c5feSlling * 1022a79c5feSlling * e.g. 1032a79c5feSlling * psize rsize 1042a79c5feSlling * root - - 1052a79c5feSlling * mirror/raidz - - 1062a79c5feSlling * disk1 20g 20g 1072a79c5feSlling * disk2 40g 20g 1082a79c5feSlling * disk3 80g 80g 1092a79c5feSlling */ 1102a79c5feSlling uint64_t 1112a79c5feSlling vdev_get_rsize(vdev_t *vd) 1122a79c5feSlling { 1132a79c5feSlling vdev_t *pvd, *cvd; 1142a79c5feSlling uint64_t c, rsize; 1152a79c5feSlling 1162a79c5feSlling pvd = vd->vdev_parent; 1172a79c5feSlling 1182a79c5feSlling /* 1192a79c5feSlling * If our parent is NULL or the root, just return our own psize. 1202a79c5feSlling */ 1212a79c5feSlling if (pvd == NULL || pvd->vdev_parent == NULL) 1222a79c5feSlling return (vd->vdev_psize); 1232a79c5feSlling 1242a79c5feSlling rsize = 0; 1252a79c5feSlling 1262a79c5feSlling for (c = 0; c < pvd->vdev_children; c++) { 1272a79c5feSlling cvd = pvd->vdev_child[c]; 1282a79c5feSlling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1292a79c5feSlling } 1302a79c5feSlling 1312a79c5feSlling return (rsize); 1322a79c5feSlling } 1332a79c5feSlling 134fa9e4066Sahrens vdev_t * 135fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 136fa9e4066Sahrens { 137fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 138fa9e4066Sahrens 139fa9e4066Sahrens if (vdev < rvd->vdev_children) 140fa9e4066Sahrens return (rvd->vdev_child[vdev]); 141fa9e4066Sahrens 142fa9e4066Sahrens return (NULL); 143fa9e4066Sahrens } 144fa9e4066Sahrens 145fa9e4066Sahrens vdev_t * 146fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 147fa9e4066Sahrens { 148fa9e4066Sahrens int c; 149fa9e4066Sahrens vdev_t *mvd; 150fa9e4066Sahrens 1510e34b6a7Sbonwick if (vd->vdev_guid == guid) 152fa9e4066Sahrens return (vd); 153fa9e4066Sahrens 154fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 155fa9e4066Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 156fa9e4066Sahrens NULL) 157fa9e4066Sahrens return (mvd); 158fa9e4066Sahrens 159fa9e4066Sahrens return (NULL); 160fa9e4066Sahrens } 161fa9e4066Sahrens 162fa9e4066Sahrens void 163fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 164fa9e4066Sahrens { 165fa9e4066Sahrens size_t oldsize, newsize; 166fa9e4066Sahrens uint64_t id = cvd->vdev_id; 167fa9e4066Sahrens vdev_t **newchild; 168fa9e4066Sahrens 169fa9e4066Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 170fa9e4066Sahrens ASSERT(cvd->vdev_parent == NULL); 171fa9e4066Sahrens 172fa9e4066Sahrens cvd->vdev_parent = pvd; 173fa9e4066Sahrens 174fa9e4066Sahrens if (pvd == NULL) 175fa9e4066Sahrens return; 176fa9e4066Sahrens 177fa9e4066Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 178fa9e4066Sahrens 179fa9e4066Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 180fa9e4066Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 181fa9e4066Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 182fa9e4066Sahrens 183fa9e4066Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 184fa9e4066Sahrens if (pvd->vdev_child != NULL) { 185fa9e4066Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 186fa9e4066Sahrens kmem_free(pvd->vdev_child, oldsize); 187fa9e4066Sahrens } 188fa9e4066Sahrens 189fa9e4066Sahrens pvd->vdev_child = newchild; 190fa9e4066Sahrens pvd->vdev_child[id] = cvd; 191fa9e4066Sahrens 192fa9e4066Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 193fa9e4066Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 194fa9e4066Sahrens 195fa9e4066Sahrens /* 196fa9e4066Sahrens * Walk up all ancestors to update guid sum. 197fa9e4066Sahrens */ 198fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 199fa9e4066Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 20005b2b3b8Smishra 20105b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 20205b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 203fa9e4066Sahrens } 204fa9e4066Sahrens 205fa9e4066Sahrens void 206fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 207fa9e4066Sahrens { 208fa9e4066Sahrens int c; 209fa9e4066Sahrens uint_t id = cvd->vdev_id; 210fa9e4066Sahrens 211fa9e4066Sahrens ASSERT(cvd->vdev_parent == pvd); 212fa9e4066Sahrens 213fa9e4066Sahrens if (pvd == NULL) 214fa9e4066Sahrens return; 215fa9e4066Sahrens 216fa9e4066Sahrens ASSERT(id < pvd->vdev_children); 217fa9e4066Sahrens ASSERT(pvd->vdev_child[id] == cvd); 218fa9e4066Sahrens 219fa9e4066Sahrens pvd->vdev_child[id] = NULL; 220fa9e4066Sahrens cvd->vdev_parent = NULL; 221fa9e4066Sahrens 222fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) 223fa9e4066Sahrens if (pvd->vdev_child[c]) 224fa9e4066Sahrens break; 225fa9e4066Sahrens 226fa9e4066Sahrens if (c == pvd->vdev_children) { 227fa9e4066Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 228fa9e4066Sahrens pvd->vdev_child = NULL; 229fa9e4066Sahrens pvd->vdev_children = 0; 230fa9e4066Sahrens } 231fa9e4066Sahrens 232fa9e4066Sahrens /* 233fa9e4066Sahrens * Walk up all ancestors to update guid sum. 234fa9e4066Sahrens */ 235fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 236fa9e4066Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 23705b2b3b8Smishra 23805b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 23905b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 240fa9e4066Sahrens } 241fa9e4066Sahrens 242fa9e4066Sahrens /* 243fa9e4066Sahrens * Remove any holes in the child array. 244fa9e4066Sahrens */ 245fa9e4066Sahrens void 246fa9e4066Sahrens vdev_compact_children(vdev_t *pvd) 247fa9e4066Sahrens { 248fa9e4066Sahrens vdev_t **newchild, *cvd; 249fa9e4066Sahrens int oldc = pvd->vdev_children; 250fa9e4066Sahrens int newc, c; 251fa9e4066Sahrens 252fa9e4066Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 253fa9e4066Sahrens 254fa9e4066Sahrens for (c = newc = 0; c < oldc; c++) 255fa9e4066Sahrens if (pvd->vdev_child[c]) 256fa9e4066Sahrens newc++; 257fa9e4066Sahrens 258fa9e4066Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 259fa9e4066Sahrens 260fa9e4066Sahrens for (c = newc = 0; c < oldc; c++) { 261fa9e4066Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 262fa9e4066Sahrens newchild[newc] = cvd; 263fa9e4066Sahrens cvd->vdev_id = newc++; 264fa9e4066Sahrens } 265fa9e4066Sahrens } 266fa9e4066Sahrens 267fa9e4066Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 268fa9e4066Sahrens pvd->vdev_child = newchild; 269fa9e4066Sahrens pvd->vdev_children = newc; 270fa9e4066Sahrens } 271fa9e4066Sahrens 272fa9e4066Sahrens /* 273fa9e4066Sahrens * Allocate and minimally initialize a vdev_t. 274fa9e4066Sahrens */ 275fa9e4066Sahrens static vdev_t * 276fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 277fa9e4066Sahrens { 278fa9e4066Sahrens vdev_t *vd; 279fa9e4066Sahrens 280fa9e4066Sahrens vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 281fa9e4066Sahrens 2820e34b6a7Sbonwick if (spa->spa_root_vdev == NULL) { 2830e34b6a7Sbonwick ASSERT(ops == &vdev_root_ops); 2840e34b6a7Sbonwick spa->spa_root_vdev = vd; 2850e34b6a7Sbonwick } 2860e34b6a7Sbonwick 2870e34b6a7Sbonwick if (guid == 0) { 2880e34b6a7Sbonwick if (spa->spa_root_vdev == vd) { 2890e34b6a7Sbonwick /* 2900e34b6a7Sbonwick * The root vdev's guid will also be the pool guid, 2910e34b6a7Sbonwick * which must be unique among all pools. 2920e34b6a7Sbonwick */ 2930e34b6a7Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2940e34b6a7Sbonwick guid = spa_get_random(-1ULL); 2950e34b6a7Sbonwick } else { 2960e34b6a7Sbonwick /* 2970e34b6a7Sbonwick * Any other vdev's guid must be unique within the pool. 2980e34b6a7Sbonwick */ 2990e34b6a7Sbonwick while (guid == 0 || 3000e34b6a7Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3010e34b6a7Sbonwick guid = spa_get_random(-1ULL); 3020e34b6a7Sbonwick } 3030e34b6a7Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3040e34b6a7Sbonwick } 3050e34b6a7Sbonwick 306fa9e4066Sahrens vd->vdev_spa = spa; 307fa9e4066Sahrens vd->vdev_id = id; 308fa9e4066Sahrens vd->vdev_guid = guid; 309fa9e4066Sahrens vd->vdev_guid_sum = guid; 310fa9e4066Sahrens vd->vdev_ops = ops; 311fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 312fa9e4066Sahrens 313fa9e4066Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3145ad82045Snd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 315fa9e4066Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 316fa9e4066Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 317fa9e4066Sahrens txg_list_create(&vd->vdev_ms_list, 318fa9e4066Sahrens offsetof(struct metaslab, ms_txg_node)); 319fa9e4066Sahrens txg_list_create(&vd->vdev_dtl_list, 320fa9e4066Sahrens offsetof(struct vdev, vdev_dtl_node)); 321fa9e4066Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3223d7072f8Seschrock vdev_queue_init(vd); 3233d7072f8Seschrock vdev_cache_init(vd); 324fa9e4066Sahrens 325fa9e4066Sahrens return (vd); 326fa9e4066Sahrens } 327fa9e4066Sahrens 328fa9e4066Sahrens /* 329fa9e4066Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 330fa9e4066Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 331fa9e4066Sahrens * different for each case. 332fa9e4066Sahrens */ 33399653d4eSeschrock int 33499653d4eSeschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 33599653d4eSeschrock int alloctype) 336fa9e4066Sahrens { 337fa9e4066Sahrens vdev_ops_t *ops; 338fa9e4066Sahrens char *type; 339*8654d025Sperrin uint64_t guid = 0, islog, nparity; 340fa9e4066Sahrens vdev_t *vd; 341fa9e4066Sahrens 342fa9e4066Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 343fa9e4066Sahrens 344fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 34599653d4eSeschrock return (EINVAL); 346fa9e4066Sahrens 347fa9e4066Sahrens if ((ops = vdev_getops(type)) == NULL) 34899653d4eSeschrock return (EINVAL); 349fa9e4066Sahrens 350fa9e4066Sahrens /* 351fa9e4066Sahrens * If this is a load, get the vdev guid from the nvlist. 352fa9e4066Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 353fa9e4066Sahrens */ 354fa9e4066Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 355fa9e4066Sahrens uint64_t label_id; 356fa9e4066Sahrens 357fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 358fa9e4066Sahrens label_id != id) 35999653d4eSeschrock return (EINVAL); 360fa9e4066Sahrens 361fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 36299653d4eSeschrock return (EINVAL); 36399653d4eSeschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 36499653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 36599653d4eSeschrock return (EINVAL); 366fa9e4066Sahrens } 367fa9e4066Sahrens 36899653d4eSeschrock /* 36999653d4eSeschrock * The first allocated vdev must be of type 'root'. 37099653d4eSeschrock */ 37199653d4eSeschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 37299653d4eSeschrock return (EINVAL); 37399653d4eSeschrock 374*8654d025Sperrin /* 375*8654d025Sperrin * Determine whether we're a log vdev. 376*8654d025Sperrin */ 377*8654d025Sperrin islog = 0; 378*8654d025Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 379*8654d025Sperrin if (islog && spa_version(spa) < ZFS_VERSION_SLOGS) 380*8654d025Sperrin return (ENOTSUP); 381fa9e4066Sahrens 38299653d4eSeschrock /* 383*8654d025Sperrin * Set the nparity property for RAID-Z vdevs. 38499653d4eSeschrock */ 385*8654d025Sperrin nparity = -1ULL; 38699653d4eSeschrock if (ops == &vdev_raidz_ops) { 38799653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 388*8654d025Sperrin &nparity) == 0) { 38999653d4eSeschrock /* 39099653d4eSeschrock * Currently, we can only support 2 parity devices. 39199653d4eSeschrock */ 392*8654d025Sperrin if (nparity == 0 || nparity > 2) 39399653d4eSeschrock return (EINVAL); 39499653d4eSeschrock /* 39599653d4eSeschrock * Older versions can only support 1 parity device. 39699653d4eSeschrock */ 397*8654d025Sperrin if (nparity == 2 && 39899653d4eSeschrock spa_version(spa) < ZFS_VERSION_RAID6) 39999653d4eSeschrock return (ENOTSUP); 40099653d4eSeschrock } else { 40199653d4eSeschrock /* 40299653d4eSeschrock * We require the parity to be specified for SPAs that 40399653d4eSeschrock * support multiple parity levels. 40499653d4eSeschrock */ 40599653d4eSeschrock if (spa_version(spa) >= ZFS_VERSION_RAID6) 40699653d4eSeschrock return (EINVAL); 40799653d4eSeschrock /* 40899653d4eSeschrock * Otherwise, we default to 1 parity device for RAID-Z. 40999653d4eSeschrock */ 410*8654d025Sperrin nparity = 1; 41199653d4eSeschrock } 41299653d4eSeschrock } else { 413*8654d025Sperrin nparity = 0; 41499653d4eSeschrock } 415*8654d025Sperrin ASSERT(nparity != -1ULL); 416*8654d025Sperrin 417*8654d025Sperrin vd = vdev_alloc_common(spa, id, guid, ops); 418*8654d025Sperrin 419*8654d025Sperrin vd->vdev_islog = islog; 420*8654d025Sperrin vd->vdev_nparity = nparity; 421*8654d025Sperrin 422*8654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 423*8654d025Sperrin vd->vdev_path = spa_strdup(vd->vdev_path); 424*8654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 425*8654d025Sperrin vd->vdev_devid = spa_strdup(vd->vdev_devid); 426*8654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 427*8654d025Sperrin &vd->vdev_physpath) == 0) 428*8654d025Sperrin vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 42999653d4eSeschrock 430afefbcddSeschrock /* 431afefbcddSeschrock * Set the whole_disk property. If it's not specified, leave the value 432afefbcddSeschrock * as -1. 433afefbcddSeschrock */ 434afefbcddSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 435afefbcddSeschrock &vd->vdev_wholedisk) != 0) 436afefbcddSeschrock vd->vdev_wholedisk = -1ULL; 437afefbcddSeschrock 438ea8dc4b6Seschrock /* 439ea8dc4b6Seschrock * Look for the 'not present' flag. This will only be set if the device 440ea8dc4b6Seschrock * was not present at the time of import. 441ea8dc4b6Seschrock */ 442ea8dc4b6Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 443ea8dc4b6Seschrock &vd->vdev_not_present); 444ea8dc4b6Seschrock 445ecc2d604Sbonwick /* 446ecc2d604Sbonwick * Get the alignment requirement. 447ecc2d604Sbonwick */ 448ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 449ecc2d604Sbonwick 450fa9e4066Sahrens /* 451fa9e4066Sahrens * If we're a top-level vdev, try to load the allocation parameters. 452fa9e4066Sahrens */ 453fa9e4066Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 454fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 455fa9e4066Sahrens &vd->vdev_ms_array); 456fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 457fa9e4066Sahrens &vd->vdev_ms_shift); 458fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 459fa9e4066Sahrens &vd->vdev_asize); 460fa9e4066Sahrens } 461fa9e4066Sahrens 462fa9e4066Sahrens /* 4633d7072f8Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 464fa9e4066Sahrens */ 465fa9e4066Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 466fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 467fa9e4066Sahrens &vd->vdev_dtl.smo_object); 468ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 469ecc2d604Sbonwick &vd->vdev_offline); 4703d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 4713d7072f8Seschrock &vd->vdev_unspare); 4723d7072f8Seschrock /* 4733d7072f8Seschrock * When importing a pool, we want to ignore the persistent fault 4743d7072f8Seschrock * state, as the diagnosis made on another system may not be 4753d7072f8Seschrock * valid in the current context. 4763d7072f8Seschrock */ 4773d7072f8Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN) { 4783d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 4793d7072f8Seschrock &vd->vdev_faulted); 4803d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 4813d7072f8Seschrock &vd->vdev_degraded); 4823d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 4833d7072f8Seschrock &vd->vdev_removed); 4843d7072f8Seschrock } 485fa9e4066Sahrens } 486fa9e4066Sahrens 487fa9e4066Sahrens /* 488fa9e4066Sahrens * Add ourselves to the parent's list of children. 489fa9e4066Sahrens */ 490fa9e4066Sahrens vdev_add_child(parent, vd); 491fa9e4066Sahrens 49299653d4eSeschrock *vdp = vd; 49399653d4eSeschrock 49499653d4eSeschrock return (0); 495fa9e4066Sahrens } 496fa9e4066Sahrens 497fa9e4066Sahrens void 498fa9e4066Sahrens vdev_free(vdev_t *vd) 499fa9e4066Sahrens { 500fa9e4066Sahrens int c; 5013d7072f8Seschrock spa_t *spa = vd->vdev_spa; 502fa9e4066Sahrens 503fa9e4066Sahrens /* 504fa9e4066Sahrens * vdev_free() implies closing the vdev first. This is simpler than 505fa9e4066Sahrens * trying to ensure complicated semantics for all callers. 506fa9e4066Sahrens */ 507fa9e4066Sahrens vdev_close(vd); 508fa9e4066Sahrens 5093d7072f8Seschrock 510ecc2d604Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 511fa9e4066Sahrens 512fa9e4066Sahrens /* 513fa9e4066Sahrens * Free all children. 514fa9e4066Sahrens */ 515fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 516fa9e4066Sahrens vdev_free(vd->vdev_child[c]); 517fa9e4066Sahrens 518fa9e4066Sahrens ASSERT(vd->vdev_child == NULL); 519fa9e4066Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 520fa9e4066Sahrens 521fa9e4066Sahrens /* 522fa9e4066Sahrens * Discard allocation state. 523fa9e4066Sahrens */ 524fa9e4066Sahrens if (vd == vd->vdev_top) 525fa9e4066Sahrens vdev_metaslab_fini(vd); 526fa9e4066Sahrens 527fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 52899653d4eSeschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 529fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 530fa9e4066Sahrens 531fa9e4066Sahrens /* 532fa9e4066Sahrens * Remove this vdev from its parent's child list. 533fa9e4066Sahrens */ 534fa9e4066Sahrens vdev_remove_child(vd->vdev_parent, vd); 535fa9e4066Sahrens 536fa9e4066Sahrens ASSERT(vd->vdev_parent == NULL); 537fa9e4066Sahrens 5383d7072f8Seschrock /* 5393d7072f8Seschrock * Clean up vdev structure. 5403d7072f8Seschrock */ 5413d7072f8Seschrock vdev_queue_fini(vd); 5423d7072f8Seschrock vdev_cache_fini(vd); 5433d7072f8Seschrock 5443d7072f8Seschrock if (vd->vdev_path) 5453d7072f8Seschrock spa_strfree(vd->vdev_path); 5463d7072f8Seschrock if (vd->vdev_devid) 5473d7072f8Seschrock spa_strfree(vd->vdev_devid); 5483d7072f8Seschrock if (vd->vdev_physpath) 5493d7072f8Seschrock spa_strfree(vd->vdev_physpath); 5503d7072f8Seschrock 5513d7072f8Seschrock if (vd->vdev_isspare) 5523d7072f8Seschrock spa_spare_remove(vd); 5533d7072f8Seschrock 5543d7072f8Seschrock txg_list_destroy(&vd->vdev_ms_list); 5553d7072f8Seschrock txg_list_destroy(&vd->vdev_dtl_list); 5563d7072f8Seschrock mutex_enter(&vd->vdev_dtl_lock); 5573d7072f8Seschrock space_map_unload(&vd->vdev_dtl_map); 5583d7072f8Seschrock space_map_destroy(&vd->vdev_dtl_map); 5593d7072f8Seschrock space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 5603d7072f8Seschrock space_map_destroy(&vd->vdev_dtl_scrub); 5613d7072f8Seschrock mutex_exit(&vd->vdev_dtl_lock); 5623d7072f8Seschrock mutex_destroy(&vd->vdev_dtl_lock); 5633d7072f8Seschrock mutex_destroy(&vd->vdev_stat_lock); 5643d7072f8Seschrock 5653d7072f8Seschrock if (vd == spa->spa_root_vdev) 5663d7072f8Seschrock spa->spa_root_vdev = NULL; 5673d7072f8Seschrock 5683d7072f8Seschrock kmem_free(vd, sizeof (vdev_t)); 569fa9e4066Sahrens } 570fa9e4066Sahrens 571fa9e4066Sahrens /* 572fa9e4066Sahrens * Transfer top-level vdev state from svd to tvd. 573fa9e4066Sahrens */ 574fa9e4066Sahrens static void 575fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 576fa9e4066Sahrens { 577fa9e4066Sahrens spa_t *spa = svd->vdev_spa; 578fa9e4066Sahrens metaslab_t *msp; 579fa9e4066Sahrens vdev_t *vd; 580fa9e4066Sahrens int t; 581fa9e4066Sahrens 582fa9e4066Sahrens ASSERT(tvd == tvd->vdev_top); 583fa9e4066Sahrens 584fa9e4066Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 585fa9e4066Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 586fa9e4066Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 587fa9e4066Sahrens 588fa9e4066Sahrens svd->vdev_ms_array = 0; 589fa9e4066Sahrens svd->vdev_ms_shift = 0; 590fa9e4066Sahrens svd->vdev_ms_count = 0; 591fa9e4066Sahrens 592fa9e4066Sahrens tvd->vdev_mg = svd->vdev_mg; 593fa9e4066Sahrens tvd->vdev_ms = svd->vdev_ms; 594fa9e4066Sahrens 595fa9e4066Sahrens svd->vdev_mg = NULL; 596fa9e4066Sahrens svd->vdev_ms = NULL; 597ecc2d604Sbonwick 598ecc2d604Sbonwick if (tvd->vdev_mg != NULL) 599ecc2d604Sbonwick tvd->vdev_mg->mg_vd = tvd; 600fa9e4066Sahrens 601fa9e4066Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 602fa9e4066Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 60399653d4eSeschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 604fa9e4066Sahrens 605fa9e4066Sahrens svd->vdev_stat.vs_alloc = 0; 606fa9e4066Sahrens svd->vdev_stat.vs_space = 0; 60799653d4eSeschrock svd->vdev_stat.vs_dspace = 0; 608fa9e4066Sahrens 609fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) { 610fa9e4066Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 611fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 612fa9e4066Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 613fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 614fa9e4066Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 615fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 616fa9e4066Sahrens } 617fa9e4066Sahrens 618ecc2d604Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 619fa9e4066Sahrens vdev_config_clean(svd); 620fa9e4066Sahrens vdev_config_dirty(tvd); 621fa9e4066Sahrens } 622fa9e4066Sahrens 62399653d4eSeschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 62499653d4eSeschrock svd->vdev_deflate_ratio = 0; 625*8654d025Sperrin 626*8654d025Sperrin tvd->vdev_islog = svd->vdev_islog; 627*8654d025Sperrin svd->vdev_islog = 0; 628fa9e4066Sahrens } 629fa9e4066Sahrens 630fa9e4066Sahrens static void 631fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 632fa9e4066Sahrens { 633fa9e4066Sahrens int c; 634fa9e4066Sahrens 635fa9e4066Sahrens if (vd == NULL) 636fa9e4066Sahrens return; 637fa9e4066Sahrens 638fa9e4066Sahrens vd->vdev_top = tvd; 639fa9e4066Sahrens 640fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 641fa9e4066Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 642fa9e4066Sahrens } 643fa9e4066Sahrens 644fa9e4066Sahrens /* 645fa9e4066Sahrens * Add a mirror/replacing vdev above an existing vdev. 646fa9e4066Sahrens */ 647fa9e4066Sahrens vdev_t * 648fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 649fa9e4066Sahrens { 650fa9e4066Sahrens spa_t *spa = cvd->vdev_spa; 651fa9e4066Sahrens vdev_t *pvd = cvd->vdev_parent; 652fa9e4066Sahrens vdev_t *mvd; 653fa9e4066Sahrens 654fa9e4066Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 655fa9e4066Sahrens 656fa9e4066Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 657ecc2d604Sbonwick 658ecc2d604Sbonwick mvd->vdev_asize = cvd->vdev_asize; 659ecc2d604Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 660ecc2d604Sbonwick mvd->vdev_state = cvd->vdev_state; 661ecc2d604Sbonwick 662fa9e4066Sahrens vdev_remove_child(pvd, cvd); 663fa9e4066Sahrens vdev_add_child(pvd, mvd); 664fa9e4066Sahrens cvd->vdev_id = mvd->vdev_children; 665fa9e4066Sahrens vdev_add_child(mvd, cvd); 666fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 667fa9e4066Sahrens 668fa9e4066Sahrens if (mvd == mvd->vdev_top) 669fa9e4066Sahrens vdev_top_transfer(cvd, mvd); 670fa9e4066Sahrens 671fa9e4066Sahrens return (mvd); 672fa9e4066Sahrens } 673fa9e4066Sahrens 674fa9e4066Sahrens /* 675fa9e4066Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 676fa9e4066Sahrens */ 677fa9e4066Sahrens void 678fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd) 679fa9e4066Sahrens { 680fa9e4066Sahrens vdev_t *mvd = cvd->vdev_parent; 681fa9e4066Sahrens vdev_t *pvd = mvd->vdev_parent; 682fa9e4066Sahrens 683fa9e4066Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 684fa9e4066Sahrens 685fa9e4066Sahrens ASSERT(mvd->vdev_children == 1); 686fa9e4066Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 68799653d4eSeschrock mvd->vdev_ops == &vdev_replacing_ops || 68899653d4eSeschrock mvd->vdev_ops == &vdev_spare_ops); 689ecc2d604Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 690fa9e4066Sahrens 691fa9e4066Sahrens vdev_remove_child(mvd, cvd); 692fa9e4066Sahrens vdev_remove_child(pvd, mvd); 693fa9e4066Sahrens cvd->vdev_id = mvd->vdev_id; 694fa9e4066Sahrens vdev_add_child(pvd, cvd); 69599653d4eSeschrock /* 69699653d4eSeschrock * If we created a new toplevel vdev, then we need to change the child's 69799653d4eSeschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 69899653d4eSeschrock * detached an offline device, and when we go to import the pool we'll 69999653d4eSeschrock * think we have two toplevel vdevs, instead of a different version of 70099653d4eSeschrock * the same toplevel vdev. 70199653d4eSeschrock */ 70299653d4eSeschrock if (cvd->vdev_top == cvd) { 70399653d4eSeschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 70499653d4eSeschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 70599653d4eSeschrock cvd->vdev_guid = mvd->vdev_guid; 70699653d4eSeschrock cvd->vdev_guid_sum += mvd->vdev_guid; 70799653d4eSeschrock pvd->vdev_guid_sum += cvd->vdev_guid; 70899653d4eSeschrock } 709fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 710fa9e4066Sahrens 711fa9e4066Sahrens if (cvd == cvd->vdev_top) 712fa9e4066Sahrens vdev_top_transfer(mvd, cvd); 713fa9e4066Sahrens 714fa9e4066Sahrens ASSERT(mvd->vdev_children == 0); 715fa9e4066Sahrens vdev_free(mvd); 716fa9e4066Sahrens } 717fa9e4066Sahrens 718ea8dc4b6Seschrock int 719fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 720fa9e4066Sahrens { 721fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 722ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 723*8654d025Sperrin metaslab_class_t *mc; 724ecc2d604Sbonwick uint64_t m; 725fa9e4066Sahrens uint64_t oldc = vd->vdev_ms_count; 726fa9e4066Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 727ecc2d604Sbonwick metaslab_t **mspp; 728ecc2d604Sbonwick int error; 729fa9e4066Sahrens 7300e34b6a7Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7310e34b6a7Sbonwick return (0); 7320e34b6a7Sbonwick 733fa9e4066Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 734fa9e4066Sahrens 735fa9e4066Sahrens ASSERT(oldc <= newc); 736fa9e4066Sahrens 737*8654d025Sperrin if (vd->vdev_islog) 738*8654d025Sperrin mc = spa->spa_log_class; 739*8654d025Sperrin else 740*8654d025Sperrin mc = spa->spa_normal_class; 741*8654d025Sperrin 742ecc2d604Sbonwick if (vd->vdev_mg == NULL) 743ecc2d604Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 744fa9e4066Sahrens 745ecc2d604Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 746fa9e4066Sahrens 747ecc2d604Sbonwick if (oldc != 0) { 748ecc2d604Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 749ecc2d604Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 750ecc2d604Sbonwick } 751fa9e4066Sahrens 752ecc2d604Sbonwick vd->vdev_ms = mspp; 753ecc2d604Sbonwick vd->vdev_ms_count = newc; 754fa9e4066Sahrens 755ecc2d604Sbonwick for (m = oldc; m < newc; m++) { 756ecc2d604Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 757ecc2d604Sbonwick if (txg == 0) { 758ecc2d604Sbonwick uint64_t object = 0; 759ecc2d604Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 760ecc2d604Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 761ecc2d604Sbonwick if (error) 762ecc2d604Sbonwick return (error); 763ecc2d604Sbonwick if (object != 0) { 764ecc2d604Sbonwick dmu_buf_t *db; 765ecc2d604Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 766ecc2d604Sbonwick if (error) 767ecc2d604Sbonwick return (error); 768ecc2d604Sbonwick ASSERT3U(db->db_size, ==, sizeof (smo)); 769ecc2d604Sbonwick bcopy(db->db_data, &smo, db->db_size); 770ecc2d604Sbonwick ASSERT3U(smo.smo_object, ==, object); 771ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 772fa9e4066Sahrens } 773fa9e4066Sahrens } 774ecc2d604Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 775ecc2d604Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 776fa9e4066Sahrens } 777fa9e4066Sahrens 778ea8dc4b6Seschrock return (0); 779fa9e4066Sahrens } 780fa9e4066Sahrens 781fa9e4066Sahrens void 782fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd) 783fa9e4066Sahrens { 784fa9e4066Sahrens uint64_t m; 785fa9e4066Sahrens uint64_t count = vd->vdev_ms_count; 786fa9e4066Sahrens 787fa9e4066Sahrens if (vd->vdev_ms != NULL) { 788fa9e4066Sahrens for (m = 0; m < count; m++) 789ecc2d604Sbonwick if (vd->vdev_ms[m] != NULL) 790ecc2d604Sbonwick metaslab_fini(vd->vdev_ms[m]); 791fa9e4066Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 792fa9e4066Sahrens vd->vdev_ms = NULL; 793fa9e4066Sahrens } 794fa9e4066Sahrens } 795fa9e4066Sahrens 796fa9e4066Sahrens /* 797fa9e4066Sahrens * Prepare a virtual device for access. 798fa9e4066Sahrens */ 799fa9e4066Sahrens int 800fa9e4066Sahrens vdev_open(vdev_t *vd) 801fa9e4066Sahrens { 802fa9e4066Sahrens int error; 803fa9e4066Sahrens int c; 804fa9e4066Sahrens uint64_t osize = 0; 805fa9e4066Sahrens uint64_t asize, psize; 806ecc2d604Sbonwick uint64_t ashift = 0; 807fa9e4066Sahrens 808fa9e4066Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 809fa9e4066Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 810fa9e4066Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 811fa9e4066Sahrens 812fa9e4066Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 813fa9e4066Sahrens vd->vdev_fault_arg >>= 1; 814fa9e4066Sahrens else 815fa9e4066Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 816fa9e4066Sahrens 817fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 818fa9e4066Sahrens 8193d7072f8Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 8203d7072f8Seschrock ASSERT(vd->vdev_children == 0); 8213d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8223d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 8233d7072f8Seschrock return (ENXIO); 8243d7072f8Seschrock } else if (vd->vdev_offline) { 825fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 826ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 827fa9e4066Sahrens return (ENXIO); 828fa9e4066Sahrens } 829fa9e4066Sahrens 830fa9e4066Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 831fa9e4066Sahrens 832ea8dc4b6Seschrock if (zio_injection_enabled && error == 0) 833ea8dc4b6Seschrock error = zio_handle_device_injection(vd, ENXIO); 834ea8dc4b6Seschrock 835fa9e4066Sahrens if (error) { 8363d7072f8Seschrock if (vd->vdev_removed && 8373d7072f8Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 8383d7072f8Seschrock vd->vdev_removed = B_FALSE; 8393d7072f8Seschrock 840ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 841fa9e4066Sahrens vd->vdev_stat.vs_aux); 842fa9e4066Sahrens return (error); 843fa9e4066Sahrens } 844fa9e4066Sahrens 8453d7072f8Seschrock vd->vdev_removed = B_FALSE; 8463d7072f8Seschrock 8473d7072f8Seschrock if (vd->vdev_degraded) { 8483d7072f8Seschrock ASSERT(vd->vdev_children == 0); 8493d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8503d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 8513d7072f8Seschrock } else { 8523d7072f8Seschrock vd->vdev_state = VDEV_STATE_HEALTHY; 8533d7072f8Seschrock } 854fa9e4066Sahrens 855fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 856ea8dc4b6Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 857ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 858ea8dc4b6Seschrock VDEV_AUX_NONE); 859ea8dc4b6Seschrock break; 860ea8dc4b6Seschrock } 861fa9e4066Sahrens 862fa9e4066Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 863fa9e4066Sahrens 864fa9e4066Sahrens if (vd->vdev_children == 0) { 865fa9e4066Sahrens if (osize < SPA_MINDEVSIZE) { 866ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 867ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 868fa9e4066Sahrens return (EOVERFLOW); 869fa9e4066Sahrens } 870fa9e4066Sahrens psize = osize; 871fa9e4066Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 872fa9e4066Sahrens } else { 873ecc2d604Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 874fa9e4066Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 875ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 876ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 877fa9e4066Sahrens return (EOVERFLOW); 878fa9e4066Sahrens } 879fa9e4066Sahrens psize = 0; 880fa9e4066Sahrens asize = osize; 881fa9e4066Sahrens } 882fa9e4066Sahrens 883fa9e4066Sahrens vd->vdev_psize = psize; 884fa9e4066Sahrens 885fa9e4066Sahrens if (vd->vdev_asize == 0) { 886fa9e4066Sahrens /* 887fa9e4066Sahrens * This is the first-ever open, so use the computed values. 888ecc2d604Sbonwick * For testing purposes, a higher ashift can be requested. 889fa9e4066Sahrens */ 890fa9e4066Sahrens vd->vdev_asize = asize; 891ecc2d604Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 892fa9e4066Sahrens } else { 893fa9e4066Sahrens /* 894fa9e4066Sahrens * Make sure the alignment requirement hasn't increased. 895fa9e4066Sahrens */ 896ecc2d604Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 897ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 898ea8dc4b6Seschrock VDEV_AUX_BAD_LABEL); 899fa9e4066Sahrens return (EINVAL); 900fa9e4066Sahrens } 901fa9e4066Sahrens 902fa9e4066Sahrens /* 903fa9e4066Sahrens * Make sure the device hasn't shrunk. 904fa9e4066Sahrens */ 905fa9e4066Sahrens if (asize < vd->vdev_asize) { 906ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 907ea8dc4b6Seschrock VDEV_AUX_BAD_LABEL); 908fa9e4066Sahrens return (EINVAL); 909fa9e4066Sahrens } 910fa9e4066Sahrens 911fa9e4066Sahrens /* 912fa9e4066Sahrens * If all children are healthy and the asize has increased, 913fa9e4066Sahrens * then we've experienced dynamic LUN growth. 914fa9e4066Sahrens */ 915fa9e4066Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 916fa9e4066Sahrens asize > vd->vdev_asize) { 917fa9e4066Sahrens vd->vdev_asize = asize; 918fa9e4066Sahrens } 919fa9e4066Sahrens } 920fa9e4066Sahrens 92199653d4eSeschrock /* 92299653d4eSeschrock * If this is a top-level vdev, compute the raidz-deflation 92399653d4eSeschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 92499653d4eSeschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 92599653d4eSeschrock * changes, this algorithm must never change, or we will 92699653d4eSeschrock * inconsistently account for existing bp's. 92799653d4eSeschrock */ 92899653d4eSeschrock if (vd->vdev_top == vd) { 92999653d4eSeschrock vd->vdev_deflate_ratio = (1<<17) / 93099653d4eSeschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 93199653d4eSeschrock } 93299653d4eSeschrock 933ea8dc4b6Seschrock /* 934ea8dc4b6Seschrock * This allows the ZFS DE to close cases appropriately. If a device 935ea8dc4b6Seschrock * goes away and later returns, we want to close the associated case. 936ea8dc4b6Seschrock * But it's not enough to simply post this only when a device goes from 937ea8dc4b6Seschrock * CANT_OPEN -> HEALTHY. If we reboot the system and the device is 938ea8dc4b6Seschrock * back, we also need to close the case (otherwise we will try to replay 939ea8dc4b6Seschrock * it). So we have to post this notifier every time. Since this only 940ea8dc4b6Seschrock * occurs during pool open or error recovery, this should not be an 941ea8dc4b6Seschrock * issue. 942ea8dc4b6Seschrock */ 943ea8dc4b6Seschrock zfs_post_ok(vd->vdev_spa, vd); 944ea8dc4b6Seschrock 945fa9e4066Sahrens return (0); 946fa9e4066Sahrens } 947fa9e4066Sahrens 948560e6e96Seschrock /* 949560e6e96Seschrock * Called once the vdevs are all opened, this routine validates the label 950560e6e96Seschrock * contents. This needs to be done before vdev_load() so that we don't 9513d7072f8Seschrock * inadvertently do repair I/Os to the wrong device. 952560e6e96Seschrock * 953560e6e96Seschrock * This function will only return failure if one of the vdevs indicates that it 954560e6e96Seschrock * has since been destroyed or exported. This is only possible if 955560e6e96Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 956560e6e96Seschrock * will be updated but the function will return 0. 957560e6e96Seschrock */ 958560e6e96Seschrock int 959560e6e96Seschrock vdev_validate(vdev_t *vd) 960560e6e96Seschrock { 961560e6e96Seschrock spa_t *spa = vd->vdev_spa; 962560e6e96Seschrock int c; 963560e6e96Seschrock nvlist_t *label; 964560e6e96Seschrock uint64_t guid; 965560e6e96Seschrock uint64_t state; 966560e6e96Seschrock 967560e6e96Seschrock for (c = 0; c < vd->vdev_children; c++) 968560e6e96Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 9690bf246f5Smc return (EBADF); 970560e6e96Seschrock 971b5989ec7Seschrock /* 972b5989ec7Seschrock * If the device has already failed, or was marked offline, don't do 973b5989ec7Seschrock * any further validation. Otherwise, label I/O will fail and we will 974b5989ec7Seschrock * overwrite the previous state. 975b5989ec7Seschrock */ 976b5989ec7Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 977560e6e96Seschrock 978560e6e96Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 979560e6e96Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 980560e6e96Seschrock VDEV_AUX_BAD_LABEL); 981560e6e96Seschrock return (0); 982560e6e96Seschrock } 983560e6e96Seschrock 984560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 985560e6e96Seschrock &guid) != 0 || guid != spa_guid(spa)) { 986560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 987560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 988560e6e96Seschrock nvlist_free(label); 989560e6e96Seschrock return (0); 990560e6e96Seschrock } 991560e6e96Seschrock 992560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 993560e6e96Seschrock &guid) != 0 || guid != vd->vdev_guid) { 994560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 995560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 996560e6e96Seschrock nvlist_free(label); 997560e6e96Seschrock return (0); 998560e6e96Seschrock } 999560e6e96Seschrock 1000560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1001560e6e96Seschrock &state) != 0) { 1002560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1003560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1004560e6e96Seschrock nvlist_free(label); 1005560e6e96Seschrock return (0); 1006560e6e96Seschrock } 1007560e6e96Seschrock 1008560e6e96Seschrock nvlist_free(label); 1009560e6e96Seschrock 1010560e6e96Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 1011560e6e96Seschrock state != POOL_STATE_ACTIVE) 10120bf246f5Smc return (EBADF); 1013560e6e96Seschrock } 1014560e6e96Seschrock 1015560e6e96Seschrock /* 1016560e6e96Seschrock * If we were able to open and validate a vdev that was previously 1017560e6e96Seschrock * marked permanently unavailable, clear that state now. 1018560e6e96Seschrock */ 1019560e6e96Seschrock if (vd->vdev_not_present) 1020560e6e96Seschrock vd->vdev_not_present = 0; 1021560e6e96Seschrock 1022560e6e96Seschrock return (0); 1023560e6e96Seschrock } 1024560e6e96Seschrock 1025fa9e4066Sahrens /* 1026fa9e4066Sahrens * Close a virtual device. 1027fa9e4066Sahrens */ 1028fa9e4066Sahrens void 1029fa9e4066Sahrens vdev_close(vdev_t *vd) 1030fa9e4066Sahrens { 1031fa9e4066Sahrens vd->vdev_ops->vdev_op_close(vd); 1032fa9e4066Sahrens 10333d7072f8Seschrock vdev_cache_purge(vd); 1034fa9e4066Sahrens 1035560e6e96Seschrock /* 1036560e6e96Seschrock * We record the previous state before we close it, so that if we are 1037560e6e96Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 1038560e6e96Seschrock * it's still faulted. 1039560e6e96Seschrock */ 1040560e6e96Seschrock vd->vdev_prevstate = vd->vdev_state; 1041560e6e96Seschrock 1042fa9e4066Sahrens if (vd->vdev_offline) 1043fa9e4066Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1044fa9e4066Sahrens else 1045fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 1046ea8dc4b6Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1047fa9e4066Sahrens } 1048fa9e4066Sahrens 1049fa9e4066Sahrens void 1050ea8dc4b6Seschrock vdev_reopen(vdev_t *vd) 1051fa9e4066Sahrens { 1052ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1053fa9e4066Sahrens 1054ea8dc4b6Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1055ea8dc4b6Seschrock 1056fa9e4066Sahrens vdev_close(vd); 1057fa9e4066Sahrens (void) vdev_open(vd); 1058fa9e4066Sahrens 105939c23413Seschrock /* 106039c23413Seschrock * Call vdev_validate() here to make sure we have the same device. 106139c23413Seschrock * Otherwise, a device with an invalid label could be successfully 106239c23413Seschrock * opened in response to vdev_reopen(). 106339c23413Seschrock */ 106439c23413Seschrock (void) vdev_validate(vd); 106539c23413Seschrock 1066fa9e4066Sahrens /* 10673d7072f8Seschrock * Reassess parent vdev's health. 1068fa9e4066Sahrens */ 10693d7072f8Seschrock vdev_propagate_state(vd); 1070fa9e4066Sahrens } 1071fa9e4066Sahrens 1072fa9e4066Sahrens int 107399653d4eSeschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1074fa9e4066Sahrens { 1075fa9e4066Sahrens int error; 1076fa9e4066Sahrens 1077fa9e4066Sahrens /* 1078fa9e4066Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1079fa9e4066Sahrens * For a create, however, we want to fail the request if 1080fa9e4066Sahrens * there are any components we can't open. 1081fa9e4066Sahrens */ 1082fa9e4066Sahrens error = vdev_open(vd); 1083fa9e4066Sahrens 1084fa9e4066Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1085fa9e4066Sahrens vdev_close(vd); 1086fa9e4066Sahrens return (error ? error : ENXIO); 1087fa9e4066Sahrens } 1088fa9e4066Sahrens 1089fa9e4066Sahrens /* 1090fa9e4066Sahrens * Recursively initialize all labels. 1091fa9e4066Sahrens */ 109239c23413Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 109339c23413Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1094fa9e4066Sahrens vdev_close(vd); 1095fa9e4066Sahrens return (error); 1096fa9e4066Sahrens } 1097fa9e4066Sahrens 1098fa9e4066Sahrens return (0); 1099fa9e4066Sahrens } 1100fa9e4066Sahrens 1101fa9e4066Sahrens /* 1102fa9e4066Sahrens * The is the latter half of vdev_create(). It is distinct because it 1103fa9e4066Sahrens * involves initiating transactions in order to do metaslab creation. 1104fa9e4066Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1105fa9e4066Sahrens * if anything fails; this is much harder if we have pending transactions. 1106fa9e4066Sahrens */ 11070e34b6a7Sbonwick void 1108fa9e4066Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1109fa9e4066Sahrens { 1110fa9e4066Sahrens /* 1111fa9e4066Sahrens * Aim for roughly 200 metaslabs per vdev. 1112fa9e4066Sahrens */ 1113fa9e4066Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1114fa9e4066Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1115fa9e4066Sahrens 1116fa9e4066Sahrens /* 11170e34b6a7Sbonwick * Initialize the vdev's metaslabs. This can't fail because 11180e34b6a7Sbonwick * there's nothing to read when creating all new metaslabs. 1119fa9e4066Sahrens */ 11200e34b6a7Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1121fa9e4066Sahrens } 1122fa9e4066Sahrens 1123fa9e4066Sahrens void 1124ecc2d604Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1125fa9e4066Sahrens { 1126ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 1127ecc2d604Sbonwick ASSERT(ISP2(flags)); 1128fa9e4066Sahrens 1129ecc2d604Sbonwick if (flags & VDD_METASLAB) 1130ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1131ecc2d604Sbonwick 1132ecc2d604Sbonwick if (flags & VDD_DTL) 1133ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1134ecc2d604Sbonwick 1135ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1136fa9e4066Sahrens } 1137fa9e4066Sahrens 1138fa9e4066Sahrens void 1139fa9e4066Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1140fa9e4066Sahrens { 1141fa9e4066Sahrens mutex_enter(sm->sm_lock); 1142fa9e4066Sahrens if (!space_map_contains(sm, txg, size)) 1143fa9e4066Sahrens space_map_add(sm, txg, size); 1144fa9e4066Sahrens mutex_exit(sm->sm_lock); 1145fa9e4066Sahrens } 1146fa9e4066Sahrens 1147fa9e4066Sahrens int 1148fa9e4066Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1149fa9e4066Sahrens { 1150fa9e4066Sahrens int dirty; 1151fa9e4066Sahrens 1152fa9e4066Sahrens /* 1153fa9e4066Sahrens * Quick test without the lock -- covers the common case that 1154fa9e4066Sahrens * there are no dirty time segments. 1155fa9e4066Sahrens */ 1156fa9e4066Sahrens if (sm->sm_space == 0) 1157fa9e4066Sahrens return (0); 1158fa9e4066Sahrens 1159fa9e4066Sahrens mutex_enter(sm->sm_lock); 1160fa9e4066Sahrens dirty = space_map_contains(sm, txg, size); 1161fa9e4066Sahrens mutex_exit(sm->sm_lock); 1162fa9e4066Sahrens 1163fa9e4066Sahrens return (dirty); 1164fa9e4066Sahrens } 1165fa9e4066Sahrens 1166fa9e4066Sahrens /* 1167fa9e4066Sahrens * Reassess DTLs after a config change or scrub completion. 1168fa9e4066Sahrens */ 1169fa9e4066Sahrens void 1170fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1171fa9e4066Sahrens { 1172ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1173fa9e4066Sahrens int c; 1174fa9e4066Sahrens 1175ea8dc4b6Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1176fa9e4066Sahrens 1177fa9e4066Sahrens if (vd->vdev_children == 0) { 1178fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1179fa9e4066Sahrens /* 1180fa9e4066Sahrens * We're successfully scrubbed everything up to scrub_txg. 1181fa9e4066Sahrens * Therefore, excise all old DTLs up to that point, then 1182fa9e4066Sahrens * fold in the DTLs for everything we couldn't scrub. 1183fa9e4066Sahrens */ 1184fa9e4066Sahrens if (scrub_txg != 0) { 1185fa9e4066Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1186fa9e4066Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1187fa9e4066Sahrens } 1188fa9e4066Sahrens if (scrub_done) 1189fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1190fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1191ecc2d604Sbonwick if (txg != 0) 1192ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1193fa9e4066Sahrens return; 1194fa9e4066Sahrens } 1195fa9e4066Sahrens 1196ea8dc4b6Seschrock /* 1197ea8dc4b6Seschrock * Make sure the DTLs are always correct under the scrub lock. 1198ea8dc4b6Seschrock */ 1199ea8dc4b6Seschrock if (vd == spa->spa_root_vdev) 1200ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 1201ea8dc4b6Seschrock 1202fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1203fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1204fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1205fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1206fa9e4066Sahrens 1207fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) { 1208fa9e4066Sahrens vdev_t *cvd = vd->vdev_child[c]; 1209fa9e4066Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1210fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1211fa9e4066Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1212fa9e4066Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1213fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1214fa9e4066Sahrens } 1215ea8dc4b6Seschrock 1216ea8dc4b6Seschrock if (vd == spa->spa_root_vdev) 1217ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 1218fa9e4066Sahrens } 1219fa9e4066Sahrens 1220fa9e4066Sahrens static int 1221fa9e4066Sahrens vdev_dtl_load(vdev_t *vd) 1222fa9e4066Sahrens { 1223fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1224fa9e4066Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1225ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1226fa9e4066Sahrens dmu_buf_t *db; 1227fa9e4066Sahrens int error; 1228fa9e4066Sahrens 1229fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 1230fa9e4066Sahrens 1231fa9e4066Sahrens if (smo->smo_object == 0) 1232fa9e4066Sahrens return (0); 1233fa9e4066Sahrens 1234ecc2d604Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1235ea8dc4b6Seschrock return (error); 1236ecc2d604Sbonwick 1237fa9e4066Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1238fa9e4066Sahrens bcopy(db->db_data, smo, db->db_size); 1239ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1240fa9e4066Sahrens 1241fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1242ecc2d604Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1243fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1244fa9e4066Sahrens 1245fa9e4066Sahrens return (error); 1246fa9e4066Sahrens } 1247fa9e4066Sahrens 1248fa9e4066Sahrens void 1249fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1250fa9e4066Sahrens { 1251fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1252fa9e4066Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1253fa9e4066Sahrens space_map_t *sm = &vd->vdev_dtl_map; 1254ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1255fa9e4066Sahrens space_map_t smsync; 1256fa9e4066Sahrens kmutex_t smlock; 1257fa9e4066Sahrens dmu_buf_t *db; 1258fa9e4066Sahrens dmu_tx_t *tx; 1259fa9e4066Sahrens 1260fa9e4066Sahrens dprintf("%s in txg %llu pass %d\n", 1261fa9e4066Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1262fa9e4066Sahrens 1263fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1264fa9e4066Sahrens 1265fa9e4066Sahrens if (vd->vdev_detached) { 1266fa9e4066Sahrens if (smo->smo_object != 0) { 1267ecc2d604Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1268fa9e4066Sahrens ASSERT3U(err, ==, 0); 1269fa9e4066Sahrens smo->smo_object = 0; 1270fa9e4066Sahrens } 1271fa9e4066Sahrens dmu_tx_commit(tx); 1272ecc2d604Sbonwick dprintf("detach %s committed in txg %llu\n", 1273ecc2d604Sbonwick vdev_description(vd), txg); 1274fa9e4066Sahrens return; 1275fa9e4066Sahrens } 1276fa9e4066Sahrens 1277fa9e4066Sahrens if (smo->smo_object == 0) { 1278fa9e4066Sahrens ASSERT(smo->smo_objsize == 0); 1279fa9e4066Sahrens ASSERT(smo->smo_alloc == 0); 1280ecc2d604Sbonwick smo->smo_object = dmu_object_alloc(mos, 1281fa9e4066Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1282fa9e4066Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1283fa9e4066Sahrens ASSERT(smo->smo_object != 0); 1284fa9e4066Sahrens vdev_config_dirty(vd->vdev_top); 1285fa9e4066Sahrens } 1286fa9e4066Sahrens 1287fa9e4066Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1288fa9e4066Sahrens 1289fa9e4066Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1290fa9e4066Sahrens &smlock); 1291fa9e4066Sahrens 1292fa9e4066Sahrens mutex_enter(&smlock); 1293fa9e4066Sahrens 1294fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1295ecc2d604Sbonwick space_map_walk(sm, space_map_add, &smsync); 1296fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1297fa9e4066Sahrens 1298ecc2d604Sbonwick space_map_truncate(smo, mos, tx); 1299ecc2d604Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1300fa9e4066Sahrens 1301fa9e4066Sahrens space_map_destroy(&smsync); 1302fa9e4066Sahrens 1303fa9e4066Sahrens mutex_exit(&smlock); 1304fa9e4066Sahrens mutex_destroy(&smlock); 1305fa9e4066Sahrens 1306ecc2d604Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1307fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 1308fa9e4066Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1309fa9e4066Sahrens bcopy(smo, db->db_data, db->db_size); 1310ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1311fa9e4066Sahrens 1312fa9e4066Sahrens dmu_tx_commit(tx); 1313fa9e4066Sahrens } 1314fa9e4066Sahrens 1315560e6e96Seschrock void 1316ea8dc4b6Seschrock vdev_load(vdev_t *vd) 1317fa9e4066Sahrens { 1318560e6e96Seschrock int c; 1319fa9e4066Sahrens 1320fa9e4066Sahrens /* 1321fa9e4066Sahrens * Recursively load all children. 1322fa9e4066Sahrens */ 1323fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1324560e6e96Seschrock vdev_load(vd->vdev_child[c]); 1325fa9e4066Sahrens 1326fa9e4066Sahrens /* 13270e34b6a7Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1328fa9e4066Sahrens */ 1329560e6e96Seschrock if (vd == vd->vdev_top && 1330560e6e96Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1331560e6e96Seschrock vdev_metaslab_init(vd, 0) != 0)) 1332560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1333560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1334fa9e4066Sahrens 1335fa9e4066Sahrens /* 1336fa9e4066Sahrens * If this is a leaf vdev, load its DTL. 1337fa9e4066Sahrens */ 1338560e6e96Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1339560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1340560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1341fa9e4066Sahrens } 1342fa9e4066Sahrens 134399653d4eSeschrock /* 134499653d4eSeschrock * This special case of vdev_spare() is used for hot spares. It's sole purpose 134599653d4eSeschrock * it to set the vdev state for the associated vdev. To do this, we make sure 134699653d4eSeschrock * that we can open the underlying device, then try to read the label, and make 134799653d4eSeschrock * sure that the label is sane and that it hasn't been repurposed to another 134899653d4eSeschrock * pool. 134999653d4eSeschrock */ 135099653d4eSeschrock int 135199653d4eSeschrock vdev_validate_spare(vdev_t *vd) 135299653d4eSeschrock { 135399653d4eSeschrock nvlist_t *label; 135499653d4eSeschrock uint64_t guid, version; 135599653d4eSeschrock uint64_t state; 135699653d4eSeschrock 135799653d4eSeschrock if ((label = vdev_label_read_config(vd)) == NULL) { 135899653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 135999653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 136099653d4eSeschrock return (-1); 136199653d4eSeschrock } 136299653d4eSeschrock 136399653d4eSeschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 136499653d4eSeschrock version > ZFS_VERSION || 136599653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 136699653d4eSeschrock guid != vd->vdev_guid || 136799653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 136899653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 136999653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 137099653d4eSeschrock nvlist_free(label); 137199653d4eSeschrock return (-1); 137299653d4eSeschrock } 137399653d4eSeschrock 137439c23413Seschrock spa_spare_add(vd); 137539c23413Seschrock 137699653d4eSeschrock /* 137799653d4eSeschrock * We don't actually check the pool state here. If it's in fact in 137899653d4eSeschrock * use by another pool, we update this fact on the fly when requested. 137999653d4eSeschrock */ 138099653d4eSeschrock nvlist_free(label); 138199653d4eSeschrock return (0); 138299653d4eSeschrock } 138399653d4eSeschrock 1384fa9e4066Sahrens void 1385fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1386fa9e4066Sahrens { 1387fa9e4066Sahrens metaslab_t *msp; 1388fa9e4066Sahrens 1389fa9e4066Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1390fa9e4066Sahrens 1391fa9e4066Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1392fa9e4066Sahrens metaslab_sync_done(msp, txg); 1393fa9e4066Sahrens } 1394fa9e4066Sahrens 1395fa9e4066Sahrens void 1396fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1397fa9e4066Sahrens { 1398fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1399fa9e4066Sahrens vdev_t *lvd; 1400fa9e4066Sahrens metaslab_t *msp; 1401ecc2d604Sbonwick dmu_tx_t *tx; 1402fa9e4066Sahrens 1403fa9e4066Sahrens dprintf("%s txg %llu pass %d\n", 1404fa9e4066Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1405fa9e4066Sahrens 1406ecc2d604Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1407ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 1408ecc2d604Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1409ecc2d604Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1410ecc2d604Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1411ecc2d604Sbonwick ASSERT(vd->vdev_ms_array != 0); 1412ecc2d604Sbonwick vdev_config_dirty(vd); 1413ecc2d604Sbonwick dmu_tx_commit(tx); 1414ecc2d604Sbonwick } 1415fa9e4066Sahrens 1416ecc2d604Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1417fa9e4066Sahrens metaslab_sync(msp, txg); 1418ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1419ecc2d604Sbonwick } 1420fa9e4066Sahrens 1421fa9e4066Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1422fa9e4066Sahrens vdev_dtl_sync(lvd, txg); 1423fa9e4066Sahrens 1424fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1425fa9e4066Sahrens } 1426fa9e4066Sahrens 1427fa9e4066Sahrens uint64_t 1428fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1429fa9e4066Sahrens { 1430fa9e4066Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1431fa9e4066Sahrens } 1432fa9e4066Sahrens 1433fa9e4066Sahrens void 1434fa9e4066Sahrens vdev_io_start(zio_t *zio) 1435fa9e4066Sahrens { 1436fa9e4066Sahrens zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1437fa9e4066Sahrens } 1438fa9e4066Sahrens 1439fa9e4066Sahrens void 1440fa9e4066Sahrens vdev_io_done(zio_t *zio) 1441fa9e4066Sahrens { 1442fa9e4066Sahrens zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1443fa9e4066Sahrens } 1444fa9e4066Sahrens 1445fa9e4066Sahrens const char * 1446fa9e4066Sahrens vdev_description(vdev_t *vd) 1447fa9e4066Sahrens { 1448fa9e4066Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1449fa9e4066Sahrens return ("<unknown>"); 1450fa9e4066Sahrens 1451fa9e4066Sahrens if (vd->vdev_path != NULL) 1452fa9e4066Sahrens return (vd->vdev_path); 1453fa9e4066Sahrens 1454fa9e4066Sahrens if (vd->vdev_parent == NULL) 1455fa9e4066Sahrens return (spa_name(vd->vdev_spa)); 1456fa9e4066Sahrens 1457fa9e4066Sahrens return (vd->vdev_ops->vdev_op_type); 1458fa9e4066Sahrens } 1459fa9e4066Sahrens 14603d7072f8Seschrock /* 14613d7072f8Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 14623d7072f8Seschrock * not be opened, and no I/O is attempted. 14633d7072f8Seschrock */ 1464fa9e4066Sahrens int 14653d7072f8Seschrock vdev_fault(spa_t *spa, uint64_t guid) 1466fa9e4066Sahrens { 1467441d80aaSlling vdev_t *rvd, *vd; 1468441d80aaSlling uint64_t txg; 1469fa9e4066Sahrens 1470441d80aaSlling txg = spa_vdev_enter(spa); 1471fa9e4066Sahrens 1472441d80aaSlling rvd = spa->spa_root_vdev; 14730e34b6a7Sbonwick 1474ea8dc4b6Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1475441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 14763d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 14773d7072f8Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1478fa9e4066Sahrens 14793d7072f8Seschrock /* 14803d7072f8Seschrock * Faulted state takes precedence over degraded. 14813d7072f8Seschrock */ 14823d7072f8Seschrock vd->vdev_faulted = 1ULL; 14833d7072f8Seschrock vd->vdev_degraded = 0ULL; 14843d7072f8Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 14853d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 14863d7072f8Seschrock 14873d7072f8Seschrock /* 14883d7072f8Seschrock * If marking the vdev as faulted cause the toplevel vdev to become 14893d7072f8Seschrock * unavailable, then back off and simply mark the vdev as degraded 14903d7072f8Seschrock * instead. 14913d7072f8Seschrock */ 14923d7072f8Seschrock if (vdev_is_dead(vd->vdev_top)) { 14933d7072f8Seschrock vd->vdev_degraded = 1ULL; 14943d7072f8Seschrock vd->vdev_faulted = 0ULL; 14953d7072f8Seschrock 14963d7072f8Seschrock /* 14973d7072f8Seschrock * If we reopen the device and it's not dead, only then do we 14983d7072f8Seschrock * mark it degraded. 14993d7072f8Seschrock */ 15003d7072f8Seschrock vdev_reopen(vd); 15013d7072f8Seschrock 15023d7072f8Seschrock if (!vdev_is_dead(vd)) { 15033d7072f8Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15043d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 15053d7072f8Seschrock } 15063d7072f8Seschrock } 15073d7072f8Seschrock 15083d7072f8Seschrock vdev_config_dirty(vd->vdev_top); 15093d7072f8Seschrock 15103d7072f8Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15113d7072f8Seschrock 15123d7072f8Seschrock return (0); 15133d7072f8Seschrock } 15143d7072f8Seschrock 15153d7072f8Seschrock /* 15163d7072f8Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 15173d7072f8Seschrock * user that something is wrong. The vdev continues to operate as normal as far 15183d7072f8Seschrock * as I/O is concerned. 15193d7072f8Seschrock */ 15203d7072f8Seschrock int 15213d7072f8Seschrock vdev_degrade(spa_t *spa, uint64_t guid) 15223d7072f8Seschrock { 15233d7072f8Seschrock vdev_t *rvd, *vd; 15243d7072f8Seschrock uint64_t txg; 15253d7072f8Seschrock 15263d7072f8Seschrock txg = spa_vdev_enter(spa); 15273d7072f8Seschrock 15283d7072f8Seschrock rvd = spa->spa_root_vdev; 15293d7072f8Seschrock 15303d7072f8Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15313d7072f8Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15320e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 15330e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15340e34b6a7Sbonwick 15353d7072f8Seschrock /* 15363d7072f8Seschrock * If the vdev is already faulted, then don't do anything. 15373d7072f8Seschrock */ 15383d7072f8Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 15393d7072f8Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15403d7072f8Seschrock return (0); 15413d7072f8Seschrock } 15423d7072f8Seschrock 15433d7072f8Seschrock vd->vdev_degraded = 1ULL; 15443d7072f8Seschrock if (!vdev_is_dead(vd)) 15453d7072f8Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15463d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 15473d7072f8Seschrock vdev_config_dirty(vd->vdev_top); 15483d7072f8Seschrock 15493d7072f8Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15503d7072f8Seschrock 15513d7072f8Seschrock return (0); 15523d7072f8Seschrock } 15533d7072f8Seschrock 15543d7072f8Seschrock /* 15553d7072f8Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 15563d7072f8Seschrock * any attached spare device should be detached when the device finishes 15573d7072f8Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 15583d7072f8Seschrock * so no FMA events are generated if the device fails to open. 15593d7072f8Seschrock */ 15603d7072f8Seschrock int 15613d7072f8Seschrock vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 15623d7072f8Seschrock vdev_state_t *newstate) 15633d7072f8Seschrock { 15643d7072f8Seschrock vdev_t *rvd, *vd; 15653d7072f8Seschrock uint64_t txg; 15663d7072f8Seschrock 15673d7072f8Seschrock txg = spa_vdev_enter(spa); 15683d7072f8Seschrock 15693d7072f8Seschrock rvd = spa->spa_root_vdev; 15703d7072f8Seschrock 15713d7072f8Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15723d7072f8Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15733d7072f8Seschrock 15743d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15753d7072f8Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1576fa9e4066Sahrens 1577fa9e4066Sahrens vd->vdev_offline = B_FALSE; 1578441d80aaSlling vd->vdev_tmpoffline = B_FALSE; 15793d7072f8Seschrock vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 15803d7072f8Seschrock B_TRUE : B_FALSE; 15813d7072f8Seschrock vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 15823d7072f8Seschrock B_TRUE : B_FALSE; 1583ea8dc4b6Seschrock vdev_reopen(vd->vdev_top); 15843d7072f8Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 15853d7072f8Seschrock 15863d7072f8Seschrock if (newstate) 15873d7072f8Seschrock *newstate = vd->vdev_state; 15883d7072f8Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 15893d7072f8Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 15903d7072f8Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 15913d7072f8Seschrock vd->vdev_parent->vdev_child[0] == vd) 15923d7072f8Seschrock vd->vdev_unspare = B_TRUE; 1593fa9e4066Sahrens 1594441d80aaSlling vdev_config_dirty(vd->vdev_top); 1595441d80aaSlling 1596441d80aaSlling (void) spa_vdev_exit(spa, NULL, txg, 0); 1597fa9e4066Sahrens 15983d7072f8Seschrock /* 15993d7072f8Seschrock * Must hold spa_namespace_lock in order to post resilver sysevent 16003d7072f8Seschrock * w/pool name. 16013d7072f8Seschrock */ 16023d7072f8Seschrock mutex_enter(&spa_namespace_lock); 1603fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 16043d7072f8Seschrock mutex_exit(&spa_namespace_lock); 1605fa9e4066Sahrens 1606fa9e4066Sahrens return (0); 1607fa9e4066Sahrens } 1608fa9e4066Sahrens 1609fa9e4066Sahrens int 16103d7072f8Seschrock vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1611fa9e4066Sahrens { 1612441d80aaSlling vdev_t *rvd, *vd; 1613441d80aaSlling uint64_t txg; 1614fa9e4066Sahrens 1615441d80aaSlling txg = spa_vdev_enter(spa); 1616fa9e4066Sahrens 1617441d80aaSlling rvd = spa->spa_root_vdev; 16180e34b6a7Sbonwick 1619ea8dc4b6Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1620441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1621fa9e4066Sahrens 16220e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16230e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16240e34b6a7Sbonwick 1625fa9e4066Sahrens /* 1626ecc2d604Sbonwick * If the device isn't already offline, try to offline it. 1627fa9e4066Sahrens */ 1628ecc2d604Sbonwick if (!vd->vdev_offline) { 1629ecc2d604Sbonwick /* 1630ecc2d604Sbonwick * If this device's top-level vdev has a non-empty DTL, 1631ecc2d604Sbonwick * don't allow the device to be offlined. 1632ecc2d604Sbonwick * 1633ecc2d604Sbonwick * XXX -- make this more precise by allowing the offline 1634ecc2d604Sbonwick * as long as the remaining devices don't have any DTL holes. 1635ecc2d604Sbonwick */ 1636ecc2d604Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 1637ecc2d604Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1638fa9e4066Sahrens 1639ecc2d604Sbonwick /* 1640ecc2d604Sbonwick * Offline this device and reopen its top-level vdev. 1641ecc2d604Sbonwick * If this action results in the top-level vdev becoming 1642ecc2d604Sbonwick * unusable, undo it and fail the request. 1643ecc2d604Sbonwick */ 1644ecc2d604Sbonwick vd->vdev_offline = B_TRUE; 1645ea8dc4b6Seschrock vdev_reopen(vd->vdev_top); 1646ecc2d604Sbonwick if (vdev_is_dead(vd->vdev_top)) { 1647ecc2d604Sbonwick vd->vdev_offline = B_FALSE; 1648ecc2d604Sbonwick vdev_reopen(vd->vdev_top); 1649ecc2d604Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1650ecc2d604Sbonwick } 1651fa9e4066Sahrens } 1652fa9e4066Sahrens 16533d7072f8Seschrock vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 16543d7072f8Seschrock B_TRUE : B_FALSE; 1655ecc2d604Sbonwick 1656ecc2d604Sbonwick vdev_config_dirty(vd->vdev_top); 1657441d80aaSlling 1658441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, 0)); 1659fa9e4066Sahrens } 1660fa9e4066Sahrens 1661ea8dc4b6Seschrock /* 1662ea8dc4b6Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 1663ea8dc4b6Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 1664ea8dc4b6Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1665ea8dc4b6Seschrock */ 1666ea8dc4b6Seschrock void 1667ea8dc4b6Seschrock vdev_clear(spa_t *spa, vdev_t *vd) 1668fa9e4066Sahrens { 1669ea8dc4b6Seschrock int c; 1670fa9e4066Sahrens 1671ea8dc4b6Seschrock if (vd == NULL) 1672ea8dc4b6Seschrock vd = spa->spa_root_vdev; 1673fa9e4066Sahrens 1674ea8dc4b6Seschrock vd->vdev_stat.vs_read_errors = 0; 1675ea8dc4b6Seschrock vd->vdev_stat.vs_write_errors = 0; 1676ea8dc4b6Seschrock vd->vdev_stat.vs_checksum_errors = 0; 1677fa9e4066Sahrens 1678ea8dc4b6Seschrock for (c = 0; c < vd->vdev_children; c++) 1679ea8dc4b6Seschrock vdev_clear(spa, vd->vdev_child[c]); 16803d7072f8Seschrock 16813d7072f8Seschrock /* 16823d7072f8Seschrock * If we're in the FAULTED state, then clear the persistent state and 16833d7072f8Seschrock * attempt to reopen the device. We also mark the vdev config dirty, so 16843d7072f8Seschrock * that the new faulted state is written out to disk. 16853d7072f8Seschrock */ 16863d7072f8Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 16873d7072f8Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 16883d7072f8Seschrock vdev_reopen(vd); 16893d7072f8Seschrock vdev_config_dirty(vd->vdev_top); 16903d7072f8Seschrock 16913d7072f8Seschrock if (vd->vdev_faulted) 16923d7072f8Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, 16933d7072f8Seschrock B_TRUE) == 0); 16943d7072f8Seschrock 16953d7072f8Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 16963d7072f8Seschrock } 1697fa9e4066Sahrens } 1698fa9e4066Sahrens 1699fa9e4066Sahrens int 1700fa9e4066Sahrens vdev_is_dead(vdev_t *vd) 1701fa9e4066Sahrens { 17023d7072f8Seschrock return (vd->vdev_state < VDEV_STATE_DEGRADED); 1703fa9e4066Sahrens } 1704fa9e4066Sahrens 1705fa9e4066Sahrens int 1706fa9e4066Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1707fa9e4066Sahrens { 1708fa9e4066Sahrens int error = 0; 1709fa9e4066Sahrens 1710fa9e4066Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1711fa9e4066Sahrens return (0); 1712fa9e4066Sahrens 1713fa9e4066Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1714fa9e4066Sahrens return (0); 1715fa9e4066Sahrens 1716fa9e4066Sahrens switch (vd->vdev_fault_mode) { 1717fa9e4066Sahrens case VDEV_FAULT_RANDOM: 1718fa9e4066Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1719fa9e4066Sahrens error = EIO; 1720fa9e4066Sahrens break; 1721fa9e4066Sahrens 1722fa9e4066Sahrens case VDEV_FAULT_COUNT: 1723fa9e4066Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1724fa9e4066Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1725fa9e4066Sahrens error = EIO; 1726fa9e4066Sahrens break; 1727fa9e4066Sahrens } 1728fa9e4066Sahrens 1729fa9e4066Sahrens return (error); 1730fa9e4066Sahrens } 1731fa9e4066Sahrens 1732fa9e4066Sahrens /* 1733fa9e4066Sahrens * Get statistics for the given vdev. 1734fa9e4066Sahrens */ 1735fa9e4066Sahrens void 1736fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1737fa9e4066Sahrens { 1738fa9e4066Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1739fa9e4066Sahrens int c, t; 1740fa9e4066Sahrens 1741fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1742fa9e4066Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1743fa9e4066Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1744fa9e4066Sahrens vs->vs_state = vd->vdev_state; 17452a79c5feSlling vs->vs_rsize = vdev_get_rsize(vd); 1746fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1747fa9e4066Sahrens 1748fa9e4066Sahrens /* 1749fa9e4066Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1750fa9e4066Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1751fa9e4066Sahrens */ 1752fa9e4066Sahrens if (vd == rvd) { 1753fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1754fa9e4066Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1755fa9e4066Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1756fa9e4066Sahrens 1757fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1758fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1759fa9e4066Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1760fa9e4066Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1761fa9e4066Sahrens } 1762fa9e4066Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1763fa9e4066Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1764fa9e4066Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1765fa9e4066Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1766fa9e4066Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1767fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1768fa9e4066Sahrens } 1769fa9e4066Sahrens } 1770fa9e4066Sahrens } 1771fa9e4066Sahrens 1772fa9e4066Sahrens void 1773fa9e4066Sahrens vdev_stat_update(zio_t *zio) 1774fa9e4066Sahrens { 1775fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1776fa9e4066Sahrens vdev_t *pvd; 1777fa9e4066Sahrens uint64_t txg = zio->io_txg; 1778fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1779fa9e4066Sahrens zio_type_t type = zio->io_type; 1780fa9e4066Sahrens int flags = zio->io_flags; 1781fa9e4066Sahrens 1782fa9e4066Sahrens if (zio->io_error == 0) { 1783fa9e4066Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1784fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1785fa9e4066Sahrens vs->vs_ops[type]++; 1786fa9e4066Sahrens vs->vs_bytes[type] += zio->io_size; 1787fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1788fa9e4066Sahrens } 1789fa9e4066Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1790fa9e4066Sahrens zio->io_delegate_list == NULL) { 1791fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1792d80c45e0Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1793fa9e4066Sahrens vs->vs_scrub_repaired += zio->io_size; 1794fa9e4066Sahrens else 1795fa9e4066Sahrens vs->vs_self_healed += zio->io_size; 1796fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1797fa9e4066Sahrens } 1798fa9e4066Sahrens return; 1799fa9e4066Sahrens } 1800fa9e4066Sahrens 1801fa9e4066Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1802fa9e4066Sahrens return; 1803fa9e4066Sahrens 1804fa9e4066Sahrens if (!vdev_is_dead(vd)) { 1805fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1806fa9e4066Sahrens if (type == ZIO_TYPE_READ) { 1807fa9e4066Sahrens if (zio->io_error == ECKSUM) 1808fa9e4066Sahrens vs->vs_checksum_errors++; 1809fa9e4066Sahrens else 1810fa9e4066Sahrens vs->vs_read_errors++; 1811fa9e4066Sahrens } 1812fa9e4066Sahrens if (type == ZIO_TYPE_WRITE) 1813fa9e4066Sahrens vs->vs_write_errors++; 1814fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1815fa9e4066Sahrens } 1816fa9e4066Sahrens 1817fa9e4066Sahrens if (type == ZIO_TYPE_WRITE) { 1818fa9e4066Sahrens if (txg == 0 || vd->vdev_children != 0) 1819fa9e4066Sahrens return; 1820d80c45e0Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1821fa9e4066Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1822fa9e4066Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1823fa9e4066Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1824fa9e4066Sahrens } 1825fa9e4066Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1826fa9e4066Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1827fa9e4066Sahrens return; 1828ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1829fa9e4066Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1830fa9e4066Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1831fa9e4066Sahrens } 1832fa9e4066Sahrens } 1833fa9e4066Sahrens } 1834fa9e4066Sahrens 1835fa9e4066Sahrens void 1836fa9e4066Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1837fa9e4066Sahrens { 1838fa9e4066Sahrens int c; 1839fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1840fa9e4066Sahrens 1841fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1842fa9e4066Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1843fa9e4066Sahrens 1844fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1845fa9e4066Sahrens 1846fa9e4066Sahrens if (type == POOL_SCRUB_NONE) { 1847fa9e4066Sahrens /* 1848fa9e4066Sahrens * Update completion and end time. Leave everything else alone 1849fa9e4066Sahrens * so we can report what happened during the previous scrub. 1850fa9e4066Sahrens */ 1851fa9e4066Sahrens vs->vs_scrub_complete = complete; 1852fa9e4066Sahrens vs->vs_scrub_end = gethrestime_sec(); 1853fa9e4066Sahrens } else { 1854fa9e4066Sahrens vs->vs_scrub_type = type; 1855fa9e4066Sahrens vs->vs_scrub_complete = 0; 1856fa9e4066Sahrens vs->vs_scrub_examined = 0; 1857fa9e4066Sahrens vs->vs_scrub_repaired = 0; 1858fa9e4066Sahrens vs->vs_scrub_errors = 0; 1859fa9e4066Sahrens vs->vs_scrub_start = gethrestime_sec(); 1860fa9e4066Sahrens vs->vs_scrub_end = 0; 1861fa9e4066Sahrens } 1862fa9e4066Sahrens 1863fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1864fa9e4066Sahrens } 1865fa9e4066Sahrens 1866fa9e4066Sahrens /* 1867fa9e4066Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1868fa9e4066Sahrens */ 1869fa9e4066Sahrens void 187099653d4eSeschrock vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) 1871fa9e4066Sahrens { 187299653d4eSeschrock int64_t dspace_delta = space_delta; 1873*8654d025Sperrin spa_t *spa = vd->vdev_spa; 1874*8654d025Sperrin vdev_t *rvd = spa->spa_root_vdev; 1875fa9e4066Sahrens 1876*8654d025Sperrin ASSERT(vd == vd->vdev_top); 1877*8654d025Sperrin ASSERT(rvd == vd->vdev_parent); 1878*8654d025Sperrin ASSERT(vd->vdev_ms_count != 0); 187999653d4eSeschrock 1880*8654d025Sperrin /* 1881*8654d025Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 1882*8654d025Sperrin * factor. We must calculate this here and not at the root vdev 1883*8654d025Sperrin * because the root vdev's psize-to-asize is simply the max of its 1884*8654d025Sperrin * childrens', thus not accurate enough for us. 1885*8654d025Sperrin */ 1886*8654d025Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 1887*8654d025Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 1888*8654d025Sperrin vd->vdev_deflate_ratio; 1889*8654d025Sperrin 1890*8654d025Sperrin mutex_enter(&vd->vdev_stat_lock); 1891*8654d025Sperrin vd->vdev_stat.vs_space += space_delta; 1892*8654d025Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 1893*8654d025Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 1894*8654d025Sperrin mutex_exit(&vd->vdev_stat_lock); 1895*8654d025Sperrin 1896*8654d025Sperrin /* 1897*8654d025Sperrin * Don't count non-normal (e.g. intent log) space as part of 1898*8654d025Sperrin * the pool's capacity. 1899*8654d025Sperrin */ 1900*8654d025Sperrin if (vd->vdev_mg->mg_class != spa->spa_normal_class) 1901*8654d025Sperrin return; 1902*8654d025Sperrin 1903*8654d025Sperrin mutex_enter(&rvd->vdev_stat_lock); 1904*8654d025Sperrin rvd->vdev_stat.vs_space += space_delta; 1905*8654d025Sperrin rvd->vdev_stat.vs_alloc += alloc_delta; 1906*8654d025Sperrin rvd->vdev_stat.vs_dspace += dspace_delta; 1907*8654d025Sperrin mutex_exit(&rvd->vdev_stat_lock); 1908fa9e4066Sahrens } 1909fa9e4066Sahrens 1910fa9e4066Sahrens /* 1911fa9e4066Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 1912fa9e4066Sahrens * so that it will be written out next time the vdev configuration is synced. 1913fa9e4066Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 1914fa9e4066Sahrens */ 1915fa9e4066Sahrens void 1916fa9e4066Sahrens vdev_config_dirty(vdev_t *vd) 1917fa9e4066Sahrens { 1918fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1919fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1920fa9e4066Sahrens int c; 1921fa9e4066Sahrens 19225dabedeeSbonwick /* 19235dabedeeSbonwick * The dirty list is protected by the config lock. The caller must 19245dabedeeSbonwick * either hold the config lock as writer, or must be the sync thread 19255dabedeeSbonwick * (which holds the lock as reader). There's only one sync thread, 19265dabedeeSbonwick * so this is sufficient to ensure mutual exclusion. 19275dabedeeSbonwick */ 19285dabedeeSbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 19295dabedeeSbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 19305dabedeeSbonwick 1931fa9e4066Sahrens if (vd == rvd) { 1932fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) 1933fa9e4066Sahrens vdev_config_dirty(rvd->vdev_child[c]); 1934fa9e4066Sahrens } else { 1935fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 1936fa9e4066Sahrens 1937ecc2d604Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 1938fa9e4066Sahrens list_insert_head(&spa->spa_dirty_list, vd); 1939fa9e4066Sahrens } 1940fa9e4066Sahrens } 1941fa9e4066Sahrens 1942fa9e4066Sahrens void 1943fa9e4066Sahrens vdev_config_clean(vdev_t *vd) 1944fa9e4066Sahrens { 19455dabedeeSbonwick spa_t *spa = vd->vdev_spa; 19465dabedeeSbonwick 19475dabedeeSbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 19485dabedeeSbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 19495dabedeeSbonwick 1950ecc2d604Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 19515dabedeeSbonwick list_remove(&spa->spa_dirty_list, vd); 1952fa9e4066Sahrens } 1953fa9e4066Sahrens 195444cd46caSbillm void 195544cd46caSbillm vdev_propagate_state(vdev_t *vd) 195644cd46caSbillm { 195744cd46caSbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 195844cd46caSbillm int degraded = 0, faulted = 0; 195944cd46caSbillm int corrupted = 0; 196044cd46caSbillm int c; 196144cd46caSbillm vdev_t *child; 196244cd46caSbillm 19633d7072f8Seschrock if (vd->vdev_children > 0) { 19643d7072f8Seschrock for (c = 0; c < vd->vdev_children; c++) { 19653d7072f8Seschrock child = vd->vdev_child[c]; 19663d7072f8Seschrock if (vdev_is_dead(child)) 19673d7072f8Seschrock faulted++; 19683d7072f8Seschrock else if (child->vdev_state == VDEV_STATE_DEGRADED) 19693d7072f8Seschrock degraded++; 197044cd46caSbillm 19713d7072f8Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 19723d7072f8Seschrock corrupted++; 19733d7072f8Seschrock } 197444cd46caSbillm 19753d7072f8Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 19763d7072f8Seschrock 19773d7072f8Seschrock /* 19783d7072f8Seschrock * Root special: if there is a toplevel vdev that cannot be 19793d7072f8Seschrock * opened due to corrupted metadata, then propagate the root 19803d7072f8Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 19813d7072f8Seschrock * replicas'. 19823d7072f8Seschrock */ 19833d7072f8Seschrock if (corrupted && vd == rvd && 19843d7072f8Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 19853d7072f8Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 19863d7072f8Seschrock VDEV_AUX_CORRUPT_DATA); 19873d7072f8Seschrock } 19883d7072f8Seschrock 1989*8654d025Sperrin if (vd->vdev_parent && !vd->vdev_islog) 19903d7072f8Seschrock vdev_propagate_state(vd->vdev_parent); 199144cd46caSbillm } 199244cd46caSbillm 1993fa9e4066Sahrens /* 1994ea8dc4b6Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 1995ea8dc4b6Seschrock * state, because we're in the process of opening children depth-first. 1996ea8dc4b6Seschrock * Otherwise, we propagate the change to the parent. 1997ea8dc4b6Seschrock * 1998ea8dc4b6Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 1999ea8dc4b6Seschrock * generated. 2000fa9e4066Sahrens */ 2001fa9e4066Sahrens void 2002ea8dc4b6Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2003fa9e4066Sahrens { 2004560e6e96Seschrock uint64_t save_state; 2005ea8dc4b6Seschrock 2006ea8dc4b6Seschrock if (state == vd->vdev_state) { 2007ea8dc4b6Seschrock vd->vdev_stat.vs_aux = aux; 2008fa9e4066Sahrens return; 2009ea8dc4b6Seschrock } 2010ea8dc4b6Seschrock 2011560e6e96Seschrock save_state = vd->vdev_state; 2012fa9e4066Sahrens 2013fa9e4066Sahrens vd->vdev_state = state; 2014fa9e4066Sahrens vd->vdev_stat.vs_aux = aux; 2015fa9e4066Sahrens 20163d7072f8Seschrock /* 20173d7072f8Seschrock * If we are setting the vdev state to anything but an open state, then 20183d7072f8Seschrock * always close the underlying device. Otherwise, we keep accessible 20193d7072f8Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 20203d7072f8Seschrock * because that implies some extra checks (offline, etc) that we don't 20213d7072f8Seschrock * want here. This is limited to leaf devices, because otherwise 20223d7072f8Seschrock * closing the device will affect other children. 20233d7072f8Seschrock */ 20243d7072f8Seschrock if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) 20253d7072f8Seschrock vd->vdev_ops->vdev_op_close(vd); 20263d7072f8Seschrock 20273d7072f8Seschrock if (vd->vdev_removed && 20283d7072f8Seschrock state == VDEV_STATE_CANT_OPEN && 20293d7072f8Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 20303d7072f8Seschrock /* 20313d7072f8Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 20323d7072f8Seschrock * device was previously marked removed and someone attempted to 20333d7072f8Seschrock * reopen it. If this failed due to a nonexistent device, then 20343d7072f8Seschrock * keep the device in the REMOVED state. We also let this be if 20353d7072f8Seschrock * it is one of our special test online cases, which is only 20363d7072f8Seschrock * attempting to online the device and shouldn't generate an FMA 20373d7072f8Seschrock * fault. 20383d7072f8Seschrock */ 20393d7072f8Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 20403d7072f8Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 20413d7072f8Seschrock } else if (state == VDEV_STATE_REMOVED) { 20423d7072f8Seschrock /* 20433d7072f8Seschrock * Indicate to the ZFS DE that this device has been removed, and 20443d7072f8Seschrock * any recent errors should be ignored. 20453d7072f8Seschrock */ 20463d7072f8Seschrock zfs_post_remove(vd->vdev_spa, vd); 20473d7072f8Seschrock vd->vdev_removed = B_TRUE; 20483d7072f8Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 2049ea8dc4b6Seschrock /* 2050ea8dc4b6Seschrock * If we fail to open a vdev during an import, we mark it as 2051ea8dc4b6Seschrock * "not available", which signifies that it was never there to 2052ea8dc4b6Seschrock * begin with. Failure to open such a device is not considered 2053ea8dc4b6Seschrock * an error. 2054ea8dc4b6Seschrock */ 2055560e6e96Seschrock if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && 2056560e6e96Seschrock vd->vdev_ops->vdev_op_leaf) 2057560e6e96Seschrock vd->vdev_not_present = 1; 2058560e6e96Seschrock 2059560e6e96Seschrock /* 2060560e6e96Seschrock * Post the appropriate ereport. If the 'prevstate' field is 2061560e6e96Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 2062560e6e96Seschrock * that this is part of a vdev_reopen(). In this case, we don't 2063560e6e96Seschrock * want to post the ereport if the device was already in the 2064560e6e96Seschrock * CANT_OPEN state beforehand. 20653d7072f8Seschrock * 20663d7072f8Seschrock * If the 'checkremove' flag is set, then this is an attempt to 20673d7072f8Seschrock * online the device in response to an insertion event. If we 20683d7072f8Seschrock * hit this case, then we have detected an insertion event for a 20693d7072f8Seschrock * faulted or offline device that wasn't in the removed state. 20703d7072f8Seschrock * In this scenario, we don't post an ereport because we are 20713d7072f8Seschrock * about to replace the device, or attempt an online with 20723d7072f8Seschrock * vdev_forcefault, which will generate the fault for us. 2073560e6e96Seschrock */ 20743d7072f8Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 20753d7072f8Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 2076ea8dc4b6Seschrock vd != vd->vdev_spa->spa_root_vdev) { 2077ea8dc4b6Seschrock const char *class; 2078ea8dc4b6Seschrock 2079ea8dc4b6Seschrock switch (aux) { 2080ea8dc4b6Seschrock case VDEV_AUX_OPEN_FAILED: 2081ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2082ea8dc4b6Seschrock break; 2083ea8dc4b6Seschrock case VDEV_AUX_CORRUPT_DATA: 2084ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2085ea8dc4b6Seschrock break; 2086ea8dc4b6Seschrock case VDEV_AUX_NO_REPLICAS: 2087ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2088ea8dc4b6Seschrock break; 2089ea8dc4b6Seschrock case VDEV_AUX_BAD_GUID_SUM: 2090ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2091ea8dc4b6Seschrock break; 2092ea8dc4b6Seschrock case VDEV_AUX_TOO_SMALL: 2093ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2094ea8dc4b6Seschrock break; 2095ea8dc4b6Seschrock case VDEV_AUX_BAD_LABEL: 2096ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2097ea8dc4b6Seschrock break; 2098ea8dc4b6Seschrock default: 2099ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2100ea8dc4b6Seschrock } 2101ea8dc4b6Seschrock 2102ea8dc4b6Seschrock zfs_ereport_post(class, vd->vdev_spa, 2103560e6e96Seschrock vd, NULL, save_state, 0); 2104ea8dc4b6Seschrock } 2105ea8dc4b6Seschrock 21063d7072f8Seschrock /* Erase any notion of persistent removed state */ 21073d7072f8Seschrock vd->vdev_removed = B_FALSE; 21083d7072f8Seschrock } else { 21093d7072f8Seschrock vd->vdev_removed = B_FALSE; 21103d7072f8Seschrock } 2111ea8dc4b6Seschrock 21123d7072f8Seschrock if (!isopen) 21133d7072f8Seschrock vdev_propagate_state(vd); 2114fa9e4066Sahrens } 2115