1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5441d80aaSlling * Common Development and Distribution License (the "License"). 6441d80aaSlling * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 2199653d4eSeschrock 22fa9e4066Sahrens /* 2339c23413Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens #include <sys/zfs_context.h> 30ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 31fa9e4066Sahrens #include <sys/spa.h> 32fa9e4066Sahrens #include <sys/spa_impl.h> 33fa9e4066Sahrens #include <sys/dmu.h> 34fa9e4066Sahrens #include <sys/dmu_tx.h> 35fa9e4066Sahrens #include <sys/vdev_impl.h> 36fa9e4066Sahrens #include <sys/uberblock_impl.h> 37fa9e4066Sahrens #include <sys/metaslab.h> 38fa9e4066Sahrens #include <sys/metaslab_impl.h> 39fa9e4066Sahrens #include <sys/space_map.h> 40fa9e4066Sahrens #include <sys/zio.h> 41fa9e4066Sahrens #include <sys/zap.h> 42fa9e4066Sahrens #include <sys/fs/zfs.h> 43fa9e4066Sahrens 44fa9e4066Sahrens /* 45fa9e4066Sahrens * Virtual device management. 46fa9e4066Sahrens */ 47fa9e4066Sahrens 48fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = { 49fa9e4066Sahrens &vdev_root_ops, 50fa9e4066Sahrens &vdev_raidz_ops, 51fa9e4066Sahrens &vdev_mirror_ops, 52fa9e4066Sahrens &vdev_replacing_ops, 5399653d4eSeschrock &vdev_spare_ops, 54fa9e4066Sahrens &vdev_disk_ops, 55fa9e4066Sahrens &vdev_file_ops, 56fa9e4066Sahrens &vdev_missing_ops, 57fa9e4066Sahrens NULL 58fa9e4066Sahrens }; 59fa9e4066Sahrens 60*05b2b3b8Smishra /* maximum scrub/resilver I/O queue */ 61*05b2b3b8Smishra int zfs_scrub_limit = 70; 62*05b2b3b8Smishra 63fa9e4066Sahrens /* 64fa9e4066Sahrens * Given a vdev type, return the appropriate ops vector. 65fa9e4066Sahrens */ 66fa9e4066Sahrens static vdev_ops_t * 67fa9e4066Sahrens vdev_getops(const char *type) 68fa9e4066Sahrens { 69fa9e4066Sahrens vdev_ops_t *ops, **opspp; 70fa9e4066Sahrens 71fa9e4066Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 72fa9e4066Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 73fa9e4066Sahrens break; 74fa9e4066Sahrens 75fa9e4066Sahrens return (ops); 76fa9e4066Sahrens } 77fa9e4066Sahrens 78fa9e4066Sahrens /* 79fa9e4066Sahrens * Default asize function: return the MAX of psize with the asize of 80fa9e4066Sahrens * all children. This is what's used by anything other than RAID-Z. 81fa9e4066Sahrens */ 82fa9e4066Sahrens uint64_t 83fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 84fa9e4066Sahrens { 85ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 86fa9e4066Sahrens uint64_t csize; 87fa9e4066Sahrens uint64_t c; 88fa9e4066Sahrens 89fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) { 90fa9e4066Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91fa9e4066Sahrens asize = MAX(asize, csize); 92fa9e4066Sahrens } 93fa9e4066Sahrens 94fa9e4066Sahrens return (asize); 95fa9e4066Sahrens } 96fa9e4066Sahrens 972a79c5feSlling /* 982a79c5feSlling * Get the replaceable or attachable device size. 992a79c5feSlling * If the parent is a mirror or raidz, the replaceable size is the minimum 1002a79c5feSlling * psize of all its children. For the rest, just return our own psize. 1012a79c5feSlling * 1022a79c5feSlling * e.g. 1032a79c5feSlling * psize rsize 1042a79c5feSlling * root - - 1052a79c5feSlling * mirror/raidz - - 1062a79c5feSlling * disk1 20g 20g 1072a79c5feSlling * disk2 40g 20g 1082a79c5feSlling * disk3 80g 80g 1092a79c5feSlling */ 1102a79c5feSlling uint64_t 1112a79c5feSlling vdev_get_rsize(vdev_t *vd) 1122a79c5feSlling { 1132a79c5feSlling vdev_t *pvd, *cvd; 1142a79c5feSlling uint64_t c, rsize; 1152a79c5feSlling 1162a79c5feSlling pvd = vd->vdev_parent; 1172a79c5feSlling 1182a79c5feSlling /* 1192a79c5feSlling * If our parent is NULL or the root, just return our own psize. 1202a79c5feSlling */ 1212a79c5feSlling if (pvd == NULL || pvd->vdev_parent == NULL) 1222a79c5feSlling return (vd->vdev_psize); 1232a79c5feSlling 1242a79c5feSlling rsize = 0; 1252a79c5feSlling 1262a79c5feSlling for (c = 0; c < pvd->vdev_children; c++) { 1272a79c5feSlling cvd = pvd->vdev_child[c]; 1282a79c5feSlling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1292a79c5feSlling } 1302a79c5feSlling 1312a79c5feSlling return (rsize); 1322a79c5feSlling } 1332a79c5feSlling 134fa9e4066Sahrens vdev_t * 135fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 136fa9e4066Sahrens { 137fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 138fa9e4066Sahrens 139fa9e4066Sahrens if (vdev < rvd->vdev_children) 140fa9e4066Sahrens return (rvd->vdev_child[vdev]); 141fa9e4066Sahrens 142fa9e4066Sahrens return (NULL); 143fa9e4066Sahrens } 144fa9e4066Sahrens 145fa9e4066Sahrens vdev_t * 146fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 147fa9e4066Sahrens { 148fa9e4066Sahrens int c; 149fa9e4066Sahrens vdev_t *mvd; 150fa9e4066Sahrens 1510e34b6a7Sbonwick if (vd->vdev_guid == guid) 152fa9e4066Sahrens return (vd); 153fa9e4066Sahrens 154fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 155fa9e4066Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 156fa9e4066Sahrens NULL) 157fa9e4066Sahrens return (mvd); 158fa9e4066Sahrens 159fa9e4066Sahrens return (NULL); 160fa9e4066Sahrens } 161fa9e4066Sahrens 162fa9e4066Sahrens void 163fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 164fa9e4066Sahrens { 165fa9e4066Sahrens size_t oldsize, newsize; 166fa9e4066Sahrens uint64_t id = cvd->vdev_id; 167fa9e4066Sahrens vdev_t **newchild; 168fa9e4066Sahrens 169fa9e4066Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 170fa9e4066Sahrens ASSERT(cvd->vdev_parent == NULL); 171fa9e4066Sahrens 172fa9e4066Sahrens cvd->vdev_parent = pvd; 173fa9e4066Sahrens 174fa9e4066Sahrens if (pvd == NULL) 175fa9e4066Sahrens return; 176fa9e4066Sahrens 177fa9e4066Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 178fa9e4066Sahrens 179fa9e4066Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 180fa9e4066Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 181fa9e4066Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 182fa9e4066Sahrens 183fa9e4066Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 184fa9e4066Sahrens if (pvd->vdev_child != NULL) { 185fa9e4066Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 186fa9e4066Sahrens kmem_free(pvd->vdev_child, oldsize); 187fa9e4066Sahrens } 188fa9e4066Sahrens 189fa9e4066Sahrens pvd->vdev_child = newchild; 190fa9e4066Sahrens pvd->vdev_child[id] = cvd; 191fa9e4066Sahrens 192fa9e4066Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 193fa9e4066Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 194fa9e4066Sahrens 195fa9e4066Sahrens /* 196fa9e4066Sahrens * Walk up all ancestors to update guid sum. 197fa9e4066Sahrens */ 198fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 199fa9e4066Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 200*05b2b3b8Smishra 201*05b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 202*05b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 203fa9e4066Sahrens } 204fa9e4066Sahrens 205fa9e4066Sahrens void 206fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 207fa9e4066Sahrens { 208fa9e4066Sahrens int c; 209fa9e4066Sahrens uint_t id = cvd->vdev_id; 210fa9e4066Sahrens 211fa9e4066Sahrens ASSERT(cvd->vdev_parent == pvd); 212fa9e4066Sahrens 213fa9e4066Sahrens if (pvd == NULL) 214fa9e4066Sahrens return; 215fa9e4066Sahrens 216fa9e4066Sahrens ASSERT(id < pvd->vdev_children); 217fa9e4066Sahrens ASSERT(pvd->vdev_child[id] == cvd); 218fa9e4066Sahrens 219fa9e4066Sahrens pvd->vdev_child[id] = NULL; 220fa9e4066Sahrens cvd->vdev_parent = NULL; 221fa9e4066Sahrens 222fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) 223fa9e4066Sahrens if (pvd->vdev_child[c]) 224fa9e4066Sahrens break; 225fa9e4066Sahrens 226fa9e4066Sahrens if (c == pvd->vdev_children) { 227fa9e4066Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 228fa9e4066Sahrens pvd->vdev_child = NULL; 229fa9e4066Sahrens pvd->vdev_children = 0; 230fa9e4066Sahrens } 231fa9e4066Sahrens 232fa9e4066Sahrens /* 233fa9e4066Sahrens * Walk up all ancestors to update guid sum. 234fa9e4066Sahrens */ 235fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 236fa9e4066Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 237*05b2b3b8Smishra 238*05b2b3b8Smishra if (cvd->vdev_ops->vdev_op_leaf) 239*05b2b3b8Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 240fa9e4066Sahrens } 241fa9e4066Sahrens 242fa9e4066Sahrens /* 243fa9e4066Sahrens * Remove any holes in the child array. 244fa9e4066Sahrens */ 245fa9e4066Sahrens void 246fa9e4066Sahrens vdev_compact_children(vdev_t *pvd) 247fa9e4066Sahrens { 248fa9e4066Sahrens vdev_t **newchild, *cvd; 249fa9e4066Sahrens int oldc = pvd->vdev_children; 250fa9e4066Sahrens int newc, c; 251fa9e4066Sahrens 252fa9e4066Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 253fa9e4066Sahrens 254fa9e4066Sahrens for (c = newc = 0; c < oldc; c++) 255fa9e4066Sahrens if (pvd->vdev_child[c]) 256fa9e4066Sahrens newc++; 257fa9e4066Sahrens 258fa9e4066Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 259fa9e4066Sahrens 260fa9e4066Sahrens for (c = newc = 0; c < oldc; c++) { 261fa9e4066Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 262fa9e4066Sahrens newchild[newc] = cvd; 263fa9e4066Sahrens cvd->vdev_id = newc++; 264fa9e4066Sahrens } 265fa9e4066Sahrens } 266fa9e4066Sahrens 267fa9e4066Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 268fa9e4066Sahrens pvd->vdev_child = newchild; 269fa9e4066Sahrens pvd->vdev_children = newc; 270fa9e4066Sahrens } 271fa9e4066Sahrens 272fa9e4066Sahrens /* 273fa9e4066Sahrens * Allocate and minimally initialize a vdev_t. 274fa9e4066Sahrens */ 275fa9e4066Sahrens static vdev_t * 276fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 277fa9e4066Sahrens { 278fa9e4066Sahrens vdev_t *vd; 279fa9e4066Sahrens 280fa9e4066Sahrens vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 281fa9e4066Sahrens 2820e34b6a7Sbonwick if (spa->spa_root_vdev == NULL) { 2830e34b6a7Sbonwick ASSERT(ops == &vdev_root_ops); 2840e34b6a7Sbonwick spa->spa_root_vdev = vd; 2850e34b6a7Sbonwick } 2860e34b6a7Sbonwick 2870e34b6a7Sbonwick if (guid == 0) { 2880e34b6a7Sbonwick if (spa->spa_root_vdev == vd) { 2890e34b6a7Sbonwick /* 2900e34b6a7Sbonwick * The root vdev's guid will also be the pool guid, 2910e34b6a7Sbonwick * which must be unique among all pools. 2920e34b6a7Sbonwick */ 2930e34b6a7Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2940e34b6a7Sbonwick guid = spa_get_random(-1ULL); 2950e34b6a7Sbonwick } else { 2960e34b6a7Sbonwick /* 2970e34b6a7Sbonwick * Any other vdev's guid must be unique within the pool. 2980e34b6a7Sbonwick */ 2990e34b6a7Sbonwick while (guid == 0 || 3000e34b6a7Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3010e34b6a7Sbonwick guid = spa_get_random(-1ULL); 3020e34b6a7Sbonwick } 3030e34b6a7Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3040e34b6a7Sbonwick } 3050e34b6a7Sbonwick 306fa9e4066Sahrens vd->vdev_spa = spa; 307fa9e4066Sahrens vd->vdev_id = id; 308fa9e4066Sahrens vd->vdev_guid = guid; 309fa9e4066Sahrens vd->vdev_guid_sum = guid; 310fa9e4066Sahrens vd->vdev_ops = ops; 311fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 312fa9e4066Sahrens 313fa9e4066Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3145ad82045Snd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 315fa9e4066Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 316fa9e4066Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 317fa9e4066Sahrens txg_list_create(&vd->vdev_ms_list, 318fa9e4066Sahrens offsetof(struct metaslab, ms_txg_node)); 319fa9e4066Sahrens txg_list_create(&vd->vdev_dtl_list, 320fa9e4066Sahrens offsetof(struct vdev, vdev_dtl_node)); 321fa9e4066Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 322fa9e4066Sahrens 323fa9e4066Sahrens return (vd); 324fa9e4066Sahrens } 325fa9e4066Sahrens 326fa9e4066Sahrens /* 327fa9e4066Sahrens * Free a vdev_t that has been removed from service. 328fa9e4066Sahrens */ 329fa9e4066Sahrens static void 330fa9e4066Sahrens vdev_free_common(vdev_t *vd) 331fa9e4066Sahrens { 3320e34b6a7Sbonwick spa_t *spa = vd->vdev_spa; 3330e34b6a7Sbonwick 334fa9e4066Sahrens if (vd->vdev_path) 335fa9e4066Sahrens spa_strfree(vd->vdev_path); 336fa9e4066Sahrens if (vd->vdev_devid) 337fa9e4066Sahrens spa_strfree(vd->vdev_devid); 338fa9e4066Sahrens 33999653d4eSeschrock if (vd->vdev_isspare) 34039c23413Seschrock spa_spare_remove(vd); 34199653d4eSeschrock 342fa9e4066Sahrens txg_list_destroy(&vd->vdev_ms_list); 343fa9e4066Sahrens txg_list_destroy(&vd->vdev_dtl_list); 344fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 345ecc2d604Sbonwick space_map_unload(&vd->vdev_dtl_map); 346fa9e4066Sahrens space_map_destroy(&vd->vdev_dtl_map); 347fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 348fa9e4066Sahrens space_map_destroy(&vd->vdev_dtl_scrub); 349fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 350fa9e4066Sahrens mutex_destroy(&vd->vdev_dtl_lock); 3515ad82045Snd mutex_destroy(&vd->vdev_stat_lock); 352fa9e4066Sahrens 3530e34b6a7Sbonwick if (vd == spa->spa_root_vdev) 3540e34b6a7Sbonwick spa->spa_root_vdev = NULL; 3550e34b6a7Sbonwick 356fa9e4066Sahrens kmem_free(vd, sizeof (vdev_t)); 357fa9e4066Sahrens } 358fa9e4066Sahrens 359fa9e4066Sahrens /* 360fa9e4066Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 361fa9e4066Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 362fa9e4066Sahrens * different for each case. 363fa9e4066Sahrens */ 36499653d4eSeschrock int 36599653d4eSeschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 36699653d4eSeschrock int alloctype) 367fa9e4066Sahrens { 368fa9e4066Sahrens vdev_ops_t *ops; 369fa9e4066Sahrens char *type; 370ecc2d604Sbonwick uint64_t guid = 0; 371fa9e4066Sahrens vdev_t *vd; 372fa9e4066Sahrens 373fa9e4066Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 374fa9e4066Sahrens 375fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 37699653d4eSeschrock return (EINVAL); 377fa9e4066Sahrens 378fa9e4066Sahrens if ((ops = vdev_getops(type)) == NULL) 37999653d4eSeschrock return (EINVAL); 380fa9e4066Sahrens 381fa9e4066Sahrens /* 382fa9e4066Sahrens * If this is a load, get the vdev guid from the nvlist. 383fa9e4066Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 384fa9e4066Sahrens */ 385fa9e4066Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 386fa9e4066Sahrens uint64_t label_id; 387fa9e4066Sahrens 388fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 389fa9e4066Sahrens label_id != id) 39099653d4eSeschrock return (EINVAL); 391fa9e4066Sahrens 392fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 39399653d4eSeschrock return (EINVAL); 39499653d4eSeschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 39599653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 39699653d4eSeschrock return (EINVAL); 397fa9e4066Sahrens } 398fa9e4066Sahrens 39999653d4eSeschrock /* 40099653d4eSeschrock * The first allocated vdev must be of type 'root'. 40199653d4eSeschrock */ 40299653d4eSeschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 40399653d4eSeschrock return (EINVAL); 40499653d4eSeschrock 405fa9e4066Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 406fa9e4066Sahrens 407fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 408fa9e4066Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 409fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 410fa9e4066Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 411fa9e4066Sahrens 41299653d4eSeschrock /* 41399653d4eSeschrock * Set the nparity propery for RAID-Z vdevs. 41499653d4eSeschrock */ 41599653d4eSeschrock if (ops == &vdev_raidz_ops) { 41699653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 41799653d4eSeschrock &vd->vdev_nparity) == 0) { 41899653d4eSeschrock /* 41999653d4eSeschrock * Currently, we can only support 2 parity devices. 42099653d4eSeschrock */ 42199653d4eSeschrock if (vd->vdev_nparity > 2) 42299653d4eSeschrock return (EINVAL); 42399653d4eSeschrock /* 42499653d4eSeschrock * Older versions can only support 1 parity device. 42599653d4eSeschrock */ 42699653d4eSeschrock if (vd->vdev_nparity == 2 && 42799653d4eSeschrock spa_version(spa) < ZFS_VERSION_RAID6) 42899653d4eSeschrock return (ENOTSUP); 42999653d4eSeschrock 43099653d4eSeschrock } else { 43199653d4eSeschrock /* 43299653d4eSeschrock * We require the parity to be specified for SPAs that 43399653d4eSeschrock * support multiple parity levels. 43499653d4eSeschrock */ 43599653d4eSeschrock if (spa_version(spa) >= ZFS_VERSION_RAID6) 43699653d4eSeschrock return (EINVAL); 43799653d4eSeschrock 43899653d4eSeschrock /* 43999653d4eSeschrock * Otherwise, we default to 1 parity device for RAID-Z. 44099653d4eSeschrock */ 44199653d4eSeschrock vd->vdev_nparity = 1; 44299653d4eSeschrock } 44399653d4eSeschrock } else { 44499653d4eSeschrock vd->vdev_nparity = 0; 44599653d4eSeschrock } 44699653d4eSeschrock 447afefbcddSeschrock /* 448afefbcddSeschrock * Set the whole_disk property. If it's not specified, leave the value 449afefbcddSeschrock * as -1. 450afefbcddSeschrock */ 451afefbcddSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 452afefbcddSeschrock &vd->vdev_wholedisk) != 0) 453afefbcddSeschrock vd->vdev_wholedisk = -1ULL; 454afefbcddSeschrock 455ea8dc4b6Seschrock /* 456ea8dc4b6Seschrock * Look for the 'not present' flag. This will only be set if the device 457ea8dc4b6Seschrock * was not present at the time of import. 458ea8dc4b6Seschrock */ 459ea8dc4b6Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 460ea8dc4b6Seschrock &vd->vdev_not_present); 461ea8dc4b6Seschrock 462ecc2d604Sbonwick /* 463ecc2d604Sbonwick * Get the alignment requirement. 464ecc2d604Sbonwick */ 465ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 466ecc2d604Sbonwick 467fa9e4066Sahrens /* 468fa9e4066Sahrens * If we're a top-level vdev, try to load the allocation parameters. 469fa9e4066Sahrens */ 470fa9e4066Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 471fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 472fa9e4066Sahrens &vd->vdev_ms_array); 473fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 474fa9e4066Sahrens &vd->vdev_ms_shift); 475fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 476fa9e4066Sahrens &vd->vdev_asize); 477fa9e4066Sahrens } 478fa9e4066Sahrens 479fa9e4066Sahrens /* 480ecc2d604Sbonwick * If we're a leaf vdev, try to load the DTL object and offline state. 481fa9e4066Sahrens */ 482fa9e4066Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 483fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 484fa9e4066Sahrens &vd->vdev_dtl.smo_object); 485ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 486ecc2d604Sbonwick &vd->vdev_offline); 487fa9e4066Sahrens } 488fa9e4066Sahrens 489fa9e4066Sahrens /* 490fa9e4066Sahrens * Add ourselves to the parent's list of children. 491fa9e4066Sahrens */ 492fa9e4066Sahrens vdev_add_child(parent, vd); 493fa9e4066Sahrens 49499653d4eSeschrock *vdp = vd; 49599653d4eSeschrock 49699653d4eSeschrock return (0); 497fa9e4066Sahrens } 498fa9e4066Sahrens 499fa9e4066Sahrens void 500fa9e4066Sahrens vdev_free(vdev_t *vd) 501fa9e4066Sahrens { 502fa9e4066Sahrens int c; 503fa9e4066Sahrens 504fa9e4066Sahrens /* 505fa9e4066Sahrens * vdev_free() implies closing the vdev first. This is simpler than 506fa9e4066Sahrens * trying to ensure complicated semantics for all callers. 507fa9e4066Sahrens */ 508fa9e4066Sahrens vdev_close(vd); 509fa9e4066Sahrens 510ecc2d604Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 511fa9e4066Sahrens 512fa9e4066Sahrens /* 513fa9e4066Sahrens * Free all children. 514fa9e4066Sahrens */ 515fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 516fa9e4066Sahrens vdev_free(vd->vdev_child[c]); 517fa9e4066Sahrens 518fa9e4066Sahrens ASSERT(vd->vdev_child == NULL); 519fa9e4066Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 520fa9e4066Sahrens 521fa9e4066Sahrens /* 522fa9e4066Sahrens * Discard allocation state. 523fa9e4066Sahrens */ 524fa9e4066Sahrens if (vd == vd->vdev_top) 525fa9e4066Sahrens vdev_metaslab_fini(vd); 526fa9e4066Sahrens 527fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 52899653d4eSeschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 529fa9e4066Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 530fa9e4066Sahrens 531fa9e4066Sahrens /* 532fa9e4066Sahrens * Remove this vdev from its parent's child list. 533fa9e4066Sahrens */ 534fa9e4066Sahrens vdev_remove_child(vd->vdev_parent, vd); 535fa9e4066Sahrens 536fa9e4066Sahrens ASSERT(vd->vdev_parent == NULL); 537fa9e4066Sahrens 538fa9e4066Sahrens vdev_free_common(vd); 539fa9e4066Sahrens } 540fa9e4066Sahrens 541fa9e4066Sahrens /* 542fa9e4066Sahrens * Transfer top-level vdev state from svd to tvd. 543fa9e4066Sahrens */ 544fa9e4066Sahrens static void 545fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 546fa9e4066Sahrens { 547fa9e4066Sahrens spa_t *spa = svd->vdev_spa; 548fa9e4066Sahrens metaslab_t *msp; 549fa9e4066Sahrens vdev_t *vd; 550fa9e4066Sahrens int t; 551fa9e4066Sahrens 552fa9e4066Sahrens ASSERT(tvd == tvd->vdev_top); 553fa9e4066Sahrens 554fa9e4066Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 555fa9e4066Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 556fa9e4066Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 557fa9e4066Sahrens 558fa9e4066Sahrens svd->vdev_ms_array = 0; 559fa9e4066Sahrens svd->vdev_ms_shift = 0; 560fa9e4066Sahrens svd->vdev_ms_count = 0; 561fa9e4066Sahrens 562fa9e4066Sahrens tvd->vdev_mg = svd->vdev_mg; 563fa9e4066Sahrens tvd->vdev_ms = svd->vdev_ms; 564fa9e4066Sahrens 565fa9e4066Sahrens svd->vdev_mg = NULL; 566fa9e4066Sahrens svd->vdev_ms = NULL; 567ecc2d604Sbonwick 568ecc2d604Sbonwick if (tvd->vdev_mg != NULL) 569ecc2d604Sbonwick tvd->vdev_mg->mg_vd = tvd; 570fa9e4066Sahrens 571fa9e4066Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 572fa9e4066Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 57399653d4eSeschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 574fa9e4066Sahrens 575fa9e4066Sahrens svd->vdev_stat.vs_alloc = 0; 576fa9e4066Sahrens svd->vdev_stat.vs_space = 0; 57799653d4eSeschrock svd->vdev_stat.vs_dspace = 0; 578fa9e4066Sahrens 579fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) { 580fa9e4066Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 581fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 582fa9e4066Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 583fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 584fa9e4066Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 585fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 586fa9e4066Sahrens } 587fa9e4066Sahrens 588ecc2d604Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 589fa9e4066Sahrens vdev_config_clean(svd); 590fa9e4066Sahrens vdev_config_dirty(tvd); 591fa9e4066Sahrens } 592fa9e4066Sahrens 593ea8dc4b6Seschrock tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; 594ea8dc4b6Seschrock svd->vdev_reopen_wanted = 0; 59599653d4eSeschrock 59699653d4eSeschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 59799653d4eSeschrock svd->vdev_deflate_ratio = 0; 598fa9e4066Sahrens } 599fa9e4066Sahrens 600fa9e4066Sahrens static void 601fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 602fa9e4066Sahrens { 603fa9e4066Sahrens int c; 604fa9e4066Sahrens 605fa9e4066Sahrens if (vd == NULL) 606fa9e4066Sahrens return; 607fa9e4066Sahrens 608fa9e4066Sahrens vd->vdev_top = tvd; 609fa9e4066Sahrens 610fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 611fa9e4066Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 612fa9e4066Sahrens } 613fa9e4066Sahrens 614fa9e4066Sahrens /* 615fa9e4066Sahrens * Add a mirror/replacing vdev above an existing vdev. 616fa9e4066Sahrens */ 617fa9e4066Sahrens vdev_t * 618fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 619fa9e4066Sahrens { 620fa9e4066Sahrens spa_t *spa = cvd->vdev_spa; 621fa9e4066Sahrens vdev_t *pvd = cvd->vdev_parent; 622fa9e4066Sahrens vdev_t *mvd; 623fa9e4066Sahrens 624fa9e4066Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 625fa9e4066Sahrens 626fa9e4066Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 627ecc2d604Sbonwick 628ecc2d604Sbonwick mvd->vdev_asize = cvd->vdev_asize; 629ecc2d604Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 630ecc2d604Sbonwick mvd->vdev_state = cvd->vdev_state; 631ecc2d604Sbonwick 632fa9e4066Sahrens vdev_remove_child(pvd, cvd); 633fa9e4066Sahrens vdev_add_child(pvd, mvd); 634fa9e4066Sahrens cvd->vdev_id = mvd->vdev_children; 635fa9e4066Sahrens vdev_add_child(mvd, cvd); 636fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 637fa9e4066Sahrens 638fa9e4066Sahrens if (mvd == mvd->vdev_top) 639fa9e4066Sahrens vdev_top_transfer(cvd, mvd); 640fa9e4066Sahrens 641fa9e4066Sahrens return (mvd); 642fa9e4066Sahrens } 643fa9e4066Sahrens 644fa9e4066Sahrens /* 645fa9e4066Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 646fa9e4066Sahrens */ 647fa9e4066Sahrens void 648fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd) 649fa9e4066Sahrens { 650fa9e4066Sahrens vdev_t *mvd = cvd->vdev_parent; 651fa9e4066Sahrens vdev_t *pvd = mvd->vdev_parent; 652fa9e4066Sahrens 653fa9e4066Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 654fa9e4066Sahrens 655fa9e4066Sahrens ASSERT(mvd->vdev_children == 1); 656fa9e4066Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 65799653d4eSeschrock mvd->vdev_ops == &vdev_replacing_ops || 65899653d4eSeschrock mvd->vdev_ops == &vdev_spare_ops); 659ecc2d604Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 660fa9e4066Sahrens 661fa9e4066Sahrens vdev_remove_child(mvd, cvd); 662fa9e4066Sahrens vdev_remove_child(pvd, mvd); 663fa9e4066Sahrens cvd->vdev_id = mvd->vdev_id; 664fa9e4066Sahrens vdev_add_child(pvd, cvd); 66599653d4eSeschrock /* 66699653d4eSeschrock * If we created a new toplevel vdev, then we need to change the child's 66799653d4eSeschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 66899653d4eSeschrock * detached an offline device, and when we go to import the pool we'll 66999653d4eSeschrock * think we have two toplevel vdevs, instead of a different version of 67099653d4eSeschrock * the same toplevel vdev. 67199653d4eSeschrock */ 67299653d4eSeschrock if (cvd->vdev_top == cvd) { 67399653d4eSeschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 67499653d4eSeschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 67599653d4eSeschrock cvd->vdev_guid = mvd->vdev_guid; 67699653d4eSeschrock cvd->vdev_guid_sum += mvd->vdev_guid; 67799653d4eSeschrock pvd->vdev_guid_sum += cvd->vdev_guid; 67899653d4eSeschrock } 679fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 680fa9e4066Sahrens 681fa9e4066Sahrens if (cvd == cvd->vdev_top) 682fa9e4066Sahrens vdev_top_transfer(mvd, cvd); 683fa9e4066Sahrens 684fa9e4066Sahrens ASSERT(mvd->vdev_children == 0); 685fa9e4066Sahrens vdev_free(mvd); 686fa9e4066Sahrens } 687fa9e4066Sahrens 688ea8dc4b6Seschrock int 689fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 690fa9e4066Sahrens { 691fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 692ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 693fa9e4066Sahrens metaslab_class_t *mc = spa_metaslab_class_select(spa); 694ecc2d604Sbonwick uint64_t m; 695fa9e4066Sahrens uint64_t oldc = vd->vdev_ms_count; 696fa9e4066Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 697ecc2d604Sbonwick metaslab_t **mspp; 698ecc2d604Sbonwick int error; 699fa9e4066Sahrens 7000e34b6a7Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7010e34b6a7Sbonwick return (0); 7020e34b6a7Sbonwick 703fa9e4066Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 704fa9e4066Sahrens 705fa9e4066Sahrens ASSERT(oldc <= newc); 706fa9e4066Sahrens 707ecc2d604Sbonwick if (vd->vdev_mg == NULL) 708ecc2d604Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 709fa9e4066Sahrens 710ecc2d604Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 711fa9e4066Sahrens 712ecc2d604Sbonwick if (oldc != 0) { 713ecc2d604Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 714ecc2d604Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 715ecc2d604Sbonwick } 716fa9e4066Sahrens 717ecc2d604Sbonwick vd->vdev_ms = mspp; 718ecc2d604Sbonwick vd->vdev_ms_count = newc; 719fa9e4066Sahrens 720ecc2d604Sbonwick for (m = oldc; m < newc; m++) { 721ecc2d604Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 722ecc2d604Sbonwick if (txg == 0) { 723ecc2d604Sbonwick uint64_t object = 0; 724ecc2d604Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 725ecc2d604Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 726ecc2d604Sbonwick if (error) 727ecc2d604Sbonwick return (error); 728ecc2d604Sbonwick if (object != 0) { 729ecc2d604Sbonwick dmu_buf_t *db; 730ecc2d604Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 731ecc2d604Sbonwick if (error) 732ecc2d604Sbonwick return (error); 733ecc2d604Sbonwick ASSERT3U(db->db_size, ==, sizeof (smo)); 734ecc2d604Sbonwick bcopy(db->db_data, &smo, db->db_size); 735ecc2d604Sbonwick ASSERT3U(smo.smo_object, ==, object); 736ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 737fa9e4066Sahrens } 738fa9e4066Sahrens } 739ecc2d604Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 740ecc2d604Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 741fa9e4066Sahrens } 742fa9e4066Sahrens 743ea8dc4b6Seschrock return (0); 744fa9e4066Sahrens } 745fa9e4066Sahrens 746fa9e4066Sahrens void 747fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd) 748fa9e4066Sahrens { 749fa9e4066Sahrens uint64_t m; 750fa9e4066Sahrens uint64_t count = vd->vdev_ms_count; 751fa9e4066Sahrens 752fa9e4066Sahrens if (vd->vdev_ms != NULL) { 753fa9e4066Sahrens for (m = 0; m < count; m++) 754ecc2d604Sbonwick if (vd->vdev_ms[m] != NULL) 755ecc2d604Sbonwick metaslab_fini(vd->vdev_ms[m]); 756fa9e4066Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 757fa9e4066Sahrens vd->vdev_ms = NULL; 758fa9e4066Sahrens } 759fa9e4066Sahrens } 760fa9e4066Sahrens 761fa9e4066Sahrens /* 762fa9e4066Sahrens * Prepare a virtual device for access. 763fa9e4066Sahrens */ 764fa9e4066Sahrens int 765fa9e4066Sahrens vdev_open(vdev_t *vd) 766fa9e4066Sahrens { 767fa9e4066Sahrens int error; 768fa9e4066Sahrens int c; 769fa9e4066Sahrens uint64_t osize = 0; 770fa9e4066Sahrens uint64_t asize, psize; 771ecc2d604Sbonwick uint64_t ashift = 0; 772fa9e4066Sahrens 773fa9e4066Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 774fa9e4066Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 775fa9e4066Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 776fa9e4066Sahrens 777fa9e4066Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 778fa9e4066Sahrens vd->vdev_fault_arg >>= 1; 779fa9e4066Sahrens else 780fa9e4066Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 781fa9e4066Sahrens 782fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 783fa9e4066Sahrens 784fa9e4066Sahrens if (vd->vdev_ops->vdev_op_leaf) { 785fa9e4066Sahrens vdev_cache_init(vd); 786fa9e4066Sahrens vdev_queue_init(vd); 787fa9e4066Sahrens vd->vdev_cache_active = B_TRUE; 788fa9e4066Sahrens } 789fa9e4066Sahrens 790fa9e4066Sahrens if (vd->vdev_offline) { 791fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 792ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 793fa9e4066Sahrens return (ENXIO); 794fa9e4066Sahrens } 795fa9e4066Sahrens 796fa9e4066Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 797fa9e4066Sahrens 798ea8dc4b6Seschrock if (zio_injection_enabled && error == 0) 799ea8dc4b6Seschrock error = zio_handle_device_injection(vd, ENXIO); 800ea8dc4b6Seschrock 801fa9e4066Sahrens dprintf("%s = %d, osize %llu, state = %d\n", 802fa9e4066Sahrens vdev_description(vd), error, osize, vd->vdev_state); 803fa9e4066Sahrens 804fa9e4066Sahrens if (error) { 805ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 806fa9e4066Sahrens vd->vdev_stat.vs_aux); 807fa9e4066Sahrens return (error); 808fa9e4066Sahrens } 809fa9e4066Sahrens 810fa9e4066Sahrens vd->vdev_state = VDEV_STATE_HEALTHY; 811fa9e4066Sahrens 812fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 813ea8dc4b6Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 814ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 815ea8dc4b6Seschrock VDEV_AUX_NONE); 816ea8dc4b6Seschrock break; 817ea8dc4b6Seschrock } 818fa9e4066Sahrens 819fa9e4066Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 820fa9e4066Sahrens 821fa9e4066Sahrens if (vd->vdev_children == 0) { 822fa9e4066Sahrens if (osize < SPA_MINDEVSIZE) { 823ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 824ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 825fa9e4066Sahrens return (EOVERFLOW); 826fa9e4066Sahrens } 827fa9e4066Sahrens psize = osize; 828fa9e4066Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 829fa9e4066Sahrens } else { 830ecc2d604Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 831fa9e4066Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 832ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 833ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 834fa9e4066Sahrens return (EOVERFLOW); 835fa9e4066Sahrens } 836fa9e4066Sahrens psize = 0; 837fa9e4066Sahrens asize = osize; 838fa9e4066Sahrens } 839fa9e4066Sahrens 840fa9e4066Sahrens vd->vdev_psize = psize; 841fa9e4066Sahrens 842fa9e4066Sahrens if (vd->vdev_asize == 0) { 843fa9e4066Sahrens /* 844fa9e4066Sahrens * This is the first-ever open, so use the computed values. 845ecc2d604Sbonwick * For testing purposes, a higher ashift can be requested. 846fa9e4066Sahrens */ 847fa9e4066Sahrens vd->vdev_asize = asize; 848ecc2d604Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 849fa9e4066Sahrens } else { 850fa9e4066Sahrens /* 851fa9e4066Sahrens * Make sure the alignment requirement hasn't increased. 852fa9e4066Sahrens */ 853ecc2d604Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 854ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 855ea8dc4b6Seschrock VDEV_AUX_BAD_LABEL); 856fa9e4066Sahrens return (EINVAL); 857fa9e4066Sahrens } 858fa9e4066Sahrens 859fa9e4066Sahrens /* 860fa9e4066Sahrens * Make sure the device hasn't shrunk. 861fa9e4066Sahrens */ 862fa9e4066Sahrens if (asize < vd->vdev_asize) { 863ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 864ea8dc4b6Seschrock VDEV_AUX_BAD_LABEL); 865fa9e4066Sahrens return (EINVAL); 866fa9e4066Sahrens } 867fa9e4066Sahrens 868fa9e4066Sahrens /* 869fa9e4066Sahrens * If all children are healthy and the asize has increased, 870fa9e4066Sahrens * then we've experienced dynamic LUN growth. 871fa9e4066Sahrens */ 872fa9e4066Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 873fa9e4066Sahrens asize > vd->vdev_asize) { 874fa9e4066Sahrens vd->vdev_asize = asize; 875fa9e4066Sahrens } 876fa9e4066Sahrens } 877fa9e4066Sahrens 87899653d4eSeschrock /* 87999653d4eSeschrock * If this is a top-level vdev, compute the raidz-deflation 88099653d4eSeschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 88199653d4eSeschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 88299653d4eSeschrock * changes, this algorithm must never change, or we will 88399653d4eSeschrock * inconsistently account for existing bp's. 88499653d4eSeschrock */ 88599653d4eSeschrock if (vd->vdev_top == vd) { 88699653d4eSeschrock vd->vdev_deflate_ratio = (1<<17) / 88799653d4eSeschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 88899653d4eSeschrock } 88999653d4eSeschrock 890ea8dc4b6Seschrock /* 891ea8dc4b6Seschrock * This allows the ZFS DE to close cases appropriately. If a device 892ea8dc4b6Seschrock * goes away and later returns, we want to close the associated case. 893ea8dc4b6Seschrock * But it's not enough to simply post this only when a device goes from 894ea8dc4b6Seschrock * CANT_OPEN -> HEALTHY. If we reboot the system and the device is 895ea8dc4b6Seschrock * back, we also need to close the case (otherwise we will try to replay 896ea8dc4b6Seschrock * it). So we have to post this notifier every time. Since this only 897ea8dc4b6Seschrock * occurs during pool open or error recovery, this should not be an 898ea8dc4b6Seschrock * issue. 899ea8dc4b6Seschrock */ 900ea8dc4b6Seschrock zfs_post_ok(vd->vdev_spa, vd); 901ea8dc4b6Seschrock 902fa9e4066Sahrens return (0); 903fa9e4066Sahrens } 904fa9e4066Sahrens 905560e6e96Seschrock /* 906560e6e96Seschrock * Called once the vdevs are all opened, this routine validates the label 907560e6e96Seschrock * contents. This needs to be done before vdev_load() so that we don't 908560e6e96Seschrock * inadvertently do repair I/Os to the wrong device, and so that vdev_reopen() 909560e6e96Seschrock * won't succeed if the device has been changed underneath. 910560e6e96Seschrock * 911560e6e96Seschrock * This function will only return failure if one of the vdevs indicates that it 912560e6e96Seschrock * has since been destroyed or exported. This is only possible if 913560e6e96Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 914560e6e96Seschrock * will be updated but the function will return 0. 915560e6e96Seschrock */ 916560e6e96Seschrock int 917560e6e96Seschrock vdev_validate(vdev_t *vd) 918560e6e96Seschrock { 919560e6e96Seschrock spa_t *spa = vd->vdev_spa; 920560e6e96Seschrock int c; 921560e6e96Seschrock nvlist_t *label; 922560e6e96Seschrock uint64_t guid; 923560e6e96Seschrock uint64_t state; 924560e6e96Seschrock 925560e6e96Seschrock for (c = 0; c < vd->vdev_children; c++) 926560e6e96Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 927560e6e96Seschrock return (-1); 928560e6e96Seschrock 929b5989ec7Seschrock /* 930b5989ec7Seschrock * If the device has already failed, or was marked offline, don't do 931b5989ec7Seschrock * any further validation. Otherwise, label I/O will fail and we will 932b5989ec7Seschrock * overwrite the previous state. 933b5989ec7Seschrock */ 934b5989ec7Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 935560e6e96Seschrock 936560e6e96Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 937560e6e96Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 938560e6e96Seschrock VDEV_AUX_BAD_LABEL); 939560e6e96Seschrock return (0); 940560e6e96Seschrock } 941560e6e96Seschrock 942560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 943560e6e96Seschrock &guid) != 0 || guid != spa_guid(spa)) { 944560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 945560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 946560e6e96Seschrock nvlist_free(label); 947560e6e96Seschrock return (0); 948560e6e96Seschrock } 949560e6e96Seschrock 950560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 951560e6e96Seschrock &guid) != 0 || guid != vd->vdev_guid) { 952560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 953560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 954560e6e96Seschrock nvlist_free(label); 955560e6e96Seschrock return (0); 956560e6e96Seschrock } 957560e6e96Seschrock 958560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 959560e6e96Seschrock &state) != 0) { 960560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 961560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 962560e6e96Seschrock nvlist_free(label); 963560e6e96Seschrock return (0); 964560e6e96Seschrock } 965560e6e96Seschrock 966560e6e96Seschrock nvlist_free(label); 967560e6e96Seschrock 968560e6e96Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 969560e6e96Seschrock state != POOL_STATE_ACTIVE) 970560e6e96Seschrock return (-1); 971560e6e96Seschrock } 972560e6e96Seschrock 973560e6e96Seschrock /* 974560e6e96Seschrock * If we were able to open and validate a vdev that was previously 975560e6e96Seschrock * marked permanently unavailable, clear that state now. 976560e6e96Seschrock */ 977560e6e96Seschrock if (vd->vdev_not_present) 978560e6e96Seschrock vd->vdev_not_present = 0; 979560e6e96Seschrock 980560e6e96Seschrock return (0); 981560e6e96Seschrock } 982560e6e96Seschrock 983fa9e4066Sahrens /* 984fa9e4066Sahrens * Close a virtual device. 985fa9e4066Sahrens */ 986fa9e4066Sahrens void 987fa9e4066Sahrens vdev_close(vdev_t *vd) 988fa9e4066Sahrens { 989fa9e4066Sahrens vd->vdev_ops->vdev_op_close(vd); 990fa9e4066Sahrens 991fa9e4066Sahrens if (vd->vdev_cache_active) { 992fa9e4066Sahrens vdev_cache_fini(vd); 993fa9e4066Sahrens vdev_queue_fini(vd); 994fa9e4066Sahrens vd->vdev_cache_active = B_FALSE; 995fa9e4066Sahrens } 996fa9e4066Sahrens 997560e6e96Seschrock /* 998560e6e96Seschrock * We record the previous state before we close it, so that if we are 999560e6e96Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 1000560e6e96Seschrock * it's still faulted. 1001560e6e96Seschrock */ 1002560e6e96Seschrock vd->vdev_prevstate = vd->vdev_state; 1003560e6e96Seschrock 1004fa9e4066Sahrens if (vd->vdev_offline) 1005fa9e4066Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1006fa9e4066Sahrens else 1007fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 1008ea8dc4b6Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1009fa9e4066Sahrens } 1010fa9e4066Sahrens 1011fa9e4066Sahrens void 1012ea8dc4b6Seschrock vdev_reopen(vdev_t *vd) 1013fa9e4066Sahrens { 1014ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1015fa9e4066Sahrens 1016ea8dc4b6Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1017ea8dc4b6Seschrock 1018fa9e4066Sahrens vdev_close(vd); 1019fa9e4066Sahrens (void) vdev_open(vd); 1020fa9e4066Sahrens 102139c23413Seschrock /* 102239c23413Seschrock * Call vdev_validate() here to make sure we have the same device. 102339c23413Seschrock * Otherwise, a device with an invalid label could be successfully 102439c23413Seschrock * opened in response to vdev_reopen(). 102539c23413Seschrock * 102639c23413Seschrock * The downside to this is that if the user is simply experimenting by 102739c23413Seschrock * overwriting an entire disk, we'll fault the device rather than 102839c23413Seschrock * demonstrate self-healing capabilities. On the other hand, with 102939c23413Seschrock * proper FMA integration, the series of errors we'd see from the device 103039c23413Seschrock * would result in a faulted device anyway. Given that this doesn't 103139c23413Seschrock * model any real-world corruption, it's better to catch this here and 103239c23413Seschrock * correctly identify that the device has either changed beneath us, or 103339c23413Seschrock * is corrupted beyond recognition. 103439c23413Seschrock */ 103539c23413Seschrock (void) vdev_validate(vd); 103639c23413Seschrock 1037fa9e4066Sahrens /* 1038fa9e4066Sahrens * Reassess root vdev's health. 1039fa9e4066Sahrens */ 104044cd46caSbillm vdev_propagate_state(spa->spa_root_vdev); 1041fa9e4066Sahrens } 1042fa9e4066Sahrens 1043fa9e4066Sahrens int 104499653d4eSeschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1045fa9e4066Sahrens { 1046fa9e4066Sahrens int error; 1047fa9e4066Sahrens 1048fa9e4066Sahrens /* 1049fa9e4066Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1050fa9e4066Sahrens * For a create, however, we want to fail the request if 1051fa9e4066Sahrens * there are any components we can't open. 1052fa9e4066Sahrens */ 1053fa9e4066Sahrens error = vdev_open(vd); 1054fa9e4066Sahrens 1055fa9e4066Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1056fa9e4066Sahrens vdev_close(vd); 1057fa9e4066Sahrens return (error ? error : ENXIO); 1058fa9e4066Sahrens } 1059fa9e4066Sahrens 1060fa9e4066Sahrens /* 1061fa9e4066Sahrens * Recursively initialize all labels. 1062fa9e4066Sahrens */ 106339c23413Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 106439c23413Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1065fa9e4066Sahrens vdev_close(vd); 1066fa9e4066Sahrens return (error); 1067fa9e4066Sahrens } 1068fa9e4066Sahrens 1069fa9e4066Sahrens return (0); 1070fa9e4066Sahrens } 1071fa9e4066Sahrens 1072fa9e4066Sahrens /* 1073fa9e4066Sahrens * The is the latter half of vdev_create(). It is distinct because it 1074fa9e4066Sahrens * involves initiating transactions in order to do metaslab creation. 1075fa9e4066Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1076fa9e4066Sahrens * if anything fails; this is much harder if we have pending transactions. 1077fa9e4066Sahrens */ 10780e34b6a7Sbonwick void 1079fa9e4066Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1080fa9e4066Sahrens { 1081fa9e4066Sahrens /* 1082fa9e4066Sahrens * Aim for roughly 200 metaslabs per vdev. 1083fa9e4066Sahrens */ 1084fa9e4066Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1085fa9e4066Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1086fa9e4066Sahrens 1087fa9e4066Sahrens /* 10880e34b6a7Sbonwick * Initialize the vdev's metaslabs. This can't fail because 10890e34b6a7Sbonwick * there's nothing to read when creating all new metaslabs. 1090fa9e4066Sahrens */ 10910e34b6a7Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1092fa9e4066Sahrens } 1093fa9e4066Sahrens 1094fa9e4066Sahrens void 1095ecc2d604Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1096fa9e4066Sahrens { 1097ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 1098ecc2d604Sbonwick ASSERT(ISP2(flags)); 1099fa9e4066Sahrens 1100ecc2d604Sbonwick if (flags & VDD_METASLAB) 1101ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1102ecc2d604Sbonwick 1103ecc2d604Sbonwick if (flags & VDD_DTL) 1104ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1105ecc2d604Sbonwick 1106ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1107fa9e4066Sahrens } 1108fa9e4066Sahrens 1109fa9e4066Sahrens void 1110fa9e4066Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1111fa9e4066Sahrens { 1112fa9e4066Sahrens mutex_enter(sm->sm_lock); 1113fa9e4066Sahrens if (!space_map_contains(sm, txg, size)) 1114fa9e4066Sahrens space_map_add(sm, txg, size); 1115fa9e4066Sahrens mutex_exit(sm->sm_lock); 1116fa9e4066Sahrens } 1117fa9e4066Sahrens 1118fa9e4066Sahrens int 1119fa9e4066Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1120fa9e4066Sahrens { 1121fa9e4066Sahrens int dirty; 1122fa9e4066Sahrens 1123fa9e4066Sahrens /* 1124fa9e4066Sahrens * Quick test without the lock -- covers the common case that 1125fa9e4066Sahrens * there are no dirty time segments. 1126fa9e4066Sahrens */ 1127fa9e4066Sahrens if (sm->sm_space == 0) 1128fa9e4066Sahrens return (0); 1129fa9e4066Sahrens 1130fa9e4066Sahrens mutex_enter(sm->sm_lock); 1131fa9e4066Sahrens dirty = space_map_contains(sm, txg, size); 1132fa9e4066Sahrens mutex_exit(sm->sm_lock); 1133fa9e4066Sahrens 1134fa9e4066Sahrens return (dirty); 1135fa9e4066Sahrens } 1136fa9e4066Sahrens 1137fa9e4066Sahrens /* 1138fa9e4066Sahrens * Reassess DTLs after a config change or scrub completion. 1139fa9e4066Sahrens */ 1140fa9e4066Sahrens void 1141fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1142fa9e4066Sahrens { 1143ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1144fa9e4066Sahrens int c; 1145fa9e4066Sahrens 1146ea8dc4b6Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1147fa9e4066Sahrens 1148fa9e4066Sahrens if (vd->vdev_children == 0) { 1149fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1150fa9e4066Sahrens /* 1151fa9e4066Sahrens * We're successfully scrubbed everything up to scrub_txg. 1152fa9e4066Sahrens * Therefore, excise all old DTLs up to that point, then 1153fa9e4066Sahrens * fold in the DTLs for everything we couldn't scrub. 1154fa9e4066Sahrens */ 1155fa9e4066Sahrens if (scrub_txg != 0) { 1156fa9e4066Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1157fa9e4066Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1158fa9e4066Sahrens } 1159fa9e4066Sahrens if (scrub_done) 1160fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1161fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1162ecc2d604Sbonwick if (txg != 0) 1163ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1164fa9e4066Sahrens return; 1165fa9e4066Sahrens } 1166fa9e4066Sahrens 1167ea8dc4b6Seschrock /* 1168ea8dc4b6Seschrock * Make sure the DTLs are always correct under the scrub lock. 1169ea8dc4b6Seschrock */ 1170ea8dc4b6Seschrock if (vd == spa->spa_root_vdev) 1171ea8dc4b6Seschrock mutex_enter(&spa->spa_scrub_lock); 1172ea8dc4b6Seschrock 1173fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1174fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1175fa9e4066Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1176fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1177fa9e4066Sahrens 1178fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) { 1179fa9e4066Sahrens vdev_t *cvd = vd->vdev_child[c]; 1180fa9e4066Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1181fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1182fa9e4066Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1183fa9e4066Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1184fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1185fa9e4066Sahrens } 1186ea8dc4b6Seschrock 1187ea8dc4b6Seschrock if (vd == spa->spa_root_vdev) 1188ea8dc4b6Seschrock mutex_exit(&spa->spa_scrub_lock); 1189fa9e4066Sahrens } 1190fa9e4066Sahrens 1191fa9e4066Sahrens static int 1192fa9e4066Sahrens vdev_dtl_load(vdev_t *vd) 1193fa9e4066Sahrens { 1194fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1195fa9e4066Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1196ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1197fa9e4066Sahrens dmu_buf_t *db; 1198fa9e4066Sahrens int error; 1199fa9e4066Sahrens 1200fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 1201fa9e4066Sahrens 1202fa9e4066Sahrens if (smo->smo_object == 0) 1203fa9e4066Sahrens return (0); 1204fa9e4066Sahrens 1205ecc2d604Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1206ea8dc4b6Seschrock return (error); 1207ecc2d604Sbonwick 1208fa9e4066Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1209fa9e4066Sahrens bcopy(db->db_data, smo, db->db_size); 1210ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1211fa9e4066Sahrens 1212fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1213ecc2d604Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1214fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1215fa9e4066Sahrens 1216fa9e4066Sahrens return (error); 1217fa9e4066Sahrens } 1218fa9e4066Sahrens 1219fa9e4066Sahrens void 1220fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1221fa9e4066Sahrens { 1222fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1223fa9e4066Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1224fa9e4066Sahrens space_map_t *sm = &vd->vdev_dtl_map; 1225ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 1226fa9e4066Sahrens space_map_t smsync; 1227fa9e4066Sahrens kmutex_t smlock; 1228fa9e4066Sahrens dmu_buf_t *db; 1229fa9e4066Sahrens dmu_tx_t *tx; 1230fa9e4066Sahrens 1231fa9e4066Sahrens dprintf("%s in txg %llu pass %d\n", 1232fa9e4066Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1233fa9e4066Sahrens 1234fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1235fa9e4066Sahrens 1236fa9e4066Sahrens if (vd->vdev_detached) { 1237fa9e4066Sahrens if (smo->smo_object != 0) { 1238ecc2d604Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1239fa9e4066Sahrens ASSERT3U(err, ==, 0); 1240fa9e4066Sahrens smo->smo_object = 0; 1241fa9e4066Sahrens } 1242fa9e4066Sahrens dmu_tx_commit(tx); 1243ecc2d604Sbonwick dprintf("detach %s committed in txg %llu\n", 1244ecc2d604Sbonwick vdev_description(vd), txg); 1245fa9e4066Sahrens return; 1246fa9e4066Sahrens } 1247fa9e4066Sahrens 1248fa9e4066Sahrens if (smo->smo_object == 0) { 1249fa9e4066Sahrens ASSERT(smo->smo_objsize == 0); 1250fa9e4066Sahrens ASSERT(smo->smo_alloc == 0); 1251ecc2d604Sbonwick smo->smo_object = dmu_object_alloc(mos, 1252fa9e4066Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1253fa9e4066Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1254fa9e4066Sahrens ASSERT(smo->smo_object != 0); 1255fa9e4066Sahrens vdev_config_dirty(vd->vdev_top); 1256fa9e4066Sahrens } 1257fa9e4066Sahrens 1258fa9e4066Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1259fa9e4066Sahrens 1260fa9e4066Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1261fa9e4066Sahrens &smlock); 1262fa9e4066Sahrens 1263fa9e4066Sahrens mutex_enter(&smlock); 1264fa9e4066Sahrens 1265fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1266ecc2d604Sbonwick space_map_walk(sm, space_map_add, &smsync); 1267fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 1268fa9e4066Sahrens 1269ecc2d604Sbonwick space_map_truncate(smo, mos, tx); 1270ecc2d604Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1271fa9e4066Sahrens 1272fa9e4066Sahrens space_map_destroy(&smsync); 1273fa9e4066Sahrens 1274fa9e4066Sahrens mutex_exit(&smlock); 1275fa9e4066Sahrens mutex_destroy(&smlock); 1276fa9e4066Sahrens 1277ecc2d604Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1278fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 1279fa9e4066Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1280fa9e4066Sahrens bcopy(smo, db->db_data, db->db_size); 1281ea8dc4b6Seschrock dmu_buf_rele(db, FTAG); 1282fa9e4066Sahrens 1283fa9e4066Sahrens dmu_tx_commit(tx); 1284fa9e4066Sahrens } 1285fa9e4066Sahrens 1286560e6e96Seschrock void 1287ea8dc4b6Seschrock vdev_load(vdev_t *vd) 1288fa9e4066Sahrens { 1289560e6e96Seschrock int c; 1290fa9e4066Sahrens 1291fa9e4066Sahrens /* 1292fa9e4066Sahrens * Recursively load all children. 1293fa9e4066Sahrens */ 1294fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1295560e6e96Seschrock vdev_load(vd->vdev_child[c]); 1296fa9e4066Sahrens 1297fa9e4066Sahrens /* 12980e34b6a7Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1299fa9e4066Sahrens */ 1300560e6e96Seschrock if (vd == vd->vdev_top && 1301560e6e96Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1302560e6e96Seschrock vdev_metaslab_init(vd, 0) != 0)) 1303560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1304560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1305fa9e4066Sahrens 1306fa9e4066Sahrens /* 1307fa9e4066Sahrens * If this is a leaf vdev, load its DTL. 1308fa9e4066Sahrens */ 1309560e6e96Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1310560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1311560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1312fa9e4066Sahrens } 1313fa9e4066Sahrens 131499653d4eSeschrock /* 131599653d4eSeschrock * This special case of vdev_spare() is used for hot spares. It's sole purpose 131699653d4eSeschrock * it to set the vdev state for the associated vdev. To do this, we make sure 131799653d4eSeschrock * that we can open the underlying device, then try to read the label, and make 131899653d4eSeschrock * sure that the label is sane and that it hasn't been repurposed to another 131999653d4eSeschrock * pool. 132099653d4eSeschrock */ 132199653d4eSeschrock int 132299653d4eSeschrock vdev_validate_spare(vdev_t *vd) 132399653d4eSeschrock { 132499653d4eSeschrock nvlist_t *label; 132599653d4eSeschrock uint64_t guid, version; 132699653d4eSeschrock uint64_t state; 132799653d4eSeschrock 132899653d4eSeschrock if ((label = vdev_label_read_config(vd)) == NULL) { 132999653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 133099653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 133199653d4eSeschrock return (-1); 133299653d4eSeschrock } 133399653d4eSeschrock 133499653d4eSeschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 133599653d4eSeschrock version > ZFS_VERSION || 133699653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 133799653d4eSeschrock guid != vd->vdev_guid || 133899653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 133999653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 134099653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 134199653d4eSeschrock nvlist_free(label); 134299653d4eSeschrock return (-1); 134399653d4eSeschrock } 134499653d4eSeschrock 134539c23413Seschrock spa_spare_add(vd); 134639c23413Seschrock 134799653d4eSeschrock /* 134899653d4eSeschrock * We don't actually check the pool state here. If it's in fact in 134999653d4eSeschrock * use by another pool, we update this fact on the fly when requested. 135099653d4eSeschrock */ 135199653d4eSeschrock nvlist_free(label); 135299653d4eSeschrock return (0); 135399653d4eSeschrock } 135499653d4eSeschrock 1355fa9e4066Sahrens void 1356fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1357fa9e4066Sahrens { 1358fa9e4066Sahrens metaslab_t *msp; 1359fa9e4066Sahrens 1360fa9e4066Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1361fa9e4066Sahrens 1362fa9e4066Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1363fa9e4066Sahrens metaslab_sync_done(msp, txg); 1364fa9e4066Sahrens } 1365fa9e4066Sahrens 1366fa9e4066Sahrens void 1367fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1368fa9e4066Sahrens { 1369fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1370fa9e4066Sahrens vdev_t *lvd; 1371fa9e4066Sahrens metaslab_t *msp; 1372ecc2d604Sbonwick dmu_tx_t *tx; 1373fa9e4066Sahrens 1374fa9e4066Sahrens dprintf("%s txg %llu pass %d\n", 1375fa9e4066Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1376fa9e4066Sahrens 1377ecc2d604Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1378ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 1379ecc2d604Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1380ecc2d604Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1381ecc2d604Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1382ecc2d604Sbonwick ASSERT(vd->vdev_ms_array != 0); 1383ecc2d604Sbonwick vdev_config_dirty(vd); 1384ecc2d604Sbonwick dmu_tx_commit(tx); 1385ecc2d604Sbonwick } 1386fa9e4066Sahrens 1387ecc2d604Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1388fa9e4066Sahrens metaslab_sync(msp, txg); 1389ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1390ecc2d604Sbonwick } 1391fa9e4066Sahrens 1392fa9e4066Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1393fa9e4066Sahrens vdev_dtl_sync(lvd, txg); 1394fa9e4066Sahrens 1395fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1396fa9e4066Sahrens } 1397fa9e4066Sahrens 1398fa9e4066Sahrens uint64_t 1399fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1400fa9e4066Sahrens { 1401fa9e4066Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1402fa9e4066Sahrens } 1403fa9e4066Sahrens 1404fa9e4066Sahrens void 1405fa9e4066Sahrens vdev_io_start(zio_t *zio) 1406fa9e4066Sahrens { 1407fa9e4066Sahrens zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1408fa9e4066Sahrens } 1409fa9e4066Sahrens 1410fa9e4066Sahrens void 1411fa9e4066Sahrens vdev_io_done(zio_t *zio) 1412fa9e4066Sahrens { 1413fa9e4066Sahrens zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1414fa9e4066Sahrens } 1415fa9e4066Sahrens 1416fa9e4066Sahrens const char * 1417fa9e4066Sahrens vdev_description(vdev_t *vd) 1418fa9e4066Sahrens { 1419fa9e4066Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1420fa9e4066Sahrens return ("<unknown>"); 1421fa9e4066Sahrens 1422fa9e4066Sahrens if (vd->vdev_path != NULL) 1423fa9e4066Sahrens return (vd->vdev_path); 1424fa9e4066Sahrens 1425fa9e4066Sahrens if (vd->vdev_parent == NULL) 1426fa9e4066Sahrens return (spa_name(vd->vdev_spa)); 1427fa9e4066Sahrens 1428fa9e4066Sahrens return (vd->vdev_ops->vdev_op_type); 1429fa9e4066Sahrens } 1430fa9e4066Sahrens 1431fa9e4066Sahrens int 1432ea8dc4b6Seschrock vdev_online(spa_t *spa, uint64_t guid) 1433fa9e4066Sahrens { 1434441d80aaSlling vdev_t *rvd, *vd; 1435441d80aaSlling uint64_t txg; 1436fa9e4066Sahrens 1437441d80aaSlling txg = spa_vdev_enter(spa); 1438fa9e4066Sahrens 1439441d80aaSlling rvd = spa->spa_root_vdev; 14400e34b6a7Sbonwick 1441ea8dc4b6Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1442441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1443fa9e4066Sahrens 14440e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14450e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14460e34b6a7Sbonwick 1447fa9e4066Sahrens dprintf("ONLINE: %s\n", vdev_description(vd)); 1448fa9e4066Sahrens 1449fa9e4066Sahrens vd->vdev_offline = B_FALSE; 1450441d80aaSlling vd->vdev_tmpoffline = B_FALSE; 1451ea8dc4b6Seschrock vdev_reopen(vd->vdev_top); 1452fa9e4066Sahrens 1453441d80aaSlling vdev_config_dirty(vd->vdev_top); 1454441d80aaSlling 1455441d80aaSlling (void) spa_vdev_exit(spa, NULL, txg, 0); 1456fa9e4066Sahrens 1457fa9e4066Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1458fa9e4066Sahrens 1459fa9e4066Sahrens return (0); 1460fa9e4066Sahrens } 1461fa9e4066Sahrens 1462fa9e4066Sahrens int 1463ea8dc4b6Seschrock vdev_offline(spa_t *spa, uint64_t guid, int istmp) 1464fa9e4066Sahrens { 1465441d80aaSlling vdev_t *rvd, *vd; 1466441d80aaSlling uint64_t txg; 1467fa9e4066Sahrens 1468441d80aaSlling txg = spa_vdev_enter(spa); 1469fa9e4066Sahrens 1470441d80aaSlling rvd = spa->spa_root_vdev; 14710e34b6a7Sbonwick 1472ea8dc4b6Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1473441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1474fa9e4066Sahrens 14750e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14760e34b6a7Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14770e34b6a7Sbonwick 1478fa9e4066Sahrens dprintf("OFFLINE: %s\n", vdev_description(vd)); 1479fa9e4066Sahrens 1480fa9e4066Sahrens /* 1481ecc2d604Sbonwick * If the device isn't already offline, try to offline it. 1482fa9e4066Sahrens */ 1483ecc2d604Sbonwick if (!vd->vdev_offline) { 1484ecc2d604Sbonwick /* 1485ecc2d604Sbonwick * If this device's top-level vdev has a non-empty DTL, 1486ecc2d604Sbonwick * don't allow the device to be offlined. 1487ecc2d604Sbonwick * 1488ecc2d604Sbonwick * XXX -- make this more precise by allowing the offline 1489ecc2d604Sbonwick * as long as the remaining devices don't have any DTL holes. 1490ecc2d604Sbonwick */ 1491ecc2d604Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 1492ecc2d604Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1493fa9e4066Sahrens 1494ecc2d604Sbonwick /* 1495ecc2d604Sbonwick * Offline this device and reopen its top-level vdev. 1496ecc2d604Sbonwick * If this action results in the top-level vdev becoming 1497ecc2d604Sbonwick * unusable, undo it and fail the request. 1498ecc2d604Sbonwick */ 1499ecc2d604Sbonwick vd->vdev_offline = B_TRUE; 1500ea8dc4b6Seschrock vdev_reopen(vd->vdev_top); 1501ecc2d604Sbonwick if (vdev_is_dead(vd->vdev_top)) { 1502ecc2d604Sbonwick vd->vdev_offline = B_FALSE; 1503ecc2d604Sbonwick vdev_reopen(vd->vdev_top); 1504ecc2d604Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1505ecc2d604Sbonwick } 1506fa9e4066Sahrens } 1507fa9e4066Sahrens 1508441d80aaSlling vd->vdev_tmpoffline = istmp; 1509ecc2d604Sbonwick 1510ecc2d604Sbonwick vdev_config_dirty(vd->vdev_top); 1511441d80aaSlling 1512441d80aaSlling return (spa_vdev_exit(spa, NULL, txg, 0)); 1513fa9e4066Sahrens } 1514fa9e4066Sahrens 1515ea8dc4b6Seschrock /* 1516ea8dc4b6Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 1517ea8dc4b6Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 1518ea8dc4b6Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1519ea8dc4b6Seschrock */ 1520ea8dc4b6Seschrock void 1521ea8dc4b6Seschrock vdev_clear(spa_t *spa, vdev_t *vd) 1522fa9e4066Sahrens { 1523ea8dc4b6Seschrock int c; 1524fa9e4066Sahrens 1525ea8dc4b6Seschrock if (vd == NULL) 1526ea8dc4b6Seschrock vd = spa->spa_root_vdev; 1527fa9e4066Sahrens 1528ea8dc4b6Seschrock vd->vdev_stat.vs_read_errors = 0; 1529ea8dc4b6Seschrock vd->vdev_stat.vs_write_errors = 0; 1530ea8dc4b6Seschrock vd->vdev_stat.vs_checksum_errors = 0; 1531fa9e4066Sahrens 1532ea8dc4b6Seschrock for (c = 0; c < vd->vdev_children; c++) 1533ea8dc4b6Seschrock vdev_clear(spa, vd->vdev_child[c]); 1534fa9e4066Sahrens } 1535fa9e4066Sahrens 1536fa9e4066Sahrens int 1537fa9e4066Sahrens vdev_is_dead(vdev_t *vd) 1538fa9e4066Sahrens { 1539fa9e4066Sahrens return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); 1540fa9e4066Sahrens } 1541fa9e4066Sahrens 1542fa9e4066Sahrens int 1543fa9e4066Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1544fa9e4066Sahrens { 1545fa9e4066Sahrens int error = 0; 1546fa9e4066Sahrens 1547fa9e4066Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1548fa9e4066Sahrens return (0); 1549fa9e4066Sahrens 1550fa9e4066Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1551fa9e4066Sahrens return (0); 1552fa9e4066Sahrens 1553fa9e4066Sahrens switch (vd->vdev_fault_mode) { 1554fa9e4066Sahrens case VDEV_FAULT_RANDOM: 1555fa9e4066Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1556fa9e4066Sahrens error = EIO; 1557fa9e4066Sahrens break; 1558fa9e4066Sahrens 1559fa9e4066Sahrens case VDEV_FAULT_COUNT: 1560fa9e4066Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1561fa9e4066Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1562fa9e4066Sahrens error = EIO; 1563fa9e4066Sahrens break; 1564fa9e4066Sahrens } 1565fa9e4066Sahrens 1566fa9e4066Sahrens if (error != 0) { 1567fa9e4066Sahrens dprintf("returning %d for type %d on %s state %d offset %llx\n", 1568fa9e4066Sahrens error, zio->io_type, vdev_description(vd), 1569fa9e4066Sahrens vd->vdev_state, zio->io_offset); 1570fa9e4066Sahrens } 1571fa9e4066Sahrens 1572fa9e4066Sahrens return (error); 1573fa9e4066Sahrens } 1574fa9e4066Sahrens 1575fa9e4066Sahrens /* 1576fa9e4066Sahrens * Get statistics for the given vdev. 1577fa9e4066Sahrens */ 1578fa9e4066Sahrens void 1579fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1580fa9e4066Sahrens { 1581fa9e4066Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1582fa9e4066Sahrens int c, t; 1583fa9e4066Sahrens 1584fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1585fa9e4066Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1586fa9e4066Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1587fa9e4066Sahrens vs->vs_state = vd->vdev_state; 15882a79c5feSlling vs->vs_rsize = vdev_get_rsize(vd); 1589fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1590fa9e4066Sahrens 1591fa9e4066Sahrens /* 1592fa9e4066Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1593fa9e4066Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1594fa9e4066Sahrens */ 1595fa9e4066Sahrens if (vd == rvd) { 1596fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1597fa9e4066Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1598fa9e4066Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1599fa9e4066Sahrens 1600fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1601fa9e4066Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1602fa9e4066Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1603fa9e4066Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1604fa9e4066Sahrens } 1605fa9e4066Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1606fa9e4066Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1607fa9e4066Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1608fa9e4066Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1609fa9e4066Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1610fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1611fa9e4066Sahrens } 1612fa9e4066Sahrens } 1613fa9e4066Sahrens } 1614fa9e4066Sahrens 1615fa9e4066Sahrens void 1616fa9e4066Sahrens vdev_stat_update(zio_t *zio) 1617fa9e4066Sahrens { 1618fa9e4066Sahrens vdev_t *vd = zio->io_vd; 1619fa9e4066Sahrens vdev_t *pvd; 1620fa9e4066Sahrens uint64_t txg = zio->io_txg; 1621fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1622fa9e4066Sahrens zio_type_t type = zio->io_type; 1623fa9e4066Sahrens int flags = zio->io_flags; 1624fa9e4066Sahrens 1625fa9e4066Sahrens if (zio->io_error == 0) { 1626fa9e4066Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1627fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1628fa9e4066Sahrens vs->vs_ops[type]++; 1629fa9e4066Sahrens vs->vs_bytes[type] += zio->io_size; 1630fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1631fa9e4066Sahrens } 1632fa9e4066Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1633fa9e4066Sahrens zio->io_delegate_list == NULL) { 1634fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1635d80c45e0Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1636fa9e4066Sahrens vs->vs_scrub_repaired += zio->io_size; 1637fa9e4066Sahrens else 1638fa9e4066Sahrens vs->vs_self_healed += zio->io_size; 1639fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1640fa9e4066Sahrens } 1641fa9e4066Sahrens return; 1642fa9e4066Sahrens } 1643fa9e4066Sahrens 1644fa9e4066Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1645fa9e4066Sahrens return; 1646fa9e4066Sahrens 1647fa9e4066Sahrens if (!vdev_is_dead(vd)) { 1648fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1649fa9e4066Sahrens if (type == ZIO_TYPE_READ) { 1650fa9e4066Sahrens if (zio->io_error == ECKSUM) 1651fa9e4066Sahrens vs->vs_checksum_errors++; 1652fa9e4066Sahrens else 1653fa9e4066Sahrens vs->vs_read_errors++; 1654fa9e4066Sahrens } 1655fa9e4066Sahrens if (type == ZIO_TYPE_WRITE) 1656fa9e4066Sahrens vs->vs_write_errors++; 1657fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1658fa9e4066Sahrens } 1659fa9e4066Sahrens 1660fa9e4066Sahrens if (type == ZIO_TYPE_WRITE) { 1661fa9e4066Sahrens if (txg == 0 || vd->vdev_children != 0) 1662fa9e4066Sahrens return; 1663d80c45e0Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1664fa9e4066Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1665fa9e4066Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1666fa9e4066Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1667fa9e4066Sahrens } 1668fa9e4066Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1669fa9e4066Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1670fa9e4066Sahrens return; 1671ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1672fa9e4066Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1673fa9e4066Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1674fa9e4066Sahrens } 1675fa9e4066Sahrens } 1676fa9e4066Sahrens } 1677fa9e4066Sahrens 1678fa9e4066Sahrens void 1679fa9e4066Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1680fa9e4066Sahrens { 1681fa9e4066Sahrens int c; 1682fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1683fa9e4066Sahrens 1684fa9e4066Sahrens for (c = 0; c < vd->vdev_children; c++) 1685fa9e4066Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1686fa9e4066Sahrens 1687fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1688fa9e4066Sahrens 1689fa9e4066Sahrens if (type == POOL_SCRUB_NONE) { 1690fa9e4066Sahrens /* 1691fa9e4066Sahrens * Update completion and end time. Leave everything else alone 1692fa9e4066Sahrens * so we can report what happened during the previous scrub. 1693fa9e4066Sahrens */ 1694fa9e4066Sahrens vs->vs_scrub_complete = complete; 1695fa9e4066Sahrens vs->vs_scrub_end = gethrestime_sec(); 1696fa9e4066Sahrens } else { 1697fa9e4066Sahrens vs->vs_scrub_type = type; 1698fa9e4066Sahrens vs->vs_scrub_complete = 0; 1699fa9e4066Sahrens vs->vs_scrub_examined = 0; 1700fa9e4066Sahrens vs->vs_scrub_repaired = 0; 1701fa9e4066Sahrens vs->vs_scrub_errors = 0; 1702fa9e4066Sahrens vs->vs_scrub_start = gethrestime_sec(); 1703fa9e4066Sahrens vs->vs_scrub_end = 0; 1704fa9e4066Sahrens } 1705fa9e4066Sahrens 1706fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1707fa9e4066Sahrens } 1708fa9e4066Sahrens 1709fa9e4066Sahrens /* 1710fa9e4066Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1711fa9e4066Sahrens */ 1712fa9e4066Sahrens void 171399653d4eSeschrock vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) 1714fa9e4066Sahrens { 1715fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 171699653d4eSeschrock int64_t dspace_delta = space_delta; 1717fa9e4066Sahrens 1718fa9e4066Sahrens do { 171999653d4eSeschrock if (vd->vdev_ms_count) { 172099653d4eSeschrock /* 172199653d4eSeschrock * If this is a top-level vdev, apply the 172299653d4eSeschrock * inverse of its psize-to-asize (ie. RAID-Z) 172399653d4eSeschrock * space-expansion factor. We must calculate 172499653d4eSeschrock * this here and not at the root vdev because 172599653d4eSeschrock * the root vdev's psize-to-asize is simply the 172699653d4eSeschrock * max of its childrens', thus not accurate 172799653d4eSeschrock * enough for us. 172899653d4eSeschrock */ 172999653d4eSeschrock ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 173099653d4eSeschrock dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 173199653d4eSeschrock vd->vdev_deflate_ratio; 173299653d4eSeschrock } 173399653d4eSeschrock 1734fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 1735fa9e4066Sahrens vd->vdev_stat.vs_space += space_delta; 1736fa9e4066Sahrens vd->vdev_stat.vs_alloc += alloc_delta; 173799653d4eSeschrock vd->vdev_stat.vs_dspace += dspace_delta; 1738fa9e4066Sahrens mutex_exit(&vd->vdev_stat_lock); 1739fa9e4066Sahrens } while ((vd = vd->vdev_parent) != NULL); 1740fa9e4066Sahrens } 1741fa9e4066Sahrens 1742fa9e4066Sahrens /* 1743fa9e4066Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 1744fa9e4066Sahrens * so that it will be written out next time the vdev configuration is synced. 1745fa9e4066Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 1746fa9e4066Sahrens */ 1747fa9e4066Sahrens void 1748fa9e4066Sahrens vdev_config_dirty(vdev_t *vd) 1749fa9e4066Sahrens { 1750fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 1751fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 1752fa9e4066Sahrens int c; 1753fa9e4066Sahrens 17545dabedeeSbonwick /* 17555dabedeeSbonwick * The dirty list is protected by the config lock. The caller must 17565dabedeeSbonwick * either hold the config lock as writer, or must be the sync thread 17575dabedeeSbonwick * (which holds the lock as reader). There's only one sync thread, 17585dabedeeSbonwick * so this is sufficient to ensure mutual exclusion. 17595dabedeeSbonwick */ 17605dabedeeSbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 17615dabedeeSbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 17625dabedeeSbonwick 1763fa9e4066Sahrens if (vd == rvd) { 1764fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) 1765fa9e4066Sahrens vdev_config_dirty(rvd->vdev_child[c]); 1766fa9e4066Sahrens } else { 1767fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 1768fa9e4066Sahrens 1769ecc2d604Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 1770fa9e4066Sahrens list_insert_head(&spa->spa_dirty_list, vd); 1771fa9e4066Sahrens } 1772fa9e4066Sahrens } 1773fa9e4066Sahrens 1774fa9e4066Sahrens void 1775fa9e4066Sahrens vdev_config_clean(vdev_t *vd) 1776fa9e4066Sahrens { 17775dabedeeSbonwick spa_t *spa = vd->vdev_spa; 17785dabedeeSbonwick 17795dabedeeSbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 17805dabedeeSbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 17815dabedeeSbonwick 1782ecc2d604Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 17835dabedeeSbonwick list_remove(&spa->spa_dirty_list, vd); 1784fa9e4066Sahrens } 1785fa9e4066Sahrens 178644cd46caSbillm void 178744cd46caSbillm vdev_propagate_state(vdev_t *vd) 178844cd46caSbillm { 178944cd46caSbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 179044cd46caSbillm int degraded = 0, faulted = 0; 179144cd46caSbillm int corrupted = 0; 179244cd46caSbillm int c; 179344cd46caSbillm vdev_t *child; 179444cd46caSbillm 179544cd46caSbillm for (c = 0; c < vd->vdev_children; c++) { 179644cd46caSbillm child = vd->vdev_child[c]; 179744cd46caSbillm if (child->vdev_state <= VDEV_STATE_CANT_OPEN) 179844cd46caSbillm faulted++; 179944cd46caSbillm else if (child->vdev_state == VDEV_STATE_DEGRADED) 180044cd46caSbillm degraded++; 180144cd46caSbillm 180244cd46caSbillm if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 180344cd46caSbillm corrupted++; 180444cd46caSbillm } 180544cd46caSbillm 180644cd46caSbillm vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 180744cd46caSbillm 180844cd46caSbillm /* 180944cd46caSbillm * Root special: if there is a toplevel vdev that cannot be 181044cd46caSbillm * opened due to corrupted metadata, then propagate the root 181144cd46caSbillm * vdev's aux state as 'corrupt' rather than 'insufficient 181244cd46caSbillm * replicas'. 181344cd46caSbillm */ 181444cd46caSbillm if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) 181544cd46caSbillm vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 181644cd46caSbillm VDEV_AUX_CORRUPT_DATA); 181744cd46caSbillm } 181844cd46caSbillm 1819fa9e4066Sahrens /* 1820ea8dc4b6Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 1821ea8dc4b6Seschrock * state, because we're in the process of opening children depth-first. 1822ea8dc4b6Seschrock * Otherwise, we propagate the change to the parent. 1823ea8dc4b6Seschrock * 1824ea8dc4b6Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 1825ea8dc4b6Seschrock * generated. 1826fa9e4066Sahrens */ 1827fa9e4066Sahrens void 1828ea8dc4b6Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 1829fa9e4066Sahrens { 1830560e6e96Seschrock uint64_t save_state; 1831ea8dc4b6Seschrock 1832ea8dc4b6Seschrock if (state == vd->vdev_state) { 1833ea8dc4b6Seschrock vd->vdev_stat.vs_aux = aux; 1834fa9e4066Sahrens return; 1835ea8dc4b6Seschrock } 1836ea8dc4b6Seschrock 1837560e6e96Seschrock save_state = vd->vdev_state; 1838fa9e4066Sahrens 1839fa9e4066Sahrens vd->vdev_state = state; 1840fa9e4066Sahrens vd->vdev_stat.vs_aux = aux; 1841fa9e4066Sahrens 1842ea8dc4b6Seschrock if (state == VDEV_STATE_CANT_OPEN) { 1843ea8dc4b6Seschrock /* 1844ea8dc4b6Seschrock * If we fail to open a vdev during an import, we mark it as 1845ea8dc4b6Seschrock * "not available", which signifies that it was never there to 1846ea8dc4b6Seschrock * begin with. Failure to open such a device is not considered 1847ea8dc4b6Seschrock * an error. 1848ea8dc4b6Seschrock */ 1849560e6e96Seschrock if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && 1850560e6e96Seschrock vd->vdev_ops->vdev_op_leaf) 1851560e6e96Seschrock vd->vdev_not_present = 1; 1852560e6e96Seschrock 1853560e6e96Seschrock /* 1854560e6e96Seschrock * Post the appropriate ereport. If the 'prevstate' field is 1855560e6e96Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 1856560e6e96Seschrock * that this is part of a vdev_reopen(). In this case, we don't 1857560e6e96Seschrock * want to post the ereport if the device was already in the 1858560e6e96Seschrock * CANT_OPEN state beforehand. 1859560e6e96Seschrock */ 1860560e6e96Seschrock if (vd->vdev_prevstate != state && !vd->vdev_not_present && 1861ea8dc4b6Seschrock vd != vd->vdev_spa->spa_root_vdev) { 1862ea8dc4b6Seschrock const char *class; 1863ea8dc4b6Seschrock 1864ea8dc4b6Seschrock switch (aux) { 1865ea8dc4b6Seschrock case VDEV_AUX_OPEN_FAILED: 1866ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 1867ea8dc4b6Seschrock break; 1868ea8dc4b6Seschrock case VDEV_AUX_CORRUPT_DATA: 1869ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 1870ea8dc4b6Seschrock break; 1871ea8dc4b6Seschrock case VDEV_AUX_NO_REPLICAS: 1872ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 1873ea8dc4b6Seschrock break; 1874ea8dc4b6Seschrock case VDEV_AUX_BAD_GUID_SUM: 1875ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 1876ea8dc4b6Seschrock break; 1877ea8dc4b6Seschrock case VDEV_AUX_TOO_SMALL: 1878ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 1879ea8dc4b6Seschrock break; 1880ea8dc4b6Seschrock case VDEV_AUX_BAD_LABEL: 1881ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 1882ea8dc4b6Seschrock break; 1883ea8dc4b6Seschrock default: 1884ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 1885ea8dc4b6Seschrock } 1886ea8dc4b6Seschrock 1887ea8dc4b6Seschrock zfs_ereport_post(class, vd->vdev_spa, 1888560e6e96Seschrock vd, NULL, save_state, 0); 1889ea8dc4b6Seschrock } 1890ea8dc4b6Seschrock } 1891ea8dc4b6Seschrock 1892ea8dc4b6Seschrock if (isopen) 1893ea8dc4b6Seschrock return; 1894ea8dc4b6Seschrock 189544cd46caSbillm if (vd->vdev_parent != NULL) 189644cd46caSbillm vdev_propagate_state(vd->vdev_parent); 1897fa9e4066Sahrens } 1898