1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5441d80aaSlling * Common Development and Distribution License (the "License"). 6441d80aaSlling * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 2199653d4eSeschrock 22fa9e4066Sahrens /* 2398d1cbfeSGeorge Wilson * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 245cabbc6bSPrashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 255f368aefSYuri Pankov * Copyright 2017 Nexenta Systems, Inc. 26c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 27c8811bd3SToomas Soome * Copyright 2016 Toomas Soome <tsoome@me.com> 28ce1577b0SDave Eddy * Copyright 2017 Joyent, Inc. 29fa9e4066Sahrens */ 30fa9e4066Sahrens 31fa9e4066Sahrens #include <sys/zfs_context.h> 32ea8dc4b6Seschrock #include <sys/fm/fs/zfs.h> 33fa9e4066Sahrens #include <sys/spa.h> 34fa9e4066Sahrens #include <sys/spa_impl.h> 355cabbc6bSPrashanth Sreenivasa #include <sys/bpobj.h> 36fa9e4066Sahrens #include <sys/dmu.h> 37fa9e4066Sahrens #include <sys/dmu_tx.h> 385cabbc6bSPrashanth Sreenivasa #include <sys/dsl_dir.h> 39fa9e4066Sahrens #include <sys/vdev_impl.h> 40fa9e4066Sahrens #include <sys/uberblock_impl.h> 41fa9e4066Sahrens #include <sys/metaslab.h> 42fa9e4066Sahrens #include <sys/metaslab_impl.h> 43fa9e4066Sahrens #include <sys/space_map.h> 440713e232SGeorge Wilson #include <sys/space_reftree.h> 45fa9e4066Sahrens #include <sys/zio.h> 46fa9e4066Sahrens #include <sys/zap.h> 47fa9e4066Sahrens #include <sys/fs/zfs.h> 48c5904d13Seschrock #include <sys/arc.h> 49e6ca193dSGeorge Wilson #include <sys/zil.h> 503f9d6ad7SLin Ling #include <sys/dsl_scan.h> 51770499e1SDan Kimmel #include <sys/abd.h> 52fa9e4066Sahrens 53fa9e4066Sahrens /* 54fa9e4066Sahrens * Virtual device management. 55fa9e4066Sahrens */ 56fa9e4066Sahrens 57fa9e4066Sahrens static vdev_ops_t *vdev_ops_table[] = { 58fa9e4066Sahrens &vdev_root_ops, 59fa9e4066Sahrens &vdev_raidz_ops, 60fa9e4066Sahrens &vdev_mirror_ops, 61fa9e4066Sahrens &vdev_replacing_ops, 6299653d4eSeschrock &vdev_spare_ops, 63fa9e4066Sahrens &vdev_disk_ops, 64fa9e4066Sahrens &vdev_file_ops, 65fa9e4066Sahrens &vdev_missing_ops, 6688ecc943SGeorge Wilson &vdev_hole_ops, 675cabbc6bSPrashanth Sreenivasa &vdev_indirect_ops, 68fa9e4066Sahrens NULL 69fa9e4066Sahrens }; 70fa9e4066Sahrens 71088f3894Sahrens /* maximum scrub/resilver I/O queue per leaf vdev */ 72088f3894Sahrens int zfs_scrub_limit = 10; 7305b2b3b8Smishra 74bf3e216cSMatthew Ahrens /* 75bf3e216cSMatthew Ahrens * When a vdev is added, it will be divided into approximately (but no 76bf3e216cSMatthew Ahrens * more than) this number of metaslabs. 77bf3e216cSMatthew Ahrens */ 78bf3e216cSMatthew Ahrens int metaslabs_per_vdev = 200; 79bf3e216cSMatthew Ahrens 80*3ee8c80cSPavel Zakharov /*PRINTFLIKE2*/ 81*3ee8c80cSPavel Zakharov void 82*3ee8c80cSPavel Zakharov vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) 83*3ee8c80cSPavel Zakharov { 84*3ee8c80cSPavel Zakharov va_list adx; 85*3ee8c80cSPavel Zakharov char buf[256]; 86*3ee8c80cSPavel Zakharov 87*3ee8c80cSPavel Zakharov va_start(adx, fmt); 88*3ee8c80cSPavel Zakharov (void) vsnprintf(buf, sizeof (buf), fmt, adx); 89*3ee8c80cSPavel Zakharov va_end(adx); 90*3ee8c80cSPavel Zakharov 91*3ee8c80cSPavel Zakharov if (vd->vdev_path != NULL) { 92*3ee8c80cSPavel Zakharov zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, 93*3ee8c80cSPavel Zakharov vd->vdev_path, buf); 94*3ee8c80cSPavel Zakharov } else { 95*3ee8c80cSPavel Zakharov zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", 96*3ee8c80cSPavel Zakharov vd->vdev_ops->vdev_op_type, 97*3ee8c80cSPavel Zakharov (u_longlong_t)vd->vdev_id, 98*3ee8c80cSPavel Zakharov (u_longlong_t)vd->vdev_guid, buf); 99*3ee8c80cSPavel Zakharov } 100*3ee8c80cSPavel Zakharov } 101*3ee8c80cSPavel Zakharov 102fa9e4066Sahrens /* 103fa9e4066Sahrens * Given a vdev type, return the appropriate ops vector. 104fa9e4066Sahrens */ 105fa9e4066Sahrens static vdev_ops_t * 106fa9e4066Sahrens vdev_getops(const char *type) 107fa9e4066Sahrens { 108fa9e4066Sahrens vdev_ops_t *ops, **opspp; 109fa9e4066Sahrens 110fa9e4066Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 111fa9e4066Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 112fa9e4066Sahrens break; 113fa9e4066Sahrens 114fa9e4066Sahrens return (ops); 115fa9e4066Sahrens } 116fa9e4066Sahrens 117fa9e4066Sahrens /* 118fa9e4066Sahrens * Default asize function: return the MAX of psize with the asize of 119fa9e4066Sahrens * all children. This is what's used by anything other than RAID-Z. 120fa9e4066Sahrens */ 121fa9e4066Sahrens uint64_t 122fa9e4066Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 123fa9e4066Sahrens { 124ecc2d604Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 125fa9e4066Sahrens uint64_t csize; 126fa9e4066Sahrens 127573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 128fa9e4066Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 129fa9e4066Sahrens asize = MAX(asize, csize); 130fa9e4066Sahrens } 131fa9e4066Sahrens 132fa9e4066Sahrens return (asize); 133fa9e4066Sahrens } 134fa9e4066Sahrens 1352a79c5feSlling /* 136573ca77eSGeorge Wilson * Get the minimum allocatable size. We define the allocatable size as 137573ca77eSGeorge Wilson * the vdev's asize rounded to the nearest metaslab. This allows us to 138573ca77eSGeorge Wilson * replace or attach devices which don't have the same physical size but 139573ca77eSGeorge Wilson * can still satisfy the same number of allocations. 1402a79c5feSlling */ 1412a79c5feSlling uint64_t 142573ca77eSGeorge Wilson vdev_get_min_asize(vdev_t *vd) 1432a79c5feSlling { 144573ca77eSGeorge Wilson vdev_t *pvd = vd->vdev_parent; 1452a79c5feSlling 146573ca77eSGeorge Wilson /* 1474263d13fSGeorge Wilson * If our parent is NULL (inactive spare or cache) or is the root, 148573ca77eSGeorge Wilson * just return our own asize. 149573ca77eSGeorge Wilson */ 150573ca77eSGeorge Wilson if (pvd == NULL) 151573ca77eSGeorge Wilson return (vd->vdev_asize); 1522a79c5feSlling 1532a79c5feSlling /* 154573ca77eSGeorge Wilson * The top-level vdev just returns the allocatable size rounded 155573ca77eSGeorge Wilson * to the nearest metaslab. 1562a79c5feSlling */ 157573ca77eSGeorge Wilson if (vd == vd->vdev_top) 158573ca77eSGeorge Wilson return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 1592a79c5feSlling 160573ca77eSGeorge Wilson /* 161573ca77eSGeorge Wilson * The allocatable space for a raidz vdev is N * sizeof(smallest child), 162573ca77eSGeorge Wilson * so each child must provide at least 1/Nth of its asize. 163573ca77eSGeorge Wilson */ 164573ca77eSGeorge Wilson if (pvd->vdev_ops == &vdev_raidz_ops) 165c040c10cSSteven Hartland return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 166c040c10cSSteven Hartland pvd->vdev_children); 1672a79c5feSlling 168573ca77eSGeorge Wilson return (pvd->vdev_min_asize); 169573ca77eSGeorge Wilson } 1702a79c5feSlling 171573ca77eSGeorge Wilson void 172573ca77eSGeorge Wilson vdev_set_min_asize(vdev_t *vd) 173573ca77eSGeorge Wilson { 174573ca77eSGeorge Wilson vd->vdev_min_asize = vdev_get_min_asize(vd); 175573ca77eSGeorge Wilson 176573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 177573ca77eSGeorge Wilson vdev_set_min_asize(vd->vdev_child[c]); 1782a79c5feSlling } 1792a79c5feSlling 180fa9e4066Sahrens vdev_t * 181fa9e4066Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 182fa9e4066Sahrens { 183fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 184fa9e4066Sahrens 185e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 186e05725b1Sbonwick 187088f3894Sahrens if (vdev < rvd->vdev_children) { 188088f3894Sahrens ASSERT(rvd->vdev_child[vdev] != NULL); 189fa9e4066Sahrens return (rvd->vdev_child[vdev]); 190088f3894Sahrens } 191fa9e4066Sahrens 192fa9e4066Sahrens return (NULL); 193fa9e4066Sahrens } 194fa9e4066Sahrens 195fa9e4066Sahrens vdev_t * 196fa9e4066Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 197fa9e4066Sahrens { 198fa9e4066Sahrens vdev_t *mvd; 199fa9e4066Sahrens 2000e34b6a7Sbonwick if (vd->vdev_guid == guid) 201fa9e4066Sahrens return (vd); 202fa9e4066Sahrens 203573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 204fa9e4066Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 205fa9e4066Sahrens NULL) 206fa9e4066Sahrens return (mvd); 207fa9e4066Sahrens 208fa9e4066Sahrens return (NULL); 209fa9e4066Sahrens } 210fa9e4066Sahrens 21112380e1eSArne Jansen static int 21212380e1eSArne Jansen vdev_count_leaves_impl(vdev_t *vd) 21312380e1eSArne Jansen { 21412380e1eSArne Jansen int n = 0; 21512380e1eSArne Jansen 21612380e1eSArne Jansen if (vd->vdev_ops->vdev_op_leaf) 21712380e1eSArne Jansen return (1); 21812380e1eSArne Jansen 21912380e1eSArne Jansen for (int c = 0; c < vd->vdev_children; c++) 22012380e1eSArne Jansen n += vdev_count_leaves_impl(vd->vdev_child[c]); 22112380e1eSArne Jansen 22212380e1eSArne Jansen return (n); 22312380e1eSArne Jansen } 22412380e1eSArne Jansen 22512380e1eSArne Jansen int 22612380e1eSArne Jansen vdev_count_leaves(spa_t *spa) 22712380e1eSArne Jansen { 22812380e1eSArne Jansen return (vdev_count_leaves_impl(spa->spa_root_vdev)); 22912380e1eSArne Jansen } 23012380e1eSArne Jansen 231fa9e4066Sahrens void 232fa9e4066Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 233fa9e4066Sahrens { 234fa9e4066Sahrens size_t oldsize, newsize; 235fa9e4066Sahrens uint64_t id = cvd->vdev_id; 236fa9e4066Sahrens vdev_t **newchild; 23781cd5c55SMatthew Ahrens spa_t *spa = cvd->vdev_spa; 238fa9e4066Sahrens 23981cd5c55SMatthew Ahrens ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 240fa9e4066Sahrens ASSERT(cvd->vdev_parent == NULL); 241fa9e4066Sahrens 242fa9e4066Sahrens cvd->vdev_parent = pvd; 243fa9e4066Sahrens 244fa9e4066Sahrens if (pvd == NULL) 245fa9e4066Sahrens return; 246fa9e4066Sahrens 247fa9e4066Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 248fa9e4066Sahrens 249fa9e4066Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 250fa9e4066Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 251fa9e4066Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 252fa9e4066Sahrens 253fa9e4066Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 254fa9e4066Sahrens if (pvd->vdev_child != NULL) { 255fa9e4066Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 256fa9e4066Sahrens kmem_free(pvd->vdev_child, oldsize); 257fa9e4066Sahrens } 258fa9e4066Sahrens 259fa9e4066Sahrens pvd->vdev_child = newchild; 260fa9e4066Sahrens pvd->vdev_child[id] = cvd; 261fa9e4066Sahrens 262fa9e4066Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 263fa9e4066Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 264fa9e4066Sahrens 265fa9e4066Sahrens /* 266fa9e4066Sahrens * Walk up all ancestors to update guid sum. 267fa9e4066Sahrens */ 268fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 269fa9e4066Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 270fa9e4066Sahrens } 271fa9e4066Sahrens 272fa9e4066Sahrens void 273fa9e4066Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 274fa9e4066Sahrens { 275fa9e4066Sahrens int c; 276fa9e4066Sahrens uint_t id = cvd->vdev_id; 277fa9e4066Sahrens 278fa9e4066Sahrens ASSERT(cvd->vdev_parent == pvd); 279fa9e4066Sahrens 280fa9e4066Sahrens if (pvd == NULL) 281fa9e4066Sahrens return; 282fa9e4066Sahrens 283fa9e4066Sahrens ASSERT(id < pvd->vdev_children); 284fa9e4066Sahrens ASSERT(pvd->vdev_child[id] == cvd); 285fa9e4066Sahrens 286fa9e4066Sahrens pvd->vdev_child[id] = NULL; 287fa9e4066Sahrens cvd->vdev_parent = NULL; 288fa9e4066Sahrens 289fa9e4066Sahrens for (c = 0; c < pvd->vdev_children; c++) 290fa9e4066Sahrens if (pvd->vdev_child[c]) 291fa9e4066Sahrens break; 292fa9e4066Sahrens 293fa9e4066Sahrens if (c == pvd->vdev_children) { 294fa9e4066Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 295fa9e4066Sahrens pvd->vdev_child = NULL; 296fa9e4066Sahrens pvd->vdev_children = 0; 297fa9e4066Sahrens } 298fa9e4066Sahrens 299fa9e4066Sahrens /* 300fa9e4066Sahrens * Walk up all ancestors to update guid sum. 301fa9e4066Sahrens */ 302fa9e4066Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 303fa9e4066Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 304fa9e4066Sahrens } 305fa9e4066Sahrens 306fa9e4066Sahrens /* 307fa9e4066Sahrens * Remove any holes in the child array. 308fa9e4066Sahrens */ 309fa9e4066Sahrens void 310fa9e4066Sahrens vdev_compact_children(vdev_t *pvd) 311fa9e4066Sahrens { 312fa9e4066Sahrens vdev_t **newchild, *cvd; 313fa9e4066Sahrens int oldc = pvd->vdev_children; 314573ca77eSGeorge Wilson int newc; 315fa9e4066Sahrens 316e14bb325SJeff Bonwick ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 317fa9e4066Sahrens 318573ca77eSGeorge Wilson for (int c = newc = 0; c < oldc; c++) 319fa9e4066Sahrens if (pvd->vdev_child[c]) 320fa9e4066Sahrens newc++; 321fa9e4066Sahrens 322fa9e4066Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 323fa9e4066Sahrens 324573ca77eSGeorge Wilson for (int c = newc = 0; c < oldc; c++) { 325fa9e4066Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 326fa9e4066Sahrens newchild[newc] = cvd; 327fa9e4066Sahrens cvd->vdev_id = newc++; 328fa9e4066Sahrens } 329fa9e4066Sahrens } 330fa9e4066Sahrens 331fa9e4066Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 332fa9e4066Sahrens pvd->vdev_child = newchild; 333fa9e4066Sahrens pvd->vdev_children = newc; 334fa9e4066Sahrens } 335fa9e4066Sahrens 336fa9e4066Sahrens /* 337fa9e4066Sahrens * Allocate and minimally initialize a vdev_t. 338fa9e4066Sahrens */ 33988ecc943SGeorge Wilson vdev_t * 340fa9e4066Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 341fa9e4066Sahrens { 342fa9e4066Sahrens vdev_t *vd; 3435cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic; 344fa9e4066Sahrens 345fa9e4066Sahrens vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 3465cabbc6bSPrashanth Sreenivasa vic = &vd->vdev_indirect_config; 347fa9e4066Sahrens 3480e34b6a7Sbonwick if (spa->spa_root_vdev == NULL) { 3490e34b6a7Sbonwick ASSERT(ops == &vdev_root_ops); 3500e34b6a7Sbonwick spa->spa_root_vdev = vd; 351e9103aaeSGarrett D'Amore spa->spa_load_guid = spa_generate_guid(NULL); 3520e34b6a7Sbonwick } 3530e34b6a7Sbonwick 35488ecc943SGeorge Wilson if (guid == 0 && ops != &vdev_hole_ops) { 3550e34b6a7Sbonwick if (spa->spa_root_vdev == vd) { 3560e34b6a7Sbonwick /* 3570e34b6a7Sbonwick * The root vdev's guid will also be the pool guid, 3580e34b6a7Sbonwick * which must be unique among all pools. 3590e34b6a7Sbonwick */ 3601195e687SMark J Musante guid = spa_generate_guid(NULL); 3610e34b6a7Sbonwick } else { 3620e34b6a7Sbonwick /* 3630e34b6a7Sbonwick * Any other vdev's guid must be unique within the pool. 3640e34b6a7Sbonwick */ 3651195e687SMark J Musante guid = spa_generate_guid(spa); 3660e34b6a7Sbonwick } 3670e34b6a7Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3680e34b6a7Sbonwick } 3690e34b6a7Sbonwick 370fa9e4066Sahrens vd->vdev_spa = spa; 371fa9e4066Sahrens vd->vdev_id = id; 372fa9e4066Sahrens vd->vdev_guid = guid; 373fa9e4066Sahrens vd->vdev_guid_sum = guid; 374fa9e4066Sahrens vd->vdev_ops = ops; 375fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 37688ecc943SGeorge Wilson vd->vdev_ishole = (ops == &vdev_hole_ops); 3775cabbc6bSPrashanth Sreenivasa vic->vic_prev_indirect_vdev = UINT64_MAX; 3785cabbc6bSPrashanth Sreenivasa 3795cabbc6bSPrashanth Sreenivasa rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 3805cabbc6bSPrashanth Sreenivasa mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 3815cabbc6bSPrashanth Sreenivasa vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 382fa9e4066Sahrens 383fa9e4066Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3845ad82045Snd mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 385e14bb325SJeff Bonwick mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 3860f7643c7SGeorge Wilson mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 3878ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 3885cabbc6bSPrashanth Sreenivasa vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 3898ad4d6ddSJeff Bonwick } 390b7b2590dSMatthew Ahrens txg_list_create(&vd->vdev_ms_list, spa, 391fa9e4066Sahrens offsetof(struct metaslab, ms_txg_node)); 392b7b2590dSMatthew Ahrens txg_list_create(&vd->vdev_dtl_list, spa, 393fa9e4066Sahrens offsetof(struct vdev, vdev_dtl_node)); 394fa9e4066Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3953d7072f8Seschrock vdev_queue_init(vd); 3963d7072f8Seschrock vdev_cache_init(vd); 397fa9e4066Sahrens 398fa9e4066Sahrens return (vd); 399fa9e4066Sahrens } 400fa9e4066Sahrens 401fa9e4066Sahrens /* 402fa9e4066Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 403fa9e4066Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 404fa9e4066Sahrens * different for each case. 405fa9e4066Sahrens */ 40699653d4eSeschrock int 40799653d4eSeschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 40899653d4eSeschrock int alloctype) 409fa9e4066Sahrens { 410fa9e4066Sahrens vdev_ops_t *ops; 411fa9e4066Sahrens char *type; 4128654d025Sperrin uint64_t guid = 0, islog, nparity; 413fa9e4066Sahrens vdev_t *vd; 4145cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic; 415fa9e4066Sahrens 416e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 417fa9e4066Sahrens 418fa9e4066Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 419be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 420fa9e4066Sahrens 421fa9e4066Sahrens if ((ops = vdev_getops(type)) == NULL) 422be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 423fa9e4066Sahrens 424fa9e4066Sahrens /* 425fa9e4066Sahrens * If this is a load, get the vdev guid from the nvlist. 426fa9e4066Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 427fa9e4066Sahrens */ 428fa9e4066Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 429fa9e4066Sahrens uint64_t label_id; 430fa9e4066Sahrens 431fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 432fa9e4066Sahrens label_id != id) 433be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 434fa9e4066Sahrens 435fa9e4066Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 436be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 43799653d4eSeschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 43899653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 439be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 440fa94a07fSbrendan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 441fa94a07fSbrendan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 442be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 44321ecdf64SLin Ling } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 44421ecdf64SLin Ling if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 445be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 446fa9e4066Sahrens } 447fa9e4066Sahrens 44899653d4eSeschrock /* 44999653d4eSeschrock * The first allocated vdev must be of type 'root'. 45099653d4eSeschrock */ 45199653d4eSeschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 452be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 45399653d4eSeschrock 4548654d025Sperrin /* 4558654d025Sperrin * Determine whether we're a log vdev. 4568654d025Sperrin */ 4578654d025Sperrin islog = 0; 4588654d025Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 459990b4856Slling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 460be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 461fa9e4066Sahrens 46288ecc943SGeorge Wilson if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 463be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 46488ecc943SGeorge Wilson 46599653d4eSeschrock /* 4668654d025Sperrin * Set the nparity property for RAID-Z vdevs. 46799653d4eSeschrock */ 4688654d025Sperrin nparity = -1ULL; 46999653d4eSeschrock if (ops == &vdev_raidz_ops) { 47099653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 4718654d025Sperrin &nparity) == 0) { 472b24ab676SJeff Bonwick if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 473be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 47499653d4eSeschrock /* 475f94275ceSAdam Leventhal * Previous versions could only support 1 or 2 parity 476f94275ceSAdam Leventhal * device. 47799653d4eSeschrock */ 478f94275ceSAdam Leventhal if (nparity > 1 && 479f94275ceSAdam Leventhal spa_version(spa) < SPA_VERSION_RAIDZ2) 480be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 481f94275ceSAdam Leventhal if (nparity > 2 && 482f94275ceSAdam Leventhal spa_version(spa) < SPA_VERSION_RAIDZ3) 483be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 48499653d4eSeschrock } else { 48599653d4eSeschrock /* 48699653d4eSeschrock * We require the parity to be specified for SPAs that 48799653d4eSeschrock * support multiple parity levels. 48899653d4eSeschrock */ 489f94275ceSAdam Leventhal if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 490be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 49199653d4eSeschrock /* 49299653d4eSeschrock * Otherwise, we default to 1 parity device for RAID-Z. 49399653d4eSeschrock */ 4948654d025Sperrin nparity = 1; 49599653d4eSeschrock } 49699653d4eSeschrock } else { 4978654d025Sperrin nparity = 0; 49899653d4eSeschrock } 4998654d025Sperrin ASSERT(nparity != -1ULL); 5008654d025Sperrin 5018654d025Sperrin vd = vdev_alloc_common(spa, id, guid, ops); 5025cabbc6bSPrashanth Sreenivasa vic = &vd->vdev_indirect_config; 5038654d025Sperrin 5048654d025Sperrin vd->vdev_islog = islog; 5058654d025Sperrin vd->vdev_nparity = nparity; 5068654d025Sperrin 5078654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 5088654d025Sperrin vd->vdev_path = spa_strdup(vd->vdev_path); 5098654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 5108654d025Sperrin vd->vdev_devid = spa_strdup(vd->vdev_devid); 5118654d025Sperrin if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 5128654d025Sperrin &vd->vdev_physpath) == 0) 5138654d025Sperrin vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 5146809eb4eSEric Schrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 5156809eb4eSEric Schrock vd->vdev_fru = spa_strdup(vd->vdev_fru); 51699653d4eSeschrock 517afefbcddSeschrock /* 518afefbcddSeschrock * Set the whole_disk property. If it's not specified, leave the value 519afefbcddSeschrock * as -1. 520afefbcddSeschrock */ 521afefbcddSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 522afefbcddSeschrock &vd->vdev_wholedisk) != 0) 523afefbcddSeschrock vd->vdev_wholedisk = -1ULL; 524afefbcddSeschrock 5255cabbc6bSPrashanth Sreenivasa ASSERT0(vic->vic_mapping_object); 5265cabbc6bSPrashanth Sreenivasa (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 5275cabbc6bSPrashanth Sreenivasa &vic->vic_mapping_object); 5285cabbc6bSPrashanth Sreenivasa ASSERT0(vic->vic_births_object); 5295cabbc6bSPrashanth Sreenivasa (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 5305cabbc6bSPrashanth Sreenivasa &vic->vic_births_object); 5315cabbc6bSPrashanth Sreenivasa ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 5325cabbc6bSPrashanth Sreenivasa (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 5335cabbc6bSPrashanth Sreenivasa &vic->vic_prev_indirect_vdev); 5345cabbc6bSPrashanth Sreenivasa 535ea8dc4b6Seschrock /* 536ea8dc4b6Seschrock * Look for the 'not present' flag. This will only be set if the device 537ea8dc4b6Seschrock * was not present at the time of import. 538ea8dc4b6Seschrock */ 5396809eb4eSEric Schrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 5406809eb4eSEric Schrock &vd->vdev_not_present); 541ea8dc4b6Seschrock 542ecc2d604Sbonwick /* 543ecc2d604Sbonwick * Get the alignment requirement. 544ecc2d604Sbonwick */ 545ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 546ecc2d604Sbonwick 54788ecc943SGeorge Wilson /* 54888ecc943SGeorge Wilson * Retrieve the vdev creation time. 54988ecc943SGeorge Wilson */ 55088ecc943SGeorge Wilson (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 55188ecc943SGeorge Wilson &vd->vdev_crtxg); 55288ecc943SGeorge Wilson 553fa9e4066Sahrens /* 554fa9e4066Sahrens * If we're a top-level vdev, try to load the allocation parameters. 555fa9e4066Sahrens */ 5561195e687SMark J Musante if (parent && !parent->vdev_parent && 5571195e687SMark J Musante (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 558fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 559fa9e4066Sahrens &vd->vdev_ms_array); 560fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 561fa9e4066Sahrens &vd->vdev_ms_shift); 562fa9e4066Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 563fa9e4066Sahrens &vd->vdev_asize); 5643f9d6ad7SLin Ling (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 5653f9d6ad7SLin Ling &vd->vdev_removing); 566215198a6SJoe Stein (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 567215198a6SJoe Stein &vd->vdev_top_zap); 568215198a6SJoe Stein } else { 569215198a6SJoe Stein ASSERT0(vd->vdev_top_zap); 570fa9e4066Sahrens } 571fa9e4066Sahrens 572cd0837ccSGeorge Wilson if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 573a1521560SJeff Bonwick ASSERT(alloctype == VDEV_ALLOC_LOAD || 5749f4ab4d8SGeorge Wilson alloctype == VDEV_ALLOC_ADD || 5751195e687SMark J Musante alloctype == VDEV_ALLOC_SPLIT || 5769f4ab4d8SGeorge Wilson alloctype == VDEV_ALLOC_ROOTPOOL); 577a1521560SJeff Bonwick vd->vdev_mg = metaslab_group_create(islog ? 578a1521560SJeff Bonwick spa_log_class(spa) : spa_normal_class(spa), vd); 579a1521560SJeff Bonwick } 580a1521560SJeff Bonwick 581215198a6SJoe Stein if (vd->vdev_ops->vdev_op_leaf && 582215198a6SJoe Stein (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 583215198a6SJoe Stein (void) nvlist_lookup_uint64(nv, 584215198a6SJoe Stein ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 585215198a6SJoe Stein } else { 586215198a6SJoe Stein ASSERT0(vd->vdev_leaf_zap); 587215198a6SJoe Stein } 588215198a6SJoe Stein 589fa9e4066Sahrens /* 5903d7072f8Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 591fa9e4066Sahrens */ 592215198a6SJoe Stein 593c5904d13Seschrock if (vd->vdev_ops->vdev_op_leaf && 59421ecdf64SLin Ling (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 59521ecdf64SLin Ling alloctype == VDEV_ALLOC_ROOTPOOL)) { 596c5904d13Seschrock if (alloctype == VDEV_ALLOC_LOAD) { 597c5904d13Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 5980713e232SGeorge Wilson &vd->vdev_dtl_object); 599c5904d13Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 600c5904d13Seschrock &vd->vdev_unspare); 601c5904d13Seschrock } 60221ecdf64SLin Ling 60321ecdf64SLin Ling if (alloctype == VDEV_ALLOC_ROOTPOOL) { 60421ecdf64SLin Ling uint64_t spare = 0; 60521ecdf64SLin Ling 60621ecdf64SLin Ling if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 60721ecdf64SLin Ling &spare) == 0 && spare) 60821ecdf64SLin Ling spa_spare_add(vd); 60921ecdf64SLin Ling } 61021ecdf64SLin Ling 611ecc2d604Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 612ecc2d604Sbonwick &vd->vdev_offline); 613c5904d13Seschrock 614b4952e17SGeorge Wilson (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 615b4952e17SGeorge Wilson &vd->vdev_resilver_txg); 616cb04b873SMark J Musante 6173d7072f8Seschrock /* 6183d7072f8Seschrock * When importing a pool, we want to ignore the persistent fault 6193d7072f8Seschrock * state, as the diagnosis made on another system may not be 620069f55e2SEric Schrock * valid in the current context. Local vdevs will 621069f55e2SEric Schrock * remain in the faulted state. 6223d7072f8Seschrock */ 623b16da2e2SGeorge Wilson if (spa_load_state(spa) == SPA_LOAD_OPEN) { 6243d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 6253d7072f8Seschrock &vd->vdev_faulted); 6263d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 6273d7072f8Seschrock &vd->vdev_degraded); 6283d7072f8Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 6293d7072f8Seschrock &vd->vdev_removed); 630069f55e2SEric Schrock 631069f55e2SEric Schrock if (vd->vdev_faulted || vd->vdev_degraded) { 632069f55e2SEric Schrock char *aux; 633069f55e2SEric Schrock 634069f55e2SEric Schrock vd->vdev_label_aux = 635069f55e2SEric Schrock VDEV_AUX_ERR_EXCEEDED; 636069f55e2SEric Schrock if (nvlist_lookup_string(nv, 637069f55e2SEric Schrock ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 638069f55e2SEric Schrock strcmp(aux, "external") == 0) 639069f55e2SEric Schrock vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 640069f55e2SEric Schrock } 6413d7072f8Seschrock } 642fa9e4066Sahrens } 643fa9e4066Sahrens 644fa9e4066Sahrens /* 645fa9e4066Sahrens * Add ourselves to the parent's list of children. 646fa9e4066Sahrens */ 647fa9e4066Sahrens vdev_add_child(parent, vd); 648fa9e4066Sahrens 64999653d4eSeschrock *vdp = vd; 65099653d4eSeschrock 65199653d4eSeschrock return (0); 652fa9e4066Sahrens } 653fa9e4066Sahrens 654fa9e4066Sahrens void 655fa9e4066Sahrens vdev_free(vdev_t *vd) 656fa9e4066Sahrens { 6573d7072f8Seschrock spa_t *spa = vd->vdev_spa; 658fa9e4066Sahrens 659fa9e4066Sahrens /* 660fa9e4066Sahrens * vdev_free() implies closing the vdev first. This is simpler than 661fa9e4066Sahrens * trying to ensure complicated semantics for all callers. 662fa9e4066Sahrens */ 663fa9e4066Sahrens vdev_close(vd); 664fa9e4066Sahrens 665e14bb325SJeff Bonwick ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 666b24ab676SJeff Bonwick ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 667fa9e4066Sahrens 668fa9e4066Sahrens /* 669fa9e4066Sahrens * Free all children. 670fa9e4066Sahrens */ 671573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 672fa9e4066Sahrens vdev_free(vd->vdev_child[c]); 673fa9e4066Sahrens 674fa9e4066Sahrens ASSERT(vd->vdev_child == NULL); 675fa9e4066Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 676fa9e4066Sahrens 677fa9e4066Sahrens /* 678fa9e4066Sahrens * Discard allocation state. 679fa9e4066Sahrens */ 680a1521560SJeff Bonwick if (vd->vdev_mg != NULL) { 681fa9e4066Sahrens vdev_metaslab_fini(vd); 682a1521560SJeff Bonwick metaslab_group_destroy(vd->vdev_mg); 683a1521560SJeff Bonwick } 684fa9e4066Sahrens 685fb09f5aaSMadhav Suresh ASSERT0(vd->vdev_stat.vs_space); 686fb09f5aaSMadhav Suresh ASSERT0(vd->vdev_stat.vs_dspace); 687fb09f5aaSMadhav Suresh ASSERT0(vd->vdev_stat.vs_alloc); 688fa9e4066Sahrens 689fa9e4066Sahrens /* 690fa9e4066Sahrens * Remove this vdev from its parent's child list. 691fa9e4066Sahrens */ 692fa9e4066Sahrens vdev_remove_child(vd->vdev_parent, vd); 693fa9e4066Sahrens 694fa9e4066Sahrens ASSERT(vd->vdev_parent == NULL); 695fa9e4066Sahrens 6963d7072f8Seschrock /* 6973d7072f8Seschrock * Clean up vdev structure. 6983d7072f8Seschrock */ 6993d7072f8Seschrock vdev_queue_fini(vd); 7003d7072f8Seschrock vdev_cache_fini(vd); 7013d7072f8Seschrock 7023d7072f8Seschrock if (vd->vdev_path) 7033d7072f8Seschrock spa_strfree(vd->vdev_path); 7043d7072f8Seschrock if (vd->vdev_devid) 7053d7072f8Seschrock spa_strfree(vd->vdev_devid); 7063d7072f8Seschrock if (vd->vdev_physpath) 7073d7072f8Seschrock spa_strfree(vd->vdev_physpath); 7086809eb4eSEric Schrock if (vd->vdev_fru) 7096809eb4eSEric Schrock spa_strfree(vd->vdev_fru); 7103d7072f8Seschrock 7113d7072f8Seschrock if (vd->vdev_isspare) 7123d7072f8Seschrock spa_spare_remove(vd); 713fa94a07fSbrendan if (vd->vdev_isl2cache) 714fa94a07fSbrendan spa_l2cache_remove(vd); 7153d7072f8Seschrock 7163d7072f8Seschrock txg_list_destroy(&vd->vdev_ms_list); 7173d7072f8Seschrock txg_list_destroy(&vd->vdev_dtl_list); 7188ad4d6ddSJeff Bonwick 7193d7072f8Seschrock mutex_enter(&vd->vdev_dtl_lock); 7200713e232SGeorge Wilson space_map_close(vd->vdev_dtl_sm); 7218ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 7220713e232SGeorge Wilson range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 7230713e232SGeorge Wilson range_tree_destroy(vd->vdev_dtl[t]); 7248ad4d6ddSJeff Bonwick } 7253d7072f8Seschrock mutex_exit(&vd->vdev_dtl_lock); 7268ad4d6ddSJeff Bonwick 7275cabbc6bSPrashanth Sreenivasa EQUIV(vd->vdev_indirect_births != NULL, 7285cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping != NULL); 7295cabbc6bSPrashanth Sreenivasa if (vd->vdev_indirect_births != NULL) { 7305cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 7315cabbc6bSPrashanth Sreenivasa vdev_indirect_births_close(vd->vdev_indirect_births); 7325cabbc6bSPrashanth Sreenivasa } 7335cabbc6bSPrashanth Sreenivasa 7345cabbc6bSPrashanth Sreenivasa if (vd->vdev_obsolete_sm != NULL) { 7355cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing || 7365cabbc6bSPrashanth Sreenivasa vd->vdev_ops == &vdev_indirect_ops); 7375cabbc6bSPrashanth Sreenivasa space_map_close(vd->vdev_obsolete_sm); 7385cabbc6bSPrashanth Sreenivasa vd->vdev_obsolete_sm = NULL; 7395cabbc6bSPrashanth Sreenivasa } 7405cabbc6bSPrashanth Sreenivasa range_tree_destroy(vd->vdev_obsolete_segments); 7415cabbc6bSPrashanth Sreenivasa rw_destroy(&vd->vdev_indirect_rwlock); 7425cabbc6bSPrashanth Sreenivasa mutex_destroy(&vd->vdev_obsolete_lock); 7435cabbc6bSPrashanth Sreenivasa 7440f7643c7SGeorge Wilson mutex_destroy(&vd->vdev_queue_lock); 7453d7072f8Seschrock mutex_destroy(&vd->vdev_dtl_lock); 7463d7072f8Seschrock mutex_destroy(&vd->vdev_stat_lock); 747e14bb325SJeff Bonwick mutex_destroy(&vd->vdev_probe_lock); 7483d7072f8Seschrock 7493d7072f8Seschrock if (vd == spa->spa_root_vdev) 7503d7072f8Seschrock spa->spa_root_vdev = NULL; 7513d7072f8Seschrock 7523d7072f8Seschrock kmem_free(vd, sizeof (vdev_t)); 753fa9e4066Sahrens } 754fa9e4066Sahrens 755fa9e4066Sahrens /* 756fa9e4066Sahrens * Transfer top-level vdev state from svd to tvd. 757fa9e4066Sahrens */ 758fa9e4066Sahrens static void 759fa9e4066Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 760fa9e4066Sahrens { 761fa9e4066Sahrens spa_t *spa = svd->vdev_spa; 762fa9e4066Sahrens metaslab_t *msp; 763fa9e4066Sahrens vdev_t *vd; 764fa9e4066Sahrens int t; 765fa9e4066Sahrens 766fa9e4066Sahrens ASSERT(tvd == tvd->vdev_top); 767fa9e4066Sahrens 768fa9e4066Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 769fa9e4066Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 770fa9e4066Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 771215198a6SJoe Stein tvd->vdev_top_zap = svd->vdev_top_zap; 772fa9e4066Sahrens 773fa9e4066Sahrens svd->vdev_ms_array = 0; 774fa9e4066Sahrens svd->vdev_ms_shift = 0; 775fa9e4066Sahrens svd->vdev_ms_count = 0; 776215198a6SJoe Stein svd->vdev_top_zap = 0; 777fa9e4066Sahrens 778cd0837ccSGeorge Wilson if (tvd->vdev_mg) 779cd0837ccSGeorge Wilson ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 780fa9e4066Sahrens tvd->vdev_mg = svd->vdev_mg; 781fa9e4066Sahrens tvd->vdev_ms = svd->vdev_ms; 782fa9e4066Sahrens 783fa9e4066Sahrens svd->vdev_mg = NULL; 784fa9e4066Sahrens svd->vdev_ms = NULL; 785ecc2d604Sbonwick 786ecc2d604Sbonwick if (tvd->vdev_mg != NULL) 787ecc2d604Sbonwick tvd->vdev_mg->mg_vd = tvd; 788fa9e4066Sahrens 789fa9e4066Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 790fa9e4066Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 79199653d4eSeschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 792fa9e4066Sahrens 793fa9e4066Sahrens svd->vdev_stat.vs_alloc = 0; 794fa9e4066Sahrens svd->vdev_stat.vs_space = 0; 79599653d4eSeschrock svd->vdev_stat.vs_dspace = 0; 796fa9e4066Sahrens 797fa9e4066Sahrens for (t = 0; t < TXG_SIZE; t++) { 798fa9e4066Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 799fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 800fa9e4066Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 801fa9e4066Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 802fa9e4066Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 803fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 804fa9e4066Sahrens } 805fa9e4066Sahrens 806e14bb325SJeff Bonwick if (list_link_active(&svd->vdev_config_dirty_node)) { 807fa9e4066Sahrens vdev_config_clean(svd); 808fa9e4066Sahrens vdev_config_dirty(tvd); 809fa9e4066Sahrens } 810fa9e4066Sahrens 811e14bb325SJeff Bonwick if (list_link_active(&svd->vdev_state_dirty_node)) { 812e14bb325SJeff Bonwick vdev_state_clean(svd); 813e14bb325SJeff Bonwick vdev_state_dirty(tvd); 814e14bb325SJeff Bonwick } 815e14bb325SJeff Bonwick 81699653d4eSeschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 81799653d4eSeschrock svd->vdev_deflate_ratio = 0; 8188654d025Sperrin 8198654d025Sperrin tvd->vdev_islog = svd->vdev_islog; 8208654d025Sperrin svd->vdev_islog = 0; 821fa9e4066Sahrens } 822fa9e4066Sahrens 823fa9e4066Sahrens static void 824fa9e4066Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 825fa9e4066Sahrens { 826fa9e4066Sahrens if (vd == NULL) 827fa9e4066Sahrens return; 828fa9e4066Sahrens 829fa9e4066Sahrens vd->vdev_top = tvd; 830fa9e4066Sahrens 831573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 832fa9e4066Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 833fa9e4066Sahrens } 834fa9e4066Sahrens 835fa9e4066Sahrens /* 836fa9e4066Sahrens * Add a mirror/replacing vdev above an existing vdev. 837fa9e4066Sahrens */ 838fa9e4066Sahrens vdev_t * 839fa9e4066Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 840fa9e4066Sahrens { 841fa9e4066Sahrens spa_t *spa = cvd->vdev_spa; 842fa9e4066Sahrens vdev_t *pvd = cvd->vdev_parent; 843fa9e4066Sahrens vdev_t *mvd; 844fa9e4066Sahrens 845e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 846fa9e4066Sahrens 847fa9e4066Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 848ecc2d604Sbonwick 849ecc2d604Sbonwick mvd->vdev_asize = cvd->vdev_asize; 850573ca77eSGeorge Wilson mvd->vdev_min_asize = cvd->vdev_min_asize; 8514263d13fSGeorge Wilson mvd->vdev_max_asize = cvd->vdev_max_asize; 8525cabbc6bSPrashanth Sreenivasa mvd->vdev_psize = cvd->vdev_psize; 853ecc2d604Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 854ecc2d604Sbonwick mvd->vdev_state = cvd->vdev_state; 85588ecc943SGeorge Wilson mvd->vdev_crtxg = cvd->vdev_crtxg; 856ecc2d604Sbonwick 857fa9e4066Sahrens vdev_remove_child(pvd, cvd); 858fa9e4066Sahrens vdev_add_child(pvd, mvd); 859fa9e4066Sahrens cvd->vdev_id = mvd->vdev_children; 860fa9e4066Sahrens vdev_add_child(mvd, cvd); 861fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 862fa9e4066Sahrens 863fa9e4066Sahrens if (mvd == mvd->vdev_top) 864fa9e4066Sahrens vdev_top_transfer(cvd, mvd); 865fa9e4066Sahrens 866fa9e4066Sahrens return (mvd); 867fa9e4066Sahrens } 868fa9e4066Sahrens 869fa9e4066Sahrens /* 870fa9e4066Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 871fa9e4066Sahrens */ 872fa9e4066Sahrens void 873fa9e4066Sahrens vdev_remove_parent(vdev_t *cvd) 874fa9e4066Sahrens { 875fa9e4066Sahrens vdev_t *mvd = cvd->vdev_parent; 876fa9e4066Sahrens vdev_t *pvd = mvd->vdev_parent; 877fa9e4066Sahrens 878e14bb325SJeff Bonwick ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 879fa9e4066Sahrens 880fa9e4066Sahrens ASSERT(mvd->vdev_children == 1); 881fa9e4066Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 88299653d4eSeschrock mvd->vdev_ops == &vdev_replacing_ops || 88399653d4eSeschrock mvd->vdev_ops == &vdev_spare_ops); 884ecc2d604Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 885fa9e4066Sahrens 886fa9e4066Sahrens vdev_remove_child(mvd, cvd); 887fa9e4066Sahrens vdev_remove_child(pvd, mvd); 8888ad4d6ddSJeff Bonwick 88999653d4eSeschrock /* 890e14bb325SJeff Bonwick * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 891e14bb325SJeff Bonwick * Otherwise, we could have detached an offline device, and when we 892e14bb325SJeff Bonwick * go to import the pool we'll think we have two top-level vdevs, 893e14bb325SJeff Bonwick * instead of a different version of the same top-level vdev. 89499653d4eSeschrock */ 8958ad4d6ddSJeff Bonwick if (mvd->vdev_top == mvd) { 8968ad4d6ddSJeff Bonwick uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 8971195e687SMark J Musante cvd->vdev_orig_guid = cvd->vdev_guid; 8988ad4d6ddSJeff Bonwick cvd->vdev_guid += guid_delta; 8998ad4d6ddSJeff Bonwick cvd->vdev_guid_sum += guid_delta; 9008ad4d6ddSJeff Bonwick } 901e14bb325SJeff Bonwick cvd->vdev_id = mvd->vdev_id; 902e14bb325SJeff Bonwick vdev_add_child(pvd, cvd); 903fa9e4066Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 904fa9e4066Sahrens 905fa9e4066Sahrens if (cvd == cvd->vdev_top) 906fa9e4066Sahrens vdev_top_transfer(mvd, cvd); 907fa9e4066Sahrens 908fa9e4066Sahrens ASSERT(mvd->vdev_children == 0); 909fa9e4066Sahrens vdev_free(mvd); 910fa9e4066Sahrens } 911fa9e4066Sahrens 912ea8dc4b6Seschrock int 913fa9e4066Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 914fa9e4066Sahrens { 915fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 916ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 917ecc2d604Sbonwick uint64_t m; 918fa9e4066Sahrens uint64_t oldc = vd->vdev_ms_count; 919fa9e4066Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 920ecc2d604Sbonwick metaslab_t **mspp; 921ecc2d604Sbonwick int error; 922fa9e4066Sahrens 923a1521560SJeff Bonwick ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 924a1521560SJeff Bonwick 92588ecc943SGeorge Wilson /* 92688ecc943SGeorge Wilson * This vdev is not being allocated from yet or is a hole. 92788ecc943SGeorge Wilson */ 92888ecc943SGeorge Wilson if (vd->vdev_ms_shift == 0) 9290e34b6a7Sbonwick return (0); 9300e34b6a7Sbonwick 93188ecc943SGeorge Wilson ASSERT(!vd->vdev_ishole); 93288ecc943SGeorge Wilson 933fa9e4066Sahrens ASSERT(oldc <= newc); 934fa9e4066Sahrens 935ecc2d604Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 936fa9e4066Sahrens 937ecc2d604Sbonwick if (oldc != 0) { 938ecc2d604Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 939ecc2d604Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 940ecc2d604Sbonwick } 941fa9e4066Sahrens 942ecc2d604Sbonwick vd->vdev_ms = mspp; 943ecc2d604Sbonwick vd->vdev_ms_count = newc; 944fa9e4066Sahrens 945ecc2d604Sbonwick for (m = oldc; m < newc; m++) { 9460713e232SGeorge Wilson uint64_t object = 0; 9470713e232SGeorge Wilson 9485cabbc6bSPrashanth Sreenivasa /* 9495cabbc6bSPrashanth Sreenivasa * vdev_ms_array may be 0 if we are creating the "fake" 9505cabbc6bSPrashanth Sreenivasa * metaslabs for an indirect vdev for zdb's leak detection. 9515cabbc6bSPrashanth Sreenivasa * See zdb_leak_init(). 9525cabbc6bSPrashanth Sreenivasa */ 9535cabbc6bSPrashanth Sreenivasa if (txg == 0 && vd->vdev_ms_array != 0) { 954ecc2d604Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 9557bfdf011SNeil Perrin m * sizeof (uint64_t), sizeof (uint64_t), &object, 9567bfdf011SNeil Perrin DMU_READ_PREFETCH); 957*3ee8c80cSPavel Zakharov if (error != 0) { 958*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "unable to read the metaslab " 959*3ee8c80cSPavel Zakharov "array [error=%d]", error); 960ecc2d604Sbonwick return (error); 961*3ee8c80cSPavel Zakharov } 962fa9e4066Sahrens } 9631e9bd7ecSPrakash Surya 9641e9bd7ecSPrakash Surya error = metaslab_init(vd->vdev_mg, m, object, txg, 9651e9bd7ecSPrakash Surya &(vd->vdev_ms[m])); 966*3ee8c80cSPavel Zakharov if (error != 0) { 967*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", 968*3ee8c80cSPavel Zakharov error); 9691e9bd7ecSPrakash Surya return (error); 970*3ee8c80cSPavel Zakharov } 971fa9e4066Sahrens } 972fa9e4066Sahrens 973a1521560SJeff Bonwick if (txg == 0) 974a1521560SJeff Bonwick spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 975a1521560SJeff Bonwick 9763f9d6ad7SLin Ling /* 9773f9d6ad7SLin Ling * If the vdev is being removed we don't activate 9783f9d6ad7SLin Ling * the metaslabs since we want to ensure that no new 9793f9d6ad7SLin Ling * allocations are performed on this device. 9803f9d6ad7SLin Ling */ 9813f9d6ad7SLin Ling if (oldc == 0 && !vd->vdev_removing) 982a1521560SJeff Bonwick metaslab_group_activate(vd->vdev_mg); 983a1521560SJeff Bonwick 984a1521560SJeff Bonwick if (txg == 0) 985a1521560SJeff Bonwick spa_config_exit(spa, SCL_ALLOC, FTAG); 986a1521560SJeff Bonwick 987ea8dc4b6Seschrock return (0); 988fa9e4066Sahrens } 989fa9e4066Sahrens 990fa9e4066Sahrens void 991fa9e4066Sahrens vdev_metaslab_fini(vdev_t *vd) 992fa9e4066Sahrens { 993fa9e4066Sahrens if (vd->vdev_ms != NULL) { 9945cabbc6bSPrashanth Sreenivasa uint64_t count = vd->vdev_ms_count; 9955cabbc6bSPrashanth Sreenivasa 996a1521560SJeff Bonwick metaslab_group_passivate(vd->vdev_mg); 9975cabbc6bSPrashanth Sreenivasa for (uint64_t m = 0; m < count; m++) { 9980713e232SGeorge Wilson metaslab_t *msp = vd->vdev_ms[m]; 9990713e232SGeorge Wilson 10000713e232SGeorge Wilson if (msp != NULL) 10010713e232SGeorge Wilson metaslab_fini(msp); 10020713e232SGeorge Wilson } 1003fa9e4066Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1004fa9e4066Sahrens vd->vdev_ms = NULL; 10055cabbc6bSPrashanth Sreenivasa 10065cabbc6bSPrashanth Sreenivasa vd->vdev_ms_count = 0; 1007fa9e4066Sahrens } 10085cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_ms_count); 1009fa9e4066Sahrens } 1010fa9e4066Sahrens 1011e14bb325SJeff Bonwick typedef struct vdev_probe_stats { 1012e14bb325SJeff Bonwick boolean_t vps_readable; 1013e14bb325SJeff Bonwick boolean_t vps_writeable; 1014e14bb325SJeff Bonwick int vps_flags; 1015e14bb325SJeff Bonwick } vdev_probe_stats_t; 1016e14bb325SJeff Bonwick 1017e14bb325SJeff Bonwick static void 1018e14bb325SJeff Bonwick vdev_probe_done(zio_t *zio) 10190a4e9518Sgw { 10208ad4d6ddSJeff Bonwick spa_t *spa = zio->io_spa; 1021a3f829aeSBill Moore vdev_t *vd = zio->io_vd; 1022e14bb325SJeff Bonwick vdev_probe_stats_t *vps = zio->io_private; 1023a3f829aeSBill Moore 1024a3f829aeSBill Moore ASSERT(vd->vdev_probe_zio != NULL); 1025e14bb325SJeff Bonwick 1026e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_READ) { 1027e14bb325SJeff Bonwick if (zio->io_error == 0) 1028e14bb325SJeff Bonwick vps->vps_readable = 1; 10298ad4d6ddSJeff Bonwick if (zio->io_error == 0 && spa_writeable(spa)) { 1030a3f829aeSBill Moore zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1031770499e1SDan Kimmel zio->io_offset, zio->io_size, zio->io_abd, 1032e14bb325SJeff Bonwick ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1033e14bb325SJeff Bonwick ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1034e14bb325SJeff Bonwick } else { 1035770499e1SDan Kimmel abd_free(zio->io_abd); 1036e14bb325SJeff Bonwick } 1037e14bb325SJeff Bonwick } else if (zio->io_type == ZIO_TYPE_WRITE) { 1038e14bb325SJeff Bonwick if (zio->io_error == 0) 1039e14bb325SJeff Bonwick vps->vps_writeable = 1; 1040770499e1SDan Kimmel abd_free(zio->io_abd); 1041e14bb325SJeff Bonwick } else if (zio->io_type == ZIO_TYPE_NULL) { 1042a3f829aeSBill Moore zio_t *pio; 1043e14bb325SJeff Bonwick 1044e14bb325SJeff Bonwick vd->vdev_cant_read |= !vps->vps_readable; 1045e14bb325SJeff Bonwick vd->vdev_cant_write |= !vps->vps_writeable; 1046e14bb325SJeff Bonwick 1047e14bb325SJeff Bonwick if (vdev_readable(vd) && 10488ad4d6ddSJeff Bonwick (vdev_writeable(vd) || !spa_writeable(spa))) { 1049e14bb325SJeff Bonwick zio->io_error = 0; 1050e14bb325SJeff Bonwick } else { 1051e14bb325SJeff Bonwick ASSERT(zio->io_error != 0); 1052*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "failed probe"); 1053e14bb325SJeff Bonwick zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 10548ad4d6ddSJeff Bonwick spa, vd, NULL, 0, 0); 1055be6fd75aSMatthew Ahrens zio->io_error = SET_ERROR(ENXIO); 1056e14bb325SJeff Bonwick } 1057a3f829aeSBill Moore 1058a3f829aeSBill Moore mutex_enter(&vd->vdev_probe_lock); 1059a3f829aeSBill Moore ASSERT(vd->vdev_probe_zio == zio); 1060a3f829aeSBill Moore vd->vdev_probe_zio = NULL; 1061a3f829aeSBill Moore mutex_exit(&vd->vdev_probe_lock); 1062a3f829aeSBill Moore 10630f7643c7SGeorge Wilson zio_link_t *zl = NULL; 10640f7643c7SGeorge Wilson while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1065a3f829aeSBill Moore if (!vdev_accessible(vd, pio)) 1066be6fd75aSMatthew Ahrens pio->io_error = SET_ERROR(ENXIO); 1067a3f829aeSBill Moore 1068e14bb325SJeff Bonwick kmem_free(vps, sizeof (*vps)); 1069e14bb325SJeff Bonwick } 1070e14bb325SJeff Bonwick } 10710a4e9518Sgw 1072e14bb325SJeff Bonwick /* 1073f7170741SWill Andrews * Determine whether this device is accessible. 1074f7170741SWill Andrews * 1075f7170741SWill Andrews * Read and write to several known locations: the pad regions of each 1076f7170741SWill Andrews * vdev label but the first, which we leave alone in case it contains 1077f7170741SWill Andrews * a VTOC. 1078e14bb325SJeff Bonwick */ 1079e14bb325SJeff Bonwick zio_t * 1080a3f829aeSBill Moore vdev_probe(vdev_t *vd, zio_t *zio) 1081e14bb325SJeff Bonwick { 1082e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 1083a3f829aeSBill Moore vdev_probe_stats_t *vps = NULL; 1084a3f829aeSBill Moore zio_t *pio; 1085a3f829aeSBill Moore 1086a3f829aeSBill Moore ASSERT(vd->vdev_ops->vdev_op_leaf); 10870a4e9518Sgw 1088a3f829aeSBill Moore /* 1089a3f829aeSBill Moore * Don't probe the probe. 1090a3f829aeSBill Moore */ 1091a3f829aeSBill Moore if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1092a3f829aeSBill Moore return (NULL); 1093e14bb325SJeff Bonwick 1094a3f829aeSBill Moore /* 1095a3f829aeSBill Moore * To prevent 'probe storms' when a device fails, we create 1096a3f829aeSBill Moore * just one probe i/o at a time. All zios that want to probe 1097a3f829aeSBill Moore * this vdev will become parents of the probe io. 1098a3f829aeSBill Moore */ 1099a3f829aeSBill Moore mutex_enter(&vd->vdev_probe_lock); 1100e14bb325SJeff Bonwick 1101a3f829aeSBill Moore if ((pio = vd->vdev_probe_zio) == NULL) { 1102a3f829aeSBill Moore vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1103a3f829aeSBill Moore 1104a3f829aeSBill Moore vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1105a3f829aeSBill Moore ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 11068956713aSEric Schrock ZIO_FLAG_TRYHARD; 1107a3f829aeSBill Moore 1108a3f829aeSBill Moore if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1109a3f829aeSBill Moore /* 1110a3f829aeSBill Moore * vdev_cant_read and vdev_cant_write can only 1111a3f829aeSBill Moore * transition from TRUE to FALSE when we have the 1112a3f829aeSBill Moore * SCL_ZIO lock as writer; otherwise they can only 1113a3f829aeSBill Moore * transition from FALSE to TRUE. This ensures that 1114a3f829aeSBill Moore * any zio looking at these values can assume that 1115a3f829aeSBill Moore * failures persist for the life of the I/O. That's 1116a3f829aeSBill Moore * important because when a device has intermittent 1117a3f829aeSBill Moore * connectivity problems, we want to ensure that 1118a3f829aeSBill Moore * they're ascribed to the device (ENXIO) and not 1119a3f829aeSBill Moore * the zio (EIO). 1120a3f829aeSBill Moore * 1121a3f829aeSBill Moore * Since we hold SCL_ZIO as writer here, clear both 1122a3f829aeSBill Moore * values so the probe can reevaluate from first 1123a3f829aeSBill Moore * principles. 1124a3f829aeSBill Moore */ 1125a3f829aeSBill Moore vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1126a3f829aeSBill Moore vd->vdev_cant_read = B_FALSE; 1127a3f829aeSBill Moore vd->vdev_cant_write = B_FALSE; 1128a3f829aeSBill Moore } 1129a3f829aeSBill Moore 1130a3f829aeSBill Moore vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1131a3f829aeSBill Moore vdev_probe_done, vps, 1132a3f829aeSBill Moore vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1133a3f829aeSBill Moore 113498d1cbfeSGeorge Wilson /* 113598d1cbfeSGeorge Wilson * We can't change the vdev state in this context, so we 113698d1cbfeSGeorge Wilson * kick off an async task to do it on our behalf. 113798d1cbfeSGeorge Wilson */ 1138a3f829aeSBill Moore if (zio != NULL) { 1139a3f829aeSBill Moore vd->vdev_probe_wanted = B_TRUE; 1140a3f829aeSBill Moore spa_async_request(spa, SPA_ASYNC_PROBE); 1141a3f829aeSBill Moore } 1142e14bb325SJeff Bonwick } 1143e14bb325SJeff Bonwick 1144a3f829aeSBill Moore if (zio != NULL) 1145a3f829aeSBill Moore zio_add_child(zio, pio); 1146e14bb325SJeff Bonwick 1147a3f829aeSBill Moore mutex_exit(&vd->vdev_probe_lock); 1148e14bb325SJeff Bonwick 1149a3f829aeSBill Moore if (vps == NULL) { 1150a3f829aeSBill Moore ASSERT(zio != NULL); 1151a3f829aeSBill Moore return (NULL); 1152a3f829aeSBill Moore } 1153e14bb325SJeff Bonwick 1154e14bb325SJeff Bonwick for (int l = 1; l < VDEV_LABELS; l++) { 1155a3f829aeSBill Moore zio_nowait(zio_read_phys(pio, vd, 1156e14bb325SJeff Bonwick vdev_label_offset(vd->vdev_psize, l, 1157770499e1SDan Kimmel offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1158770499e1SDan Kimmel abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1159e14bb325SJeff Bonwick ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1160e14bb325SJeff Bonwick ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1161e14bb325SJeff Bonwick } 1162e14bb325SJeff Bonwick 1163a3f829aeSBill Moore if (zio == NULL) 1164a3f829aeSBill Moore return (pio); 1165a3f829aeSBill Moore 1166a3f829aeSBill Moore zio_nowait(pio); 1167a3f829aeSBill Moore return (NULL); 11680a4e9518Sgw } 11690a4e9518Sgw 1170f64c0e34SEric Taylor static void 1171f64c0e34SEric Taylor vdev_open_child(void *arg) 1172f64c0e34SEric Taylor { 1173f64c0e34SEric Taylor vdev_t *vd = arg; 1174f64c0e34SEric Taylor 1175f64c0e34SEric Taylor vd->vdev_open_thread = curthread; 1176f64c0e34SEric Taylor vd->vdev_open_error = vdev_open(vd); 1177f64c0e34SEric Taylor vd->vdev_open_thread = NULL; 1178f64c0e34SEric Taylor } 1179f64c0e34SEric Taylor 1180681d9761SEric Taylor boolean_t 1181681d9761SEric Taylor vdev_uses_zvols(vdev_t *vd) 1182681d9761SEric Taylor { 1183681d9761SEric Taylor if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1184681d9761SEric Taylor strlen(ZVOL_DIR)) == 0) 1185681d9761SEric Taylor return (B_TRUE); 1186681d9761SEric Taylor for (int c = 0; c < vd->vdev_children; c++) 1187681d9761SEric Taylor if (vdev_uses_zvols(vd->vdev_child[c])) 1188681d9761SEric Taylor return (B_TRUE); 1189681d9761SEric Taylor return (B_FALSE); 1190681d9761SEric Taylor } 1191681d9761SEric Taylor 1192f64c0e34SEric Taylor void 1193f64c0e34SEric Taylor vdev_open_children(vdev_t *vd) 1194f64c0e34SEric Taylor { 1195f64c0e34SEric Taylor taskq_t *tq; 1196f64c0e34SEric Taylor int children = vd->vdev_children; 1197f64c0e34SEric Taylor 1198681d9761SEric Taylor /* 1199681d9761SEric Taylor * in order to handle pools on top of zvols, do the opens 1200681d9761SEric Taylor * in a single thread so that the same thread holds the 1201681d9761SEric Taylor * spa_namespace_lock 1202681d9761SEric Taylor */ 1203681d9761SEric Taylor if (vdev_uses_zvols(vd)) { 1204681d9761SEric Taylor for (int c = 0; c < children; c++) 1205681d9761SEric Taylor vd->vdev_child[c]->vdev_open_error = 1206681d9761SEric Taylor vdev_open(vd->vdev_child[c]); 1207681d9761SEric Taylor return; 1208681d9761SEric Taylor } 1209f64c0e34SEric Taylor tq = taskq_create("vdev_open", children, minclsyspri, 1210f64c0e34SEric Taylor children, children, TASKQ_PREPOPULATE); 1211f64c0e34SEric Taylor 1212f64c0e34SEric Taylor for (int c = 0; c < children; c++) 1213f64c0e34SEric Taylor VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1214f64c0e34SEric Taylor TQ_SLEEP) != NULL); 1215f64c0e34SEric Taylor 1216f64c0e34SEric Taylor taskq_destroy(tq); 1217f64c0e34SEric Taylor } 1218f64c0e34SEric Taylor 12195cabbc6bSPrashanth Sreenivasa /* 12205cabbc6bSPrashanth Sreenivasa * Compute the raidz-deflation ratio. Note, we hard-code 12215cabbc6bSPrashanth Sreenivasa * in 128k (1 << 17) because it is the "typical" blocksize. 12225cabbc6bSPrashanth Sreenivasa * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 12235cabbc6bSPrashanth Sreenivasa * otherwise it would inconsistently account for existing bp's. 12245cabbc6bSPrashanth Sreenivasa */ 12255cabbc6bSPrashanth Sreenivasa static void 12265cabbc6bSPrashanth Sreenivasa vdev_set_deflate_ratio(vdev_t *vd) 12275cabbc6bSPrashanth Sreenivasa { 12285cabbc6bSPrashanth Sreenivasa if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 12295cabbc6bSPrashanth Sreenivasa vd->vdev_deflate_ratio = (1 << 17) / 12305cabbc6bSPrashanth Sreenivasa (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 12315cabbc6bSPrashanth Sreenivasa } 12325cabbc6bSPrashanth Sreenivasa } 12335cabbc6bSPrashanth Sreenivasa 1234fa9e4066Sahrens /* 1235fa9e4066Sahrens * Prepare a virtual device for access. 1236fa9e4066Sahrens */ 1237fa9e4066Sahrens int 1238fa9e4066Sahrens vdev_open(vdev_t *vd) 1239fa9e4066Sahrens { 12408ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 1241fa9e4066Sahrens int error; 1242fa9e4066Sahrens uint64_t osize = 0; 12434263d13fSGeorge Wilson uint64_t max_osize = 0; 12444263d13fSGeorge Wilson uint64_t asize, max_asize, psize; 1245ecc2d604Sbonwick uint64_t ashift = 0; 1246fa9e4066Sahrens 1247f64c0e34SEric Taylor ASSERT(vd->vdev_open_thread == curthread || 1248f64c0e34SEric Taylor spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1249fa9e4066Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1250fa9e4066Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 1251fa9e4066Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 1252fa9e4066Sahrens 1253fa9e4066Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1254e6ca193dSGeorge Wilson vd->vdev_cant_read = B_FALSE; 1255e6ca193dSGeorge Wilson vd->vdev_cant_write = B_FALSE; 1256573ca77eSGeorge Wilson vd->vdev_min_asize = vdev_get_min_asize(vd); 1257fa9e4066Sahrens 1258069f55e2SEric Schrock /* 1259069f55e2SEric Schrock * If this vdev is not removed, check its fault status. If it's 1260069f55e2SEric Schrock * faulted, bail out of the open. 1261069f55e2SEric Schrock */ 12623d7072f8Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 12633d7072f8Seschrock ASSERT(vd->vdev_children == 0); 1264069f55e2SEric Schrock ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1265069f55e2SEric Schrock vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 12663d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1267069f55e2SEric Schrock vd->vdev_label_aux); 1268be6fd75aSMatthew Ahrens return (SET_ERROR(ENXIO)); 12693d7072f8Seschrock } else if (vd->vdev_offline) { 1270fa9e4066Sahrens ASSERT(vd->vdev_children == 0); 1271ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1272be6fd75aSMatthew Ahrens return (SET_ERROR(ENXIO)); 1273fa9e4066Sahrens } 1274fa9e4066Sahrens 12754263d13fSGeorge Wilson error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 1276fa9e4066Sahrens 1277095bcd66SGeorge Wilson /* 1278095bcd66SGeorge Wilson * Reset the vdev_reopening flag so that we actually close 1279095bcd66SGeorge Wilson * the vdev on error. 1280095bcd66SGeorge Wilson */ 1281095bcd66SGeorge Wilson vd->vdev_reopening = B_FALSE; 1282ea8dc4b6Seschrock if (zio_injection_enabled && error == 0) 12838956713aSEric Schrock error = zio_handle_device_injection(vd, NULL, ENXIO); 1284ea8dc4b6Seschrock 1285fa9e4066Sahrens if (error) { 12863d7072f8Seschrock if (vd->vdev_removed && 12873d7072f8Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 12883d7072f8Seschrock vd->vdev_removed = B_FALSE; 12893d7072f8Seschrock 1290ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1291fa9e4066Sahrens vd->vdev_stat.vs_aux); 1292fa9e4066Sahrens return (error); 1293fa9e4066Sahrens } 1294fa9e4066Sahrens 12953d7072f8Seschrock vd->vdev_removed = B_FALSE; 12963d7072f8Seschrock 1297096d22d4SEric Schrock /* 1298096d22d4SEric Schrock * Recheck the faulted flag now that we have confirmed that 1299096d22d4SEric Schrock * the vdev is accessible. If we're faulted, bail. 1300096d22d4SEric Schrock */ 1301096d22d4SEric Schrock if (vd->vdev_faulted) { 1302096d22d4SEric Schrock ASSERT(vd->vdev_children == 0); 1303096d22d4SEric Schrock ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1304096d22d4SEric Schrock vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1305096d22d4SEric Schrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1306096d22d4SEric Schrock vd->vdev_label_aux); 1307be6fd75aSMatthew Ahrens return (SET_ERROR(ENXIO)); 1308096d22d4SEric Schrock } 1309096d22d4SEric Schrock 13103d7072f8Seschrock if (vd->vdev_degraded) { 13113d7072f8Seschrock ASSERT(vd->vdev_children == 0); 13123d7072f8Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 13133d7072f8Seschrock VDEV_AUX_ERR_EXCEEDED); 13143d7072f8Seschrock } else { 1315069f55e2SEric Schrock vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 13163d7072f8Seschrock } 1317fa9e4066Sahrens 131888ecc943SGeorge Wilson /* 131988ecc943SGeorge Wilson * For hole or missing vdevs we just return success. 132088ecc943SGeorge Wilson */ 132188ecc943SGeorge Wilson if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 132288ecc943SGeorge Wilson return (0); 132388ecc943SGeorge Wilson 1324573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 1325ea8dc4b6Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1326ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1327ea8dc4b6Seschrock VDEV_AUX_NONE); 1328ea8dc4b6Seschrock break; 1329ea8dc4b6Seschrock } 1330573ca77eSGeorge Wilson } 1331fa9e4066Sahrens 1332fa9e4066Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 13334263d13fSGeorge Wilson max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1334fa9e4066Sahrens 1335fa9e4066Sahrens if (vd->vdev_children == 0) { 1336fa9e4066Sahrens if (osize < SPA_MINDEVSIZE) { 1337ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1338ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 1339be6fd75aSMatthew Ahrens return (SET_ERROR(EOVERFLOW)); 1340fa9e4066Sahrens } 1341fa9e4066Sahrens psize = osize; 1342fa9e4066Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 13434263d13fSGeorge Wilson max_asize = max_osize - (VDEV_LABEL_START_SIZE + 13444263d13fSGeorge Wilson VDEV_LABEL_END_SIZE); 1345fa9e4066Sahrens } else { 1346ecc2d604Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1347fa9e4066Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1348ea8dc4b6Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1349ea8dc4b6Seschrock VDEV_AUX_TOO_SMALL); 1350be6fd75aSMatthew Ahrens return (SET_ERROR(EOVERFLOW)); 1351fa9e4066Sahrens } 1352fa9e4066Sahrens psize = 0; 1353fa9e4066Sahrens asize = osize; 13544263d13fSGeorge Wilson max_asize = max_osize; 1355fa9e4066Sahrens } 1356fa9e4066Sahrens 1357fa9e4066Sahrens vd->vdev_psize = psize; 1358fa9e4066Sahrens 1359573ca77eSGeorge Wilson /* 1360c040c10cSSteven Hartland * Make sure the allocatable size hasn't shrunk too much. 1361573ca77eSGeorge Wilson */ 1362573ca77eSGeorge Wilson if (asize < vd->vdev_min_asize) { 1363573ca77eSGeorge Wilson vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1364573ca77eSGeorge Wilson VDEV_AUX_BAD_LABEL); 1365be6fd75aSMatthew Ahrens return (SET_ERROR(EINVAL)); 1366573ca77eSGeorge Wilson } 1367573ca77eSGeorge Wilson 1368fa9e4066Sahrens if (vd->vdev_asize == 0) { 1369fa9e4066Sahrens /* 1370fa9e4066Sahrens * This is the first-ever open, so use the computed values. 1371ecc2d604Sbonwick * For testing purposes, a higher ashift can be requested. 1372fa9e4066Sahrens */ 1373fa9e4066Sahrens vd->vdev_asize = asize; 13744263d13fSGeorge Wilson vd->vdev_max_asize = max_asize; 1375ecc2d604Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1376fa9e4066Sahrens } else { 1377fa9e4066Sahrens /* 13782384d9f8SGeorge Wilson * Detect if the alignment requirement has increased. 13792384d9f8SGeorge Wilson * We don't want to make the pool unavailable, just 13802384d9f8SGeorge Wilson * issue a warning instead. 1381fa9e4066Sahrens */ 13822384d9f8SGeorge Wilson if (ashift > vd->vdev_top->vdev_ashift && 13832384d9f8SGeorge Wilson vd->vdev_ops->vdev_op_leaf) { 13842384d9f8SGeorge Wilson cmn_err(CE_WARN, 13852384d9f8SGeorge Wilson "Disk, '%s', has a block alignment that is " 13862384d9f8SGeorge Wilson "larger than the pool's alignment\n", 13872384d9f8SGeorge Wilson vd->vdev_path); 1388fa9e4066Sahrens } 13894263d13fSGeorge Wilson vd->vdev_max_asize = max_asize; 1390573ca77eSGeorge Wilson } 1391fa9e4066Sahrens 1392573ca77eSGeorge Wilson /* 1393c040c10cSSteven Hartland * If all children are healthy we update asize if either: 1394c040c10cSSteven Hartland * The asize has increased, due to a device expansion caused by dynamic 1395c040c10cSSteven Hartland * LUN growth or vdev replacement, and automatic expansion is enabled; 1396c040c10cSSteven Hartland * making the additional space available. 1397c040c10cSSteven Hartland * 1398c040c10cSSteven Hartland * The asize has decreased, due to a device shrink usually caused by a 1399c040c10cSSteven Hartland * vdev replace with a smaller device. This ensures that calculations 1400c040c10cSSteven Hartland * based of max_asize and asize e.g. esize are always valid. It's safe 1401c040c10cSSteven Hartland * to do this as we've already validated that asize is greater than 1402c040c10cSSteven Hartland * vdev_min_asize. 1403573ca77eSGeorge Wilson */ 1404c040c10cSSteven Hartland if (vd->vdev_state == VDEV_STATE_HEALTHY && 1405c040c10cSSteven Hartland ((asize > vd->vdev_asize && 1406c040c10cSSteven Hartland (vd->vdev_expanding || spa->spa_autoexpand)) || 1407c040c10cSSteven Hartland (asize < vd->vdev_asize))) 1408573ca77eSGeorge Wilson vd->vdev_asize = asize; 1409fa9e4066Sahrens 1410573ca77eSGeorge Wilson vdev_set_min_asize(vd); 1411fa9e4066Sahrens 14120a4e9518Sgw /* 14130a4e9518Sgw * Ensure we can issue some IO before declaring the 14140a4e9518Sgw * vdev open for business. 14150a4e9518Sgw */ 1416e14bb325SJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && 1417e14bb325SJeff Bonwick (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 141898d1cbfeSGeorge Wilson vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 141998d1cbfeSGeorge Wilson VDEV_AUX_ERR_EXCEEDED); 14200a4e9518Sgw return (error); 14210a4e9518Sgw } 14220a4e9518Sgw 14235cabbc6bSPrashanth Sreenivasa if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 14245cabbc6bSPrashanth Sreenivasa !vd->vdev_isl2cache && !vd->vdev_islog) { 14255cabbc6bSPrashanth Sreenivasa if (vd->vdev_ashift > spa->spa_max_ashift) 14265cabbc6bSPrashanth Sreenivasa spa->spa_max_ashift = vd->vdev_ashift; 14275cabbc6bSPrashanth Sreenivasa if (vd->vdev_ashift < spa->spa_min_ashift) 14285cabbc6bSPrashanth Sreenivasa spa->spa_min_ashift = vd->vdev_ashift; 14295cabbc6bSPrashanth Sreenivasa } 14305cabbc6bSPrashanth Sreenivasa 143181cd5c55SMatthew Ahrens /* 143281cd5c55SMatthew Ahrens * Track the min and max ashift values for normal data devices. 143381cd5c55SMatthew Ahrens */ 143481cd5c55SMatthew Ahrens if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 143581cd5c55SMatthew Ahrens !vd->vdev_islog && vd->vdev_aux == NULL) { 143681cd5c55SMatthew Ahrens if (vd->vdev_ashift > spa->spa_max_ashift) 143781cd5c55SMatthew Ahrens spa->spa_max_ashift = vd->vdev_ashift; 143881cd5c55SMatthew Ahrens if (vd->vdev_ashift < spa->spa_min_ashift) 143981cd5c55SMatthew Ahrens spa->spa_min_ashift = vd->vdev_ashift; 144081cd5c55SMatthew Ahrens } 144181cd5c55SMatthew Ahrens 1442088f3894Sahrens /* 1443088f3894Sahrens * If a leaf vdev has a DTL, and seems healthy, then kick off a 14448ad4d6ddSJeff Bonwick * resilver. But don't do this if we are doing a reopen for a scrub, 14458ad4d6ddSJeff Bonwick * since this would just restart the scrub we are already doing. 1446088f3894Sahrens */ 14478ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 14488ad4d6ddSJeff Bonwick vdev_resilver_needed(vd, NULL, NULL)) 14498ad4d6ddSJeff Bonwick spa_async_request(spa, SPA_ASYNC_RESILVER); 1450088f3894Sahrens 1451fa9e4066Sahrens return (0); 1452fa9e4066Sahrens } 1453fa9e4066Sahrens 1454560e6e96Seschrock /* 1455560e6e96Seschrock * Called once the vdevs are all opened, this routine validates the label 1456560e6e96Seschrock * contents. This needs to be done before vdev_load() so that we don't 14573d7072f8Seschrock * inadvertently do repair I/Os to the wrong device. 1458560e6e96Seschrock * 1459d7f601efSGeorge Wilson * If 'strict' is false ignore the spa guid check. This is necessary because 1460d7f601efSGeorge Wilson * if the machine crashed during a re-guid the new guid might have been written 1461d7f601efSGeorge Wilson * to all of the vdev labels, but not the cached config. The strict check 1462d7f601efSGeorge Wilson * will be performed when the pool is opened again using the mos config. 1463d7f601efSGeorge Wilson * 1464560e6e96Seschrock * This function will only return failure if one of the vdevs indicates that it 1465560e6e96Seschrock * has since been destroyed or exported. This is only possible if 1466560e6e96Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1467560e6e96Seschrock * will be updated but the function will return 0. 1468560e6e96Seschrock */ 1469560e6e96Seschrock int 1470d7f601efSGeorge Wilson vdev_validate(vdev_t *vd, boolean_t strict) 1471560e6e96Seschrock { 1472560e6e96Seschrock spa_t *spa = vd->vdev_spa; 1473560e6e96Seschrock nvlist_t *label; 14741195e687SMark J Musante uint64_t guid = 0, top_guid; 1475560e6e96Seschrock uint64_t state; 1476560e6e96Seschrock 1477573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 1478d7f601efSGeorge Wilson if (vdev_validate(vd->vdev_child[c], strict) != 0) 1479be6fd75aSMatthew Ahrens return (SET_ERROR(EBADF)); 1480560e6e96Seschrock 1481b5989ec7Seschrock /* 1482b5989ec7Seschrock * If the device has already failed, or was marked offline, don't do 1483b5989ec7Seschrock * any further validation. Otherwise, label I/O will fail and we will 1484b5989ec7Seschrock * overwrite the previous state. 1485b5989ec7Seschrock */ 1486e14bb325SJeff Bonwick if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 14871195e687SMark J Musante uint64_t aux_guid = 0; 14881195e687SMark J Musante nvlist_t *nvl; 1489bda88194SGeorge Wilson uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1490bda88194SGeorge Wilson spa_last_synced_txg(spa) : -1ULL; 1491560e6e96Seschrock 1492dfbb9432SGeorge Wilson if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1493560e6e96Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1494560e6e96Seschrock VDEV_AUX_BAD_LABEL); 1495*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: failed reading config"); 1496560e6e96Seschrock return (0); 1497560e6e96Seschrock } 1498560e6e96Seschrock 14991195e687SMark J Musante /* 15001195e687SMark J Musante * Determine if this vdev has been split off into another 15011195e687SMark J Musante * pool. If so, then refuse to open it. 15021195e687SMark J Musante */ 15031195e687SMark J Musante if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 15041195e687SMark J Musante &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 15051195e687SMark J Musante vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 15061195e687SMark J Musante VDEV_AUX_SPLIT_POOL); 15071195e687SMark J Musante nvlist_free(label); 1508*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: vdev split into other " 1509*3ee8c80cSPavel Zakharov "pool"); 15101195e687SMark J Musante return (0); 15111195e687SMark J Musante } 15121195e687SMark J Musante 1513d7f601efSGeorge Wilson if (strict && (nvlist_lookup_uint64(label, 1514d7f601efSGeorge Wilson ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1515d7f601efSGeorge Wilson guid != spa_guid(spa))) { 1516560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1517560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1518560e6e96Seschrock nvlist_free(label); 1519*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid " 1520*3ee8c80cSPavel Zakharov "doesn't match config (%llu != %llu)", 1521*3ee8c80cSPavel Zakharov (u_longlong_t)guid, 1522*3ee8c80cSPavel Zakharov (u_longlong_t)spa_guid(spa)); 1523560e6e96Seschrock return (0); 1524560e6e96Seschrock } 1525560e6e96Seschrock 15261195e687SMark J Musante if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 15271195e687SMark J Musante != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 15281195e687SMark J Musante &aux_guid) != 0) 15291195e687SMark J Musante aux_guid = 0; 15301195e687SMark J Musante 1531e14bb325SJeff Bonwick /* 1532e14bb325SJeff Bonwick * If this vdev just became a top-level vdev because its 1533e14bb325SJeff Bonwick * sibling was detached, it will have adopted the parent's 1534e14bb325SJeff Bonwick * vdev guid -- but the label may or may not be on disk yet. 1535e14bb325SJeff Bonwick * Fortunately, either version of the label will have the 1536e14bb325SJeff Bonwick * same top guid, so if we're a top-level vdev, we can 1537e14bb325SJeff Bonwick * safely compare to that instead. 15381195e687SMark J Musante * 15391195e687SMark J Musante * If we split this vdev off instead, then we also check the 15401195e687SMark J Musante * original pool's guid. We don't want to consider the vdev 15411195e687SMark J Musante * corrupt if it is partway through a split operation. 1542e14bb325SJeff Bonwick */ 1543560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1544e14bb325SJeff Bonwick &guid) != 0 || 1545e14bb325SJeff Bonwick nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1546e14bb325SJeff Bonwick &top_guid) != 0 || 15471195e687SMark J Musante ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1548e14bb325SJeff Bonwick (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1549560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1550560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1551560e6e96Seschrock nvlist_free(label); 1552*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: config guid doesn't " 1553*3ee8c80cSPavel Zakharov "match label guid (%llu != %llu)", 1554*3ee8c80cSPavel Zakharov (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid); 1555560e6e96Seschrock return (0); 1556560e6e96Seschrock } 1557560e6e96Seschrock 1558560e6e96Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1559560e6e96Seschrock &state) != 0) { 1560560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1561560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 1562560e6e96Seschrock nvlist_free(label); 1563*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: '%s' missing", 1564*3ee8c80cSPavel Zakharov ZPOOL_CONFIG_POOL_STATE); 1565560e6e96Seschrock return (0); 1566560e6e96Seschrock } 1567560e6e96Seschrock 1568560e6e96Seschrock nvlist_free(label); 1569560e6e96Seschrock 1570bc758434SLin Ling /* 15714b964adaSGeorge Wilson * If this is a verbatim import, no need to check the 1572bc758434SLin Ling * state of the pool. 1573bc758434SLin Ling */ 15744b964adaSGeorge Wilson if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1575b16da2e2SGeorge Wilson spa_load_state(spa) == SPA_LOAD_OPEN && 1576*3ee8c80cSPavel Zakharov state != POOL_STATE_ACTIVE) { 1577*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_validate: invalid pool state " 1578*3ee8c80cSPavel Zakharov "(%llu) for spa %s", (u_longlong_t)state, 1579*3ee8c80cSPavel Zakharov spa->spa_name); 1580be6fd75aSMatthew Ahrens return (SET_ERROR(EBADF)); 1581*3ee8c80cSPavel Zakharov } 1582560e6e96Seschrock 158351ece835Seschrock /* 158451ece835Seschrock * If we were able to open and validate a vdev that was 158551ece835Seschrock * previously marked permanently unavailable, clear that state 158651ece835Seschrock * now. 158751ece835Seschrock */ 158851ece835Seschrock if (vd->vdev_not_present) 158951ece835Seschrock vd->vdev_not_present = 0; 159051ece835Seschrock } 1591560e6e96Seschrock 1592560e6e96Seschrock return (0); 1593560e6e96Seschrock } 1594560e6e96Seschrock 1595fa9e4066Sahrens /* 1596fa9e4066Sahrens * Close a virtual device. 1597fa9e4066Sahrens */ 1598fa9e4066Sahrens void 1599fa9e4066Sahrens vdev_close(vdev_t *vd) 1600fa9e4066Sahrens { 16018ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 1602095bcd66SGeorge Wilson vdev_t *pvd = vd->vdev_parent; 16038ad4d6ddSJeff Bonwick 16048ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 16058ad4d6ddSJeff Bonwick 16061195e687SMark J Musante /* 16071195e687SMark J Musante * If our parent is reopening, then we are as well, unless we are 16081195e687SMark J Musante * going offline. 16091195e687SMark J Musante */ 1610095bcd66SGeorge Wilson if (pvd != NULL && pvd->vdev_reopening) 16111195e687SMark J Musante vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1612095bcd66SGeorge Wilson 1613fa9e4066Sahrens vd->vdev_ops->vdev_op_close(vd); 1614fa9e4066Sahrens 16153d7072f8Seschrock vdev_cache_purge(vd); 1616fa9e4066Sahrens 1617560e6e96Seschrock /* 1618573ca77eSGeorge Wilson * We record the previous state before we close it, so that if we are 1619560e6e96Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 1620560e6e96Seschrock * it's still faulted. 1621560e6e96Seschrock */ 1622560e6e96Seschrock vd->vdev_prevstate = vd->vdev_state; 1623560e6e96Seschrock 1624fa9e4066Sahrens if (vd->vdev_offline) 1625fa9e4066Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1626fa9e4066Sahrens else 1627fa9e4066Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 1628ea8dc4b6Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1629fa9e4066Sahrens } 1630fa9e4066Sahrens 1631dcba9f3fSGeorge Wilson void 1632dcba9f3fSGeorge Wilson vdev_hold(vdev_t *vd) 1633dcba9f3fSGeorge Wilson { 1634dcba9f3fSGeorge Wilson spa_t *spa = vd->vdev_spa; 1635dcba9f3fSGeorge Wilson 1636dcba9f3fSGeorge Wilson ASSERT(spa_is_root(spa)); 1637dcba9f3fSGeorge Wilson if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1638dcba9f3fSGeorge Wilson return; 1639dcba9f3fSGeorge Wilson 1640dcba9f3fSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 1641dcba9f3fSGeorge Wilson vdev_hold(vd->vdev_child[c]); 1642dcba9f3fSGeorge Wilson 1643dcba9f3fSGeorge Wilson if (vd->vdev_ops->vdev_op_leaf) 1644dcba9f3fSGeorge Wilson vd->vdev_ops->vdev_op_hold(vd); 1645dcba9f3fSGeorge Wilson } 1646dcba9f3fSGeorge Wilson 1647dcba9f3fSGeorge Wilson void 1648dcba9f3fSGeorge Wilson vdev_rele(vdev_t *vd) 1649dcba9f3fSGeorge Wilson { 1650dcba9f3fSGeorge Wilson spa_t *spa = vd->vdev_spa; 1651dcba9f3fSGeorge Wilson 1652dcba9f3fSGeorge Wilson ASSERT(spa_is_root(spa)); 1653dcba9f3fSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 1654dcba9f3fSGeorge Wilson vdev_rele(vd->vdev_child[c]); 1655dcba9f3fSGeorge Wilson 1656dcba9f3fSGeorge Wilson if (vd->vdev_ops->vdev_op_leaf) 1657dcba9f3fSGeorge Wilson vd->vdev_ops->vdev_op_rele(vd); 1658dcba9f3fSGeorge Wilson } 1659dcba9f3fSGeorge Wilson 1660095bcd66SGeorge Wilson /* 1661095bcd66SGeorge Wilson * Reopen all interior vdevs and any unopened leaves. We don't actually 1662095bcd66SGeorge Wilson * reopen leaf vdevs which had previously been opened as they might deadlock 1663095bcd66SGeorge Wilson * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1664095bcd66SGeorge Wilson * If the leaf has never been opened then open it, as usual. 1665095bcd66SGeorge Wilson */ 1666fa9e4066Sahrens void 1667ea8dc4b6Seschrock vdev_reopen(vdev_t *vd) 1668fa9e4066Sahrens { 1669ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 1670fa9e4066Sahrens 1671e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1672ea8dc4b6Seschrock 16731195e687SMark J Musante /* set the reopening flag unless we're taking the vdev offline */ 16741195e687SMark J Musante vd->vdev_reopening = !vd->vdev_offline; 1675fa9e4066Sahrens vdev_close(vd); 1676fa9e4066Sahrens (void) vdev_open(vd); 1677fa9e4066Sahrens 167839c23413Seschrock /* 167939c23413Seschrock * Call vdev_validate() here to make sure we have the same device. 168039c23413Seschrock * Otherwise, a device with an invalid label could be successfully 168139c23413Seschrock * opened in response to vdev_reopen(). 168239c23413Seschrock */ 1683c5904d13Seschrock if (vd->vdev_aux) { 1684c5904d13Seschrock (void) vdev_validate_aux(vd); 1685e14bb325SJeff Bonwick if (vdev_readable(vd) && vdev_writeable(vd) && 16866809eb4eSEric Schrock vd->vdev_aux == &spa->spa_l2cache && 1687573ca77eSGeorge Wilson !l2arc_vdev_present(vd)) 1688573ca77eSGeorge Wilson l2arc_add_vdev(spa, vd); 1689c5904d13Seschrock } else { 1690bda88194SGeorge Wilson (void) vdev_validate(vd, B_TRUE); 1691c5904d13Seschrock } 169239c23413Seschrock 1693fa9e4066Sahrens /* 16943d7072f8Seschrock * Reassess parent vdev's health. 1695fa9e4066Sahrens */ 16963d7072f8Seschrock vdev_propagate_state(vd); 1697fa9e4066Sahrens } 1698fa9e4066Sahrens 1699fa9e4066Sahrens int 170099653d4eSeschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1701fa9e4066Sahrens { 1702fa9e4066Sahrens int error; 1703fa9e4066Sahrens 1704fa9e4066Sahrens /* 1705fa9e4066Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1706fa9e4066Sahrens * For a create, however, we want to fail the request if 1707fa9e4066Sahrens * there are any components we can't open. 1708fa9e4066Sahrens */ 1709fa9e4066Sahrens error = vdev_open(vd); 1710fa9e4066Sahrens 1711fa9e4066Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1712fa9e4066Sahrens vdev_close(vd); 1713fa9e4066Sahrens return (error ? error : ENXIO); 1714fa9e4066Sahrens } 1715fa9e4066Sahrens 1716fa9e4066Sahrens /* 17170713e232SGeorge Wilson * Recursively load DTLs and initialize all labels. 1718fa9e4066Sahrens */ 17190713e232SGeorge Wilson if ((error = vdev_dtl_load(vd)) != 0 || 17200713e232SGeorge Wilson (error = vdev_label_init(vd, txg, isreplacing ? 172139c23413Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1722fa9e4066Sahrens vdev_close(vd); 1723fa9e4066Sahrens return (error); 1724fa9e4066Sahrens } 1725fa9e4066Sahrens 1726fa9e4066Sahrens return (0); 1727fa9e4066Sahrens } 1728fa9e4066Sahrens 17290e34b6a7Sbonwick void 1730573ca77eSGeorge Wilson vdev_metaslab_set_size(vdev_t *vd) 1731fa9e4066Sahrens { 1732fa9e4066Sahrens /* 1733bf3e216cSMatthew Ahrens * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1734fa9e4066Sahrens */ 1735bf3e216cSMatthew Ahrens vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1736fa9e4066Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1737fa9e4066Sahrens } 1738fa9e4066Sahrens 1739fa9e4066Sahrens void 1740ecc2d604Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1741fa9e4066Sahrens { 1742ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 17435cabbc6bSPrashanth Sreenivasa /* indirect vdevs don't have metaslabs or dtls */ 17445cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd) || flags == 0); 1745ecc2d604Sbonwick ASSERT(ISP2(flags)); 1746f9af39baSGeorge Wilson ASSERT(spa_writeable(vd->vdev_spa)); 1747fa9e4066Sahrens 1748ecc2d604Sbonwick if (flags & VDD_METASLAB) 1749ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1750ecc2d604Sbonwick 1751ecc2d604Sbonwick if (flags & VDD_DTL) 1752ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1753ecc2d604Sbonwick 1754ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1755fa9e4066Sahrens } 1756fa9e4066Sahrens 17570713e232SGeorge Wilson void 17580713e232SGeorge Wilson vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 17590713e232SGeorge Wilson { 17600713e232SGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 17610713e232SGeorge Wilson vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 17620713e232SGeorge Wilson 17630713e232SGeorge Wilson if (vd->vdev_ops->vdev_op_leaf) 17640713e232SGeorge Wilson vdev_dirty(vd->vdev_top, flags, vd, txg); 17650713e232SGeorge Wilson } 17660713e232SGeorge Wilson 17678ad4d6ddSJeff Bonwick /* 17688ad4d6ddSJeff Bonwick * DTLs. 17698ad4d6ddSJeff Bonwick * 17708ad4d6ddSJeff Bonwick * A vdev's DTL (dirty time log) is the set of transaction groups for which 17719fb35debSEric Taylor * the vdev has less than perfect replication. There are four kinds of DTL: 17728ad4d6ddSJeff Bonwick * 17738ad4d6ddSJeff Bonwick * DTL_MISSING: txgs for which the vdev has no valid copies of the data 17748ad4d6ddSJeff Bonwick * 17758ad4d6ddSJeff Bonwick * DTL_PARTIAL: txgs for which data is available, but not fully replicated 17768ad4d6ddSJeff Bonwick * 17778ad4d6ddSJeff Bonwick * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 17788ad4d6ddSJeff Bonwick * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 17798ad4d6ddSJeff Bonwick * txgs that was scrubbed. 17808ad4d6ddSJeff Bonwick * 17818ad4d6ddSJeff Bonwick * DTL_OUTAGE: txgs which cannot currently be read, whether due to 17828ad4d6ddSJeff Bonwick * persistent errors or just some device being offline. 17838ad4d6ddSJeff Bonwick * Unlike the other three, the DTL_OUTAGE map is not generally 17848ad4d6ddSJeff Bonwick * maintained; it's only computed when needed, typically to 17858ad4d6ddSJeff Bonwick * determine whether a device can be detached. 17868ad4d6ddSJeff Bonwick * 17878ad4d6ddSJeff Bonwick * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 17888ad4d6ddSJeff Bonwick * either has the data or it doesn't. 17898ad4d6ddSJeff Bonwick * 17908ad4d6ddSJeff Bonwick * For interior vdevs such as mirror and RAID-Z the picture is more complex. 17918ad4d6ddSJeff Bonwick * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 17928ad4d6ddSJeff Bonwick * if any child is less than fully replicated, then so is its parent. 17938ad4d6ddSJeff Bonwick * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 17948ad4d6ddSJeff Bonwick * comprising only those txgs which appear in 'maxfaults' or more children; 17958ad4d6ddSJeff Bonwick * those are the txgs we don't have enough replication to read. For example, 17968ad4d6ddSJeff Bonwick * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 17978ad4d6ddSJeff Bonwick * thus, its DTL_MISSING consists of the set of txgs that appear in more than 17988ad4d6ddSJeff Bonwick * two child DTL_MISSING maps. 17998ad4d6ddSJeff Bonwick * 18008ad4d6ddSJeff Bonwick * It should be clear from the above that to compute the DTLs and outage maps 18018ad4d6ddSJeff Bonwick * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 18028ad4d6ddSJeff Bonwick * Therefore, that is all we keep on disk. When loading the pool, or after 18038ad4d6ddSJeff Bonwick * a configuration change, we generate all other DTLs from first principles. 18048ad4d6ddSJeff Bonwick */ 1805fa9e4066Sahrens void 18068ad4d6ddSJeff Bonwick vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1807fa9e4066Sahrens { 18080713e232SGeorge Wilson range_tree_t *rt = vd->vdev_dtl[t]; 18098ad4d6ddSJeff Bonwick 18108ad4d6ddSJeff Bonwick ASSERT(t < DTL_TYPES); 18118ad4d6ddSJeff Bonwick ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1812f9af39baSGeorge Wilson ASSERT(spa_writeable(vd->vdev_spa)); 18138ad4d6ddSJeff Bonwick 18145cabbc6bSPrashanth Sreenivasa mutex_enter(&vd->vdev_dtl_lock); 18150713e232SGeorge Wilson if (!range_tree_contains(rt, txg, size)) 18160713e232SGeorge Wilson range_tree_add(rt, txg, size); 18175cabbc6bSPrashanth Sreenivasa mutex_exit(&vd->vdev_dtl_lock); 1818fa9e4066Sahrens } 1819fa9e4066Sahrens 18208ad4d6ddSJeff Bonwick boolean_t 18218ad4d6ddSJeff Bonwick vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1822fa9e4066Sahrens { 18230713e232SGeorge Wilson range_tree_t *rt = vd->vdev_dtl[t]; 18248ad4d6ddSJeff Bonwick boolean_t dirty = B_FALSE; 1825fa9e4066Sahrens 18268ad4d6ddSJeff Bonwick ASSERT(t < DTL_TYPES); 18278ad4d6ddSJeff Bonwick ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1828fa9e4066Sahrens 18295cabbc6bSPrashanth Sreenivasa /* 18305cabbc6bSPrashanth Sreenivasa * While we are loading the pool, the DTLs have not been loaded yet. 18315cabbc6bSPrashanth Sreenivasa * Ignore the DTLs and try all devices. This avoids a recursive 18325cabbc6bSPrashanth Sreenivasa * mutex enter on the vdev_dtl_lock, and also makes us try hard 18335cabbc6bSPrashanth Sreenivasa * when loading the pool (relying on the checksum to ensure that 18345cabbc6bSPrashanth Sreenivasa * we get the right data -- note that we while loading, we are 18355cabbc6bSPrashanth Sreenivasa * only reading the MOS, which is always checksummed). 18365cabbc6bSPrashanth Sreenivasa */ 18375cabbc6bSPrashanth Sreenivasa if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 18385cabbc6bSPrashanth Sreenivasa return (B_FALSE); 18395cabbc6bSPrashanth Sreenivasa 18405cabbc6bSPrashanth Sreenivasa mutex_enter(&vd->vdev_dtl_lock); 18410713e232SGeorge Wilson if (range_tree_space(rt) != 0) 18420713e232SGeorge Wilson dirty = range_tree_contains(rt, txg, size); 18435cabbc6bSPrashanth Sreenivasa mutex_exit(&vd->vdev_dtl_lock); 1844fa9e4066Sahrens 1845fa9e4066Sahrens return (dirty); 1846fa9e4066Sahrens } 1847fa9e4066Sahrens 18488ad4d6ddSJeff Bonwick boolean_t 18498ad4d6ddSJeff Bonwick vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 18508ad4d6ddSJeff Bonwick { 18510713e232SGeorge Wilson range_tree_t *rt = vd->vdev_dtl[t]; 18528ad4d6ddSJeff Bonwick boolean_t empty; 18538ad4d6ddSJeff Bonwick 18545cabbc6bSPrashanth Sreenivasa mutex_enter(&vd->vdev_dtl_lock); 18550713e232SGeorge Wilson empty = (range_tree_space(rt) == 0); 18565cabbc6bSPrashanth Sreenivasa mutex_exit(&vd->vdev_dtl_lock); 18578ad4d6ddSJeff Bonwick 18588ad4d6ddSJeff Bonwick return (empty); 18598ad4d6ddSJeff Bonwick } 18608ad4d6ddSJeff Bonwick 1861b4952e17SGeorge Wilson /* 1862b4952e17SGeorge Wilson * Returns the lowest txg in the DTL range. 1863b4952e17SGeorge Wilson */ 1864b4952e17SGeorge Wilson static uint64_t 1865b4952e17SGeorge Wilson vdev_dtl_min(vdev_t *vd) 1866b4952e17SGeorge Wilson { 18670713e232SGeorge Wilson range_seg_t *rs; 1868b4952e17SGeorge Wilson 1869b4952e17SGeorge Wilson ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 18700713e232SGeorge Wilson ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1871b4952e17SGeorge Wilson ASSERT0(vd->vdev_children); 1872b4952e17SGeorge Wilson 18730713e232SGeorge Wilson rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 18740713e232SGeorge Wilson return (rs->rs_start - 1); 1875b4952e17SGeorge Wilson } 1876b4952e17SGeorge Wilson 1877b4952e17SGeorge Wilson /* 1878b4952e17SGeorge Wilson * Returns the highest txg in the DTL. 1879b4952e17SGeorge Wilson */ 1880b4952e17SGeorge Wilson static uint64_t 1881b4952e17SGeorge Wilson vdev_dtl_max(vdev_t *vd) 1882b4952e17SGeorge Wilson { 18830713e232SGeorge Wilson range_seg_t *rs; 1884b4952e17SGeorge Wilson 1885b4952e17SGeorge Wilson ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 18860713e232SGeorge Wilson ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1887b4952e17SGeorge Wilson ASSERT0(vd->vdev_children); 1888b4952e17SGeorge Wilson 18890713e232SGeorge Wilson rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 18900713e232SGeorge Wilson return (rs->rs_end); 1891b4952e17SGeorge Wilson } 1892b4952e17SGeorge Wilson 1893b4952e17SGeorge Wilson /* 1894b4952e17SGeorge Wilson * Determine if a resilvering vdev should remove any DTL entries from 1895b4952e17SGeorge Wilson * its range. If the vdev was resilvering for the entire duration of the 1896b4952e17SGeorge Wilson * scan then it should excise that range from its DTLs. Otherwise, this 1897b4952e17SGeorge Wilson * vdev is considered partially resilvered and should leave its DTL 1898b4952e17SGeorge Wilson * entries intact. The comment in vdev_dtl_reassess() describes how we 1899b4952e17SGeorge Wilson * excise the DTLs. 1900b4952e17SGeorge Wilson */ 1901b4952e17SGeorge Wilson static boolean_t 1902b4952e17SGeorge Wilson vdev_dtl_should_excise(vdev_t *vd) 1903b4952e17SGeorge Wilson { 1904b4952e17SGeorge Wilson spa_t *spa = vd->vdev_spa; 1905b4952e17SGeorge Wilson dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1906b4952e17SGeorge Wilson 1907b4952e17SGeorge Wilson ASSERT0(scn->scn_phys.scn_errors); 1908b4952e17SGeorge Wilson ASSERT0(vd->vdev_children); 1909b4952e17SGeorge Wilson 19102d2f193aSMatthew Ahrens if (vd->vdev_state < VDEV_STATE_DEGRADED) 19112d2f193aSMatthew Ahrens return (B_FALSE); 19122d2f193aSMatthew Ahrens 1913b4952e17SGeorge Wilson if (vd->vdev_resilver_txg == 0 || 19140713e232SGeorge Wilson range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1915b4952e17SGeorge Wilson return (B_TRUE); 1916b4952e17SGeorge Wilson 1917b4952e17SGeorge Wilson /* 1918b4952e17SGeorge Wilson * When a resilver is initiated the scan will assign the scn_max_txg 1919b4952e17SGeorge Wilson * value to the highest txg value that exists in all DTLs. If this 1920b4952e17SGeorge Wilson * device's max DTL is not part of this scan (i.e. it is not in 1921b4952e17SGeorge Wilson * the range (scn_min_txg, scn_max_txg] then it is not eligible 1922b4952e17SGeorge Wilson * for excision. 1923b4952e17SGeorge Wilson */ 1924b4952e17SGeorge Wilson if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1925b4952e17SGeorge Wilson ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1926b4952e17SGeorge Wilson ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1927b4952e17SGeorge Wilson ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1928b4952e17SGeorge Wilson return (B_TRUE); 1929b4952e17SGeorge Wilson } 1930b4952e17SGeorge Wilson return (B_FALSE); 1931b4952e17SGeorge Wilson } 1932b4952e17SGeorge Wilson 1933fa9e4066Sahrens /* 1934fa9e4066Sahrens * Reassess DTLs after a config change or scrub completion. 1935fa9e4066Sahrens */ 1936fa9e4066Sahrens void 1937fa9e4066Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1938fa9e4066Sahrens { 1939ea8dc4b6Seschrock spa_t *spa = vd->vdev_spa; 19408ad4d6ddSJeff Bonwick avl_tree_t reftree; 19418ad4d6ddSJeff Bonwick int minref; 1942fa9e4066Sahrens 19438ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1944fa9e4066Sahrens 19458ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) 19468ad4d6ddSJeff Bonwick vdev_dtl_reassess(vd->vdev_child[c], txg, 19478ad4d6ddSJeff Bonwick scrub_txg, scrub_done); 19488ad4d6ddSJeff Bonwick 19495cabbc6bSPrashanth Sreenivasa if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) 19508ad4d6ddSJeff Bonwick return; 19518ad4d6ddSJeff Bonwick 19528ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf) { 19533f9d6ad7SLin Ling dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 19543f9d6ad7SLin Ling 1955fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 1956b4952e17SGeorge Wilson 1957b4952e17SGeorge Wilson /* 1958b4952e17SGeorge Wilson * If we've completed a scan cleanly then determine 1959b4952e17SGeorge Wilson * if this vdev should remove any DTLs. We only want to 1960b4952e17SGeorge Wilson * excise regions on vdevs that were available during 1961b4952e17SGeorge Wilson * the entire duration of this scan. 1962b4952e17SGeorge Wilson */ 1963088f3894Sahrens if (scrub_txg != 0 && 19643f9d6ad7SLin Ling (spa->spa_scrub_started || 1965b4952e17SGeorge Wilson (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1966b4952e17SGeorge Wilson vdev_dtl_should_excise(vd)) { 1967088f3894Sahrens /* 1968088f3894Sahrens * We completed a scrub up to scrub_txg. If we 1969088f3894Sahrens * did it without rebooting, then the scrub dtl 1970088f3894Sahrens * will be valid, so excise the old region and 1971088f3894Sahrens * fold in the scrub dtl. Otherwise, leave the 1972088f3894Sahrens * dtl as-is if there was an error. 19738ad4d6ddSJeff Bonwick * 19748ad4d6ddSJeff Bonwick * There's little trick here: to excise the beginning 19758ad4d6ddSJeff Bonwick * of the DTL_MISSING map, we put it into a reference 19768ad4d6ddSJeff Bonwick * tree and then add a segment with refcnt -1 that 19778ad4d6ddSJeff Bonwick * covers the range [0, scrub_txg). This means 19788ad4d6ddSJeff Bonwick * that each txg in that range has refcnt -1 or 0. 19798ad4d6ddSJeff Bonwick * We then add DTL_SCRUB with a refcnt of 2, so that 19808ad4d6ddSJeff Bonwick * entries in the range [0, scrub_txg) will have a 19818ad4d6ddSJeff Bonwick * positive refcnt -- either 1 or 2. We then convert 19828ad4d6ddSJeff Bonwick * the reference tree into the new DTL_MISSING map. 1983088f3894Sahrens */ 19840713e232SGeorge Wilson space_reftree_create(&reftree); 19850713e232SGeorge Wilson space_reftree_add_map(&reftree, 19860713e232SGeorge Wilson vd->vdev_dtl[DTL_MISSING], 1); 19870713e232SGeorge Wilson space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 19880713e232SGeorge Wilson space_reftree_add_map(&reftree, 19890713e232SGeorge Wilson vd->vdev_dtl[DTL_SCRUB], 2); 19900713e232SGeorge Wilson space_reftree_generate_map(&reftree, 19910713e232SGeorge Wilson vd->vdev_dtl[DTL_MISSING], 1); 19920713e232SGeorge Wilson space_reftree_destroy(&reftree); 1993fa9e4066Sahrens } 19940713e232SGeorge Wilson range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 19950713e232SGeorge Wilson range_tree_walk(vd->vdev_dtl[DTL_MISSING], 19960713e232SGeorge Wilson range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1997fa9e4066Sahrens if (scrub_done) 19980713e232SGeorge Wilson range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 19990713e232SGeorge Wilson range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 20008ad4d6ddSJeff Bonwick if (!vdev_readable(vd)) 20010713e232SGeorge Wilson range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 20028ad4d6ddSJeff Bonwick else 20030713e232SGeorge Wilson range_tree_walk(vd->vdev_dtl[DTL_MISSING], 20040713e232SGeorge Wilson range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2005b4952e17SGeorge Wilson 2006b4952e17SGeorge Wilson /* 2007b4952e17SGeorge Wilson * If the vdev was resilvering and no longer has any 2008b4952e17SGeorge Wilson * DTLs then reset its resilvering flag. 2009b4952e17SGeorge Wilson */ 2010b4952e17SGeorge Wilson if (vd->vdev_resilver_txg != 0 && 20110713e232SGeorge Wilson range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 20120713e232SGeorge Wilson range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) 2013b4952e17SGeorge Wilson vd->vdev_resilver_txg = 0; 2014b4952e17SGeorge Wilson 2015fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 2016088f3894Sahrens 2017ecc2d604Sbonwick if (txg != 0) 2018ecc2d604Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2019fa9e4066Sahrens return; 2020fa9e4066Sahrens } 2021fa9e4066Sahrens 2022fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 20238ad4d6ddSJeff Bonwick for (int t = 0; t < DTL_TYPES; t++) { 202499bb17e2SEric Taylor /* account for child's outage in parent's missing map */ 202599bb17e2SEric Taylor int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 20268ad4d6ddSJeff Bonwick if (t == DTL_SCRUB) 20278ad4d6ddSJeff Bonwick continue; /* leaf vdevs only */ 20288ad4d6ddSJeff Bonwick if (t == DTL_PARTIAL) 20298ad4d6ddSJeff Bonwick minref = 1; /* i.e. non-zero */ 20308ad4d6ddSJeff Bonwick else if (vd->vdev_nparity != 0) 20318ad4d6ddSJeff Bonwick minref = vd->vdev_nparity + 1; /* RAID-Z */ 20328ad4d6ddSJeff Bonwick else 20338ad4d6ddSJeff Bonwick minref = vd->vdev_children; /* any kind of mirror */ 20340713e232SGeorge Wilson space_reftree_create(&reftree); 20358ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) { 20368ad4d6ddSJeff Bonwick vdev_t *cvd = vd->vdev_child[c]; 20378ad4d6ddSJeff Bonwick mutex_enter(&cvd->vdev_dtl_lock); 20380713e232SGeorge Wilson space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 20398ad4d6ddSJeff Bonwick mutex_exit(&cvd->vdev_dtl_lock); 20408ad4d6ddSJeff Bonwick } 20410713e232SGeorge Wilson space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 20420713e232SGeorge Wilson space_reftree_destroy(&reftree); 2043fa9e4066Sahrens } 20448ad4d6ddSJeff Bonwick mutex_exit(&vd->vdev_dtl_lock); 2045fa9e4066Sahrens } 2046fa9e4066Sahrens 20470713e232SGeorge Wilson int 2048fa9e4066Sahrens vdev_dtl_load(vdev_t *vd) 2049fa9e4066Sahrens { 2050fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 2051ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 20520713e232SGeorge Wilson int error = 0; 2053fa9e4066Sahrens 20540713e232SGeorge Wilson if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 20555cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 2056fa9e4066Sahrens 20570713e232SGeorge Wilson error = space_map_open(&vd->vdev_dtl_sm, mos, 20585cabbc6bSPrashanth Sreenivasa vd->vdev_dtl_object, 0, -1ULL, 0); 20590713e232SGeorge Wilson if (error) 20600713e232SGeorge Wilson return (error); 20610713e232SGeorge Wilson ASSERT(vd->vdev_dtl_sm != NULL); 2062fa9e4066Sahrens 20630713e232SGeorge Wilson mutex_enter(&vd->vdev_dtl_lock); 206488ecc943SGeorge Wilson 20650713e232SGeorge Wilson /* 20660713e232SGeorge Wilson * Now that we've opened the space_map we need to update 20670713e232SGeorge Wilson * the in-core DTL. 20680713e232SGeorge Wilson */ 20690713e232SGeorge Wilson space_map_update(vd->vdev_dtl_sm); 2070ecc2d604Sbonwick 20710713e232SGeorge Wilson error = space_map_load(vd->vdev_dtl_sm, 20720713e232SGeorge Wilson vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 20730713e232SGeorge Wilson mutex_exit(&vd->vdev_dtl_lock); 2074fa9e4066Sahrens 20750713e232SGeorge Wilson return (error); 20760713e232SGeorge Wilson } 20770713e232SGeorge Wilson 20780713e232SGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 20790713e232SGeorge Wilson error = vdev_dtl_load(vd->vdev_child[c]); 20800713e232SGeorge Wilson if (error != 0) 20810713e232SGeorge Wilson break; 20820713e232SGeorge Wilson } 2083fa9e4066Sahrens 2084fa9e4066Sahrens return (error); 2085fa9e4066Sahrens } 2086fa9e4066Sahrens 2087215198a6SJoe Stein void 2088215198a6SJoe Stein vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 2089215198a6SJoe Stein { 2090215198a6SJoe Stein spa_t *spa = vd->vdev_spa; 2091215198a6SJoe Stein 2092215198a6SJoe Stein VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2093215198a6SJoe Stein VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2094215198a6SJoe Stein zapobj, tx)); 2095215198a6SJoe Stein } 2096215198a6SJoe Stein 2097215198a6SJoe Stein uint64_t 2098215198a6SJoe Stein vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 2099215198a6SJoe Stein { 2100215198a6SJoe Stein spa_t *spa = vd->vdev_spa; 2101215198a6SJoe Stein uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2102215198a6SJoe Stein DMU_OT_NONE, 0, tx); 2103215198a6SJoe Stein 2104215198a6SJoe Stein ASSERT(zap != 0); 2105215198a6SJoe Stein VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2106215198a6SJoe Stein zap, tx)); 2107215198a6SJoe Stein 2108215198a6SJoe Stein return (zap); 2109215198a6SJoe Stein } 2110215198a6SJoe Stein 2111215198a6SJoe Stein void 2112215198a6SJoe Stein vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 2113215198a6SJoe Stein { 2114215198a6SJoe Stein if (vd->vdev_ops != &vdev_hole_ops && 2115215198a6SJoe Stein vd->vdev_ops != &vdev_missing_ops && 2116215198a6SJoe Stein vd->vdev_ops != &vdev_root_ops && 2117215198a6SJoe Stein !vd->vdev_top->vdev_removing) { 2118215198a6SJoe Stein if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2119215198a6SJoe Stein vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2120215198a6SJoe Stein } 2121215198a6SJoe Stein if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2122215198a6SJoe Stein vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2123215198a6SJoe Stein } 2124215198a6SJoe Stein } 2125215198a6SJoe Stein for (uint64_t i = 0; i < vd->vdev_children; i++) { 2126215198a6SJoe Stein vdev_construct_zaps(vd->vdev_child[i], tx); 2127215198a6SJoe Stein } 2128215198a6SJoe Stein } 2129215198a6SJoe Stein 2130fa9e4066Sahrens void 2131fa9e4066Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 2132fa9e4066Sahrens { 2133fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 21340713e232SGeorge Wilson range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2135ecc2d604Sbonwick objset_t *mos = spa->spa_meta_objset; 21360713e232SGeorge Wilson range_tree_t *rtsync; 2137fa9e4066Sahrens dmu_tx_t *tx; 21380713e232SGeorge Wilson uint64_t object = space_map_object(vd->vdev_dtl_sm); 2139fa9e4066Sahrens 21405cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 21410713e232SGeorge Wilson ASSERT(vd->vdev_ops->vdev_op_leaf); 214288ecc943SGeorge Wilson 2143fa9e4066Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2144fa9e4066Sahrens 21450713e232SGeorge Wilson if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 21460713e232SGeorge Wilson mutex_enter(&vd->vdev_dtl_lock); 21470713e232SGeorge Wilson space_map_free(vd->vdev_dtl_sm, tx); 21480713e232SGeorge Wilson space_map_close(vd->vdev_dtl_sm); 21490713e232SGeorge Wilson vd->vdev_dtl_sm = NULL; 21500713e232SGeorge Wilson mutex_exit(&vd->vdev_dtl_lock); 2151215198a6SJoe Stein 2152215198a6SJoe Stein /* 2153215198a6SJoe Stein * We only destroy the leaf ZAP for detached leaves or for 2154215198a6SJoe Stein * removed log devices. Removed data devices handle leaf ZAP 2155215198a6SJoe Stein * cleanup later, once cancellation is no longer possible. 2156215198a6SJoe Stein */ 2157215198a6SJoe Stein if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 2158215198a6SJoe Stein vd->vdev_top->vdev_islog)) { 2159215198a6SJoe Stein vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2160215198a6SJoe Stein vd->vdev_leaf_zap = 0; 2161215198a6SJoe Stein } 2162215198a6SJoe Stein 2163fa9e4066Sahrens dmu_tx_commit(tx); 2164fa9e4066Sahrens return; 2165fa9e4066Sahrens } 2166fa9e4066Sahrens 21670713e232SGeorge Wilson if (vd->vdev_dtl_sm == NULL) { 21680713e232SGeorge Wilson uint64_t new_object; 21690713e232SGeorge Wilson 21700713e232SGeorge Wilson new_object = space_map_alloc(mos, tx); 21710713e232SGeorge Wilson VERIFY3U(new_object, !=, 0); 21720713e232SGeorge Wilson 21730713e232SGeorge Wilson VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 21745cabbc6bSPrashanth Sreenivasa 0, -1ULL, 0)); 21750713e232SGeorge Wilson ASSERT(vd->vdev_dtl_sm != NULL); 2176fa9e4066Sahrens } 2177fa9e4066Sahrens 21785cabbc6bSPrashanth Sreenivasa rtsync = range_tree_create(NULL, NULL); 2179fa9e4066Sahrens 2180fa9e4066Sahrens mutex_enter(&vd->vdev_dtl_lock); 21810713e232SGeorge Wilson range_tree_walk(rt, range_tree_add, rtsync); 2182fa9e4066Sahrens mutex_exit(&vd->vdev_dtl_lock); 2183fa9e4066Sahrens 21840713e232SGeorge Wilson space_map_truncate(vd->vdev_dtl_sm, tx); 21850713e232SGeorge Wilson space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 21860713e232SGeorge Wilson range_tree_vacate(rtsync, NULL, NULL); 2187fa9e4066Sahrens 21880713e232SGeorge Wilson range_tree_destroy(rtsync); 2189fa9e4066Sahrens 21900713e232SGeorge Wilson /* 21910713e232SGeorge Wilson * If the object for the space map has changed then dirty 21920713e232SGeorge Wilson * the top level so that we update the config. 21930713e232SGeorge Wilson */ 21940713e232SGeorge Wilson if (object != space_map_object(vd->vdev_dtl_sm)) { 2195*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " 2196*3ee8c80cSPavel Zakharov "new object %llu", (u_longlong_t)txg, spa_name(spa), 2197*3ee8c80cSPavel Zakharov (u_longlong_t)object, 2198*3ee8c80cSPavel Zakharov (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); 21990713e232SGeorge Wilson vdev_config_dirty(vd->vdev_top); 22000713e232SGeorge Wilson } 2201fa9e4066Sahrens 2202fa9e4066Sahrens dmu_tx_commit(tx); 22030713e232SGeorge Wilson 22040713e232SGeorge Wilson mutex_enter(&vd->vdev_dtl_lock); 22050713e232SGeorge Wilson space_map_update(vd->vdev_dtl_sm); 22060713e232SGeorge Wilson mutex_exit(&vd->vdev_dtl_lock); 2207fa9e4066Sahrens } 2208fa9e4066Sahrens 22098ad4d6ddSJeff Bonwick /* 22108ad4d6ddSJeff Bonwick * Determine whether the specified vdev can be offlined/detached/removed 22118ad4d6ddSJeff Bonwick * without losing data. 22128ad4d6ddSJeff Bonwick */ 22138ad4d6ddSJeff Bonwick boolean_t 22148ad4d6ddSJeff Bonwick vdev_dtl_required(vdev_t *vd) 22158ad4d6ddSJeff Bonwick { 22168ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 22178ad4d6ddSJeff Bonwick vdev_t *tvd = vd->vdev_top; 22188ad4d6ddSJeff Bonwick uint8_t cant_read = vd->vdev_cant_read; 22198ad4d6ddSJeff Bonwick boolean_t required; 22208ad4d6ddSJeff Bonwick 22218ad4d6ddSJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 22228ad4d6ddSJeff Bonwick 22238ad4d6ddSJeff Bonwick if (vd == spa->spa_root_vdev || vd == tvd) 22248ad4d6ddSJeff Bonwick return (B_TRUE); 22258ad4d6ddSJeff Bonwick 22268ad4d6ddSJeff Bonwick /* 22278ad4d6ddSJeff Bonwick * Temporarily mark the device as unreadable, and then determine 22288ad4d6ddSJeff Bonwick * whether this results in any DTL outages in the top-level vdev. 22298ad4d6ddSJeff Bonwick * If not, we can safely offline/detach/remove the device. 22308ad4d6ddSJeff Bonwick */ 22318ad4d6ddSJeff Bonwick vd->vdev_cant_read = B_TRUE; 22328ad4d6ddSJeff Bonwick vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 22338ad4d6ddSJeff Bonwick required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 22348ad4d6ddSJeff Bonwick vd->vdev_cant_read = cant_read; 22358ad4d6ddSJeff Bonwick vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 22368ad4d6ddSJeff Bonwick 2237cb04b873SMark J Musante if (!required && zio_injection_enabled) 2238cb04b873SMark J Musante required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2239cb04b873SMark J Musante 22408ad4d6ddSJeff Bonwick return (required); 22418ad4d6ddSJeff Bonwick } 22428ad4d6ddSJeff Bonwick 2243088f3894Sahrens /* 2244088f3894Sahrens * Determine if resilver is needed, and if so the txg range. 2245088f3894Sahrens */ 2246088f3894Sahrens boolean_t 2247088f3894Sahrens vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2248088f3894Sahrens { 2249088f3894Sahrens boolean_t needed = B_FALSE; 2250088f3894Sahrens uint64_t thismin = UINT64_MAX; 2251088f3894Sahrens uint64_t thismax = 0; 2252088f3894Sahrens 2253088f3894Sahrens if (vd->vdev_children == 0) { 2254088f3894Sahrens mutex_enter(&vd->vdev_dtl_lock); 22550713e232SGeorge Wilson if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 22568ad4d6ddSJeff Bonwick vdev_writeable(vd)) { 2257088f3894Sahrens 2258b4952e17SGeorge Wilson thismin = vdev_dtl_min(vd); 2259b4952e17SGeorge Wilson thismax = vdev_dtl_max(vd); 2260088f3894Sahrens needed = B_TRUE; 2261088f3894Sahrens } 2262088f3894Sahrens mutex_exit(&vd->vdev_dtl_lock); 2263088f3894Sahrens } else { 22648ad4d6ddSJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) { 2265088f3894Sahrens vdev_t *cvd = vd->vdev_child[c]; 2266088f3894Sahrens uint64_t cmin, cmax; 2267088f3894Sahrens 2268088f3894Sahrens if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2269088f3894Sahrens thismin = MIN(thismin, cmin); 2270088f3894Sahrens thismax = MAX(thismax, cmax); 2271088f3894Sahrens needed = B_TRUE; 2272088f3894Sahrens } 2273088f3894Sahrens } 2274088f3894Sahrens } 2275088f3894Sahrens 2276088f3894Sahrens if (needed && minp) { 2277088f3894Sahrens *minp = thismin; 2278088f3894Sahrens *maxp = thismax; 2279088f3894Sahrens } 2280088f3894Sahrens return (needed); 2281088f3894Sahrens } 2282088f3894Sahrens 22835cabbc6bSPrashanth Sreenivasa int 2284ea8dc4b6Seschrock vdev_load(vdev_t *vd) 2285fa9e4066Sahrens { 22865cabbc6bSPrashanth Sreenivasa int error = 0; 2287fa9e4066Sahrens /* 2288fa9e4066Sahrens * Recursively load all children. 2289fa9e4066Sahrens */ 22905cabbc6bSPrashanth Sreenivasa for (int c = 0; c < vd->vdev_children; c++) { 22915cabbc6bSPrashanth Sreenivasa error = vdev_load(vd->vdev_child[c]); 22925cabbc6bSPrashanth Sreenivasa if (error != 0) { 22935cabbc6bSPrashanth Sreenivasa return (error); 22945cabbc6bSPrashanth Sreenivasa } 22955cabbc6bSPrashanth Sreenivasa } 22965cabbc6bSPrashanth Sreenivasa 22975cabbc6bSPrashanth Sreenivasa vdev_set_deflate_ratio(vd); 2298fa9e4066Sahrens 2299fa9e4066Sahrens /* 23000e34b6a7Sbonwick * If this is a top-level vdev, initialize its metaslabs. 2301fa9e4066Sahrens */ 23025cabbc6bSPrashanth Sreenivasa if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 23035cabbc6bSPrashanth Sreenivasa if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { 23045cabbc6bSPrashanth Sreenivasa vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 23055cabbc6bSPrashanth Sreenivasa VDEV_AUX_CORRUPT_DATA); 2306*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " 2307*3ee8c80cSPavel Zakharov "asize=%llu", (u_longlong_t)vd->vdev_ashift, 2308*3ee8c80cSPavel Zakharov (u_longlong_t)vd->vdev_asize); 23095cabbc6bSPrashanth Sreenivasa return (SET_ERROR(ENXIO)); 23105cabbc6bSPrashanth Sreenivasa } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { 2311*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " 2312*3ee8c80cSPavel Zakharov "[error=%d]", error); 23135cabbc6bSPrashanth Sreenivasa vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 23145cabbc6bSPrashanth Sreenivasa VDEV_AUX_CORRUPT_DATA); 23155cabbc6bSPrashanth Sreenivasa return (error); 23165cabbc6bSPrashanth Sreenivasa } 23175cabbc6bSPrashanth Sreenivasa } 2318fa9e4066Sahrens 2319fa9e4066Sahrens /* 2320fa9e4066Sahrens * If this is a leaf vdev, load its DTL. 2321fa9e4066Sahrens */ 23225cabbc6bSPrashanth Sreenivasa if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2323560e6e96Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2324560e6e96Seschrock VDEV_AUX_CORRUPT_DATA); 2325*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " 2326*3ee8c80cSPavel Zakharov "[error=%d]", error); 23275cabbc6bSPrashanth Sreenivasa return (error); 23285cabbc6bSPrashanth Sreenivasa } 23295cabbc6bSPrashanth Sreenivasa 23305cabbc6bSPrashanth Sreenivasa uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 23315cabbc6bSPrashanth Sreenivasa if (obsolete_sm_object != 0) { 23325cabbc6bSPrashanth Sreenivasa objset_t *mos = vd->vdev_spa->spa_meta_objset; 23335cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_asize != 0); 23345cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_obsolete_sm == NULL); 23355cabbc6bSPrashanth Sreenivasa 23365cabbc6bSPrashanth Sreenivasa if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 23375cabbc6bSPrashanth Sreenivasa obsolete_sm_object, 0, vd->vdev_asize, 0))) { 23385cabbc6bSPrashanth Sreenivasa vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 23395cabbc6bSPrashanth Sreenivasa VDEV_AUX_CORRUPT_DATA); 2340*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " 2341*3ee8c80cSPavel Zakharov "obsolete spacemap (obj %llu) [error=%d]", 2342*3ee8c80cSPavel Zakharov (u_longlong_t)obsolete_sm_object, error); 23435cabbc6bSPrashanth Sreenivasa return (error); 23445cabbc6bSPrashanth Sreenivasa } 23455cabbc6bSPrashanth Sreenivasa space_map_update(vd->vdev_obsolete_sm); 23465cabbc6bSPrashanth Sreenivasa } 23475cabbc6bSPrashanth Sreenivasa 23485cabbc6bSPrashanth Sreenivasa return (0); 2349fa9e4066Sahrens } 2350fa9e4066Sahrens 235199653d4eSeschrock /* 2352fa94a07fSbrendan * The special vdev case is used for hot spares and l2cache devices. Its 2353fa94a07fSbrendan * sole purpose it to set the vdev state for the associated vdev. To do this, 2354fa94a07fSbrendan * we make sure that we can open the underlying device, then try to read the 2355fa94a07fSbrendan * label, and make sure that the label is sane and that it hasn't been 2356fa94a07fSbrendan * repurposed to another pool. 235799653d4eSeschrock */ 235899653d4eSeschrock int 2359fa94a07fSbrendan vdev_validate_aux(vdev_t *vd) 236099653d4eSeschrock { 236199653d4eSeschrock nvlist_t *label; 236299653d4eSeschrock uint64_t guid, version; 236399653d4eSeschrock uint64_t state; 236499653d4eSeschrock 2365e14bb325SJeff Bonwick if (!vdev_readable(vd)) 2366c5904d13Seschrock return (0); 2367c5904d13Seschrock 2368dfbb9432SGeorge Wilson if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 236999653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 237099653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 237199653d4eSeschrock return (-1); 237299653d4eSeschrock } 237399653d4eSeschrock 237499653d4eSeschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2375ad135b5dSChristopher Siden !SPA_VERSION_IS_SUPPORTED(version) || 237699653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 237799653d4eSeschrock guid != vd->vdev_guid || 237899653d4eSeschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 237999653d4eSeschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 238099653d4eSeschrock VDEV_AUX_CORRUPT_DATA); 238199653d4eSeschrock nvlist_free(label); 238299653d4eSeschrock return (-1); 238399653d4eSeschrock } 238499653d4eSeschrock 238599653d4eSeschrock /* 238699653d4eSeschrock * We don't actually check the pool state here. If it's in fact in 238799653d4eSeschrock * use by another pool, we update this fact on the fly when requested. 238899653d4eSeschrock */ 238999653d4eSeschrock nvlist_free(label); 239099653d4eSeschrock return (0); 239199653d4eSeschrock } 239299653d4eSeschrock 23935cabbc6bSPrashanth Sreenivasa /* 23945cabbc6bSPrashanth Sreenivasa * Free the objects used to store this vdev's spacemaps, and the array 23955cabbc6bSPrashanth Sreenivasa * that points to them. 23965cabbc6bSPrashanth Sreenivasa */ 239788ecc943SGeorge Wilson void 23985cabbc6bSPrashanth Sreenivasa vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) 23995cabbc6bSPrashanth Sreenivasa { 24005cabbc6bSPrashanth Sreenivasa if (vd->vdev_ms_array == 0) 24015cabbc6bSPrashanth Sreenivasa return; 24025cabbc6bSPrashanth Sreenivasa 24035cabbc6bSPrashanth Sreenivasa objset_t *mos = vd->vdev_spa->spa_meta_objset; 24045cabbc6bSPrashanth Sreenivasa uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 24055cabbc6bSPrashanth Sreenivasa size_t array_bytes = array_count * sizeof (uint64_t); 24065cabbc6bSPrashanth Sreenivasa uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 24075cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 24085cabbc6bSPrashanth Sreenivasa array_bytes, smobj_array, 0)); 24095cabbc6bSPrashanth Sreenivasa 24105cabbc6bSPrashanth Sreenivasa for (uint64_t i = 0; i < array_count; i++) { 24115cabbc6bSPrashanth Sreenivasa uint64_t smobj = smobj_array[i]; 24125cabbc6bSPrashanth Sreenivasa if (smobj == 0) 24135cabbc6bSPrashanth Sreenivasa continue; 24145cabbc6bSPrashanth Sreenivasa 24155cabbc6bSPrashanth Sreenivasa space_map_free_obj(mos, smobj, tx); 24165cabbc6bSPrashanth Sreenivasa } 24175cabbc6bSPrashanth Sreenivasa 24185cabbc6bSPrashanth Sreenivasa kmem_free(smobj_array, array_bytes); 24195cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 24205cabbc6bSPrashanth Sreenivasa vd->vdev_ms_array = 0; 24215cabbc6bSPrashanth Sreenivasa } 24225cabbc6bSPrashanth Sreenivasa 24235cabbc6bSPrashanth Sreenivasa static void 24245cabbc6bSPrashanth Sreenivasa vdev_remove_empty(vdev_t *vd, uint64_t txg) 242588ecc943SGeorge Wilson { 242688ecc943SGeorge Wilson spa_t *spa = vd->vdev_spa; 242788ecc943SGeorge Wilson dmu_tx_t *tx; 242888ecc943SGeorge Wilson 2429215198a6SJoe Stein ASSERT(vd == vd->vdev_top); 2430215198a6SJoe Stein ASSERT3U(txg, ==, spa_syncing_txg(spa)); 243188ecc943SGeorge Wilson 243288ecc943SGeorge Wilson if (vd->vdev_ms != NULL) { 24332e4c9986SGeorge Wilson metaslab_group_t *mg = vd->vdev_mg; 24342e4c9986SGeorge Wilson 24352e4c9986SGeorge Wilson metaslab_group_histogram_verify(mg); 24362e4c9986SGeorge Wilson metaslab_class_histogram_verify(mg->mg_class); 24372e4c9986SGeorge Wilson 243888ecc943SGeorge Wilson for (int m = 0; m < vd->vdev_ms_count; m++) { 243988ecc943SGeorge Wilson metaslab_t *msp = vd->vdev_ms[m]; 244088ecc943SGeorge Wilson 24410713e232SGeorge Wilson if (msp == NULL || msp->ms_sm == NULL) 244288ecc943SGeorge Wilson continue; 244388ecc943SGeorge Wilson 24440713e232SGeorge Wilson mutex_enter(&msp->ms_lock); 24452e4c9986SGeorge Wilson /* 24462e4c9986SGeorge Wilson * If the metaslab was not loaded when the vdev 24472e4c9986SGeorge Wilson * was removed then the histogram accounting may 24482e4c9986SGeorge Wilson * not be accurate. Update the histogram information 24492e4c9986SGeorge Wilson * here so that we ensure that the metaslab group 24502e4c9986SGeorge Wilson * and metaslab class are up-to-date. 24512e4c9986SGeorge Wilson */ 24522e4c9986SGeorge Wilson metaslab_group_histogram_remove(mg, msp); 24532e4c9986SGeorge Wilson 24540713e232SGeorge Wilson VERIFY0(space_map_allocated(msp->ms_sm)); 24550713e232SGeorge Wilson space_map_close(msp->ms_sm); 24560713e232SGeorge Wilson msp->ms_sm = NULL; 24570713e232SGeorge Wilson mutex_exit(&msp->ms_lock); 245888ecc943SGeorge Wilson } 24592e4c9986SGeorge Wilson 24602e4c9986SGeorge Wilson metaslab_group_histogram_verify(mg); 24612e4c9986SGeorge Wilson metaslab_class_histogram_verify(mg->mg_class); 24622e4c9986SGeorge Wilson for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 24632e4c9986SGeorge Wilson ASSERT0(mg->mg_histogram[i]); 246488ecc943SGeorge Wilson } 246588ecc943SGeorge Wilson 24665cabbc6bSPrashanth Sreenivasa tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 24675cabbc6bSPrashanth Sreenivasa vdev_destroy_spacemaps(vd, tx); 2468215198a6SJoe Stein 2469215198a6SJoe Stein if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2470215198a6SJoe Stein vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2471215198a6SJoe Stein vd->vdev_top_zap = 0; 2472215198a6SJoe Stein } 247388ecc943SGeorge Wilson dmu_tx_commit(tx); 247488ecc943SGeorge Wilson } 247588ecc943SGeorge Wilson 2476fa9e4066Sahrens void 2477fa9e4066Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 2478fa9e4066Sahrens { 2479fa9e4066Sahrens metaslab_t *msp; 248080eb36f2SGeorge Wilson boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2481fa9e4066Sahrens 24825cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 248388ecc943SGeorge Wilson 2484fa9e4066Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2485fa9e4066Sahrens metaslab_sync_done(msp, txg); 248680eb36f2SGeorge Wilson 248780eb36f2SGeorge Wilson if (reassess) 248880eb36f2SGeorge Wilson metaslab_sync_reassess(vd->vdev_mg); 2489fa9e4066Sahrens } 2490fa9e4066Sahrens 2491fa9e4066Sahrens void 2492fa9e4066Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 2493fa9e4066Sahrens { 2494fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 2495fa9e4066Sahrens vdev_t *lvd; 2496fa9e4066Sahrens metaslab_t *msp; 2497ecc2d604Sbonwick dmu_tx_t *tx; 2498fa9e4066Sahrens 24995cabbc6bSPrashanth Sreenivasa if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 25005cabbc6bSPrashanth Sreenivasa dmu_tx_t *tx; 25015cabbc6bSPrashanth Sreenivasa 25025cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing || 25035cabbc6bSPrashanth Sreenivasa vd->vdev_ops == &vdev_indirect_ops); 250488ecc943SGeorge Wilson 25055cabbc6bSPrashanth Sreenivasa tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 25065cabbc6bSPrashanth Sreenivasa vdev_indirect_sync_obsolete(vd, tx); 25075cabbc6bSPrashanth Sreenivasa dmu_tx_commit(tx); 25085cabbc6bSPrashanth Sreenivasa 25095cabbc6bSPrashanth Sreenivasa /* 25105cabbc6bSPrashanth Sreenivasa * If the vdev is indirect, it can't have dirty 25115cabbc6bSPrashanth Sreenivasa * metaslabs or DTLs. 25125cabbc6bSPrashanth Sreenivasa */ 25135cabbc6bSPrashanth Sreenivasa if (vd->vdev_ops == &vdev_indirect_ops) { 25145cabbc6bSPrashanth Sreenivasa ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 25155cabbc6bSPrashanth Sreenivasa ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 25165cabbc6bSPrashanth Sreenivasa return; 25175cabbc6bSPrashanth Sreenivasa } 25185cabbc6bSPrashanth Sreenivasa } 25195cabbc6bSPrashanth Sreenivasa 25205cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 25215cabbc6bSPrashanth Sreenivasa 25225cabbc6bSPrashanth Sreenivasa if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 25235cabbc6bSPrashanth Sreenivasa !vd->vdev_removing) { 2524ecc2d604Sbonwick ASSERT(vd == vd->vdev_top); 25255cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 2526ecc2d604Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2527ecc2d604Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2528ecc2d604Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2529ecc2d604Sbonwick ASSERT(vd->vdev_ms_array != 0); 2530ecc2d604Sbonwick vdev_config_dirty(vd); 2531ecc2d604Sbonwick dmu_tx_commit(tx); 2532ecc2d604Sbonwick } 2533fa9e4066Sahrens 2534ecc2d604Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2535fa9e4066Sahrens metaslab_sync(msp, txg); 2536ecc2d604Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2537ecc2d604Sbonwick } 2538fa9e4066Sahrens 2539fa9e4066Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2540fa9e4066Sahrens vdev_dtl_sync(lvd, txg); 2541fa9e4066Sahrens 25425cabbc6bSPrashanth Sreenivasa /* 25435cabbc6bSPrashanth Sreenivasa * Remove the metadata associated with this vdev once it's empty. 25445cabbc6bSPrashanth Sreenivasa * Note that this is typically used for log/cache device removal; 25455cabbc6bSPrashanth Sreenivasa * we don't empty toplevel vdevs when removing them. But if 25465cabbc6bSPrashanth Sreenivasa * a toplevel happens to be emptied, this is not harmful. 25475cabbc6bSPrashanth Sreenivasa */ 25485cabbc6bSPrashanth Sreenivasa if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 25495cabbc6bSPrashanth Sreenivasa vdev_remove_empty(vd, txg); 25505cabbc6bSPrashanth Sreenivasa } 25515cabbc6bSPrashanth Sreenivasa 2552fa9e4066Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2553fa9e4066Sahrens } 2554fa9e4066Sahrens 2555fa9e4066Sahrens uint64_t 2556fa9e4066Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2557fa9e4066Sahrens { 2558fa9e4066Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2559fa9e4066Sahrens } 2560fa9e4066Sahrens 25613d7072f8Seschrock /* 25623d7072f8Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 25633d7072f8Seschrock * not be opened, and no I/O is attempted. 25643d7072f8Seschrock */ 2565fa9e4066Sahrens int 2566069f55e2SEric Schrock vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2567fa9e4066Sahrens { 25684b964adaSGeorge Wilson vdev_t *vd, *tvd; 2569fa9e4066Sahrens 25708f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 2571fa9e4066Sahrens 2572c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2573e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2574e14bb325SJeff Bonwick 25753d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 2576e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2577fa9e4066Sahrens 25784b964adaSGeorge Wilson tvd = vd->vdev_top; 25794b964adaSGeorge Wilson 2580069f55e2SEric Schrock /* 2581069f55e2SEric Schrock * We don't directly use the aux state here, but if we do a 2582069f55e2SEric Schrock * vdev_reopen(), we need this value to be present to remember why we 2583069f55e2SEric Schrock * were faulted. 2584069f55e2SEric Schrock */ 2585069f55e2SEric Schrock vd->vdev_label_aux = aux; 2586069f55e2SEric Schrock 25873d7072f8Seschrock /* 25883d7072f8Seschrock * Faulted state takes precedence over degraded. 25893d7072f8Seschrock */ 259098d1cbfeSGeorge Wilson vd->vdev_delayed_close = B_FALSE; 25913d7072f8Seschrock vd->vdev_faulted = 1ULL; 25923d7072f8Seschrock vd->vdev_degraded = 0ULL; 2593069f55e2SEric Schrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 25943d7072f8Seschrock 25953d7072f8Seschrock /* 2596c79790bcSGeorge Wilson * If this device has the only valid copy of the data, then 2597c79790bcSGeorge Wilson * back off and simply mark the vdev as degraded instead. 25983d7072f8Seschrock */ 25994b964adaSGeorge Wilson if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 26003d7072f8Seschrock vd->vdev_degraded = 1ULL; 26013d7072f8Seschrock vd->vdev_faulted = 0ULL; 26023d7072f8Seschrock 26033d7072f8Seschrock /* 26043d7072f8Seschrock * If we reopen the device and it's not dead, only then do we 26053d7072f8Seschrock * mark it degraded. 26063d7072f8Seschrock */ 26074b964adaSGeorge Wilson vdev_reopen(tvd); 26083d7072f8Seschrock 2609069f55e2SEric Schrock if (vdev_readable(vd)) 2610069f55e2SEric Schrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 26113d7072f8Seschrock } 26123d7072f8Seschrock 2613e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 26143d7072f8Seschrock } 26153d7072f8Seschrock 26163d7072f8Seschrock /* 26173d7072f8Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 26183d7072f8Seschrock * user that something is wrong. The vdev continues to operate as normal as far 26193d7072f8Seschrock * as I/O is concerned. 26203d7072f8Seschrock */ 26213d7072f8Seschrock int 2622069f55e2SEric Schrock vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 26233d7072f8Seschrock { 2624c5904d13Seschrock vdev_t *vd; 26250a4e9518Sgw 26268f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 26273d7072f8Seschrock 2628c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2629e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2630e14bb325SJeff Bonwick 26310e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 2632e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 26330e34b6a7Sbonwick 26343d7072f8Seschrock /* 26353d7072f8Seschrock * If the vdev is already faulted, then don't do anything. 26363d7072f8Seschrock */ 2637e14bb325SJeff Bonwick if (vd->vdev_faulted || vd->vdev_degraded) 2638e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, 0)); 26393d7072f8Seschrock 26403d7072f8Seschrock vd->vdev_degraded = 1ULL; 26413d7072f8Seschrock if (!vdev_is_dead(vd)) 26423d7072f8Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2643069f55e2SEric Schrock aux); 26443d7072f8Seschrock 2645e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 26463d7072f8Seschrock } 26473d7072f8Seschrock 26483d7072f8Seschrock /* 2649f7170741SWill Andrews * Online the given vdev. 2650f7170741SWill Andrews * 2651f7170741SWill Andrews * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2652f7170741SWill Andrews * spare device should be detached when the device finishes resilvering. 2653f7170741SWill Andrews * Second, the online should be treated like a 'test' online case, so no FMA 2654f7170741SWill Andrews * events are generated if the device fails to open. 26553d7072f8Seschrock */ 26563d7072f8Seschrock int 2657e14bb325SJeff Bonwick vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 26583d7072f8Seschrock { 2659573ca77eSGeorge Wilson vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 26605f368aefSYuri Pankov boolean_t wasoffline; 26615f368aefSYuri Pankov vdev_state_t oldstate; 26623d7072f8Seschrock 26638f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_NONE); 26643d7072f8Seschrock 2665c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2666e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 26673d7072f8Seschrock 26683d7072f8Seschrock if (!vd->vdev_ops->vdev_op_leaf) 2669e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2670fa9e4066Sahrens 26715f368aefSYuri Pankov wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); 26725f368aefSYuri Pankov oldstate = vd->vdev_state; 267314372834SHans Rosenfeld 2674573ca77eSGeorge Wilson tvd = vd->vdev_top; 2675fa9e4066Sahrens vd->vdev_offline = B_FALSE; 2676441d80aaSlling vd->vdev_tmpoffline = B_FALSE; 2677e14bb325SJeff Bonwick vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2678e14bb325SJeff Bonwick vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2679573ca77eSGeorge Wilson 2680573ca77eSGeorge Wilson /* XXX - L2ARC 1.0 does not support expansion */ 2681573ca77eSGeorge Wilson if (!vd->vdev_aux) { 2682573ca77eSGeorge Wilson for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2683573ca77eSGeorge Wilson pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2684573ca77eSGeorge Wilson } 2685573ca77eSGeorge Wilson 2686573ca77eSGeorge Wilson vdev_reopen(tvd); 26873d7072f8Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 26883d7072f8Seschrock 2689573ca77eSGeorge Wilson if (!vd->vdev_aux) { 2690573ca77eSGeorge Wilson for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2691573ca77eSGeorge Wilson pvd->vdev_expanding = B_FALSE; 2692573ca77eSGeorge Wilson } 2693573ca77eSGeorge Wilson 26943d7072f8Seschrock if (newstate) 26953d7072f8Seschrock *newstate = vd->vdev_state; 26963d7072f8Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 26973d7072f8Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 26983d7072f8Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 26993d7072f8Seschrock vd->vdev_parent->vdev_child[0] == vd) 27003d7072f8Seschrock vd->vdev_unspare = B_TRUE; 2701fa9e4066Sahrens 2702573ca77eSGeorge Wilson if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2703573ca77eSGeorge Wilson 2704573ca77eSGeorge Wilson /* XXX - L2ARC 1.0 does not support expansion */ 2705573ca77eSGeorge Wilson if (vd->vdev_aux) 2706573ca77eSGeorge Wilson return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2707573ca77eSGeorge Wilson spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2708573ca77eSGeorge Wilson } 270914372834SHans Rosenfeld 27105f368aefSYuri Pankov if (wasoffline || 27115f368aefSYuri Pankov (oldstate < VDEV_STATE_DEGRADED && 27125f368aefSYuri Pankov vd->vdev_state >= VDEV_STATE_DEGRADED)) 2713ce1577b0SDave Eddy spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 271414372834SHans Rosenfeld 27158ad4d6ddSJeff Bonwick return (spa_vdev_state_exit(spa, vd, 0)); 2716fa9e4066Sahrens } 2717fa9e4066Sahrens 2718a1521560SJeff Bonwick static int 2719a1521560SJeff Bonwick vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2720fa9e4066Sahrens { 2721e6ca193dSGeorge Wilson vdev_t *vd, *tvd; 27228f18d1faSGeorge Wilson int error = 0; 27238f18d1faSGeorge Wilson uint64_t generation; 27248f18d1faSGeorge Wilson metaslab_group_t *mg; 27250a4e9518Sgw 27268f18d1faSGeorge Wilson top: 27278f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_ALLOC); 2728fa9e4066Sahrens 2729c5904d13Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2730e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2731fa9e4066Sahrens 27320e34b6a7Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 2733e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 27340e34b6a7Sbonwick 2735e6ca193dSGeorge Wilson tvd = vd->vdev_top; 27368f18d1faSGeorge Wilson mg = tvd->vdev_mg; 27378f18d1faSGeorge Wilson generation = spa->spa_config_generation + 1; 2738e6ca193dSGeorge Wilson 2739fa9e4066Sahrens /* 2740ecc2d604Sbonwick * If the device isn't already offline, try to offline it. 2741fa9e4066Sahrens */ 2742ecc2d604Sbonwick if (!vd->vdev_offline) { 2743ecc2d604Sbonwick /* 27448ad4d6ddSJeff Bonwick * If this device has the only valid copy of some data, 2745e6ca193dSGeorge Wilson * don't allow it to be offlined. Log devices are always 2746e6ca193dSGeorge Wilson * expendable. 2747ecc2d604Sbonwick */ 2748e6ca193dSGeorge Wilson if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2749e6ca193dSGeorge Wilson vdev_dtl_required(vd)) 2750e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2751fa9e4066Sahrens 27528f18d1faSGeorge Wilson /* 2753b24ab676SJeff Bonwick * If the top-level is a slog and it has had allocations 2754b24ab676SJeff Bonwick * then proceed. We check that the vdev's metaslab group 2755b24ab676SJeff Bonwick * is not NULL since it's possible that we may have just 2756b24ab676SJeff Bonwick * added this vdev but not yet initialized its metaslabs. 27578f18d1faSGeorge Wilson */ 27588f18d1faSGeorge Wilson if (tvd->vdev_islog && mg != NULL) { 27598f18d1faSGeorge Wilson /* 27608f18d1faSGeorge Wilson * Prevent any future allocations. 27618f18d1faSGeorge Wilson */ 2762a1521560SJeff Bonwick metaslab_group_passivate(mg); 27638f18d1faSGeorge Wilson (void) spa_vdev_state_exit(spa, vd, 0); 27648f18d1faSGeorge Wilson 27655cabbc6bSPrashanth Sreenivasa error = spa_reset_logs(spa); 27668f18d1faSGeorge Wilson 27678f18d1faSGeorge Wilson spa_vdev_state_enter(spa, SCL_ALLOC); 27688f18d1faSGeorge Wilson 27698f18d1faSGeorge Wilson /* 27708f18d1faSGeorge Wilson * Check to see if the config has changed. 27718f18d1faSGeorge Wilson */ 27728f18d1faSGeorge Wilson if (error || generation != spa->spa_config_generation) { 2773a1521560SJeff Bonwick metaslab_group_activate(mg); 27748f18d1faSGeorge Wilson if (error) 27758f18d1faSGeorge Wilson return (spa_vdev_state_exit(spa, 27768f18d1faSGeorge Wilson vd, error)); 27778f18d1faSGeorge Wilson (void) spa_vdev_state_exit(spa, vd, 0); 27788f18d1faSGeorge Wilson goto top; 27798f18d1faSGeorge Wilson } 2780fb09f5aaSMadhav Suresh ASSERT0(tvd->vdev_stat.vs_alloc); 27818f18d1faSGeorge Wilson } 27828f18d1faSGeorge Wilson 2783ecc2d604Sbonwick /* 2784ecc2d604Sbonwick * Offline this device and reopen its top-level vdev. 2785e6ca193dSGeorge Wilson * If the top-level vdev is a log device then just offline 2786e6ca193dSGeorge Wilson * it. Otherwise, if this action results in the top-level 2787e6ca193dSGeorge Wilson * vdev becoming unusable, undo it and fail the request. 2788ecc2d604Sbonwick */ 2789ecc2d604Sbonwick vd->vdev_offline = B_TRUE; 2790e6ca193dSGeorge Wilson vdev_reopen(tvd); 2791e6ca193dSGeorge Wilson 2792e6ca193dSGeorge Wilson if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2793e6ca193dSGeorge Wilson vdev_is_dead(tvd)) { 2794ecc2d604Sbonwick vd->vdev_offline = B_FALSE; 2795e6ca193dSGeorge Wilson vdev_reopen(tvd); 2796e14bb325SJeff Bonwick return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2797ecc2d604Sbonwick } 27988f18d1faSGeorge Wilson 27998f18d1faSGeorge Wilson /* 28008f18d1faSGeorge Wilson * Add the device back into the metaslab rotor so that 28018f18d1faSGeorge Wilson * once we online the device it's open for business. 28028f18d1faSGeorge Wilson */ 28038f18d1faSGeorge Wilson if (tvd->vdev_islog && mg != NULL) 2804a1521560SJeff Bonwick metaslab_group_activate(mg); 2805fa9e4066Sahrens } 2806fa9e4066Sahrens 2807e14bb325SJeff Bonwick vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2808ecc2d604Sbonwick 28098f18d1faSGeorge Wilson return (spa_vdev_state_exit(spa, vd, 0)); 2810fa9e4066Sahrens } 2811fa9e4066Sahrens 2812a1521560SJeff Bonwick int 2813a1521560SJeff Bonwick vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2814a1521560SJeff Bonwick { 2815a1521560SJeff Bonwick int error; 2816a1521560SJeff Bonwick 2817a1521560SJeff Bonwick mutex_enter(&spa->spa_vdev_top_lock); 2818a1521560SJeff Bonwick error = vdev_offline_locked(spa, guid, flags); 2819a1521560SJeff Bonwick mutex_exit(&spa->spa_vdev_top_lock); 2820a1521560SJeff Bonwick 2821a1521560SJeff Bonwick return (error); 2822a1521560SJeff Bonwick } 2823a1521560SJeff Bonwick 2824ea8dc4b6Seschrock /* 2825ea8dc4b6Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 2826ea8dc4b6Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 2827ea8dc4b6Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2828ea8dc4b6Seschrock */ 2829ea8dc4b6Seschrock void 2830e14bb325SJeff Bonwick vdev_clear(spa_t *spa, vdev_t *vd) 2831fa9e4066Sahrens { 2832e14bb325SJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 2833e14bb325SJeff Bonwick 2834e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2835fa9e4066Sahrens 2836ea8dc4b6Seschrock if (vd == NULL) 2837e14bb325SJeff Bonwick vd = rvd; 2838fa9e4066Sahrens 2839ea8dc4b6Seschrock vd->vdev_stat.vs_read_errors = 0; 2840ea8dc4b6Seschrock vd->vdev_stat.vs_write_errors = 0; 2841ea8dc4b6Seschrock vd->vdev_stat.vs_checksum_errors = 0; 2842fa9e4066Sahrens 2843e14bb325SJeff Bonwick for (int c = 0; c < vd->vdev_children; c++) 2844e14bb325SJeff Bonwick vdev_clear(spa, vd->vdev_child[c]); 28453d7072f8Seschrock 28465cabbc6bSPrashanth Sreenivasa /* 28475cabbc6bSPrashanth Sreenivasa * It makes no sense to "clear" an indirect vdev. 28485cabbc6bSPrashanth Sreenivasa */ 28495cabbc6bSPrashanth Sreenivasa if (!vdev_is_concrete(vd)) 28505cabbc6bSPrashanth Sreenivasa return; 28515cabbc6bSPrashanth Sreenivasa 28523d7072f8Seschrock /* 28538a79c1b5Sek * If we're in the FAULTED state or have experienced failed I/O, then 28548a79c1b5Sek * clear the persistent state and attempt to reopen the device. We 28558a79c1b5Sek * also mark the vdev config dirty, so that the new faulted state is 28568a79c1b5Sek * written out to disk. 28573d7072f8Seschrock */ 2858e14bb325SJeff Bonwick if (vd->vdev_faulted || vd->vdev_degraded || 2859e14bb325SJeff Bonwick !vdev_readable(vd) || !vdev_writeable(vd)) { 28608a79c1b5Sek 2861096d22d4SEric Schrock /* 2862096d22d4SEric Schrock * When reopening in reponse to a clear event, it may be due to 2863096d22d4SEric Schrock * a fmadm repair request. In this case, if the device is 2864096d22d4SEric Schrock * still broken, we want to still post the ereport again. 2865096d22d4SEric Schrock */ 2866096d22d4SEric Schrock vd->vdev_forcefault = B_TRUE; 2867096d22d4SEric Schrock 28684b964adaSGeorge Wilson vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2869e14bb325SJeff Bonwick vd->vdev_cant_read = B_FALSE; 2870e14bb325SJeff Bonwick vd->vdev_cant_write = B_FALSE; 2871e14bb325SJeff Bonwick 2872f9af39baSGeorge Wilson vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 28733d7072f8Seschrock 2874096d22d4SEric Schrock vd->vdev_forcefault = B_FALSE; 2875096d22d4SEric Schrock 2876f9af39baSGeorge Wilson if (vd != rvd && vdev_writeable(vd->vdev_top)) 2877e14bb325SJeff Bonwick vdev_state_dirty(vd->vdev_top); 2878e14bb325SJeff Bonwick 2879e14bb325SJeff Bonwick if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2880bb8b5132Sek spa_async_request(spa, SPA_ASYNC_RESILVER); 28813d7072f8Seschrock 2882ce1577b0SDave Eddy spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 28833d7072f8Seschrock } 2884096d22d4SEric Schrock 2885096d22d4SEric Schrock /* 2886096d22d4SEric Schrock * When clearing a FMA-diagnosed fault, we always want to 2887096d22d4SEric Schrock * unspare the device, as we assume that the original spare was 2888096d22d4SEric Schrock * done in response to the FMA fault. 2889096d22d4SEric Schrock */ 2890096d22d4SEric Schrock if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2891096d22d4SEric Schrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2892096d22d4SEric Schrock vd->vdev_parent->vdev_child[0] == vd) 2893096d22d4SEric Schrock vd->vdev_unspare = B_TRUE; 2894fa9e4066Sahrens } 2895fa9e4066Sahrens 2896e14bb325SJeff Bonwick boolean_t 2897e14bb325SJeff Bonwick vdev_is_dead(vdev_t *vd) 28980a4e9518Sgw { 289988ecc943SGeorge Wilson /* 290088ecc943SGeorge Wilson * Holes and missing devices are always considered "dead". 290188ecc943SGeorge Wilson * This simplifies the code since we don't have to check for 290288ecc943SGeorge Wilson * these types of devices in the various code paths. 290388ecc943SGeorge Wilson * Instead we rely on the fact that we skip over dead devices 290488ecc943SGeorge Wilson * before issuing I/O to them. 290588ecc943SGeorge Wilson */ 29065cabbc6bSPrashanth Sreenivasa return (vd->vdev_state < VDEV_STATE_DEGRADED || 29075cabbc6bSPrashanth Sreenivasa vd->vdev_ops == &vdev_hole_ops || 290888ecc943SGeorge Wilson vd->vdev_ops == &vdev_missing_ops); 29090a4e9518Sgw } 29100a4e9518Sgw 2911e14bb325SJeff Bonwick boolean_t 2912e14bb325SJeff Bonwick vdev_readable(vdev_t *vd) 29130a4e9518Sgw { 2914e14bb325SJeff Bonwick return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 29150a4e9518Sgw } 29160a4e9518Sgw 2917e14bb325SJeff Bonwick boolean_t 2918e14bb325SJeff Bonwick vdev_writeable(vdev_t *vd) 2919fa9e4066Sahrens { 29205cabbc6bSPrashanth Sreenivasa return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 29215cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vd)); 2922fa9e4066Sahrens } 2923fa9e4066Sahrens 2924a31e6787SGeorge Wilson boolean_t 2925a31e6787SGeorge Wilson vdev_allocatable(vdev_t *vd) 2926a31e6787SGeorge Wilson { 29278ad4d6ddSJeff Bonwick uint64_t state = vd->vdev_state; 29288ad4d6ddSJeff Bonwick 2929a31e6787SGeorge Wilson /* 29308ad4d6ddSJeff Bonwick * We currently allow allocations from vdevs which may be in the 2931a31e6787SGeorge Wilson * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2932a31e6787SGeorge Wilson * fails to reopen then we'll catch it later when we're holding 29338ad4d6ddSJeff Bonwick * the proper locks. Note that we have to get the vdev state 29348ad4d6ddSJeff Bonwick * in a local variable because although it changes atomically, 29358ad4d6ddSJeff Bonwick * we're asking two separate questions about it. 2936a31e6787SGeorge Wilson */ 29378ad4d6ddSJeff Bonwick return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 29385cabbc6bSPrashanth Sreenivasa !vd->vdev_cant_write && vdev_is_concrete(vd) && 29390f7643c7SGeorge Wilson vd->vdev_mg->mg_initialized); 2940a31e6787SGeorge Wilson } 2941a31e6787SGeorge Wilson 2942e14bb325SJeff Bonwick boolean_t 2943e14bb325SJeff Bonwick vdev_accessible(vdev_t *vd, zio_t *zio) 2944fa9e4066Sahrens { 2945e14bb325SJeff Bonwick ASSERT(zio->io_vd == vd); 2946fa9e4066Sahrens 2947e14bb325SJeff Bonwick if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2948e14bb325SJeff Bonwick return (B_FALSE); 2949fa9e4066Sahrens 2950e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_READ) 2951e14bb325SJeff Bonwick return (!vd->vdev_cant_read); 2952fa9e4066Sahrens 2953e14bb325SJeff Bonwick if (zio->io_type == ZIO_TYPE_WRITE) 2954e14bb325SJeff Bonwick return (!vd->vdev_cant_write); 2955fa9e4066Sahrens 2956e14bb325SJeff Bonwick return (B_TRUE); 2957fa9e4066Sahrens } 2958fa9e4066Sahrens 2959fa9e4066Sahrens /* 2960fa9e4066Sahrens * Get statistics for the given vdev. 2961fa9e4066Sahrens */ 2962fa9e4066Sahrens void 2963fa9e4066Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2964fa9e4066Sahrens { 29652e4c9986SGeorge Wilson spa_t *spa = vd->vdev_spa; 29662e4c9986SGeorge Wilson vdev_t *rvd = spa->spa_root_vdev; 2967c39a2aaeSGeorge Wilson vdev_t *tvd = vd->vdev_top; 29682e4c9986SGeorge Wilson 29692e4c9986SGeorge Wilson ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2970fa9e4066Sahrens 2971fa9e4066Sahrens mutex_enter(&vd->vdev_stat_lock); 2972fa9e4066Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2973fa9e4066Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2974fa9e4066Sahrens vs->vs_state = vd->vdev_state; 2975573ca77eSGeorge Wilson vs->vs_rsize = vdev_get_min_asize(vd); 2976573ca77eSGeorge Wilson if (vd->vdev_ops->vdev_op_leaf) 2977573ca77eSGeorge Wilson vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2978c39a2aaeSGeorge Wilson /* 2979c39a2aaeSGeorge Wilson * Report expandable space on top-level, non-auxillary devices only. 2980c39a2aaeSGeorge Wilson * The expandable space is reported in terms of metaslab sized units 2981c39a2aaeSGeorge Wilson * since that determines how much space the pool can expand. 2982c39a2aaeSGeorge Wilson */ 2983c39a2aaeSGeorge Wilson if (vd->vdev_aux == NULL && tvd != NULL) { 29847855d95bSToomas Soome vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 29857855d95bSToomas Soome spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 2986c39a2aaeSGeorge Wilson } 29875cabbc6bSPrashanth Sreenivasa if (vd->vdev_aux == NULL && vd == vd->vdev_top && 29885cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vd)) { 29892e4c9986SGeorge Wilson vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 29902986efa8SAlex Reece } 2991fa9e4066Sahrens 2992fa9e4066Sahrens /* 2993fa9e4066Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 2994fa9e4066Sahrens * over all top-level vdevs (i.e. the direct children of the root). 2995fa9e4066Sahrens */ 2996fa9e4066Sahrens if (vd == rvd) { 2997e14bb325SJeff Bonwick for (int c = 0; c < rvd->vdev_children; c++) { 2998fa9e4066Sahrens vdev_t *cvd = rvd->vdev_child[c]; 2999fa9e4066Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 3000fa9e4066Sahrens 3001e14bb325SJeff Bonwick for (int t = 0; t < ZIO_TYPES; t++) { 3002fa9e4066Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 3003fa9e4066Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 3004fa9e4066Sahrens } 30053f9d6ad7SLin Ling cvs->vs_scan_removing = cvd->vdev_removing; 3006fa9e4066Sahrens } 3007fa9e4066Sahrens } 30082e4c9986SGeorge Wilson mutex_exit(&vd->vdev_stat_lock); 3009fa9e4066Sahrens } 3010fa9e4066Sahrens 3011fa94a07fSbrendan void 3012fa94a07fSbrendan vdev_clear_stats(vdev_t *vd) 3013fa94a07fSbrendan { 3014fa94a07fSbrendan mutex_enter(&vd->vdev_stat_lock); 3015fa94a07fSbrendan vd->vdev_stat.vs_space = 0; 3016fa94a07fSbrendan vd->vdev_stat.vs_dspace = 0; 3017fa94a07fSbrendan vd->vdev_stat.vs_alloc = 0; 3018fa94a07fSbrendan mutex_exit(&vd->vdev_stat_lock); 3019fa94a07fSbrendan } 3020fa94a07fSbrendan 30213f9d6ad7SLin Ling void 30223f9d6ad7SLin Ling vdev_scan_stat_init(vdev_t *vd) 30233f9d6ad7SLin Ling { 30243f9d6ad7SLin Ling vdev_stat_t *vs = &vd->vdev_stat; 30253f9d6ad7SLin Ling 30263f9d6ad7SLin Ling for (int c = 0; c < vd->vdev_children; c++) 30273f9d6ad7SLin Ling vdev_scan_stat_init(vd->vdev_child[c]); 30283f9d6ad7SLin Ling 30293f9d6ad7SLin Ling mutex_enter(&vd->vdev_stat_lock); 30303f9d6ad7SLin Ling vs->vs_scan_processed = 0; 30313f9d6ad7SLin Ling mutex_exit(&vd->vdev_stat_lock); 30323f9d6ad7SLin Ling } 30333f9d6ad7SLin Ling 3034fa9e4066Sahrens void 3035e14bb325SJeff Bonwick vdev_stat_update(zio_t *zio, uint64_t psize) 3036fa9e4066Sahrens { 30378ad4d6ddSJeff Bonwick spa_t *spa = zio->io_spa; 30388ad4d6ddSJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 3039e14bb325SJeff Bonwick vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 3040fa9e4066Sahrens vdev_t *pvd; 3041fa9e4066Sahrens uint64_t txg = zio->io_txg; 3042fa9e4066Sahrens vdev_stat_t *vs = &vd->vdev_stat; 3043fa9e4066Sahrens zio_type_t type = zio->io_type; 3044fa9e4066Sahrens int flags = zio->io_flags; 3045fa9e4066Sahrens 3046e14bb325SJeff Bonwick /* 3047e14bb325SJeff Bonwick * If this i/o is a gang leader, it didn't do any actual work. 3048e14bb325SJeff Bonwick */ 3049e14bb325SJeff Bonwick if (zio->io_gang_tree) 3050e14bb325SJeff Bonwick return; 3051e14bb325SJeff Bonwick 3052fa9e4066Sahrens if (zio->io_error == 0) { 3053e14bb325SJeff Bonwick /* 3054e14bb325SJeff Bonwick * If this is a root i/o, don't count it -- we've already 3055e14bb325SJeff Bonwick * counted the top-level vdevs, and vdev_get_stats() will 3056e14bb325SJeff Bonwick * aggregate them when asked. This reduces contention on 3057e14bb325SJeff Bonwick * the root vdev_stat_lock and implicitly handles blocks 3058e14bb325SJeff Bonwick * that compress away to holes, for which there is no i/o. 3059e14bb325SJeff Bonwick * (Holes never create vdev children, so all the counters 3060e14bb325SJeff Bonwick * remain zero, which is what we want.) 3061e14bb325SJeff Bonwick * 3062e14bb325SJeff Bonwick * Note: this only applies to successful i/o (io_error == 0) 3063e14bb325SJeff Bonwick * because unlike i/o counts, errors are not additive. 3064e14bb325SJeff Bonwick * When reading a ditto block, for example, failure of 3065e14bb325SJeff Bonwick * one top-level vdev does not imply a root-level error. 3066e14bb325SJeff Bonwick */ 3067e14bb325SJeff Bonwick if (vd == rvd) 3068e14bb325SJeff Bonwick return; 3069e14bb325SJeff Bonwick 3070e14bb325SJeff Bonwick ASSERT(vd == zio->io_vd); 30718ad4d6ddSJeff Bonwick 30728ad4d6ddSJeff Bonwick if (flags & ZIO_FLAG_IO_BYPASS) 30738ad4d6ddSJeff Bonwick return; 30748ad4d6ddSJeff Bonwick 30758ad4d6ddSJeff Bonwick mutex_enter(&vd->vdev_stat_lock); 30768ad4d6ddSJeff Bonwick 3077e14bb325SJeff Bonwick if (flags & ZIO_FLAG_IO_REPAIR) { 307844ecc532SGeorge Wilson if (flags & ZIO_FLAG_SCAN_THREAD) { 30793f9d6ad7SLin Ling dsl_scan_phys_t *scn_phys = 30803f9d6ad7SLin Ling &spa->spa_dsl_pool->dp_scan->scn_phys; 30813f9d6ad7SLin Ling uint64_t *processed = &scn_phys->scn_processed; 30823f9d6ad7SLin Ling 30833f9d6ad7SLin Ling /* XXX cleanup? */ 30843f9d6ad7SLin Ling if (vd->vdev_ops->vdev_op_leaf) 30853f9d6ad7SLin Ling atomic_add_64(processed, psize); 30863f9d6ad7SLin Ling vs->vs_scan_processed += psize; 30873f9d6ad7SLin Ling } 30883f9d6ad7SLin Ling 30898ad4d6ddSJeff Bonwick if (flags & ZIO_FLAG_SELF_HEAL) 3090e14bb325SJeff Bonwick vs->vs_self_healed += psize; 3091fa9e4066Sahrens } 30928ad4d6ddSJeff Bonwick 30938ad4d6ddSJeff Bonwick vs->vs_ops[type]++; 30948ad4d6ddSJeff Bonwick vs->vs_bytes[type] += psize; 30958ad4d6ddSJeff Bonwick 30968ad4d6ddSJeff Bonwick mutex_exit(&vd->vdev_stat_lock); 3097fa9e4066Sahrens return; 3098fa9e4066Sahrens } 3099fa9e4066Sahrens 3100fa9e4066Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 3101fa9e4066Sahrens return; 3102fa9e4066Sahrens 31038956713aSEric Schrock /* 31048956713aSEric Schrock * If this is an I/O error that is going to be retried, then ignore the 31058956713aSEric Schrock * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 31068956713aSEric Schrock * hard errors, when in reality they can happen for any number of 31078956713aSEric Schrock * innocuous reasons (bus resets, MPxIO link failure, etc). 31088956713aSEric Schrock */ 31098956713aSEric Schrock if (zio->io_error == EIO && 31108956713aSEric Schrock !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 31118956713aSEric Schrock return; 31128956713aSEric Schrock 31138f18d1faSGeorge Wilson /* 31148f18d1faSGeorge Wilson * Intent logs writes won't propagate their error to the root 31158f18d1faSGeorge Wilson * I/O so don't mark these types of failures as pool-level 31168f18d1faSGeorge Wilson * errors. 31178f18d1faSGeorge Wilson */ 31188f18d1faSGeorge Wilson if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 31198f18d1faSGeorge Wilson return; 31208f18d1faSGeorge Wilson 3121e14bb325SJeff Bonwick mutex_enter(&vd->vdev_stat_lock); 3122b47119fdSGeorge Wilson if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3123e14bb325SJeff Bonwick if (zio->io_error == ECKSUM) 3124e14bb325SJeff Bonwick vs->vs_checksum_errors++; 3125e14bb325SJeff Bonwick else 3126e14bb325SJeff Bonwick vs->vs_read_errors++; 3127fa9e4066Sahrens } 3128b47119fdSGeorge Wilson if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3129e14bb325SJeff Bonwick vs->vs_write_errors++; 3130e14bb325SJeff Bonwick mutex_exit(&vd->vdev_stat_lock); 3131fa9e4066Sahrens 31325cabbc6bSPrashanth Sreenivasa if (spa->spa_load_state == SPA_LOAD_NONE && 31335cabbc6bSPrashanth Sreenivasa type == ZIO_TYPE_WRITE && txg != 0 && 31348ad4d6ddSJeff Bonwick (!(flags & ZIO_FLAG_IO_REPAIR) || 313544ecc532SGeorge Wilson (flags & ZIO_FLAG_SCAN_THREAD) || 3136b24ab676SJeff Bonwick spa->spa_claiming)) { 31378ad4d6ddSJeff Bonwick /* 3138b24ab676SJeff Bonwick * This is either a normal write (not a repair), or it's 3139b24ab676SJeff Bonwick * a repair induced by the scrub thread, or it's a repair 3140b24ab676SJeff Bonwick * made by zil_claim() during spa_load() in the first txg. 3141b24ab676SJeff Bonwick * In the normal case, we commit the DTL change in the same 3142b24ab676SJeff Bonwick * txg as the block was born. In the scrub-induced repair 3143b24ab676SJeff Bonwick * case, we know that scrubs run in first-pass syncing context, 3144b24ab676SJeff Bonwick * so we commit the DTL change in spa_syncing_txg(spa). 3145b24ab676SJeff Bonwick * In the zil_claim() case, we commit in spa_first_txg(spa). 31468ad4d6ddSJeff Bonwick * 31478ad4d6ddSJeff Bonwick * We currently do not make DTL entries for failed spontaneous 31488ad4d6ddSJeff Bonwick * self-healing writes triggered by normal (non-scrubbing) 31498ad4d6ddSJeff Bonwick * reads, because we have no transactional context in which to 31508ad4d6ddSJeff Bonwick * do so -- and it's not clear that it'd be desirable anyway. 31518ad4d6ddSJeff Bonwick */ 31528ad4d6ddSJeff Bonwick if (vd->vdev_ops->vdev_op_leaf) { 31538ad4d6ddSJeff Bonwick uint64_t commit_txg = txg; 315444ecc532SGeorge Wilson if (flags & ZIO_FLAG_SCAN_THREAD) { 31558ad4d6ddSJeff Bonwick ASSERT(flags & ZIO_FLAG_IO_REPAIR); 31568ad4d6ddSJeff Bonwick ASSERT(spa_sync_pass(spa) == 1); 31578ad4d6ddSJeff Bonwick vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3158b24ab676SJeff Bonwick commit_txg = spa_syncing_txg(spa); 3159b24ab676SJeff Bonwick } else if (spa->spa_claiming) { 3160b24ab676SJeff Bonwick ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3161b24ab676SJeff Bonwick commit_txg = spa_first_txg(spa); 31628ad4d6ddSJeff Bonwick } 3163b24ab676SJeff Bonwick ASSERT(commit_txg >= spa_syncing_txg(spa)); 31648ad4d6ddSJeff Bonwick if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3165fa9e4066Sahrens return; 31668ad4d6ddSJeff Bonwick for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 31678ad4d6ddSJeff Bonwick vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 31688ad4d6ddSJeff Bonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3169fa9e4066Sahrens } 31708ad4d6ddSJeff Bonwick if (vd != rvd) 31718ad4d6ddSJeff Bonwick vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3172fa9e4066Sahrens } 3173fa9e4066Sahrens } 3174fa9e4066Sahrens 3175fa9e4066Sahrens /* 3176b24ab676SJeff Bonwick * Update the in-core space usage stats for this vdev, its metaslab class, 3177b24ab676SJeff Bonwick * and the root vdev. 3178fa9e4066Sahrens */ 3179fa9e4066Sahrens void 3180b24ab676SJeff Bonwick vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3181b24ab676SJeff Bonwick int64_t space_delta) 3182fa9e4066Sahrens { 318399653d4eSeschrock int64_t dspace_delta = space_delta; 31848654d025Sperrin spa_t *spa = vd->vdev_spa; 31858654d025Sperrin vdev_t *rvd = spa->spa_root_vdev; 3186b24ab676SJeff Bonwick metaslab_group_t *mg = vd->vdev_mg; 3187b24ab676SJeff Bonwick metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3188fa9e4066Sahrens 31898654d025Sperrin ASSERT(vd == vd->vdev_top); 319099653d4eSeschrock 31918654d025Sperrin /* 31928654d025Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 31938654d025Sperrin * factor. We must calculate this here and not at the root vdev 31948654d025Sperrin * because the root vdev's psize-to-asize is simply the max of its 31958654d025Sperrin * childrens', thus not accurate enough for us. 31968654d025Sperrin */ 31978654d025Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3198e6ca193dSGeorge Wilson ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 31998654d025Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 32008654d025Sperrin vd->vdev_deflate_ratio; 32018654d025Sperrin 32028654d025Sperrin mutex_enter(&vd->vdev_stat_lock); 32038654d025Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 3204b24ab676SJeff Bonwick vd->vdev_stat.vs_space += space_delta; 32058654d025Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 32068654d025Sperrin mutex_exit(&vd->vdev_stat_lock); 32078654d025Sperrin 3208b24ab676SJeff Bonwick if (mc == spa_normal_class(spa)) { 3209fa94a07fSbrendan mutex_enter(&rvd->vdev_stat_lock); 3210fa94a07fSbrendan rvd->vdev_stat.vs_alloc += alloc_delta; 3211b24ab676SJeff Bonwick rvd->vdev_stat.vs_space += space_delta; 3212fa94a07fSbrendan rvd->vdev_stat.vs_dspace += dspace_delta; 3213fa94a07fSbrendan mutex_exit(&rvd->vdev_stat_lock); 3214fa94a07fSbrendan } 3215b24ab676SJeff Bonwick 3216b24ab676SJeff Bonwick if (mc != NULL) { 3217b24ab676SJeff Bonwick ASSERT(rvd == vd->vdev_parent); 3218b24ab676SJeff Bonwick ASSERT(vd->vdev_ms_count != 0); 3219b24ab676SJeff Bonwick 3220b24ab676SJeff Bonwick metaslab_class_space_update(mc, 3221b24ab676SJeff Bonwick alloc_delta, defer_delta, space_delta, dspace_delta); 3222b24ab676SJeff Bonwick } 3223fa9e4066Sahrens } 3224fa9e4066Sahrens 3225fa9e4066Sahrens /* 3226fa9e4066Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 3227fa9e4066Sahrens * so that it will be written out next time the vdev configuration is synced. 3228fa9e4066Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3229fa9e4066Sahrens */ 3230fa9e4066Sahrens void 3231fa9e4066Sahrens vdev_config_dirty(vdev_t *vd) 3232fa9e4066Sahrens { 3233fa9e4066Sahrens spa_t *spa = vd->vdev_spa; 3234fa9e4066Sahrens vdev_t *rvd = spa->spa_root_vdev; 3235fa9e4066Sahrens int c; 3236fa9e4066Sahrens 3237f9af39baSGeorge Wilson ASSERT(spa_writeable(spa)); 3238f9af39baSGeorge Wilson 3239c5904d13Seschrock /* 32406809eb4eSEric Schrock * If this is an aux vdev (as with l2cache and spare devices), then we 32416809eb4eSEric Schrock * update the vdev config manually and set the sync flag. 3242c5904d13Seschrock */ 3243c5904d13Seschrock if (vd->vdev_aux != NULL) { 3244c5904d13Seschrock spa_aux_vdev_t *sav = vd->vdev_aux; 3245c5904d13Seschrock nvlist_t **aux; 3246c5904d13Seschrock uint_t naux; 3247c5904d13Seschrock 3248c5904d13Seschrock for (c = 0; c < sav->sav_count; c++) { 3249c5904d13Seschrock if (sav->sav_vdevs[c] == vd) 3250c5904d13Seschrock break; 3251c5904d13Seschrock } 3252c5904d13Seschrock 3253e14bb325SJeff Bonwick if (c == sav->sav_count) { 3254e14bb325SJeff Bonwick /* 3255e14bb325SJeff Bonwick * We're being removed. There's nothing more to do. 3256e14bb325SJeff Bonwick */ 3257e14bb325SJeff Bonwick ASSERT(sav->sav_sync == B_TRUE); 3258e14bb325SJeff Bonwick return; 3259e14bb325SJeff Bonwick } 3260e14bb325SJeff Bonwick 3261c5904d13Seschrock sav->sav_sync = B_TRUE; 3262c5904d13Seschrock 32636809eb4eSEric Schrock if (nvlist_lookup_nvlist_array(sav->sav_config, 32646809eb4eSEric Schrock ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 32656809eb4eSEric Schrock VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 32666809eb4eSEric Schrock ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 32676809eb4eSEric Schrock } 3268c5904d13Seschrock 3269c5904d13Seschrock ASSERT(c < naux); 3270c5904d13Seschrock 3271c5904d13Seschrock /* 3272c5904d13Seschrock * Setting the nvlist in the middle if the array is a little 3273c5904d13Seschrock * sketchy, but it will work. 3274c5904d13Seschrock */ 3275c5904d13Seschrock nvlist_free(aux[c]); 32763f9d6ad7SLin Ling aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3277c5904d13Seschrock 3278c5904d13Seschrock return; 3279c5904d13Seschrock } 3280c5904d13Seschrock 32815dabedeeSbonwick /* 3282e14bb325SJeff Bonwick * The dirty list is protected by the SCL_CONFIG lock. The caller 3283e14bb325SJeff Bonwick * must either hold SCL_CONFIG as writer, or must be the sync thread 3284e14bb325SJeff Bonwick * (which holds SCL_CONFIG as reader). There's only one sync thread, 32855dabedeeSbonwick * so this is sufficient to ensure mutual exclusion. 32865dabedeeSbonwick */ 3287e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3288e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 3289e14bb325SJeff Bonwick spa_config_held(spa, SCL_CONFIG, RW_READER))); 32905dabedeeSbonwick 3291fa9e4066Sahrens if (vd == rvd) { 3292fa9e4066Sahrens for (c = 0; c < rvd->vdev_children; c++) 3293fa9e4066Sahrens vdev_config_dirty(rvd->vdev_child[c]); 3294fa9e4066Sahrens } else { 3295fa9e4066Sahrens ASSERT(vd == vd->vdev_top); 3296fa9e4066Sahrens 329788ecc943SGeorge Wilson if (!list_link_active(&vd->vdev_config_dirty_node) && 32985cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vd)) { 3299e14bb325SJeff Bonwick list_insert_head(&spa->spa_config_dirty_list, vd); 33005cabbc6bSPrashanth Sreenivasa } 3301fa9e4066Sahrens } 3302fa9e4066Sahrens } 3303fa9e4066Sahrens 3304fa9e4066Sahrens void 3305fa9e4066Sahrens vdev_config_clean(vdev_t *vd) 3306fa9e4066Sahrens { 33075dabedeeSbonwick spa_t *spa = vd->vdev_spa; 33085dabedeeSbonwick 3309e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3310e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 3311e14bb325SJeff Bonwick spa_config_held(spa, SCL_CONFIG, RW_READER))); 33125dabedeeSbonwick 3313e14bb325SJeff Bonwick ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3314e14bb325SJeff Bonwick list_remove(&spa->spa_config_dirty_list, vd); 3315e14bb325SJeff Bonwick } 3316e14bb325SJeff Bonwick 3317e14bb325SJeff Bonwick /* 3318e14bb325SJeff Bonwick * Mark a top-level vdev's state as dirty, so that the next pass of 3319e14bb325SJeff Bonwick * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3320e14bb325SJeff Bonwick * the state changes from larger config changes because they require 3321e14bb325SJeff Bonwick * much less locking, and are often needed for administrative actions. 3322e14bb325SJeff Bonwick */ 3323e14bb325SJeff Bonwick void 3324e14bb325SJeff Bonwick vdev_state_dirty(vdev_t *vd) 3325e14bb325SJeff Bonwick { 3326e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 3327e14bb325SJeff Bonwick 3328f9af39baSGeorge Wilson ASSERT(spa_writeable(spa)); 3329e14bb325SJeff Bonwick ASSERT(vd == vd->vdev_top); 3330e14bb325SJeff Bonwick 3331e14bb325SJeff Bonwick /* 3332e14bb325SJeff Bonwick * The state list is protected by the SCL_STATE lock. The caller 3333e14bb325SJeff Bonwick * must either hold SCL_STATE as writer, or must be the sync thread 3334e14bb325SJeff Bonwick * (which holds SCL_STATE as reader). There's only one sync thread, 3335e14bb325SJeff Bonwick * so this is sufficient to ensure mutual exclusion. 3336e14bb325SJeff Bonwick */ 3337e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3338e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 3339e14bb325SJeff Bonwick spa_config_held(spa, SCL_STATE, RW_READER))); 3340e14bb325SJeff Bonwick 33415cabbc6bSPrashanth Sreenivasa if (!list_link_active(&vd->vdev_state_dirty_node) && 33425cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vd)) 3343e14bb325SJeff Bonwick list_insert_head(&spa->spa_state_dirty_list, vd); 3344e14bb325SJeff Bonwick } 3345e14bb325SJeff Bonwick 3346e14bb325SJeff Bonwick void 3347e14bb325SJeff Bonwick vdev_state_clean(vdev_t *vd) 3348e14bb325SJeff Bonwick { 3349e14bb325SJeff Bonwick spa_t *spa = vd->vdev_spa; 3350e14bb325SJeff Bonwick 3351e14bb325SJeff Bonwick ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3352e14bb325SJeff Bonwick (dsl_pool_sync_context(spa_get_dsl(spa)) && 3353e14bb325SJeff Bonwick spa_config_held(spa, SCL_STATE, RW_READER))); 3354e14bb325SJeff Bonwick 3355e14bb325SJeff Bonwick ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3356e14bb325SJeff Bonwick list_remove(&spa->spa_state_dirty_list, vd); 3357fa9e4066Sahrens } 3358fa9e4066Sahrens 335932b87932Sek /* 336032b87932Sek * Propagate vdev state up from children to parent. 336132b87932Sek */ 336244cd46caSbillm void 336344cd46caSbillm vdev_propagate_state(vdev_t *vd) 336444cd46caSbillm { 33658ad4d6ddSJeff Bonwick spa_t *spa = vd->vdev_spa; 33668ad4d6ddSJeff Bonwick vdev_t *rvd = spa->spa_root_vdev; 336744cd46caSbillm int degraded = 0, faulted = 0; 336844cd46caSbillm int corrupted = 0; 336944cd46caSbillm vdev_t *child; 337044cd46caSbillm 33713d7072f8Seschrock if (vd->vdev_children > 0) { 3372573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 33733d7072f8Seschrock child = vd->vdev_child[c]; 337451ece835Seschrock 337588ecc943SGeorge Wilson /* 33765cabbc6bSPrashanth Sreenivasa * Don't factor holes or indirect vdevs into the 33775cabbc6bSPrashanth Sreenivasa * decision. 337888ecc943SGeorge Wilson */ 33795cabbc6bSPrashanth Sreenivasa if (!vdev_is_concrete(child)) 338088ecc943SGeorge Wilson continue; 338188ecc943SGeorge Wilson 3382e14bb325SJeff Bonwick if (!vdev_readable(child) || 33838ad4d6ddSJeff Bonwick (!vdev_writeable(child) && spa_writeable(spa))) { 338451ece835Seschrock /* 338551ece835Seschrock * Root special: if there is a top-level log 338651ece835Seschrock * device, treat the root vdev as if it were 338751ece835Seschrock * degraded. 338851ece835Seschrock */ 338951ece835Seschrock if (child->vdev_islog && vd == rvd) 339051ece835Seschrock degraded++; 339151ece835Seschrock else 339251ece835Seschrock faulted++; 339351ece835Seschrock } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 33943d7072f8Seschrock degraded++; 339551ece835Seschrock } 339644cd46caSbillm 33973d7072f8Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 33983d7072f8Seschrock corrupted++; 33993d7072f8Seschrock } 340044cd46caSbillm 34013d7072f8Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 34023d7072f8Seschrock 34033d7072f8Seschrock /* 3404e14bb325SJeff Bonwick * Root special: if there is a top-level vdev that cannot be 34053d7072f8Seschrock * opened due to corrupted metadata, then propagate the root 34063d7072f8Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 34073d7072f8Seschrock * replicas'. 34083d7072f8Seschrock */ 34093d7072f8Seschrock if (corrupted && vd == rvd && 34103d7072f8Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 34113d7072f8Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 34123d7072f8Seschrock VDEV_AUX_CORRUPT_DATA); 34133d7072f8Seschrock } 34143d7072f8Seschrock 341551ece835Seschrock if (vd->vdev_parent) 34163d7072f8Seschrock vdev_propagate_state(vd->vdev_parent); 341744cd46caSbillm } 341844cd46caSbillm 3419fa9e4066Sahrens /* 3420ea8dc4b6Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 3421ea8dc4b6Seschrock * state, because we're in the process of opening children depth-first. 3422ea8dc4b6Seschrock * Otherwise, we propagate the change to the parent. 3423ea8dc4b6Seschrock * 3424ea8dc4b6Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 3425ea8dc4b6Seschrock * generated. 3426fa9e4066Sahrens */ 3427fa9e4066Sahrens void 3428ea8dc4b6Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3429fa9e4066Sahrens { 3430560e6e96Seschrock uint64_t save_state; 3431c5904d13Seschrock spa_t *spa = vd->vdev_spa; 3432ea8dc4b6Seschrock 3433ea8dc4b6Seschrock if (state == vd->vdev_state) { 3434ea8dc4b6Seschrock vd->vdev_stat.vs_aux = aux; 3435fa9e4066Sahrens return; 3436ea8dc4b6Seschrock } 3437ea8dc4b6Seschrock 3438560e6e96Seschrock save_state = vd->vdev_state; 3439fa9e4066Sahrens 3440fa9e4066Sahrens vd->vdev_state = state; 3441fa9e4066Sahrens vd->vdev_stat.vs_aux = aux; 3442fa9e4066Sahrens 34433d7072f8Seschrock /* 34443d7072f8Seschrock * If we are setting the vdev state to anything but an open state, then 344598d1cbfeSGeorge Wilson * always close the underlying device unless the device has requested 344698d1cbfeSGeorge Wilson * a delayed close (i.e. we're about to remove or fault the device). 344798d1cbfeSGeorge Wilson * Otherwise, we keep accessible but invalid devices open forever. 344898d1cbfeSGeorge Wilson * We don't call vdev_close() itself, because that implies some extra 344998d1cbfeSGeorge Wilson * checks (offline, etc) that we don't want here. This is limited to 345098d1cbfeSGeorge Wilson * leaf devices, because otherwise closing the device will affect other 345198d1cbfeSGeorge Wilson * children. 345298d1cbfeSGeorge Wilson */ 345398d1cbfeSGeorge Wilson if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 345498d1cbfeSGeorge Wilson vd->vdev_ops->vdev_op_leaf) 34553d7072f8Seschrock vd->vdev_ops->vdev_op_close(vd); 34563d7072f8Seschrock 3457069f55e2SEric Schrock /* 3458069f55e2SEric Schrock * If we have brought this vdev back into service, we need 3459069f55e2SEric Schrock * to notify fmd so that it can gracefully repair any outstanding 3460069f55e2SEric Schrock * cases due to a missing device. We do this in all cases, even those 3461069f55e2SEric Schrock * that probably don't correlate to a repaired fault. This is sure to 3462069f55e2SEric Schrock * catch all cases, and we let the zfs-retire agent sort it out. If 3463069f55e2SEric Schrock * this is a transient state it's OK, as the retire agent will 3464069f55e2SEric Schrock * double-check the state of the vdev before repairing it. 3465069f55e2SEric Schrock */ 3466069f55e2SEric Schrock if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3467069f55e2SEric Schrock vd->vdev_prevstate != state) 3468069f55e2SEric Schrock zfs_post_state_change(spa, vd); 3469069f55e2SEric Schrock 34703d7072f8Seschrock if (vd->vdev_removed && 34713d7072f8Seschrock state == VDEV_STATE_CANT_OPEN && 34723d7072f8Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 34733d7072f8Seschrock /* 34743d7072f8Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 34753d7072f8Seschrock * device was previously marked removed and someone attempted to 34763d7072f8Seschrock * reopen it. If this failed due to a nonexistent device, then 34773d7072f8Seschrock * keep the device in the REMOVED state. We also let this be if 34783d7072f8Seschrock * it is one of our special test online cases, which is only 34793d7072f8Seschrock * attempting to online the device and shouldn't generate an FMA 34803d7072f8Seschrock * fault. 34813d7072f8Seschrock */ 34823d7072f8Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 34833d7072f8Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 34843d7072f8Seschrock } else if (state == VDEV_STATE_REMOVED) { 34853d7072f8Seschrock vd->vdev_removed = B_TRUE; 34863d7072f8Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 3487ea8dc4b6Seschrock /* 3488cb04b873SMark J Musante * If we fail to open a vdev during an import or recovery, we 3489cb04b873SMark J Musante * mark it as "not available", which signifies that it was 3490cb04b873SMark J Musante * never there to begin with. Failure to open such a device 3491cb04b873SMark J Musante * is not considered an error. 3492ea8dc4b6Seschrock */ 3493cb04b873SMark J Musante if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3494cb04b873SMark J Musante spa_load_state(spa) == SPA_LOAD_RECOVER) && 3495560e6e96Seschrock vd->vdev_ops->vdev_op_leaf) 3496560e6e96Seschrock vd->vdev_not_present = 1; 3497560e6e96Seschrock 3498560e6e96Seschrock /* 3499560e6e96Seschrock * Post the appropriate ereport. If the 'prevstate' field is 3500560e6e96Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 3501560e6e96Seschrock * that this is part of a vdev_reopen(). In this case, we don't 3502560e6e96Seschrock * want to post the ereport if the device was already in the 3503560e6e96Seschrock * CANT_OPEN state beforehand. 35043d7072f8Seschrock * 35053d7072f8Seschrock * If the 'checkremove' flag is set, then this is an attempt to 35063d7072f8Seschrock * online the device in response to an insertion event. If we 35073d7072f8Seschrock * hit this case, then we have detected an insertion event for a 35083d7072f8Seschrock * faulted or offline device that wasn't in the removed state. 35093d7072f8Seschrock * In this scenario, we don't post an ereport because we are 35103d7072f8Seschrock * about to replace the device, or attempt an online with 35113d7072f8Seschrock * vdev_forcefault, which will generate the fault for us. 3512560e6e96Seschrock */ 35133d7072f8Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 35143d7072f8Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 3515c5904d13Seschrock vd != spa->spa_root_vdev) { 3516ea8dc4b6Seschrock const char *class; 3517ea8dc4b6Seschrock 3518ea8dc4b6Seschrock switch (aux) { 3519ea8dc4b6Seschrock case VDEV_AUX_OPEN_FAILED: 3520ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3521ea8dc4b6Seschrock break; 3522ea8dc4b6Seschrock case VDEV_AUX_CORRUPT_DATA: 3523ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3524ea8dc4b6Seschrock break; 3525ea8dc4b6Seschrock case VDEV_AUX_NO_REPLICAS: 3526ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3527ea8dc4b6Seschrock break; 3528ea8dc4b6Seschrock case VDEV_AUX_BAD_GUID_SUM: 3529ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3530ea8dc4b6Seschrock break; 3531ea8dc4b6Seschrock case VDEV_AUX_TOO_SMALL: 3532ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3533ea8dc4b6Seschrock break; 3534ea8dc4b6Seschrock case VDEV_AUX_BAD_LABEL: 3535ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3536ea8dc4b6Seschrock break; 3537ea8dc4b6Seschrock default: 3538ea8dc4b6Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3539ea8dc4b6Seschrock } 3540ea8dc4b6Seschrock 3541c5904d13Seschrock zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3542ea8dc4b6Seschrock } 3543ea8dc4b6Seschrock 35443d7072f8Seschrock /* Erase any notion of persistent removed state */ 35453d7072f8Seschrock vd->vdev_removed = B_FALSE; 35463d7072f8Seschrock } else { 35473d7072f8Seschrock vd->vdev_removed = B_FALSE; 35483d7072f8Seschrock } 3549ea8dc4b6Seschrock 35508b33d774STim Haley if (!isopen && vd->vdev_parent) 35518b33d774STim Haley vdev_propagate_state(vd->vdev_parent); 3552fa9e4066Sahrens } 355315e6edf1Sgw 355415e6edf1Sgw /* 355515e6edf1Sgw * Check the vdev configuration to ensure that it's capable of supporting 3556c8811bd3SToomas Soome * a root pool. We do not support partial configuration. 3557c8811bd3SToomas Soome * In addition, only a single top-level vdev is allowed. 355815e6edf1Sgw */ 355915e6edf1Sgw boolean_t 356015e6edf1Sgw vdev_is_bootable(vdev_t *vd) 356115e6edf1Sgw { 356215e6edf1Sgw if (!vd->vdev_ops->vdev_op_leaf) { 356315e6edf1Sgw char *vdev_type = vd->vdev_ops->vdev_op_type; 356415e6edf1Sgw 356515e6edf1Sgw if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 356615e6edf1Sgw vd->vdev_children > 1) { 356715e6edf1Sgw return (B_FALSE); 35685cabbc6bSPrashanth Sreenivasa } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || 35695cabbc6bSPrashanth Sreenivasa strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 357015e6edf1Sgw return (B_FALSE); 357115e6edf1Sgw } 357215e6edf1Sgw } 357315e6edf1Sgw 3574573ca77eSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) { 357515e6edf1Sgw if (!vdev_is_bootable(vd->vdev_child[c])) 357615e6edf1Sgw return (B_FALSE); 357715e6edf1Sgw } 357815e6edf1Sgw return (B_TRUE); 357915e6edf1Sgw } 3580e6ca193dSGeorge Wilson 35815cabbc6bSPrashanth Sreenivasa boolean_t 35825cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vdev_t *vd) 35835cabbc6bSPrashanth Sreenivasa { 35845cabbc6bSPrashanth Sreenivasa vdev_ops_t *ops = vd->vdev_ops; 35855cabbc6bSPrashanth Sreenivasa if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || 35865cabbc6bSPrashanth Sreenivasa ops == &vdev_missing_ops || ops == &vdev_root_ops) { 35875cabbc6bSPrashanth Sreenivasa return (B_FALSE); 35885cabbc6bSPrashanth Sreenivasa } else { 35895cabbc6bSPrashanth Sreenivasa return (B_TRUE); 35905cabbc6bSPrashanth Sreenivasa } 35915cabbc6bSPrashanth Sreenivasa } 35925cabbc6bSPrashanth Sreenivasa 359388ecc943SGeorge Wilson /* 359488ecc943SGeorge Wilson * Load the state from the original vdev tree (ovd) which 359588ecc943SGeorge Wilson * we've retrieved from the MOS config object. If the original 35964b964adaSGeorge Wilson * vdev was offline or faulted then we transfer that state to the 35974b964adaSGeorge Wilson * device in the current vdev tree (nvd). 359888ecc943SGeorge Wilson */ 3599e6ca193dSGeorge Wilson void 360088ecc943SGeorge Wilson vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3601e6ca193dSGeorge Wilson { 360288ecc943SGeorge Wilson spa_t *spa = nvd->vdev_spa; 3603e6ca193dSGeorge Wilson 36044b964adaSGeorge Wilson ASSERT(nvd->vdev_top->vdev_islog); 360588ecc943SGeorge Wilson ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 360688ecc943SGeorge Wilson ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3607e6ca193dSGeorge Wilson 360888ecc943SGeorge Wilson for (int c = 0; c < nvd->vdev_children; c++) 360988ecc943SGeorge Wilson vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3610e6ca193dSGeorge Wilson 36114b964adaSGeorge Wilson if (nvd->vdev_ops->vdev_op_leaf) { 3612e6ca193dSGeorge Wilson /* 36134b964adaSGeorge Wilson * Restore the persistent vdev state 3614e6ca193dSGeorge Wilson */ 361588ecc943SGeorge Wilson nvd->vdev_offline = ovd->vdev_offline; 36164b964adaSGeorge Wilson nvd->vdev_faulted = ovd->vdev_faulted; 36174b964adaSGeorge Wilson nvd->vdev_degraded = ovd->vdev_degraded; 36184b964adaSGeorge Wilson nvd->vdev_removed = ovd->vdev_removed; 3619e6ca193dSGeorge Wilson } 3620e6ca193dSGeorge Wilson } 3621573ca77eSGeorge Wilson 36224b964adaSGeorge Wilson /* 36234b964adaSGeorge Wilson * Determine if a log device has valid content. If the vdev was 36244b964adaSGeorge Wilson * removed or faulted in the MOS config then we know that 36254b964adaSGeorge Wilson * the content on the log device has already been written to the pool. 36264b964adaSGeorge Wilson */ 36274b964adaSGeorge Wilson boolean_t 36284b964adaSGeorge Wilson vdev_log_state_valid(vdev_t *vd) 36294b964adaSGeorge Wilson { 36304b964adaSGeorge Wilson if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 36314b964adaSGeorge Wilson !vd->vdev_removed) 36324b964adaSGeorge Wilson return (B_TRUE); 36334b964adaSGeorge Wilson 36344b964adaSGeorge Wilson for (int c = 0; c < vd->vdev_children; c++) 36354b964adaSGeorge Wilson if (vdev_log_state_valid(vd->vdev_child[c])) 36364b964adaSGeorge Wilson return (B_TRUE); 36374b964adaSGeorge Wilson 36384b964adaSGeorge Wilson return (B_FALSE); 36394b964adaSGeorge Wilson } 36404b964adaSGeorge Wilson 3641573ca77eSGeorge Wilson /* 3642573ca77eSGeorge Wilson * Expand a vdev if possible. 3643573ca77eSGeorge Wilson */ 3644573ca77eSGeorge Wilson void 3645573ca77eSGeorge Wilson vdev_expand(vdev_t *vd, uint64_t txg) 3646573ca77eSGeorge Wilson { 3647573ca77eSGeorge Wilson ASSERT(vd->vdev_top == vd); 3648573ca77eSGeorge Wilson ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3649573ca77eSGeorge Wilson 36505cabbc6bSPrashanth Sreenivasa vdev_set_deflate_ratio(vd); 36515cabbc6bSPrashanth Sreenivasa 36525cabbc6bSPrashanth Sreenivasa if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 36535cabbc6bSPrashanth Sreenivasa vdev_is_concrete(vd)) { 3654573ca77eSGeorge Wilson VERIFY(vdev_metaslab_init(vd, txg) == 0); 3655573ca77eSGeorge Wilson vdev_config_dirty(vd); 3656573ca77eSGeorge Wilson } 3657573ca77eSGeorge Wilson } 36581195e687SMark J Musante 36591195e687SMark J Musante /* 36601195e687SMark J Musante * Split a vdev. 36611195e687SMark J Musante */ 36621195e687SMark J Musante void 36631195e687SMark J Musante vdev_split(vdev_t *vd) 36641195e687SMark J Musante { 36651195e687SMark J Musante vdev_t *cvd, *pvd = vd->vdev_parent; 36661195e687SMark J Musante 36671195e687SMark J Musante vdev_remove_child(pvd, vd); 36681195e687SMark J Musante vdev_compact_children(pvd); 36691195e687SMark J Musante 36701195e687SMark J Musante cvd = pvd->vdev_child[0]; 36711195e687SMark J Musante if (pvd->vdev_children == 1) { 36721195e687SMark J Musante vdev_remove_parent(cvd); 36731195e687SMark J Musante cvd->vdev_splitting = B_TRUE; 36741195e687SMark J Musante } 36751195e687SMark J Musante vdev_propagate_state(cvd); 36761195e687SMark J Musante } 3677283b8460SGeorge.Wilson 3678283b8460SGeorge.Wilson void 3679283b8460SGeorge.Wilson vdev_deadman(vdev_t *vd) 3680283b8460SGeorge.Wilson { 3681283b8460SGeorge.Wilson for (int c = 0; c < vd->vdev_children; c++) { 3682283b8460SGeorge.Wilson vdev_t *cvd = vd->vdev_child[c]; 3683283b8460SGeorge.Wilson 3684283b8460SGeorge.Wilson vdev_deadman(cvd); 3685283b8460SGeorge.Wilson } 3686283b8460SGeorge.Wilson 3687283b8460SGeorge.Wilson if (vd->vdev_ops->vdev_op_leaf) { 3688283b8460SGeorge.Wilson vdev_queue_t *vq = &vd->vdev_queue; 3689283b8460SGeorge.Wilson 3690283b8460SGeorge.Wilson mutex_enter(&vq->vq_lock); 369169962b56SMatthew Ahrens if (avl_numnodes(&vq->vq_active_tree) > 0) { 3692283b8460SGeorge.Wilson spa_t *spa = vd->vdev_spa; 3693283b8460SGeorge.Wilson zio_t *fio; 3694283b8460SGeorge.Wilson uint64_t delta; 3695283b8460SGeorge.Wilson 3696283b8460SGeorge.Wilson /* 3697283b8460SGeorge.Wilson * Look at the head of all the pending queues, 3698283b8460SGeorge.Wilson * if any I/O has been outstanding for longer than 3699283b8460SGeorge.Wilson * the spa_deadman_synctime we panic the system. 3700283b8460SGeorge.Wilson */ 370169962b56SMatthew Ahrens fio = avl_first(&vq->vq_active_tree); 3702c55e05cbSMatthew Ahrens delta = gethrtime() - fio->io_timestamp; 3703c55e05cbSMatthew Ahrens if (delta > spa_deadman_synctime(spa)) { 3704*3ee8c80cSPavel Zakharov vdev_dbgmsg(vd, "SLOW IO: zio timestamp " 3705*3ee8c80cSPavel Zakharov "%lluns, delta %lluns, last io %lluns", 3706*3ee8c80cSPavel Zakharov fio->io_timestamp, (u_longlong_t)delta, 3707283b8460SGeorge.Wilson vq->vq_io_complete_ts); 3708283b8460SGeorge.Wilson fm_panic("I/O to pool '%s' appears to be " 3709283b8460SGeorge.Wilson "hung.", spa_name(spa)); 3710283b8460SGeorge.Wilson } 3711283b8460SGeorge.Wilson } 3712283b8460SGeorge.Wilson mutex_exit(&vq->vq_lock); 3713283b8460SGeorge.Wilson } 3714283b8460SGeorge.Wilson } 3715