1*5cabbc6bSPrashanth Sreenivasa /* 2*5cabbc6bSPrashanth Sreenivasa * CDDL HEADER START 3*5cabbc6bSPrashanth Sreenivasa * 4*5cabbc6bSPrashanth Sreenivasa * The contents of this file are subject to the terms of the 5*5cabbc6bSPrashanth Sreenivasa * Common Development and Distribution License (the "License"). 6*5cabbc6bSPrashanth Sreenivasa * You may not use this file except in compliance with the License. 7*5cabbc6bSPrashanth Sreenivasa * 8*5cabbc6bSPrashanth Sreenivasa * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*5cabbc6bSPrashanth Sreenivasa * or http://www.opensolaris.org/os/licensing. 10*5cabbc6bSPrashanth Sreenivasa * See the License for the specific language governing permissions 11*5cabbc6bSPrashanth Sreenivasa * and limitations under the License. 12*5cabbc6bSPrashanth Sreenivasa * 13*5cabbc6bSPrashanth Sreenivasa * When distributing Covered Code, include this CDDL HEADER in each 14*5cabbc6bSPrashanth Sreenivasa * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*5cabbc6bSPrashanth Sreenivasa * If applicable, add the following below this CDDL HEADER, with the 16*5cabbc6bSPrashanth Sreenivasa * fields enclosed by brackets "[]" replaced with your own identifying 17*5cabbc6bSPrashanth Sreenivasa * information: Portions Copyright [yyyy] [name of copyright owner] 18*5cabbc6bSPrashanth Sreenivasa * 19*5cabbc6bSPrashanth Sreenivasa * CDDL HEADER END 20*5cabbc6bSPrashanth Sreenivasa */ 21*5cabbc6bSPrashanth Sreenivasa 22*5cabbc6bSPrashanth Sreenivasa /* 23*5cabbc6bSPrashanth Sreenivasa * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24*5cabbc6bSPrashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25*5cabbc6bSPrashanth Sreenivasa */ 26*5cabbc6bSPrashanth Sreenivasa 27*5cabbc6bSPrashanth Sreenivasa #include <sys/zfs_context.h> 28*5cabbc6bSPrashanth Sreenivasa #include <sys/spa_impl.h> 29*5cabbc6bSPrashanth Sreenivasa #include <sys/dmu.h> 30*5cabbc6bSPrashanth Sreenivasa #include <sys/dmu_tx.h> 31*5cabbc6bSPrashanth Sreenivasa #include <sys/zap.h> 32*5cabbc6bSPrashanth Sreenivasa #include <sys/vdev_impl.h> 33*5cabbc6bSPrashanth Sreenivasa #include <sys/metaslab.h> 34*5cabbc6bSPrashanth Sreenivasa #include <sys/metaslab_impl.h> 35*5cabbc6bSPrashanth Sreenivasa #include <sys/uberblock_impl.h> 36*5cabbc6bSPrashanth Sreenivasa #include <sys/txg.h> 37*5cabbc6bSPrashanth Sreenivasa #include <sys/avl.h> 38*5cabbc6bSPrashanth Sreenivasa #include <sys/bpobj.h> 39*5cabbc6bSPrashanth Sreenivasa #include <sys/dsl_pool.h> 40*5cabbc6bSPrashanth Sreenivasa #include <sys/dsl_synctask.h> 41*5cabbc6bSPrashanth Sreenivasa #include <sys/dsl_dir.h> 42*5cabbc6bSPrashanth Sreenivasa #include <sys/arc.h> 43*5cabbc6bSPrashanth Sreenivasa #include <sys/zfeature.h> 44*5cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_births.h> 45*5cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_mapping.h> 46*5cabbc6bSPrashanth Sreenivasa #include <sys/abd.h> 47*5cabbc6bSPrashanth Sreenivasa 48*5cabbc6bSPrashanth Sreenivasa /* 49*5cabbc6bSPrashanth Sreenivasa * This file contains the necessary logic to remove vdevs from a 50*5cabbc6bSPrashanth Sreenivasa * storage pool. Currently, the only devices that can be removed 51*5cabbc6bSPrashanth Sreenivasa * are log, cache, and spare devices; and top level vdevs from a pool 52*5cabbc6bSPrashanth Sreenivasa * w/o raidz. (Note that members of a mirror can also be removed 53*5cabbc6bSPrashanth Sreenivasa * by the detach operation.) 54*5cabbc6bSPrashanth Sreenivasa * 55*5cabbc6bSPrashanth Sreenivasa * Log vdevs are removed by evacuating them and then turning the vdev 56*5cabbc6bSPrashanth Sreenivasa * into a hole vdev while holding spa config locks. 57*5cabbc6bSPrashanth Sreenivasa * 58*5cabbc6bSPrashanth Sreenivasa * Top level vdevs are removed and converted into an indirect vdev via 59*5cabbc6bSPrashanth Sreenivasa * a multi-step process: 60*5cabbc6bSPrashanth Sreenivasa * 61*5cabbc6bSPrashanth Sreenivasa * - Disable allocations from this device (spa_vdev_remove_top). 62*5cabbc6bSPrashanth Sreenivasa * 63*5cabbc6bSPrashanth Sreenivasa * - From a new thread (spa_vdev_remove_thread), copy data from 64*5cabbc6bSPrashanth Sreenivasa * the removing vdev to a different vdev. The copy happens in open 65*5cabbc6bSPrashanth Sreenivasa * context (spa_vdev_copy_impl) and issues a sync task 66*5cabbc6bSPrashanth Sreenivasa * (vdev_mapping_sync) so the sync thread can update the partial 67*5cabbc6bSPrashanth Sreenivasa * indirect mappings in core and on disk. 68*5cabbc6bSPrashanth Sreenivasa * 69*5cabbc6bSPrashanth Sreenivasa * - If a free happens during a removal, it is freed from the 70*5cabbc6bSPrashanth Sreenivasa * removing vdev, and if it has already been copied, from the new 71*5cabbc6bSPrashanth Sreenivasa * location as well (free_from_removing_vdev). 72*5cabbc6bSPrashanth Sreenivasa * 73*5cabbc6bSPrashanth Sreenivasa * - After the removal is completed, the copy thread converts the vdev 74*5cabbc6bSPrashanth Sreenivasa * into an indirect vdev (vdev_remove_complete) before instructing 75*5cabbc6bSPrashanth Sreenivasa * the sync thread to destroy the space maps and finish the removal 76*5cabbc6bSPrashanth Sreenivasa * (spa_finish_removal). 77*5cabbc6bSPrashanth Sreenivasa */ 78*5cabbc6bSPrashanth Sreenivasa 79*5cabbc6bSPrashanth Sreenivasa typedef struct vdev_copy_arg { 80*5cabbc6bSPrashanth Sreenivasa metaslab_t *vca_msp; 81*5cabbc6bSPrashanth Sreenivasa uint64_t vca_outstanding_bytes; 82*5cabbc6bSPrashanth Sreenivasa kcondvar_t vca_cv; 83*5cabbc6bSPrashanth Sreenivasa kmutex_t vca_lock; 84*5cabbc6bSPrashanth Sreenivasa } vdev_copy_arg_t; 85*5cabbc6bSPrashanth Sreenivasa 86*5cabbc6bSPrashanth Sreenivasa typedef struct vdev_copy_seg_arg { 87*5cabbc6bSPrashanth Sreenivasa vdev_copy_arg_t *vcsa_copy_arg; 88*5cabbc6bSPrashanth Sreenivasa uint64_t vcsa_txg; 89*5cabbc6bSPrashanth Sreenivasa dva_t *vcsa_dest_dva; 90*5cabbc6bSPrashanth Sreenivasa blkptr_t *vcsa_dest_bp; 91*5cabbc6bSPrashanth Sreenivasa } vdev_copy_seg_arg_t; 92*5cabbc6bSPrashanth Sreenivasa 93*5cabbc6bSPrashanth Sreenivasa /* 94*5cabbc6bSPrashanth Sreenivasa * The maximum amount of allowed data we're allowed to copy from a device 95*5cabbc6bSPrashanth Sreenivasa * at a time when removing it. 96*5cabbc6bSPrashanth Sreenivasa */ 97*5cabbc6bSPrashanth Sreenivasa int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; 98*5cabbc6bSPrashanth Sreenivasa 99*5cabbc6bSPrashanth Sreenivasa /* 100*5cabbc6bSPrashanth Sreenivasa * The largest contiguous segment that we will attempt to allocate when 101*5cabbc6bSPrashanth Sreenivasa * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If 102*5cabbc6bSPrashanth Sreenivasa * there is a performance problem with attempting to allocate large blocks, 103*5cabbc6bSPrashanth Sreenivasa * consider decreasing this. 104*5cabbc6bSPrashanth Sreenivasa * 105*5cabbc6bSPrashanth Sreenivasa * Note: we will issue I/Os of up to this size. The mpt driver does not 106*5cabbc6bSPrashanth Sreenivasa * respond well to I/Os larger than 1MB, so we set this to 1MB. (When 107*5cabbc6bSPrashanth Sreenivasa * mpt processes an I/O larger than 1MB, it needs to do an allocation of 108*5cabbc6bSPrashanth Sreenivasa * 2 physically contiguous pages; if this allocation fails, mpt will drop 109*5cabbc6bSPrashanth Sreenivasa * the I/O and hang the device.) 110*5cabbc6bSPrashanth Sreenivasa */ 111*5cabbc6bSPrashanth Sreenivasa int zfs_remove_max_segment = 1024 * 1024; 112*5cabbc6bSPrashanth Sreenivasa 113*5cabbc6bSPrashanth Sreenivasa #define VDEV_REMOVAL_ZAP_OBJS "lzap" 114*5cabbc6bSPrashanth Sreenivasa 115*5cabbc6bSPrashanth Sreenivasa static void spa_vdev_remove_thread(void *arg); 116*5cabbc6bSPrashanth Sreenivasa 117*5cabbc6bSPrashanth Sreenivasa static void 118*5cabbc6bSPrashanth Sreenivasa spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) 119*5cabbc6bSPrashanth Sreenivasa { 120*5cabbc6bSPrashanth Sreenivasa VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, 121*5cabbc6bSPrashanth Sreenivasa DMU_POOL_DIRECTORY_OBJECT, 122*5cabbc6bSPrashanth Sreenivasa DMU_POOL_REMOVING, sizeof (uint64_t), 123*5cabbc6bSPrashanth Sreenivasa sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 124*5cabbc6bSPrashanth Sreenivasa &spa->spa_removing_phys, tx)); 125*5cabbc6bSPrashanth Sreenivasa } 126*5cabbc6bSPrashanth Sreenivasa 127*5cabbc6bSPrashanth Sreenivasa static nvlist_t * 128*5cabbc6bSPrashanth Sreenivasa spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 129*5cabbc6bSPrashanth Sreenivasa { 130*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < count; i++) { 131*5cabbc6bSPrashanth Sreenivasa uint64_t guid = 132*5cabbc6bSPrashanth Sreenivasa fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); 133*5cabbc6bSPrashanth Sreenivasa 134*5cabbc6bSPrashanth Sreenivasa if (guid == target_guid) 135*5cabbc6bSPrashanth Sreenivasa return (nvpp[i]); 136*5cabbc6bSPrashanth Sreenivasa } 137*5cabbc6bSPrashanth Sreenivasa 138*5cabbc6bSPrashanth Sreenivasa return (NULL); 139*5cabbc6bSPrashanth Sreenivasa } 140*5cabbc6bSPrashanth Sreenivasa 141*5cabbc6bSPrashanth Sreenivasa static void 142*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 143*5cabbc6bSPrashanth Sreenivasa nvlist_t *dev_to_remove) 144*5cabbc6bSPrashanth Sreenivasa { 145*5cabbc6bSPrashanth Sreenivasa nvlist_t **newdev = NULL; 146*5cabbc6bSPrashanth Sreenivasa 147*5cabbc6bSPrashanth Sreenivasa if (count > 1) 148*5cabbc6bSPrashanth Sreenivasa newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 149*5cabbc6bSPrashanth Sreenivasa 150*5cabbc6bSPrashanth Sreenivasa for (int i = 0, j = 0; i < count; i++) { 151*5cabbc6bSPrashanth Sreenivasa if (dev[i] == dev_to_remove) 152*5cabbc6bSPrashanth Sreenivasa continue; 153*5cabbc6bSPrashanth Sreenivasa VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 154*5cabbc6bSPrashanth Sreenivasa } 155*5cabbc6bSPrashanth Sreenivasa 156*5cabbc6bSPrashanth Sreenivasa VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 157*5cabbc6bSPrashanth Sreenivasa VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 158*5cabbc6bSPrashanth Sreenivasa 159*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < count - 1; i++) 160*5cabbc6bSPrashanth Sreenivasa nvlist_free(newdev[i]); 161*5cabbc6bSPrashanth Sreenivasa 162*5cabbc6bSPrashanth Sreenivasa if (count > 1) 163*5cabbc6bSPrashanth Sreenivasa kmem_free(newdev, (count - 1) * sizeof (void *)); 164*5cabbc6bSPrashanth Sreenivasa } 165*5cabbc6bSPrashanth Sreenivasa 166*5cabbc6bSPrashanth Sreenivasa static spa_vdev_removal_t * 167*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_create(vdev_t *vd) 168*5cabbc6bSPrashanth Sreenivasa { 169*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); 170*5cabbc6bSPrashanth Sreenivasa mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); 171*5cabbc6bSPrashanth Sreenivasa cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); 172*5cabbc6bSPrashanth Sreenivasa svr->svr_allocd_segs = range_tree_create(NULL, NULL); 173*5cabbc6bSPrashanth Sreenivasa svr->svr_vdev = vd; 174*5cabbc6bSPrashanth Sreenivasa 175*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 176*5cabbc6bSPrashanth Sreenivasa svr->svr_frees[i] = range_tree_create(NULL, NULL); 177*5cabbc6bSPrashanth Sreenivasa list_create(&svr->svr_new_segments[i], 178*5cabbc6bSPrashanth Sreenivasa sizeof (vdev_indirect_mapping_entry_t), 179*5cabbc6bSPrashanth Sreenivasa offsetof(vdev_indirect_mapping_entry_t, vime_node)); 180*5cabbc6bSPrashanth Sreenivasa } 181*5cabbc6bSPrashanth Sreenivasa 182*5cabbc6bSPrashanth Sreenivasa return (svr); 183*5cabbc6bSPrashanth Sreenivasa } 184*5cabbc6bSPrashanth Sreenivasa 185*5cabbc6bSPrashanth Sreenivasa void 186*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_destroy(spa_vdev_removal_t *svr) 187*5cabbc6bSPrashanth Sreenivasa { 188*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 189*5cabbc6bSPrashanth Sreenivasa ASSERT0(svr->svr_bytes_done[i]); 190*5cabbc6bSPrashanth Sreenivasa ASSERT0(svr->svr_max_offset_to_sync[i]); 191*5cabbc6bSPrashanth Sreenivasa range_tree_destroy(svr->svr_frees[i]); 192*5cabbc6bSPrashanth Sreenivasa list_destroy(&svr->svr_new_segments[i]); 193*5cabbc6bSPrashanth Sreenivasa } 194*5cabbc6bSPrashanth Sreenivasa 195*5cabbc6bSPrashanth Sreenivasa range_tree_destroy(svr->svr_allocd_segs); 196*5cabbc6bSPrashanth Sreenivasa mutex_destroy(&svr->svr_lock); 197*5cabbc6bSPrashanth Sreenivasa cv_destroy(&svr->svr_cv); 198*5cabbc6bSPrashanth Sreenivasa kmem_free(svr, sizeof (*svr)); 199*5cabbc6bSPrashanth Sreenivasa } 200*5cabbc6bSPrashanth Sreenivasa 201*5cabbc6bSPrashanth Sreenivasa /* 202*5cabbc6bSPrashanth Sreenivasa * This is called as a synctask in the txg in which we will mark this vdev 203*5cabbc6bSPrashanth Sreenivasa * as removing (in the config stored in the MOS). 204*5cabbc6bSPrashanth Sreenivasa * 205*5cabbc6bSPrashanth Sreenivasa * It begins the evacuation of a toplevel vdev by: 206*5cabbc6bSPrashanth Sreenivasa * - initializing the spa_removing_phys which tracks this removal 207*5cabbc6bSPrashanth Sreenivasa * - computing the amount of space to remove for accounting purposes 208*5cabbc6bSPrashanth Sreenivasa * - dirtying all dbufs in the spa_config_object 209*5cabbc6bSPrashanth Sreenivasa * - creating the spa_vdev_removal 210*5cabbc6bSPrashanth Sreenivasa * - starting the spa_vdev_remove_thread 211*5cabbc6bSPrashanth Sreenivasa */ 212*5cabbc6bSPrashanth Sreenivasa static void 213*5cabbc6bSPrashanth Sreenivasa vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) 214*5cabbc6bSPrashanth Sreenivasa { 215*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = arg; 216*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 217*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 218*5cabbc6bSPrashanth Sreenivasa objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; 219*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = NULL; 220*5cabbc6bSPrashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 221*5cabbc6bSPrashanth Sreenivasa 222*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 223*5cabbc6bSPrashanth Sreenivasa svr = spa_vdev_removal_create(vd); 224*5cabbc6bSPrashanth Sreenivasa 225*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing); 226*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); 227*5cabbc6bSPrashanth Sreenivasa 228*5cabbc6bSPrashanth Sreenivasa spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 229*5cabbc6bSPrashanth Sreenivasa if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 230*5cabbc6bSPrashanth Sreenivasa /* 231*5cabbc6bSPrashanth Sreenivasa * By activating the OBSOLETE_COUNTS feature, we prevent 232*5cabbc6bSPrashanth Sreenivasa * the pool from being downgraded and ensure that the 233*5cabbc6bSPrashanth Sreenivasa * refcounts are precise. 234*5cabbc6bSPrashanth Sreenivasa */ 235*5cabbc6bSPrashanth Sreenivasa spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 236*5cabbc6bSPrashanth Sreenivasa uint64_t one = 1; 237*5cabbc6bSPrashanth Sreenivasa VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, 238*5cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, 239*5cabbc6bSPrashanth Sreenivasa &one, tx)); 240*5cabbc6bSPrashanth Sreenivasa ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); 241*5cabbc6bSPrashanth Sreenivasa } 242*5cabbc6bSPrashanth Sreenivasa 243*5cabbc6bSPrashanth Sreenivasa vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); 244*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = 245*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_open(mos, vic->vic_mapping_object); 246*5cabbc6bSPrashanth Sreenivasa vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); 247*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_births = 248*5cabbc6bSPrashanth Sreenivasa vdev_indirect_births_open(mos, vic->vic_births_object); 249*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; 250*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_start_time = gethrestime_sec(); 251*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_end_time = 0; 252*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_state = DSS_SCANNING; 253*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_to_copy = 0; 254*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_copied = 0; 255*5cabbc6bSPrashanth Sreenivasa 256*5cabbc6bSPrashanth Sreenivasa /* 257*5cabbc6bSPrashanth Sreenivasa * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because 258*5cabbc6bSPrashanth Sreenivasa * there may be space in the defer tree, which is free, but still 259*5cabbc6bSPrashanth Sreenivasa * counted in vs_alloc. 260*5cabbc6bSPrashanth Sreenivasa */ 261*5cabbc6bSPrashanth Sreenivasa for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { 262*5cabbc6bSPrashanth Sreenivasa metaslab_t *ms = vd->vdev_ms[i]; 263*5cabbc6bSPrashanth Sreenivasa if (ms->ms_sm == NULL) 264*5cabbc6bSPrashanth Sreenivasa continue; 265*5cabbc6bSPrashanth Sreenivasa 266*5cabbc6bSPrashanth Sreenivasa /* 267*5cabbc6bSPrashanth Sreenivasa * Sync tasks happen before metaslab_sync(), therefore 268*5cabbc6bSPrashanth Sreenivasa * smp_alloc and sm_alloc must be the same. 269*5cabbc6bSPrashanth Sreenivasa */ 270*5cabbc6bSPrashanth Sreenivasa ASSERT3U(space_map_allocated(ms->ms_sm), ==, 271*5cabbc6bSPrashanth Sreenivasa ms->ms_sm->sm_phys->smp_alloc); 272*5cabbc6bSPrashanth Sreenivasa 273*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_to_copy += 274*5cabbc6bSPrashanth Sreenivasa space_map_allocated(ms->ms_sm); 275*5cabbc6bSPrashanth Sreenivasa 276*5cabbc6bSPrashanth Sreenivasa /* 277*5cabbc6bSPrashanth Sreenivasa * Space which we are freeing this txg does not need to 278*5cabbc6bSPrashanth Sreenivasa * be copied. 279*5cabbc6bSPrashanth Sreenivasa */ 280*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_to_copy -= 281*5cabbc6bSPrashanth Sreenivasa range_tree_space(ms->ms_freeingtree); 282*5cabbc6bSPrashanth Sreenivasa 283*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(ms->ms_freedtree)); 284*5cabbc6bSPrashanth Sreenivasa for (int t = 0; t < TXG_SIZE; t++) 285*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(ms->ms_alloctree[t])); 286*5cabbc6bSPrashanth Sreenivasa } 287*5cabbc6bSPrashanth Sreenivasa 288*5cabbc6bSPrashanth Sreenivasa /* 289*5cabbc6bSPrashanth Sreenivasa * Sync tasks are called before metaslab_sync(), so there should 290*5cabbc6bSPrashanth Sreenivasa * be no already-synced metaslabs in the TXG_CLEAN list. 291*5cabbc6bSPrashanth Sreenivasa */ 292*5cabbc6bSPrashanth Sreenivasa ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); 293*5cabbc6bSPrashanth Sreenivasa 294*5cabbc6bSPrashanth Sreenivasa spa_sync_removing_state(spa, tx); 295*5cabbc6bSPrashanth Sreenivasa 296*5cabbc6bSPrashanth Sreenivasa /* 297*5cabbc6bSPrashanth Sreenivasa * All blocks that we need to read the most recent mapping must be 298*5cabbc6bSPrashanth Sreenivasa * stored on concrete vdevs. Therefore, we must dirty anything that 299*5cabbc6bSPrashanth Sreenivasa * is read before spa_remove_init(). Specifically, the 300*5cabbc6bSPrashanth Sreenivasa * spa_config_object. (Note that although we already modified the 301*5cabbc6bSPrashanth Sreenivasa * spa_config_object in spa_sync_removing_state, that may not have 302*5cabbc6bSPrashanth Sreenivasa * modified all blocks of the object.) 303*5cabbc6bSPrashanth Sreenivasa */ 304*5cabbc6bSPrashanth Sreenivasa dmu_object_info_t doi; 305*5cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); 306*5cabbc6bSPrashanth Sreenivasa for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { 307*5cabbc6bSPrashanth Sreenivasa dmu_buf_t *dbuf; 308*5cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, 309*5cabbc6bSPrashanth Sreenivasa offset, FTAG, &dbuf, 0)); 310*5cabbc6bSPrashanth Sreenivasa dmu_buf_will_dirty(dbuf, tx); 311*5cabbc6bSPrashanth Sreenivasa offset += dbuf->db_size; 312*5cabbc6bSPrashanth Sreenivasa dmu_buf_rele(dbuf, FTAG); 313*5cabbc6bSPrashanth Sreenivasa } 314*5cabbc6bSPrashanth Sreenivasa 315*5cabbc6bSPrashanth Sreenivasa /* 316*5cabbc6bSPrashanth Sreenivasa * Now that we've allocated the im_object, dirty the vdev to ensure 317*5cabbc6bSPrashanth Sreenivasa * that the object gets written to the config on disk. 318*5cabbc6bSPrashanth Sreenivasa */ 319*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(vd); 320*5cabbc6bSPrashanth Sreenivasa 321*5cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " 322*5cabbc6bSPrashanth Sreenivasa "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), 323*5cabbc6bSPrashanth Sreenivasa vic->vic_mapping_object); 324*5cabbc6bSPrashanth Sreenivasa 325*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove started", tx, 326*5cabbc6bSPrashanth Sreenivasa "%s vdev %llu %s", spa_name(spa), vd->vdev_id, 327*5cabbc6bSPrashanth Sreenivasa (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 328*5cabbc6bSPrashanth Sreenivasa /* 329*5cabbc6bSPrashanth Sreenivasa * Setting spa_vdev_removal causes subsequent frees to call 330*5cabbc6bSPrashanth Sreenivasa * free_from_removing_vdev(). Note that we don't need any locking 331*5cabbc6bSPrashanth Sreenivasa * because we are the sync thread, and metaslab_free_impl() is only 332*5cabbc6bSPrashanth Sreenivasa * called from syncing context (potentially from a zio taskq thread, 333*5cabbc6bSPrashanth Sreenivasa * but in any case only when there are outstanding free i/os, which 334*5cabbc6bSPrashanth Sreenivasa * there are not). 335*5cabbc6bSPrashanth Sreenivasa */ 336*5cabbc6bSPrashanth Sreenivasa ASSERT3P(spa->spa_vdev_removal, ==, NULL); 337*5cabbc6bSPrashanth Sreenivasa spa->spa_vdev_removal = svr; 338*5cabbc6bSPrashanth Sreenivasa svr->svr_thread = thread_create(NULL, 0, 339*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); 340*5cabbc6bSPrashanth Sreenivasa } 341*5cabbc6bSPrashanth Sreenivasa 342*5cabbc6bSPrashanth Sreenivasa /* 343*5cabbc6bSPrashanth Sreenivasa * When we are opening a pool, we must read the mapping for each 344*5cabbc6bSPrashanth Sreenivasa * indirect vdev in order from most recently removed to least 345*5cabbc6bSPrashanth Sreenivasa * recently removed. We do this because the blocks for the mapping 346*5cabbc6bSPrashanth Sreenivasa * of older indirect vdevs may be stored on more recently removed vdevs. 347*5cabbc6bSPrashanth Sreenivasa * In order to read each indirect mapping object, we must have 348*5cabbc6bSPrashanth Sreenivasa * initialized all more recently removed vdevs. 349*5cabbc6bSPrashanth Sreenivasa */ 350*5cabbc6bSPrashanth Sreenivasa int 351*5cabbc6bSPrashanth Sreenivasa spa_remove_init(spa_t *spa) 352*5cabbc6bSPrashanth Sreenivasa { 353*5cabbc6bSPrashanth Sreenivasa int error; 354*5cabbc6bSPrashanth Sreenivasa 355*5cabbc6bSPrashanth Sreenivasa error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, 356*5cabbc6bSPrashanth Sreenivasa DMU_POOL_DIRECTORY_OBJECT, 357*5cabbc6bSPrashanth Sreenivasa DMU_POOL_REMOVING, sizeof (uint64_t), 358*5cabbc6bSPrashanth Sreenivasa sizeof (spa->spa_removing_phys) / sizeof (uint64_t), 359*5cabbc6bSPrashanth Sreenivasa &spa->spa_removing_phys); 360*5cabbc6bSPrashanth Sreenivasa 361*5cabbc6bSPrashanth Sreenivasa if (error == ENOENT) { 362*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_state = DSS_NONE; 363*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_removing_vdev = -1; 364*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 365*5cabbc6bSPrashanth Sreenivasa return (0); 366*5cabbc6bSPrashanth Sreenivasa } else if (error != 0) { 367*5cabbc6bSPrashanth Sreenivasa return (error); 368*5cabbc6bSPrashanth Sreenivasa } 369*5cabbc6bSPrashanth Sreenivasa 370*5cabbc6bSPrashanth Sreenivasa if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { 371*5cabbc6bSPrashanth Sreenivasa /* 372*5cabbc6bSPrashanth Sreenivasa * We are currently removing a vdev. Create and 373*5cabbc6bSPrashanth Sreenivasa * initialize a spa_vdev_removal_t from the bonus 374*5cabbc6bSPrashanth Sreenivasa * buffer of the removing vdevs vdev_im_object, and 375*5cabbc6bSPrashanth Sreenivasa * initialize its partial mapping. 376*5cabbc6bSPrashanth Sreenivasa */ 377*5cabbc6bSPrashanth Sreenivasa spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 378*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, 379*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_removing_vdev); 380*5cabbc6bSPrashanth Sreenivasa spa_config_exit(spa, SCL_STATE, FTAG); 381*5cabbc6bSPrashanth Sreenivasa 382*5cabbc6bSPrashanth Sreenivasa if (vd == NULL) 383*5cabbc6bSPrashanth Sreenivasa return (EINVAL); 384*5cabbc6bSPrashanth Sreenivasa 385*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 386*5cabbc6bSPrashanth Sreenivasa 387*5cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 388*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); 389*5cabbc6bSPrashanth Sreenivasa ASSERT(svr->svr_vdev->vdev_removing); 390*5cabbc6bSPrashanth Sreenivasa 391*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 392*5cabbc6bSPrashanth Sreenivasa spa->spa_meta_objset, vic->vic_mapping_object); 393*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_births = vdev_indirect_births_open( 394*5cabbc6bSPrashanth Sreenivasa spa->spa_meta_objset, vic->vic_births_object); 395*5cabbc6bSPrashanth Sreenivasa 396*5cabbc6bSPrashanth Sreenivasa spa->spa_vdev_removal = svr; 397*5cabbc6bSPrashanth Sreenivasa } 398*5cabbc6bSPrashanth Sreenivasa 399*5cabbc6bSPrashanth Sreenivasa spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 400*5cabbc6bSPrashanth Sreenivasa uint64_t indirect_vdev_id = 401*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_prev_indirect_vdev; 402*5cabbc6bSPrashanth Sreenivasa while (indirect_vdev_id != UINT64_MAX) { 403*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); 404*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 405*5cabbc6bSPrashanth Sreenivasa 406*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 407*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = vdev_indirect_mapping_open( 408*5cabbc6bSPrashanth Sreenivasa spa->spa_meta_objset, vic->vic_mapping_object); 409*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_births = vdev_indirect_births_open( 410*5cabbc6bSPrashanth Sreenivasa spa->spa_meta_objset, vic->vic_births_object); 411*5cabbc6bSPrashanth Sreenivasa 412*5cabbc6bSPrashanth Sreenivasa indirect_vdev_id = vic->vic_prev_indirect_vdev; 413*5cabbc6bSPrashanth Sreenivasa } 414*5cabbc6bSPrashanth Sreenivasa spa_config_exit(spa, SCL_STATE, FTAG); 415*5cabbc6bSPrashanth Sreenivasa 416*5cabbc6bSPrashanth Sreenivasa /* 417*5cabbc6bSPrashanth Sreenivasa * Now that we've loaded all the indirect mappings, we can allow 418*5cabbc6bSPrashanth Sreenivasa * reads from other blocks (e.g. via predictive prefetch). 419*5cabbc6bSPrashanth Sreenivasa */ 420*5cabbc6bSPrashanth Sreenivasa spa->spa_indirect_vdevs_loaded = B_TRUE; 421*5cabbc6bSPrashanth Sreenivasa return (0); 422*5cabbc6bSPrashanth Sreenivasa } 423*5cabbc6bSPrashanth Sreenivasa 424*5cabbc6bSPrashanth Sreenivasa void 425*5cabbc6bSPrashanth Sreenivasa spa_restart_removal(spa_t *spa) 426*5cabbc6bSPrashanth Sreenivasa { 427*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 428*5cabbc6bSPrashanth Sreenivasa 429*5cabbc6bSPrashanth Sreenivasa if (svr == NULL) 430*5cabbc6bSPrashanth Sreenivasa return; 431*5cabbc6bSPrashanth Sreenivasa 432*5cabbc6bSPrashanth Sreenivasa /* 433*5cabbc6bSPrashanth Sreenivasa * In general when this function is called there is no 434*5cabbc6bSPrashanth Sreenivasa * removal thread running. The only scenario where this 435*5cabbc6bSPrashanth Sreenivasa * is not true is during spa_import() where this function 436*5cabbc6bSPrashanth Sreenivasa * is called twice [once from spa_import_impl() and 437*5cabbc6bSPrashanth Sreenivasa * spa_async_resume()]. Thus, in the scenario where we 438*5cabbc6bSPrashanth Sreenivasa * import a pool that has an ongoing removal we don't 439*5cabbc6bSPrashanth Sreenivasa * want to spawn a second thread. 440*5cabbc6bSPrashanth Sreenivasa */ 441*5cabbc6bSPrashanth Sreenivasa if (svr->svr_thread != NULL) 442*5cabbc6bSPrashanth Sreenivasa return; 443*5cabbc6bSPrashanth Sreenivasa 444*5cabbc6bSPrashanth Sreenivasa if (!spa_writeable(spa)) 445*5cabbc6bSPrashanth Sreenivasa return; 446*5cabbc6bSPrashanth Sreenivasa 447*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = svr->svr_vdev; 448*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 449*5cabbc6bSPrashanth Sreenivasa 450*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd, !=, NULL); 451*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing); 452*5cabbc6bSPrashanth Sreenivasa 453*5cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("restarting removal of %llu at count=%llu", 454*5cabbc6bSPrashanth Sreenivasa vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); 455*5cabbc6bSPrashanth Sreenivasa svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, 456*5cabbc6bSPrashanth Sreenivasa 0, &p0, TS_RUN, minclsyspri); 457*5cabbc6bSPrashanth Sreenivasa } 458*5cabbc6bSPrashanth Sreenivasa 459*5cabbc6bSPrashanth Sreenivasa /* 460*5cabbc6bSPrashanth Sreenivasa * Process freeing from a device which is in the middle of being removed. 461*5cabbc6bSPrashanth Sreenivasa * We must handle this carefully so that we attempt to copy freed data, 462*5cabbc6bSPrashanth Sreenivasa * and we correctly free already-copied data. 463*5cabbc6bSPrashanth Sreenivasa */ 464*5cabbc6bSPrashanth Sreenivasa void 465*5cabbc6bSPrashanth Sreenivasa free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, 466*5cabbc6bSPrashanth Sreenivasa uint64_t txg) 467*5cabbc6bSPrashanth Sreenivasa { 468*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 469*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 470*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 471*5cabbc6bSPrashanth Sreenivasa uint64_t max_offset_yet = 0; 472*5cabbc6bSPrashanth Sreenivasa 473*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 474*5cabbc6bSPrashanth Sreenivasa ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, 475*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_object(vim)); 476*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd, ==, svr->svr_vdev); 477*5cabbc6bSPrashanth Sreenivasa ASSERT3U(spa_syncing_txg(spa), ==, txg); 478*5cabbc6bSPrashanth Sreenivasa 479*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 480*5cabbc6bSPrashanth Sreenivasa 481*5cabbc6bSPrashanth Sreenivasa /* 482*5cabbc6bSPrashanth Sreenivasa * Remove the segment from the removing vdev's spacemap. This 483*5cabbc6bSPrashanth Sreenivasa * ensures that we will not attempt to copy this space (if the 484*5cabbc6bSPrashanth Sreenivasa * removal thread has not yet visited it), and also ensures 485*5cabbc6bSPrashanth Sreenivasa * that we know what is actually allocated on the new vdevs 486*5cabbc6bSPrashanth Sreenivasa * (needed if we cancel the removal). 487*5cabbc6bSPrashanth Sreenivasa * 488*5cabbc6bSPrashanth Sreenivasa * Note: we must do the metaslab_free_concrete() with the svr_lock 489*5cabbc6bSPrashanth Sreenivasa * held, so that the remove_thread can not load this metaslab and then 490*5cabbc6bSPrashanth Sreenivasa * visit this offset between the time that we metaslab_free_concrete() 491*5cabbc6bSPrashanth Sreenivasa * and when we check to see if it has been visited. 492*5cabbc6bSPrashanth Sreenivasa */ 493*5cabbc6bSPrashanth Sreenivasa metaslab_free_concrete(vd, offset, size, txg); 494*5cabbc6bSPrashanth Sreenivasa 495*5cabbc6bSPrashanth Sreenivasa uint64_t synced_size = 0; 496*5cabbc6bSPrashanth Sreenivasa uint64_t synced_offset = 0; 497*5cabbc6bSPrashanth Sreenivasa uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); 498*5cabbc6bSPrashanth Sreenivasa if (offset < max_offset_synced) { 499*5cabbc6bSPrashanth Sreenivasa /* 500*5cabbc6bSPrashanth Sreenivasa * The mapping for this offset is already on disk. 501*5cabbc6bSPrashanth Sreenivasa * Free from the new location. 502*5cabbc6bSPrashanth Sreenivasa * 503*5cabbc6bSPrashanth Sreenivasa * Note that we use svr_max_synced_offset because it is 504*5cabbc6bSPrashanth Sreenivasa * updated atomically with respect to the in-core mapping. 505*5cabbc6bSPrashanth Sreenivasa * By contrast, vim_max_offset is not. 506*5cabbc6bSPrashanth Sreenivasa * 507*5cabbc6bSPrashanth Sreenivasa * This block may be split between a synced entry and an 508*5cabbc6bSPrashanth Sreenivasa * in-flight or unvisited entry. Only process the synced 509*5cabbc6bSPrashanth Sreenivasa * portion of it here. 510*5cabbc6bSPrashanth Sreenivasa */ 511*5cabbc6bSPrashanth Sreenivasa synced_size = MIN(size, max_offset_synced - offset); 512*5cabbc6bSPrashanth Sreenivasa synced_offset = offset; 513*5cabbc6bSPrashanth Sreenivasa 514*5cabbc6bSPrashanth Sreenivasa ASSERT3U(max_offset_yet, <=, max_offset_synced); 515*5cabbc6bSPrashanth Sreenivasa max_offset_yet = max_offset_synced; 516*5cabbc6bSPrashanth Sreenivasa 517*5cabbc6bSPrashanth Sreenivasa DTRACE_PROBE3(remove__free__synced, 518*5cabbc6bSPrashanth Sreenivasa spa_t *, spa, 519*5cabbc6bSPrashanth Sreenivasa uint64_t, offset, 520*5cabbc6bSPrashanth Sreenivasa uint64_t, synced_size); 521*5cabbc6bSPrashanth Sreenivasa 522*5cabbc6bSPrashanth Sreenivasa size -= synced_size; 523*5cabbc6bSPrashanth Sreenivasa offset += synced_size; 524*5cabbc6bSPrashanth Sreenivasa } 525*5cabbc6bSPrashanth Sreenivasa 526*5cabbc6bSPrashanth Sreenivasa /* 527*5cabbc6bSPrashanth Sreenivasa * Look at all in-flight txgs starting from the currently syncing one 528*5cabbc6bSPrashanth Sreenivasa * and see if a section of this free is being copied. By starting from 529*5cabbc6bSPrashanth Sreenivasa * this txg and iterating forward, we might find that this region 530*5cabbc6bSPrashanth Sreenivasa * was copied in two different txgs and handle it appropriately. 531*5cabbc6bSPrashanth Sreenivasa */ 532*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { 533*5cabbc6bSPrashanth Sreenivasa int txgoff = (txg + i) & TXG_MASK; 534*5cabbc6bSPrashanth Sreenivasa if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { 535*5cabbc6bSPrashanth Sreenivasa /* 536*5cabbc6bSPrashanth Sreenivasa * The mapping for this offset is in flight, and 537*5cabbc6bSPrashanth Sreenivasa * will be synced in txg+i. 538*5cabbc6bSPrashanth Sreenivasa */ 539*5cabbc6bSPrashanth Sreenivasa uint64_t inflight_size = MIN(size, 540*5cabbc6bSPrashanth Sreenivasa svr->svr_max_offset_to_sync[txgoff] - offset); 541*5cabbc6bSPrashanth Sreenivasa 542*5cabbc6bSPrashanth Sreenivasa DTRACE_PROBE4(remove__free__inflight, 543*5cabbc6bSPrashanth Sreenivasa spa_t *, spa, 544*5cabbc6bSPrashanth Sreenivasa uint64_t, offset, 545*5cabbc6bSPrashanth Sreenivasa uint64_t, inflight_size, 546*5cabbc6bSPrashanth Sreenivasa uint64_t, txg + i); 547*5cabbc6bSPrashanth Sreenivasa 548*5cabbc6bSPrashanth Sreenivasa /* 549*5cabbc6bSPrashanth Sreenivasa * We copy data in order of increasing offset. 550*5cabbc6bSPrashanth Sreenivasa * Therefore the max_offset_to_sync[] must increase 551*5cabbc6bSPrashanth Sreenivasa * (or be zero, indicating that nothing is being 552*5cabbc6bSPrashanth Sreenivasa * copied in that txg). 553*5cabbc6bSPrashanth Sreenivasa */ 554*5cabbc6bSPrashanth Sreenivasa if (svr->svr_max_offset_to_sync[txgoff] != 0) { 555*5cabbc6bSPrashanth Sreenivasa ASSERT3U(svr->svr_max_offset_to_sync[txgoff], 556*5cabbc6bSPrashanth Sreenivasa >=, max_offset_yet); 557*5cabbc6bSPrashanth Sreenivasa max_offset_yet = 558*5cabbc6bSPrashanth Sreenivasa svr->svr_max_offset_to_sync[txgoff]; 559*5cabbc6bSPrashanth Sreenivasa } 560*5cabbc6bSPrashanth Sreenivasa 561*5cabbc6bSPrashanth Sreenivasa /* 562*5cabbc6bSPrashanth Sreenivasa * We've already committed to copying this segment: 563*5cabbc6bSPrashanth Sreenivasa * we have allocated space elsewhere in the pool for 564*5cabbc6bSPrashanth Sreenivasa * it and have an IO outstanding to copy the data. We 565*5cabbc6bSPrashanth Sreenivasa * cannot free the space before the copy has 566*5cabbc6bSPrashanth Sreenivasa * completed, or else the copy IO might overwrite any 567*5cabbc6bSPrashanth Sreenivasa * new data. To free that space, we record the 568*5cabbc6bSPrashanth Sreenivasa * segment in the appropriate svr_frees tree and free 569*5cabbc6bSPrashanth Sreenivasa * the mapped space later, in the txg where we have 570*5cabbc6bSPrashanth Sreenivasa * completed the copy and synced the mapping (see 571*5cabbc6bSPrashanth Sreenivasa * vdev_mapping_sync). 572*5cabbc6bSPrashanth Sreenivasa */ 573*5cabbc6bSPrashanth Sreenivasa range_tree_add(svr->svr_frees[txgoff], 574*5cabbc6bSPrashanth Sreenivasa offset, inflight_size); 575*5cabbc6bSPrashanth Sreenivasa size -= inflight_size; 576*5cabbc6bSPrashanth Sreenivasa offset += inflight_size; 577*5cabbc6bSPrashanth Sreenivasa 578*5cabbc6bSPrashanth Sreenivasa /* 579*5cabbc6bSPrashanth Sreenivasa * This space is already accounted for as being 580*5cabbc6bSPrashanth Sreenivasa * done, because it is being copied in txg+i. 581*5cabbc6bSPrashanth Sreenivasa * However, if i!=0, then it is being copied in 582*5cabbc6bSPrashanth Sreenivasa * a future txg. If we crash after this txg 583*5cabbc6bSPrashanth Sreenivasa * syncs but before txg+i syncs, then the space 584*5cabbc6bSPrashanth Sreenivasa * will be free. Therefore we must account 585*5cabbc6bSPrashanth Sreenivasa * for the space being done in *this* txg 586*5cabbc6bSPrashanth Sreenivasa * (when it is freed) rather than the future txg 587*5cabbc6bSPrashanth Sreenivasa * (when it will be copied). 588*5cabbc6bSPrashanth Sreenivasa */ 589*5cabbc6bSPrashanth Sreenivasa ASSERT3U(svr->svr_bytes_done[txgoff], >=, 590*5cabbc6bSPrashanth Sreenivasa inflight_size); 591*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[txgoff] -= inflight_size; 592*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; 593*5cabbc6bSPrashanth Sreenivasa } 594*5cabbc6bSPrashanth Sreenivasa } 595*5cabbc6bSPrashanth Sreenivasa ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); 596*5cabbc6bSPrashanth Sreenivasa 597*5cabbc6bSPrashanth Sreenivasa if (size > 0) { 598*5cabbc6bSPrashanth Sreenivasa /* 599*5cabbc6bSPrashanth Sreenivasa * The copy thread has not yet visited this offset. Ensure 600*5cabbc6bSPrashanth Sreenivasa * that it doesn't. 601*5cabbc6bSPrashanth Sreenivasa */ 602*5cabbc6bSPrashanth Sreenivasa 603*5cabbc6bSPrashanth Sreenivasa DTRACE_PROBE3(remove__free__unvisited, 604*5cabbc6bSPrashanth Sreenivasa spa_t *, spa, 605*5cabbc6bSPrashanth Sreenivasa uint64_t, offset, 606*5cabbc6bSPrashanth Sreenivasa uint64_t, size); 607*5cabbc6bSPrashanth Sreenivasa 608*5cabbc6bSPrashanth Sreenivasa if (svr->svr_allocd_segs != NULL) 609*5cabbc6bSPrashanth Sreenivasa range_tree_clear(svr->svr_allocd_segs, offset, size); 610*5cabbc6bSPrashanth Sreenivasa 611*5cabbc6bSPrashanth Sreenivasa /* 612*5cabbc6bSPrashanth Sreenivasa * Since we now do not need to copy this data, for 613*5cabbc6bSPrashanth Sreenivasa * accounting purposes we have done our job and can count 614*5cabbc6bSPrashanth Sreenivasa * it as completed. 615*5cabbc6bSPrashanth Sreenivasa */ 616*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[txg & TXG_MASK] += size; 617*5cabbc6bSPrashanth Sreenivasa } 618*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 619*5cabbc6bSPrashanth Sreenivasa 620*5cabbc6bSPrashanth Sreenivasa /* 621*5cabbc6bSPrashanth Sreenivasa * Now that we have dropped svr_lock, process the synced portion 622*5cabbc6bSPrashanth Sreenivasa * of this free. 623*5cabbc6bSPrashanth Sreenivasa */ 624*5cabbc6bSPrashanth Sreenivasa if (synced_size > 0) { 625*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mark_obsolete(vd, synced_offset, synced_size, 626*5cabbc6bSPrashanth Sreenivasa txg); 627*5cabbc6bSPrashanth Sreenivasa /* 628*5cabbc6bSPrashanth Sreenivasa * Note: this can only be called from syncing context, 629*5cabbc6bSPrashanth Sreenivasa * and the vdev_indirect_mapping is only changed from the 630*5cabbc6bSPrashanth Sreenivasa * sync thread, so we don't need svr_lock while doing 631*5cabbc6bSPrashanth Sreenivasa * metaslab_free_impl_cb. 632*5cabbc6bSPrashanth Sreenivasa */ 633*5cabbc6bSPrashanth Sreenivasa vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, 634*5cabbc6bSPrashanth Sreenivasa metaslab_free_impl_cb, &txg); 635*5cabbc6bSPrashanth Sreenivasa } 636*5cabbc6bSPrashanth Sreenivasa } 637*5cabbc6bSPrashanth Sreenivasa 638*5cabbc6bSPrashanth Sreenivasa /* 639*5cabbc6bSPrashanth Sreenivasa * Stop an active removal and update the spa_removing phys. 640*5cabbc6bSPrashanth Sreenivasa */ 641*5cabbc6bSPrashanth Sreenivasa static void 642*5cabbc6bSPrashanth Sreenivasa spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) 643*5cabbc6bSPrashanth Sreenivasa { 644*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 645*5cabbc6bSPrashanth Sreenivasa ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); 646*5cabbc6bSPrashanth Sreenivasa 647*5cabbc6bSPrashanth Sreenivasa /* Ensure the removal thread has completed before we free the svr. */ 648*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_suspend(spa); 649*5cabbc6bSPrashanth Sreenivasa 650*5cabbc6bSPrashanth Sreenivasa ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); 651*5cabbc6bSPrashanth Sreenivasa 652*5cabbc6bSPrashanth Sreenivasa if (state == DSS_FINISHED) { 653*5cabbc6bSPrashanth Sreenivasa spa_removing_phys_t *srp = &spa->spa_removing_phys; 654*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = svr->svr_vdev; 655*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 656*5cabbc6bSPrashanth Sreenivasa 657*5cabbc6bSPrashanth Sreenivasa if (srp->sr_prev_indirect_vdev != UINT64_MAX) { 658*5cabbc6bSPrashanth Sreenivasa vdev_t *pvd = vdev_lookup_top(spa, 659*5cabbc6bSPrashanth Sreenivasa srp->sr_prev_indirect_vdev); 660*5cabbc6bSPrashanth Sreenivasa ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); 661*5cabbc6bSPrashanth Sreenivasa } 662*5cabbc6bSPrashanth Sreenivasa 663*5cabbc6bSPrashanth Sreenivasa vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; 664*5cabbc6bSPrashanth Sreenivasa srp->sr_prev_indirect_vdev = vd->vdev_id; 665*5cabbc6bSPrashanth Sreenivasa } 666*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_state = state; 667*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_end_time = gethrestime_sec(); 668*5cabbc6bSPrashanth Sreenivasa 669*5cabbc6bSPrashanth Sreenivasa spa->spa_vdev_removal = NULL; 670*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_destroy(svr); 671*5cabbc6bSPrashanth Sreenivasa 672*5cabbc6bSPrashanth Sreenivasa spa_sync_removing_state(spa, tx); 673*5cabbc6bSPrashanth Sreenivasa 674*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(spa->spa_root_vdev); 675*5cabbc6bSPrashanth Sreenivasa } 676*5cabbc6bSPrashanth Sreenivasa 677*5cabbc6bSPrashanth Sreenivasa static void 678*5cabbc6bSPrashanth Sreenivasa free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) 679*5cabbc6bSPrashanth Sreenivasa { 680*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = arg; 681*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mark_obsolete(vd, offset, size, 682*5cabbc6bSPrashanth Sreenivasa vd->vdev_spa->spa_syncing_txg); 683*5cabbc6bSPrashanth Sreenivasa vdev_indirect_ops.vdev_op_remap(vd, offset, size, 684*5cabbc6bSPrashanth Sreenivasa metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg); 685*5cabbc6bSPrashanth Sreenivasa } 686*5cabbc6bSPrashanth Sreenivasa 687*5cabbc6bSPrashanth Sreenivasa /* 688*5cabbc6bSPrashanth Sreenivasa * On behalf of the removal thread, syncs an incremental bit more of 689*5cabbc6bSPrashanth Sreenivasa * the indirect mapping to disk and updates the in-memory mapping. 690*5cabbc6bSPrashanth Sreenivasa * Called as a sync task in every txg that the removal thread makes progress. 691*5cabbc6bSPrashanth Sreenivasa */ 692*5cabbc6bSPrashanth Sreenivasa static void 693*5cabbc6bSPrashanth Sreenivasa vdev_mapping_sync(void *arg, dmu_tx_t *tx) 694*5cabbc6bSPrashanth Sreenivasa { 695*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = arg; 696*5cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 697*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = svr->svr_vdev; 698*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 699*5cabbc6bSPrashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 700*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 701*5cabbc6bSPrashanth Sreenivasa 702*5cabbc6bSPrashanth Sreenivasa ASSERT(vic->vic_mapping_object != 0); 703*5cabbc6bSPrashanth Sreenivasa ASSERT3U(txg, ==, spa_syncing_txg(spa)); 704*5cabbc6bSPrashanth Sreenivasa 705*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_add_entries(vim, 706*5cabbc6bSPrashanth Sreenivasa &svr->svr_new_segments[txg & TXG_MASK], tx); 707*5cabbc6bSPrashanth Sreenivasa vdev_indirect_births_add_entry(vd->vdev_indirect_births, 708*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); 709*5cabbc6bSPrashanth Sreenivasa 710*5cabbc6bSPrashanth Sreenivasa /* 711*5cabbc6bSPrashanth Sreenivasa * Free the copied data for anything that was freed while the 712*5cabbc6bSPrashanth Sreenivasa * mapping entries were in flight. 713*5cabbc6bSPrashanth Sreenivasa */ 714*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 715*5cabbc6bSPrashanth Sreenivasa range_tree_vacate(svr->svr_frees[txg & TXG_MASK], 716*5cabbc6bSPrashanth Sreenivasa free_mapped_segment_cb, vd); 717*5cabbc6bSPrashanth Sreenivasa ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, 718*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_max_offset(vim)); 719*5cabbc6bSPrashanth Sreenivasa svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; 720*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 721*5cabbc6bSPrashanth Sreenivasa 722*5cabbc6bSPrashanth Sreenivasa spa_sync_removing_state(spa, tx); 723*5cabbc6bSPrashanth Sreenivasa } 724*5cabbc6bSPrashanth Sreenivasa 725*5cabbc6bSPrashanth Sreenivasa static void 726*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_segment_write_done(zio_t *zio) 727*5cabbc6bSPrashanth Sreenivasa { 728*5cabbc6bSPrashanth Sreenivasa vdev_copy_seg_arg_t *vcsa = zio->io_private; 729*5cabbc6bSPrashanth Sreenivasa vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; 730*5cabbc6bSPrashanth Sreenivasa spa_config_exit(zio->io_spa, SCL_STATE, FTAG); 731*5cabbc6bSPrashanth Sreenivasa abd_free(zio->io_abd); 732*5cabbc6bSPrashanth Sreenivasa 733*5cabbc6bSPrashanth Sreenivasa mutex_enter(&vca->vca_lock); 734*5cabbc6bSPrashanth Sreenivasa vca->vca_outstanding_bytes -= zio->io_size; 735*5cabbc6bSPrashanth Sreenivasa cv_signal(&vca->vca_cv); 736*5cabbc6bSPrashanth Sreenivasa mutex_exit(&vca->vca_lock); 737*5cabbc6bSPrashanth Sreenivasa 738*5cabbc6bSPrashanth Sreenivasa ASSERT0(zio->io_error); 739*5cabbc6bSPrashanth Sreenivasa kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); 740*5cabbc6bSPrashanth Sreenivasa kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); 741*5cabbc6bSPrashanth Sreenivasa } 742*5cabbc6bSPrashanth Sreenivasa 743*5cabbc6bSPrashanth Sreenivasa static void 744*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_segment_read_done(zio_t *zio) 745*5cabbc6bSPrashanth Sreenivasa { 746*5cabbc6bSPrashanth Sreenivasa vdev_copy_seg_arg_t *vcsa = zio->io_private; 747*5cabbc6bSPrashanth Sreenivasa dva_t *dest_dva = vcsa->vcsa_dest_dva; 748*5cabbc6bSPrashanth Sreenivasa uint64_t txg = vcsa->vcsa_txg; 749*5cabbc6bSPrashanth Sreenivasa spa_t *spa = zio->io_spa; 750*5cabbc6bSPrashanth Sreenivasa vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)); 751*5cabbc6bSPrashanth Sreenivasa blkptr_t *bp = NULL; 752*5cabbc6bSPrashanth Sreenivasa dva_t *dva = NULL; 753*5cabbc6bSPrashanth Sreenivasa uint64_t size = zio->io_size; 754*5cabbc6bSPrashanth Sreenivasa 755*5cabbc6bSPrashanth Sreenivasa ASSERT3P(dest_vd, !=, NULL); 756*5cabbc6bSPrashanth Sreenivasa ASSERT0(zio->io_error); 757*5cabbc6bSPrashanth Sreenivasa 758*5cabbc6bSPrashanth Sreenivasa vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 759*5cabbc6bSPrashanth Sreenivasa bp = vcsa->vcsa_dest_bp; 760*5cabbc6bSPrashanth Sreenivasa dva = bp->blk_dva; 761*5cabbc6bSPrashanth Sreenivasa 762*5cabbc6bSPrashanth Sreenivasa BP_ZERO(bp); 763*5cabbc6bSPrashanth Sreenivasa 764*5cabbc6bSPrashanth Sreenivasa /* initialize with dest_dva */ 765*5cabbc6bSPrashanth Sreenivasa bcopy(dest_dva, dva, sizeof (dva_t)); 766*5cabbc6bSPrashanth Sreenivasa BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 767*5cabbc6bSPrashanth Sreenivasa 768*5cabbc6bSPrashanth Sreenivasa BP_SET_LSIZE(bp, size); 769*5cabbc6bSPrashanth Sreenivasa BP_SET_PSIZE(bp, size); 770*5cabbc6bSPrashanth Sreenivasa BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 771*5cabbc6bSPrashanth Sreenivasa BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 772*5cabbc6bSPrashanth Sreenivasa BP_SET_TYPE(bp, DMU_OT_NONE); 773*5cabbc6bSPrashanth Sreenivasa BP_SET_LEVEL(bp, 0); 774*5cabbc6bSPrashanth Sreenivasa BP_SET_DEDUP(bp, 0); 775*5cabbc6bSPrashanth Sreenivasa BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 776*5cabbc6bSPrashanth Sreenivasa 777*5cabbc6bSPrashanth Sreenivasa zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, 778*5cabbc6bSPrashanth Sreenivasa txg, bp, zio->io_abd, size, 779*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_segment_write_done, vcsa, 780*5cabbc6bSPrashanth Sreenivasa ZIO_PRIORITY_REMOVAL, 0, NULL)); 781*5cabbc6bSPrashanth Sreenivasa } 782*5cabbc6bSPrashanth Sreenivasa 783*5cabbc6bSPrashanth Sreenivasa static int 784*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, 785*5cabbc6bSPrashanth Sreenivasa vdev_copy_arg_t *vca, zio_alloc_list_t *zal) 786*5cabbc6bSPrashanth Sreenivasa { 787*5cabbc6bSPrashanth Sreenivasa metaslab_group_t *mg = vd->vdev_mg; 788*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 789*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 790*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_t *entry; 791*5cabbc6bSPrashanth Sreenivasa vdev_copy_seg_arg_t *private; 792*5cabbc6bSPrashanth Sreenivasa dva_t dst = { 0 }; 793*5cabbc6bSPrashanth Sreenivasa blkptr_t blk, *bp = &blk; 794*5cabbc6bSPrashanth Sreenivasa dva_t *dva = bp->blk_dva; 795*5cabbc6bSPrashanth Sreenivasa 796*5cabbc6bSPrashanth Sreenivasa ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 797*5cabbc6bSPrashanth Sreenivasa 798*5cabbc6bSPrashanth Sreenivasa int error = metaslab_alloc_dva(spa, mg->mg_class, size, 799*5cabbc6bSPrashanth Sreenivasa &dst, 0, NULL, txg, 0, zal); 800*5cabbc6bSPrashanth Sreenivasa if (error != 0) 801*5cabbc6bSPrashanth Sreenivasa return (error); 802*5cabbc6bSPrashanth Sreenivasa 803*5cabbc6bSPrashanth Sreenivasa /* 804*5cabbc6bSPrashanth Sreenivasa * We can't have any padding of the allocated size, otherwise we will 805*5cabbc6bSPrashanth Sreenivasa * misunderstand what's allocated, and the size of the mapping. 806*5cabbc6bSPrashanth Sreenivasa * The caller ensures this will be true by passing in a size that is 807*5cabbc6bSPrashanth Sreenivasa * aligned to the worst (highest) ashift in the pool. 808*5cabbc6bSPrashanth Sreenivasa */ 809*5cabbc6bSPrashanth Sreenivasa ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); 810*5cabbc6bSPrashanth Sreenivasa 811*5cabbc6bSPrashanth Sreenivasa mutex_enter(&vca->vca_lock); 812*5cabbc6bSPrashanth Sreenivasa vca->vca_outstanding_bytes += size; 813*5cabbc6bSPrashanth Sreenivasa mutex_exit(&vca->vca_lock); 814*5cabbc6bSPrashanth Sreenivasa 815*5cabbc6bSPrashanth Sreenivasa entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); 816*5cabbc6bSPrashanth Sreenivasa DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); 817*5cabbc6bSPrashanth Sreenivasa entry->vime_mapping.vimep_dst = dst; 818*5cabbc6bSPrashanth Sreenivasa 819*5cabbc6bSPrashanth Sreenivasa private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); 820*5cabbc6bSPrashanth Sreenivasa private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; 821*5cabbc6bSPrashanth Sreenivasa private->vcsa_txg = txg; 822*5cabbc6bSPrashanth Sreenivasa private->vcsa_copy_arg = vca; 823*5cabbc6bSPrashanth Sreenivasa 824*5cabbc6bSPrashanth Sreenivasa /* 825*5cabbc6bSPrashanth Sreenivasa * This lock is eventually released by the donefunc for the 826*5cabbc6bSPrashanth Sreenivasa * zio_write_phys that finishes copying the data. 827*5cabbc6bSPrashanth Sreenivasa */ 828*5cabbc6bSPrashanth Sreenivasa spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 829*5cabbc6bSPrashanth Sreenivasa 830*5cabbc6bSPrashanth Sreenivasa /* 831*5cabbc6bSPrashanth Sreenivasa * Do logical I/O, letting the redundancy vdevs (like mirror) 832*5cabbc6bSPrashanth Sreenivasa * handle their own I/O instead of duplicating that code here. 833*5cabbc6bSPrashanth Sreenivasa */ 834*5cabbc6bSPrashanth Sreenivasa BP_ZERO(bp); 835*5cabbc6bSPrashanth Sreenivasa 836*5cabbc6bSPrashanth Sreenivasa DVA_SET_VDEV(&dva[0], vd->vdev_id); 837*5cabbc6bSPrashanth Sreenivasa DVA_SET_OFFSET(&dva[0], start); 838*5cabbc6bSPrashanth Sreenivasa DVA_SET_GANG(&dva[0], 0); 839*5cabbc6bSPrashanth Sreenivasa DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); 840*5cabbc6bSPrashanth Sreenivasa 841*5cabbc6bSPrashanth Sreenivasa BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 842*5cabbc6bSPrashanth Sreenivasa 843*5cabbc6bSPrashanth Sreenivasa BP_SET_LSIZE(bp, size); 844*5cabbc6bSPrashanth Sreenivasa BP_SET_PSIZE(bp, size); 845*5cabbc6bSPrashanth Sreenivasa BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 846*5cabbc6bSPrashanth Sreenivasa BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 847*5cabbc6bSPrashanth Sreenivasa BP_SET_TYPE(bp, DMU_OT_NONE); 848*5cabbc6bSPrashanth Sreenivasa BP_SET_LEVEL(bp, 0); 849*5cabbc6bSPrashanth Sreenivasa BP_SET_DEDUP(bp, 0); 850*5cabbc6bSPrashanth Sreenivasa BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 851*5cabbc6bSPrashanth Sreenivasa 852*5cabbc6bSPrashanth Sreenivasa zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, 853*5cabbc6bSPrashanth Sreenivasa bp, abd_alloc_for_io(size, B_FALSE), size, 854*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_segment_read_done, private, 855*5cabbc6bSPrashanth Sreenivasa ZIO_PRIORITY_REMOVAL, 0, NULL)); 856*5cabbc6bSPrashanth Sreenivasa 857*5cabbc6bSPrashanth Sreenivasa list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); 858*5cabbc6bSPrashanth Sreenivasa ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); 859*5cabbc6bSPrashanth Sreenivasa vdev_dirty(vd, 0, NULL, txg); 860*5cabbc6bSPrashanth Sreenivasa 861*5cabbc6bSPrashanth Sreenivasa return (0); 862*5cabbc6bSPrashanth Sreenivasa } 863*5cabbc6bSPrashanth Sreenivasa 864*5cabbc6bSPrashanth Sreenivasa /* 865*5cabbc6bSPrashanth Sreenivasa * Complete the removal of a toplevel vdev. This is called as a 866*5cabbc6bSPrashanth Sreenivasa * synctask in the same txg that we will sync out the new config (to the 867*5cabbc6bSPrashanth Sreenivasa * MOS object) which indicates that this vdev is indirect. 868*5cabbc6bSPrashanth Sreenivasa */ 869*5cabbc6bSPrashanth Sreenivasa static void 870*5cabbc6bSPrashanth Sreenivasa vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) 871*5cabbc6bSPrashanth Sreenivasa { 872*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = arg; 873*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = svr->svr_vdev; 874*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 875*5cabbc6bSPrashanth Sreenivasa 876*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 877*5cabbc6bSPrashanth Sreenivasa 878*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 879*5cabbc6bSPrashanth Sreenivasa ASSERT0(svr->svr_bytes_done[i]); 880*5cabbc6bSPrashanth Sreenivasa } 881*5cabbc6bSPrashanth Sreenivasa 882*5cabbc6bSPrashanth Sreenivasa ASSERT3U(spa->spa_removing_phys.sr_copied, ==, 883*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_to_copy); 884*5cabbc6bSPrashanth Sreenivasa 885*5cabbc6bSPrashanth Sreenivasa vdev_destroy_spacemaps(vd, tx); 886*5cabbc6bSPrashanth Sreenivasa 887*5cabbc6bSPrashanth Sreenivasa /* destroy leaf zaps, if any */ 888*5cabbc6bSPrashanth Sreenivasa ASSERT3P(svr->svr_zaplist, !=, NULL); 889*5cabbc6bSPrashanth Sreenivasa for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); 890*5cabbc6bSPrashanth Sreenivasa pair != NULL; 891*5cabbc6bSPrashanth Sreenivasa pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { 892*5cabbc6bSPrashanth Sreenivasa vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); 893*5cabbc6bSPrashanth Sreenivasa } 894*5cabbc6bSPrashanth Sreenivasa fnvlist_free(svr->svr_zaplist); 895*5cabbc6bSPrashanth Sreenivasa 896*5cabbc6bSPrashanth Sreenivasa spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); 897*5cabbc6bSPrashanth Sreenivasa /* vd->vdev_path is not available here */ 898*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove completed", tx, 899*5cabbc6bSPrashanth Sreenivasa "%s vdev %llu", spa_name(spa), vd->vdev_id); 900*5cabbc6bSPrashanth Sreenivasa } 901*5cabbc6bSPrashanth Sreenivasa 902*5cabbc6bSPrashanth Sreenivasa static void 903*5cabbc6bSPrashanth Sreenivasa vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) 904*5cabbc6bSPrashanth Sreenivasa { 905*5cabbc6bSPrashanth Sreenivasa ivd->vdev_indirect_config = vd->vdev_indirect_config; 906*5cabbc6bSPrashanth Sreenivasa 907*5cabbc6bSPrashanth Sreenivasa ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); 908*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_indirect_mapping != NULL); 909*5cabbc6bSPrashanth Sreenivasa ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; 910*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = NULL; 911*5cabbc6bSPrashanth Sreenivasa 912*5cabbc6bSPrashanth Sreenivasa ASSERT3P(ivd->vdev_indirect_births, ==, NULL); 913*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_indirect_births != NULL); 914*5cabbc6bSPrashanth Sreenivasa ivd->vdev_indirect_births = vd->vdev_indirect_births; 915*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_births = NULL; 916*5cabbc6bSPrashanth Sreenivasa 917*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 918*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); 919*5cabbc6bSPrashanth Sreenivasa 920*5cabbc6bSPrashanth Sreenivasa if (vd->vdev_obsolete_sm != NULL) { 921*5cabbc6bSPrashanth Sreenivasa ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); 922*5cabbc6bSPrashanth Sreenivasa 923*5cabbc6bSPrashanth Sreenivasa /* 924*5cabbc6bSPrashanth Sreenivasa * We cannot use space_map_{open,close} because we hold all 925*5cabbc6bSPrashanth Sreenivasa * the config locks as writer. 926*5cabbc6bSPrashanth Sreenivasa */ 927*5cabbc6bSPrashanth Sreenivasa ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); 928*5cabbc6bSPrashanth Sreenivasa ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; 929*5cabbc6bSPrashanth Sreenivasa vd->vdev_obsolete_sm = NULL; 930*5cabbc6bSPrashanth Sreenivasa } 931*5cabbc6bSPrashanth Sreenivasa } 932*5cabbc6bSPrashanth Sreenivasa 933*5cabbc6bSPrashanth Sreenivasa static void 934*5cabbc6bSPrashanth Sreenivasa vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) 935*5cabbc6bSPrashanth Sreenivasa { 936*5cabbc6bSPrashanth Sreenivasa ASSERT3P(zlist, !=, NULL); 937*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); 938*5cabbc6bSPrashanth Sreenivasa 939*5cabbc6bSPrashanth Sreenivasa if (vd->vdev_leaf_zap != 0) { 940*5cabbc6bSPrashanth Sreenivasa char zkey[32]; 941*5cabbc6bSPrashanth Sreenivasa (void) snprintf(zkey, sizeof (zkey), "%s-%"PRIu64, 942*5cabbc6bSPrashanth Sreenivasa VDEV_REMOVAL_ZAP_OBJS, vd->vdev_leaf_zap); 943*5cabbc6bSPrashanth Sreenivasa fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); 944*5cabbc6bSPrashanth Sreenivasa } 945*5cabbc6bSPrashanth Sreenivasa 946*5cabbc6bSPrashanth Sreenivasa for (uint64_t id = 0; id < vd->vdev_children; id++) { 947*5cabbc6bSPrashanth Sreenivasa vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); 948*5cabbc6bSPrashanth Sreenivasa } 949*5cabbc6bSPrashanth Sreenivasa } 950*5cabbc6bSPrashanth Sreenivasa 951*5cabbc6bSPrashanth Sreenivasa static void 952*5cabbc6bSPrashanth Sreenivasa vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) 953*5cabbc6bSPrashanth Sreenivasa { 954*5cabbc6bSPrashanth Sreenivasa vdev_t *ivd; 955*5cabbc6bSPrashanth Sreenivasa dmu_tx_t *tx; 956*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 957*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 958*5cabbc6bSPrashanth Sreenivasa 959*5cabbc6bSPrashanth Sreenivasa /* 960*5cabbc6bSPrashanth Sreenivasa * First, build a list of leaf zaps to be destroyed. 961*5cabbc6bSPrashanth Sreenivasa * This is passed to the sync context thread, 962*5cabbc6bSPrashanth Sreenivasa * which does the actual unlinking. 963*5cabbc6bSPrashanth Sreenivasa */ 964*5cabbc6bSPrashanth Sreenivasa svr->svr_zaplist = fnvlist_alloc(); 965*5cabbc6bSPrashanth Sreenivasa vdev_remove_enlist_zaps(vd, svr->svr_zaplist); 966*5cabbc6bSPrashanth Sreenivasa 967*5cabbc6bSPrashanth Sreenivasa ivd = vdev_add_parent(vd, &vdev_indirect_ops); 968*5cabbc6bSPrashanth Sreenivasa 969*5cabbc6bSPrashanth Sreenivasa vd->vdev_leaf_zap = 0; 970*5cabbc6bSPrashanth Sreenivasa 971*5cabbc6bSPrashanth Sreenivasa vdev_remove_child(ivd, vd); 972*5cabbc6bSPrashanth Sreenivasa vdev_compact_children(ivd); 973*5cabbc6bSPrashanth Sreenivasa 974*5cabbc6bSPrashanth Sreenivasa vdev_indirect_state_transfer(ivd, vd); 975*5cabbc6bSPrashanth Sreenivasa 976*5cabbc6bSPrashanth Sreenivasa svr->svr_vdev = ivd; 977*5cabbc6bSPrashanth Sreenivasa 978*5cabbc6bSPrashanth Sreenivasa ASSERT(!ivd->vdev_removing); 979*5cabbc6bSPrashanth Sreenivasa ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 980*5cabbc6bSPrashanth Sreenivasa 981*5cabbc6bSPrashanth Sreenivasa tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 982*5cabbc6bSPrashanth Sreenivasa dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, 983*5cabbc6bSPrashanth Sreenivasa 0, ZFS_SPACE_CHECK_NONE, tx); 984*5cabbc6bSPrashanth Sreenivasa dmu_tx_commit(tx); 985*5cabbc6bSPrashanth Sreenivasa 986*5cabbc6bSPrashanth Sreenivasa /* 987*5cabbc6bSPrashanth Sreenivasa * Indicate that this thread has exited. 988*5cabbc6bSPrashanth Sreenivasa * After this, we can not use svr. 989*5cabbc6bSPrashanth Sreenivasa */ 990*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 991*5cabbc6bSPrashanth Sreenivasa svr->svr_thread = NULL; 992*5cabbc6bSPrashanth Sreenivasa cv_broadcast(&svr->svr_cv); 993*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 994*5cabbc6bSPrashanth Sreenivasa } 995*5cabbc6bSPrashanth Sreenivasa 996*5cabbc6bSPrashanth Sreenivasa /* 997*5cabbc6bSPrashanth Sreenivasa * Complete the removal of a toplevel vdev. This is called in open 998*5cabbc6bSPrashanth Sreenivasa * context by the removal thread after we have copied all vdev's data. 999*5cabbc6bSPrashanth Sreenivasa */ 1000*5cabbc6bSPrashanth Sreenivasa static void 1001*5cabbc6bSPrashanth Sreenivasa vdev_remove_complete(vdev_t *vd) 1002*5cabbc6bSPrashanth Sreenivasa { 1003*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1004*5cabbc6bSPrashanth Sreenivasa uint64_t txg; 1005*5cabbc6bSPrashanth Sreenivasa 1006*5cabbc6bSPrashanth Sreenivasa /* 1007*5cabbc6bSPrashanth Sreenivasa * Wait for any deferred frees to be synced before we call 1008*5cabbc6bSPrashanth Sreenivasa * vdev_metaslab_fini() 1009*5cabbc6bSPrashanth Sreenivasa */ 1010*5cabbc6bSPrashanth Sreenivasa txg_wait_synced(spa->spa_dsl_pool, 0); 1011*5cabbc6bSPrashanth Sreenivasa 1012*5cabbc6bSPrashanth Sreenivasa txg = spa_vdev_enter(spa); 1013*5cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", 1014*5cabbc6bSPrashanth Sreenivasa vd->vdev_id, txg); 1015*5cabbc6bSPrashanth Sreenivasa 1016*5cabbc6bSPrashanth Sreenivasa /* 1017*5cabbc6bSPrashanth Sreenivasa * Discard allocation state. 1018*5cabbc6bSPrashanth Sreenivasa */ 1019*5cabbc6bSPrashanth Sreenivasa if (vd->vdev_mg != NULL) { 1020*5cabbc6bSPrashanth Sreenivasa vdev_metaslab_fini(vd); 1021*5cabbc6bSPrashanth Sreenivasa metaslab_group_destroy(vd->vdev_mg); 1022*5cabbc6bSPrashanth Sreenivasa vd->vdev_mg = NULL; 1023*5cabbc6bSPrashanth Sreenivasa } 1024*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_stat.vs_space); 1025*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_stat.vs_dspace); 1026*5cabbc6bSPrashanth Sreenivasa 1027*5cabbc6bSPrashanth Sreenivasa vdev_remove_replace_with_indirect(vd, txg); 1028*5cabbc6bSPrashanth Sreenivasa 1029*5cabbc6bSPrashanth Sreenivasa /* 1030*5cabbc6bSPrashanth Sreenivasa * We now release the locks, allowing spa_sync to run and finish the 1031*5cabbc6bSPrashanth Sreenivasa * removal via vdev_remove_complete_sync in syncing context. 1032*5cabbc6bSPrashanth Sreenivasa */ 1033*5cabbc6bSPrashanth Sreenivasa (void) spa_vdev_exit(spa, NULL, txg, 0); 1034*5cabbc6bSPrashanth Sreenivasa 1035*5cabbc6bSPrashanth Sreenivasa /* 1036*5cabbc6bSPrashanth Sreenivasa * Top ZAP should have been transferred to the indirect vdev in 1037*5cabbc6bSPrashanth Sreenivasa * vdev_remove_replace_with_indirect. 1038*5cabbc6bSPrashanth Sreenivasa */ 1039*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_top_zap); 1040*5cabbc6bSPrashanth Sreenivasa 1041*5cabbc6bSPrashanth Sreenivasa /* 1042*5cabbc6bSPrashanth Sreenivasa * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. 1043*5cabbc6bSPrashanth Sreenivasa */ 1044*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_leaf_zap); 1045*5cabbc6bSPrashanth Sreenivasa 1046*5cabbc6bSPrashanth Sreenivasa txg = spa_vdev_enter(spa); 1047*5cabbc6bSPrashanth Sreenivasa (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1048*5cabbc6bSPrashanth Sreenivasa /* 1049*5cabbc6bSPrashanth Sreenivasa * Request to update the config and the config cachefile. 1050*5cabbc6bSPrashanth Sreenivasa */ 1051*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(spa->spa_root_vdev); 1052*5cabbc6bSPrashanth Sreenivasa (void) spa_vdev_exit(spa, vd, txg, 0); 1053*5cabbc6bSPrashanth Sreenivasa } 1054*5cabbc6bSPrashanth Sreenivasa 1055*5cabbc6bSPrashanth Sreenivasa /* 1056*5cabbc6bSPrashanth Sreenivasa * Evacuates a segment of size at most max_alloc from the vdev 1057*5cabbc6bSPrashanth Sreenivasa * via repeated calls to spa_vdev_copy_segment. If an allocation 1058*5cabbc6bSPrashanth Sreenivasa * fails, the pool is probably too fragmented to handle such a 1059*5cabbc6bSPrashanth Sreenivasa * large size, so decrease max_alloc so that the caller will not try 1060*5cabbc6bSPrashanth Sreenivasa * this size again this txg. 1061*5cabbc6bSPrashanth Sreenivasa */ 1062*5cabbc6bSPrashanth Sreenivasa static void 1063*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, 1064*5cabbc6bSPrashanth Sreenivasa uint64_t *max_alloc, dmu_tx_t *tx) 1065*5cabbc6bSPrashanth Sreenivasa { 1066*5cabbc6bSPrashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 1067*5cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1068*5cabbc6bSPrashanth Sreenivasa 1069*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1070*5cabbc6bSPrashanth Sreenivasa 1071*5cabbc6bSPrashanth Sreenivasa range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); 1072*5cabbc6bSPrashanth Sreenivasa if (rs == NULL) { 1073*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1074*5cabbc6bSPrashanth Sreenivasa return; 1075*5cabbc6bSPrashanth Sreenivasa } 1076*5cabbc6bSPrashanth Sreenivasa uint64_t offset = rs->rs_start; 1077*5cabbc6bSPrashanth Sreenivasa uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); 1078*5cabbc6bSPrashanth Sreenivasa 1079*5cabbc6bSPrashanth Sreenivasa range_tree_remove(svr->svr_allocd_segs, offset, length); 1080*5cabbc6bSPrashanth Sreenivasa 1081*5cabbc6bSPrashanth Sreenivasa if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { 1082*5cabbc6bSPrashanth Sreenivasa dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, 1083*5cabbc6bSPrashanth Sreenivasa svr, 0, ZFS_SPACE_CHECK_NONE, tx); 1084*5cabbc6bSPrashanth Sreenivasa } 1085*5cabbc6bSPrashanth Sreenivasa 1086*5cabbc6bSPrashanth Sreenivasa svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; 1087*5cabbc6bSPrashanth Sreenivasa 1088*5cabbc6bSPrashanth Sreenivasa /* 1089*5cabbc6bSPrashanth Sreenivasa * Note: this is the amount of *allocated* space 1090*5cabbc6bSPrashanth Sreenivasa * that we are taking care of each txg. 1091*5cabbc6bSPrashanth Sreenivasa */ 1092*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[txg & TXG_MASK] += length; 1093*5cabbc6bSPrashanth Sreenivasa 1094*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1095*5cabbc6bSPrashanth Sreenivasa 1096*5cabbc6bSPrashanth Sreenivasa zio_alloc_list_t zal; 1097*5cabbc6bSPrashanth Sreenivasa metaslab_trace_init(&zal); 1098*5cabbc6bSPrashanth Sreenivasa uint64_t thismax = *max_alloc; 1099*5cabbc6bSPrashanth Sreenivasa while (length > 0) { 1100*5cabbc6bSPrashanth Sreenivasa uint64_t mylen = MIN(length, thismax); 1101*5cabbc6bSPrashanth Sreenivasa 1102*5cabbc6bSPrashanth Sreenivasa int error = spa_vdev_copy_segment(svr->svr_vdev, 1103*5cabbc6bSPrashanth Sreenivasa offset, mylen, txg, vca, &zal); 1104*5cabbc6bSPrashanth Sreenivasa 1105*5cabbc6bSPrashanth Sreenivasa if (error == ENOSPC) { 1106*5cabbc6bSPrashanth Sreenivasa /* 1107*5cabbc6bSPrashanth Sreenivasa * Cut our segment in half, and don't try this 1108*5cabbc6bSPrashanth Sreenivasa * segment size again this txg. Note that the 1109*5cabbc6bSPrashanth Sreenivasa * allocation size must be aligned to the highest 1110*5cabbc6bSPrashanth Sreenivasa * ashift in the pool, so that the allocation will 1111*5cabbc6bSPrashanth Sreenivasa * not be padded out to a multiple of the ashift, 1112*5cabbc6bSPrashanth Sreenivasa * which could cause us to think that this mapping 1113*5cabbc6bSPrashanth Sreenivasa * is larger than we intended. 1114*5cabbc6bSPrashanth Sreenivasa */ 1115*5cabbc6bSPrashanth Sreenivasa ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); 1116*5cabbc6bSPrashanth Sreenivasa ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); 1117*5cabbc6bSPrashanth Sreenivasa thismax = P2ROUNDUP(mylen / 2, 1118*5cabbc6bSPrashanth Sreenivasa 1 << spa->spa_max_ashift); 1119*5cabbc6bSPrashanth Sreenivasa ASSERT3U(thismax, <, mylen); 1120*5cabbc6bSPrashanth Sreenivasa /* 1121*5cabbc6bSPrashanth Sreenivasa * The minimum-size allocation can not fail. 1122*5cabbc6bSPrashanth Sreenivasa */ 1123*5cabbc6bSPrashanth Sreenivasa ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); 1124*5cabbc6bSPrashanth Sreenivasa *max_alloc = mylen - (1 << spa->spa_max_ashift); 1125*5cabbc6bSPrashanth Sreenivasa } else { 1126*5cabbc6bSPrashanth Sreenivasa ASSERT0(error); 1127*5cabbc6bSPrashanth Sreenivasa length -= mylen; 1128*5cabbc6bSPrashanth Sreenivasa offset += mylen; 1129*5cabbc6bSPrashanth Sreenivasa 1130*5cabbc6bSPrashanth Sreenivasa /* 1131*5cabbc6bSPrashanth Sreenivasa * We've performed an allocation, so reset the 1132*5cabbc6bSPrashanth Sreenivasa * alloc trace list. 1133*5cabbc6bSPrashanth Sreenivasa */ 1134*5cabbc6bSPrashanth Sreenivasa metaslab_trace_fini(&zal); 1135*5cabbc6bSPrashanth Sreenivasa metaslab_trace_init(&zal); 1136*5cabbc6bSPrashanth Sreenivasa } 1137*5cabbc6bSPrashanth Sreenivasa } 1138*5cabbc6bSPrashanth Sreenivasa metaslab_trace_fini(&zal); 1139*5cabbc6bSPrashanth Sreenivasa } 1140*5cabbc6bSPrashanth Sreenivasa 1141*5cabbc6bSPrashanth Sreenivasa /* 1142*5cabbc6bSPrashanth Sreenivasa * The removal thread operates in open context. It iterates over all 1143*5cabbc6bSPrashanth Sreenivasa * allocated space in the vdev, by loading each metaslab's spacemap. 1144*5cabbc6bSPrashanth Sreenivasa * For each contiguous segment of allocated space (capping the segment 1145*5cabbc6bSPrashanth Sreenivasa * size at SPA_MAXBLOCKSIZE), we: 1146*5cabbc6bSPrashanth Sreenivasa * - Allocate space for it on another vdev. 1147*5cabbc6bSPrashanth Sreenivasa * - Create a new mapping from the old location to the new location 1148*5cabbc6bSPrashanth Sreenivasa * (as a record in svr_new_segments). 1149*5cabbc6bSPrashanth Sreenivasa * - Initiate a logical read zio to get the data off the removing disk. 1150*5cabbc6bSPrashanth Sreenivasa * - In the read zio's done callback, initiate a logical write zio to 1151*5cabbc6bSPrashanth Sreenivasa * write it to the new vdev. 1152*5cabbc6bSPrashanth Sreenivasa * Note that all of this will take effect when a particular TXG syncs. 1153*5cabbc6bSPrashanth Sreenivasa * The sync thread ensures that all the phys reads and writes for the syncing 1154*5cabbc6bSPrashanth Sreenivasa * TXG have completed (see spa_txg_zio) and writes the new mappings to disk 1155*5cabbc6bSPrashanth Sreenivasa * (see vdev_mapping_sync()). 1156*5cabbc6bSPrashanth Sreenivasa */ 1157*5cabbc6bSPrashanth Sreenivasa static void 1158*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_thread(void *arg) 1159*5cabbc6bSPrashanth Sreenivasa { 1160*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = arg; 1161*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1162*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1163*5cabbc6bSPrashanth Sreenivasa vdev_copy_arg_t vca; 1164*5cabbc6bSPrashanth Sreenivasa uint64_t max_alloc = zfs_remove_max_segment; 1165*5cabbc6bSPrashanth Sreenivasa uint64_t last_txg = 0; 1166*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1167*5cabbc6bSPrashanth Sreenivasa uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); 1168*5cabbc6bSPrashanth Sreenivasa 1169*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); 1170*5cabbc6bSPrashanth Sreenivasa ASSERT(vdev_is_concrete(vd)); 1171*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing); 1172*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 1173*5cabbc6bSPrashanth Sreenivasa ASSERT3P(svr->svr_vdev, ==, vd); 1174*5cabbc6bSPrashanth Sreenivasa ASSERT(vim != NULL); 1175*5cabbc6bSPrashanth Sreenivasa 1176*5cabbc6bSPrashanth Sreenivasa mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); 1177*5cabbc6bSPrashanth Sreenivasa cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); 1178*5cabbc6bSPrashanth Sreenivasa vca.vca_outstanding_bytes = 0; 1179*5cabbc6bSPrashanth Sreenivasa 1180*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1181*5cabbc6bSPrashanth Sreenivasa 1182*5cabbc6bSPrashanth Sreenivasa /* 1183*5cabbc6bSPrashanth Sreenivasa * Start from vim_max_offset so we pick up where we left off 1184*5cabbc6bSPrashanth Sreenivasa * if we are restarting the removal after opening the pool. 1185*5cabbc6bSPrashanth Sreenivasa */ 1186*5cabbc6bSPrashanth Sreenivasa uint64_t msi; 1187*5cabbc6bSPrashanth Sreenivasa for (msi = start_offset >> vd->vdev_ms_shift; 1188*5cabbc6bSPrashanth Sreenivasa msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { 1189*5cabbc6bSPrashanth Sreenivasa metaslab_t *msp = vd->vdev_ms[msi]; 1190*5cabbc6bSPrashanth Sreenivasa ASSERT3U(msi, <=, vd->vdev_ms_count); 1191*5cabbc6bSPrashanth Sreenivasa 1192*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1193*5cabbc6bSPrashanth Sreenivasa 1194*5cabbc6bSPrashanth Sreenivasa mutex_enter(&msp->ms_sync_lock); 1195*5cabbc6bSPrashanth Sreenivasa mutex_enter(&msp->ms_lock); 1196*5cabbc6bSPrashanth Sreenivasa 1197*5cabbc6bSPrashanth Sreenivasa /* 1198*5cabbc6bSPrashanth Sreenivasa * Assert nothing in flight -- ms_*tree is empty. 1199*5cabbc6bSPrashanth Sreenivasa */ 1200*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 1201*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(msp->ms_alloctree[i])); 1202*5cabbc6bSPrashanth Sreenivasa } 1203*5cabbc6bSPrashanth Sreenivasa 1204*5cabbc6bSPrashanth Sreenivasa /* 1205*5cabbc6bSPrashanth Sreenivasa * If the metaslab has ever been allocated from (ms_sm!=NULL), 1206*5cabbc6bSPrashanth Sreenivasa * read the allocated segments from the space map object 1207*5cabbc6bSPrashanth Sreenivasa * into svr_allocd_segs. Since we do this while holding 1208*5cabbc6bSPrashanth Sreenivasa * svr_lock and ms_sync_lock, concurrent frees (which 1209*5cabbc6bSPrashanth Sreenivasa * would have modified the space map) will wait for us 1210*5cabbc6bSPrashanth Sreenivasa * to finish loading the spacemap, and then take the 1211*5cabbc6bSPrashanth Sreenivasa * appropriate action (see free_from_removing_vdev()). 1212*5cabbc6bSPrashanth Sreenivasa */ 1213*5cabbc6bSPrashanth Sreenivasa if (msp->ms_sm != NULL) { 1214*5cabbc6bSPrashanth Sreenivasa space_map_t *sm = NULL; 1215*5cabbc6bSPrashanth Sreenivasa 1216*5cabbc6bSPrashanth Sreenivasa /* 1217*5cabbc6bSPrashanth Sreenivasa * We have to open a new space map here, because 1218*5cabbc6bSPrashanth Sreenivasa * ms_sm's sm_length and sm_alloc may not reflect 1219*5cabbc6bSPrashanth Sreenivasa * what's in the object contents, if we are in between 1220*5cabbc6bSPrashanth Sreenivasa * metaslab_sync() and metaslab_sync_done(). 1221*5cabbc6bSPrashanth Sreenivasa */ 1222*5cabbc6bSPrashanth Sreenivasa VERIFY0(space_map_open(&sm, 1223*5cabbc6bSPrashanth Sreenivasa spa->spa_dsl_pool->dp_meta_objset, 1224*5cabbc6bSPrashanth Sreenivasa msp->ms_sm->sm_object, msp->ms_sm->sm_start, 1225*5cabbc6bSPrashanth Sreenivasa msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); 1226*5cabbc6bSPrashanth Sreenivasa space_map_update(sm); 1227*5cabbc6bSPrashanth Sreenivasa VERIFY0(space_map_load(sm, svr->svr_allocd_segs, 1228*5cabbc6bSPrashanth Sreenivasa SM_ALLOC)); 1229*5cabbc6bSPrashanth Sreenivasa space_map_close(sm); 1230*5cabbc6bSPrashanth Sreenivasa 1231*5cabbc6bSPrashanth Sreenivasa range_tree_walk(msp->ms_freeingtree, 1232*5cabbc6bSPrashanth Sreenivasa range_tree_remove, svr->svr_allocd_segs); 1233*5cabbc6bSPrashanth Sreenivasa 1234*5cabbc6bSPrashanth Sreenivasa /* 1235*5cabbc6bSPrashanth Sreenivasa * When we are resuming from a paused removal (i.e. 1236*5cabbc6bSPrashanth Sreenivasa * when importing a pool with a removal in progress), 1237*5cabbc6bSPrashanth Sreenivasa * discard any state that we have already processed. 1238*5cabbc6bSPrashanth Sreenivasa */ 1239*5cabbc6bSPrashanth Sreenivasa range_tree_clear(svr->svr_allocd_segs, 0, start_offset); 1240*5cabbc6bSPrashanth Sreenivasa } 1241*5cabbc6bSPrashanth Sreenivasa mutex_exit(&msp->ms_lock); 1242*5cabbc6bSPrashanth Sreenivasa mutex_exit(&msp->ms_sync_lock); 1243*5cabbc6bSPrashanth Sreenivasa 1244*5cabbc6bSPrashanth Sreenivasa vca.vca_msp = msp; 1245*5cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("copying %llu segments for metaslab %llu", 1246*5cabbc6bSPrashanth Sreenivasa avl_numnodes(&svr->svr_allocd_segs->rt_root), 1247*5cabbc6bSPrashanth Sreenivasa msp->ms_id); 1248*5cabbc6bSPrashanth Sreenivasa 1249*5cabbc6bSPrashanth Sreenivasa while (!svr->svr_thread_exit && 1250*5cabbc6bSPrashanth Sreenivasa range_tree_space(svr->svr_allocd_segs) != 0) { 1251*5cabbc6bSPrashanth Sreenivasa 1252*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1253*5cabbc6bSPrashanth Sreenivasa 1254*5cabbc6bSPrashanth Sreenivasa mutex_enter(&vca.vca_lock); 1255*5cabbc6bSPrashanth Sreenivasa while (vca.vca_outstanding_bytes > 1256*5cabbc6bSPrashanth Sreenivasa zfs_remove_max_copy_bytes) { 1257*5cabbc6bSPrashanth Sreenivasa cv_wait(&vca.vca_cv, &vca.vca_lock); 1258*5cabbc6bSPrashanth Sreenivasa } 1259*5cabbc6bSPrashanth Sreenivasa mutex_exit(&vca.vca_lock); 1260*5cabbc6bSPrashanth Sreenivasa 1261*5cabbc6bSPrashanth Sreenivasa dmu_tx_t *tx = 1262*5cabbc6bSPrashanth Sreenivasa dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1263*5cabbc6bSPrashanth Sreenivasa 1264*5cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1265*5cabbc6bSPrashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 1266*5cabbc6bSPrashanth Sreenivasa 1267*5cabbc6bSPrashanth Sreenivasa if (txg != last_txg) 1268*5cabbc6bSPrashanth Sreenivasa max_alloc = zfs_remove_max_segment; 1269*5cabbc6bSPrashanth Sreenivasa last_txg = txg; 1270*5cabbc6bSPrashanth Sreenivasa 1271*5cabbc6bSPrashanth Sreenivasa spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); 1272*5cabbc6bSPrashanth Sreenivasa 1273*5cabbc6bSPrashanth Sreenivasa dmu_tx_commit(tx); 1274*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1275*5cabbc6bSPrashanth Sreenivasa } 1276*5cabbc6bSPrashanth Sreenivasa } 1277*5cabbc6bSPrashanth Sreenivasa 1278*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1279*5cabbc6bSPrashanth Sreenivasa /* 1280*5cabbc6bSPrashanth Sreenivasa * Wait for all copies to finish before cleaning up the vca. 1281*5cabbc6bSPrashanth Sreenivasa */ 1282*5cabbc6bSPrashanth Sreenivasa txg_wait_synced(spa->spa_dsl_pool, 0); 1283*5cabbc6bSPrashanth Sreenivasa ASSERT0(vca.vca_outstanding_bytes); 1284*5cabbc6bSPrashanth Sreenivasa 1285*5cabbc6bSPrashanth Sreenivasa mutex_destroy(&vca.vca_lock); 1286*5cabbc6bSPrashanth Sreenivasa cv_destroy(&vca.vca_cv); 1287*5cabbc6bSPrashanth Sreenivasa 1288*5cabbc6bSPrashanth Sreenivasa if (svr->svr_thread_exit) { 1289*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1290*5cabbc6bSPrashanth Sreenivasa range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); 1291*5cabbc6bSPrashanth Sreenivasa svr->svr_thread = NULL; 1292*5cabbc6bSPrashanth Sreenivasa cv_broadcast(&svr->svr_cv); 1293*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1294*5cabbc6bSPrashanth Sreenivasa } else { 1295*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1296*5cabbc6bSPrashanth Sreenivasa vdev_remove_complete(vd); 1297*5cabbc6bSPrashanth Sreenivasa } 1298*5cabbc6bSPrashanth Sreenivasa } 1299*5cabbc6bSPrashanth Sreenivasa 1300*5cabbc6bSPrashanth Sreenivasa void 1301*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_suspend(spa_t *spa) 1302*5cabbc6bSPrashanth Sreenivasa { 1303*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1304*5cabbc6bSPrashanth Sreenivasa 1305*5cabbc6bSPrashanth Sreenivasa if (svr == NULL) 1306*5cabbc6bSPrashanth Sreenivasa return; 1307*5cabbc6bSPrashanth Sreenivasa 1308*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1309*5cabbc6bSPrashanth Sreenivasa svr->svr_thread_exit = B_TRUE; 1310*5cabbc6bSPrashanth Sreenivasa while (svr->svr_thread != NULL) 1311*5cabbc6bSPrashanth Sreenivasa cv_wait(&svr->svr_cv, &svr->svr_lock); 1312*5cabbc6bSPrashanth Sreenivasa svr->svr_thread_exit = B_FALSE; 1313*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1314*5cabbc6bSPrashanth Sreenivasa } 1315*5cabbc6bSPrashanth Sreenivasa 1316*5cabbc6bSPrashanth Sreenivasa /* ARGSUSED */ 1317*5cabbc6bSPrashanth Sreenivasa static int 1318*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) 1319*5cabbc6bSPrashanth Sreenivasa { 1320*5cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1321*5cabbc6bSPrashanth Sreenivasa 1322*5cabbc6bSPrashanth Sreenivasa if (spa->spa_vdev_removal == NULL) 1323*5cabbc6bSPrashanth Sreenivasa return (ENOTACTIVE); 1324*5cabbc6bSPrashanth Sreenivasa return (0); 1325*5cabbc6bSPrashanth Sreenivasa } 1326*5cabbc6bSPrashanth Sreenivasa 1327*5cabbc6bSPrashanth Sreenivasa /* 1328*5cabbc6bSPrashanth Sreenivasa * Cancel a removal by freeing all entries from the partial mapping 1329*5cabbc6bSPrashanth Sreenivasa * and marking the vdev as no longer being removing. 1330*5cabbc6bSPrashanth Sreenivasa */ 1331*5cabbc6bSPrashanth Sreenivasa /* ARGSUSED */ 1332*5cabbc6bSPrashanth Sreenivasa static void 1333*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) 1334*5cabbc6bSPrashanth Sreenivasa { 1335*5cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1336*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1337*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = svr->svr_vdev; 1338*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1339*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1340*5cabbc6bSPrashanth Sreenivasa objset_t *mos = spa->spa_meta_objset; 1341*5cabbc6bSPrashanth Sreenivasa 1342*5cabbc6bSPrashanth Sreenivasa ASSERT3P(svr->svr_thread, ==, NULL); 1343*5cabbc6bSPrashanth Sreenivasa 1344*5cabbc6bSPrashanth Sreenivasa spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); 1345*5cabbc6bSPrashanth Sreenivasa if (vdev_obsolete_counts_are_precise(vd)) { 1346*5cabbc6bSPrashanth Sreenivasa spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1347*5cabbc6bSPrashanth Sreenivasa VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1348*5cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); 1349*5cabbc6bSPrashanth Sreenivasa } 1350*5cabbc6bSPrashanth Sreenivasa 1351*5cabbc6bSPrashanth Sreenivasa if (vdev_obsolete_sm_object(vd) != 0) { 1352*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_obsolete_sm != NULL); 1353*5cabbc6bSPrashanth Sreenivasa ASSERT3U(vdev_obsolete_sm_object(vd), ==, 1354*5cabbc6bSPrashanth Sreenivasa space_map_object(vd->vdev_obsolete_sm)); 1355*5cabbc6bSPrashanth Sreenivasa 1356*5cabbc6bSPrashanth Sreenivasa space_map_free(vd->vdev_obsolete_sm, tx); 1357*5cabbc6bSPrashanth Sreenivasa VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 1358*5cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 1359*5cabbc6bSPrashanth Sreenivasa space_map_close(vd->vdev_obsolete_sm); 1360*5cabbc6bSPrashanth Sreenivasa vd->vdev_obsolete_sm = NULL; 1361*5cabbc6bSPrashanth Sreenivasa spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 1362*5cabbc6bSPrashanth Sreenivasa } 1363*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 1364*5cabbc6bSPrashanth Sreenivasa ASSERT(list_is_empty(&svr->svr_new_segments[i])); 1365*5cabbc6bSPrashanth Sreenivasa ASSERT3U(svr->svr_max_offset_to_sync[i], <=, 1366*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_max_offset(vim)); 1367*5cabbc6bSPrashanth Sreenivasa } 1368*5cabbc6bSPrashanth Sreenivasa 1369*5cabbc6bSPrashanth Sreenivasa for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 1370*5cabbc6bSPrashanth Sreenivasa metaslab_t *msp = vd->vdev_ms[msi]; 1371*5cabbc6bSPrashanth Sreenivasa 1372*5cabbc6bSPrashanth Sreenivasa if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) 1373*5cabbc6bSPrashanth Sreenivasa break; 1374*5cabbc6bSPrashanth Sreenivasa 1375*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(svr->svr_allocd_segs)); 1376*5cabbc6bSPrashanth Sreenivasa 1377*5cabbc6bSPrashanth Sreenivasa mutex_enter(&msp->ms_lock); 1378*5cabbc6bSPrashanth Sreenivasa 1379*5cabbc6bSPrashanth Sreenivasa /* 1380*5cabbc6bSPrashanth Sreenivasa * Assert nothing in flight -- ms_*tree is empty. 1381*5cabbc6bSPrashanth Sreenivasa */ 1382*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) 1383*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(msp->ms_alloctree[i])); 1384*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_DEFER_SIZE; i++) 1385*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(msp->ms_defertree[i])); 1386*5cabbc6bSPrashanth Sreenivasa ASSERT0(range_tree_space(msp->ms_freedtree)); 1387*5cabbc6bSPrashanth Sreenivasa 1388*5cabbc6bSPrashanth Sreenivasa if (msp->ms_sm != NULL) { 1389*5cabbc6bSPrashanth Sreenivasa /* 1390*5cabbc6bSPrashanth Sreenivasa * Assert that the in-core spacemap has the same 1391*5cabbc6bSPrashanth Sreenivasa * length as the on-disk one, so we can use the 1392*5cabbc6bSPrashanth Sreenivasa * existing in-core spacemap to load it from disk. 1393*5cabbc6bSPrashanth Sreenivasa */ 1394*5cabbc6bSPrashanth Sreenivasa ASSERT3U(msp->ms_sm->sm_alloc, ==, 1395*5cabbc6bSPrashanth Sreenivasa msp->ms_sm->sm_phys->smp_alloc); 1396*5cabbc6bSPrashanth Sreenivasa ASSERT3U(msp->ms_sm->sm_length, ==, 1397*5cabbc6bSPrashanth Sreenivasa msp->ms_sm->sm_phys->smp_objsize); 1398*5cabbc6bSPrashanth Sreenivasa 1399*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1400*5cabbc6bSPrashanth Sreenivasa VERIFY0(space_map_load(msp->ms_sm, 1401*5cabbc6bSPrashanth Sreenivasa svr->svr_allocd_segs, SM_ALLOC)); 1402*5cabbc6bSPrashanth Sreenivasa range_tree_walk(msp->ms_freeingtree, 1403*5cabbc6bSPrashanth Sreenivasa range_tree_remove, svr->svr_allocd_segs); 1404*5cabbc6bSPrashanth Sreenivasa 1405*5cabbc6bSPrashanth Sreenivasa /* 1406*5cabbc6bSPrashanth Sreenivasa * Clear everything past what has been synced, 1407*5cabbc6bSPrashanth Sreenivasa * because we have not allocated mappings for it yet. 1408*5cabbc6bSPrashanth Sreenivasa */ 1409*5cabbc6bSPrashanth Sreenivasa uint64_t syncd = vdev_indirect_mapping_max_offset(vim); 1410*5cabbc6bSPrashanth Sreenivasa range_tree_clear(svr->svr_allocd_segs, syncd, 1411*5cabbc6bSPrashanth Sreenivasa msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); 1412*5cabbc6bSPrashanth Sreenivasa 1413*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1414*5cabbc6bSPrashanth Sreenivasa } 1415*5cabbc6bSPrashanth Sreenivasa mutex_exit(&msp->ms_lock); 1416*5cabbc6bSPrashanth Sreenivasa 1417*5cabbc6bSPrashanth Sreenivasa mutex_enter(&svr->svr_lock); 1418*5cabbc6bSPrashanth Sreenivasa range_tree_vacate(svr->svr_allocd_segs, 1419*5cabbc6bSPrashanth Sreenivasa free_mapped_segment_cb, vd); 1420*5cabbc6bSPrashanth Sreenivasa mutex_exit(&svr->svr_lock); 1421*5cabbc6bSPrashanth Sreenivasa } 1422*5cabbc6bSPrashanth Sreenivasa 1423*5cabbc6bSPrashanth Sreenivasa /* 1424*5cabbc6bSPrashanth Sreenivasa * Note: this must happen after we invoke free_mapped_segment_cb, 1425*5cabbc6bSPrashanth Sreenivasa * because it adds to the obsolete_segments. 1426*5cabbc6bSPrashanth Sreenivasa */ 1427*5cabbc6bSPrashanth Sreenivasa range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 1428*5cabbc6bSPrashanth Sreenivasa 1429*5cabbc6bSPrashanth Sreenivasa ASSERT3U(vic->vic_mapping_object, ==, 1430*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); 1431*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 1432*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = NULL; 1433*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 1434*5cabbc6bSPrashanth Sreenivasa vic->vic_mapping_object = 0; 1435*5cabbc6bSPrashanth Sreenivasa 1436*5cabbc6bSPrashanth Sreenivasa ASSERT3U(vic->vic_births_object, ==, 1437*5cabbc6bSPrashanth Sreenivasa vdev_indirect_births_object(vd->vdev_indirect_births)); 1438*5cabbc6bSPrashanth Sreenivasa vdev_indirect_births_close(vd->vdev_indirect_births); 1439*5cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_births = NULL; 1440*5cabbc6bSPrashanth Sreenivasa vdev_indirect_births_free(mos, vic->vic_births_object, tx); 1441*5cabbc6bSPrashanth Sreenivasa vic->vic_births_object = 0; 1442*5cabbc6bSPrashanth Sreenivasa 1443*5cabbc6bSPrashanth Sreenivasa /* 1444*5cabbc6bSPrashanth Sreenivasa * We may have processed some frees from the removing vdev in this 1445*5cabbc6bSPrashanth Sreenivasa * txg, thus increasing svr_bytes_done; discard that here to 1446*5cabbc6bSPrashanth Sreenivasa * satisfy the assertions in spa_vdev_removal_destroy(). 1447*5cabbc6bSPrashanth Sreenivasa * Note that future txg's can not have any bytes_done, because 1448*5cabbc6bSPrashanth Sreenivasa * future TXG's are only modified from open context, and we have 1449*5cabbc6bSPrashanth Sreenivasa * already shut down the copying thread. 1450*5cabbc6bSPrashanth Sreenivasa */ 1451*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; 1452*5cabbc6bSPrashanth Sreenivasa spa_finish_removal(spa, DSS_CANCELED, tx); 1453*5cabbc6bSPrashanth Sreenivasa 1454*5cabbc6bSPrashanth Sreenivasa vd->vdev_removing = B_FALSE; 1455*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(vd); 1456*5cabbc6bSPrashanth Sreenivasa 1457*5cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("canceled device removal for vdev %llu in %llu", 1458*5cabbc6bSPrashanth Sreenivasa vd->vdev_id, dmu_tx_get_txg(tx)); 1459*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove canceled", tx, 1460*5cabbc6bSPrashanth Sreenivasa "%s vdev %llu %s", spa_name(spa), 1461*5cabbc6bSPrashanth Sreenivasa vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1462*5cabbc6bSPrashanth Sreenivasa } 1463*5cabbc6bSPrashanth Sreenivasa 1464*5cabbc6bSPrashanth Sreenivasa int 1465*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_cancel(spa_t *spa) 1466*5cabbc6bSPrashanth Sreenivasa { 1467*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_suspend(spa); 1468*5cabbc6bSPrashanth Sreenivasa 1469*5cabbc6bSPrashanth Sreenivasa if (spa->spa_vdev_removal == NULL) 1470*5cabbc6bSPrashanth Sreenivasa return (ENOTACTIVE); 1471*5cabbc6bSPrashanth Sreenivasa 1472*5cabbc6bSPrashanth Sreenivasa uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; 1473*5cabbc6bSPrashanth Sreenivasa 1474*5cabbc6bSPrashanth Sreenivasa int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, 1475*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); 1476*5cabbc6bSPrashanth Sreenivasa 1477*5cabbc6bSPrashanth Sreenivasa if (error == 0) { 1478*5cabbc6bSPrashanth Sreenivasa spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); 1479*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, vdid); 1480*5cabbc6bSPrashanth Sreenivasa metaslab_group_activate(vd->vdev_mg); 1481*5cabbc6bSPrashanth Sreenivasa spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); 1482*5cabbc6bSPrashanth Sreenivasa } 1483*5cabbc6bSPrashanth Sreenivasa 1484*5cabbc6bSPrashanth Sreenivasa return (error); 1485*5cabbc6bSPrashanth Sreenivasa } 1486*5cabbc6bSPrashanth Sreenivasa 1487*5cabbc6bSPrashanth Sreenivasa /* 1488*5cabbc6bSPrashanth Sreenivasa * Called every sync pass of every txg if there's a svr. 1489*5cabbc6bSPrashanth Sreenivasa */ 1490*5cabbc6bSPrashanth Sreenivasa void 1491*5cabbc6bSPrashanth Sreenivasa svr_sync(spa_t *spa, dmu_tx_t *tx) 1492*5cabbc6bSPrashanth Sreenivasa { 1493*5cabbc6bSPrashanth Sreenivasa spa_vdev_removal_t *svr = spa->spa_vdev_removal; 1494*5cabbc6bSPrashanth Sreenivasa int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 1495*5cabbc6bSPrashanth Sreenivasa 1496*5cabbc6bSPrashanth Sreenivasa /* 1497*5cabbc6bSPrashanth Sreenivasa * This check is necessary so that we do not dirty the 1498*5cabbc6bSPrashanth Sreenivasa * DIRECTORY_OBJECT via spa_sync_removing_state() when there 1499*5cabbc6bSPrashanth Sreenivasa * is nothing to do. Dirtying it every time would prevent us 1500*5cabbc6bSPrashanth Sreenivasa * from syncing-to-convergence. 1501*5cabbc6bSPrashanth Sreenivasa */ 1502*5cabbc6bSPrashanth Sreenivasa if (svr->svr_bytes_done[txgoff] == 0) 1503*5cabbc6bSPrashanth Sreenivasa return; 1504*5cabbc6bSPrashanth Sreenivasa 1505*5cabbc6bSPrashanth Sreenivasa /* 1506*5cabbc6bSPrashanth Sreenivasa * Update progress accounting. 1507*5cabbc6bSPrashanth Sreenivasa */ 1508*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; 1509*5cabbc6bSPrashanth Sreenivasa svr->svr_bytes_done[txgoff] = 0; 1510*5cabbc6bSPrashanth Sreenivasa 1511*5cabbc6bSPrashanth Sreenivasa spa_sync_removing_state(spa, tx); 1512*5cabbc6bSPrashanth Sreenivasa } 1513*5cabbc6bSPrashanth Sreenivasa 1514*5cabbc6bSPrashanth Sreenivasa static void 1515*5cabbc6bSPrashanth Sreenivasa vdev_remove_make_hole_and_free(vdev_t *vd) 1516*5cabbc6bSPrashanth Sreenivasa { 1517*5cabbc6bSPrashanth Sreenivasa uint64_t id = vd->vdev_id; 1518*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1519*5cabbc6bSPrashanth Sreenivasa vdev_t *rvd = spa->spa_root_vdev; 1520*5cabbc6bSPrashanth Sreenivasa boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 1521*5cabbc6bSPrashanth Sreenivasa 1522*5cabbc6bSPrashanth Sreenivasa ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1523*5cabbc6bSPrashanth Sreenivasa ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1524*5cabbc6bSPrashanth Sreenivasa 1525*5cabbc6bSPrashanth Sreenivasa vdev_free(vd); 1526*5cabbc6bSPrashanth Sreenivasa 1527*5cabbc6bSPrashanth Sreenivasa if (last_vdev) { 1528*5cabbc6bSPrashanth Sreenivasa vdev_compact_children(rvd); 1529*5cabbc6bSPrashanth Sreenivasa } else { 1530*5cabbc6bSPrashanth Sreenivasa vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 1531*5cabbc6bSPrashanth Sreenivasa vdev_add_child(rvd, vd); 1532*5cabbc6bSPrashanth Sreenivasa } 1533*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(rvd); 1534*5cabbc6bSPrashanth Sreenivasa 1535*5cabbc6bSPrashanth Sreenivasa /* 1536*5cabbc6bSPrashanth Sreenivasa * Reassess the health of our root vdev. 1537*5cabbc6bSPrashanth Sreenivasa */ 1538*5cabbc6bSPrashanth Sreenivasa vdev_reopen(rvd); 1539*5cabbc6bSPrashanth Sreenivasa } 1540*5cabbc6bSPrashanth Sreenivasa 1541*5cabbc6bSPrashanth Sreenivasa /* 1542*5cabbc6bSPrashanth Sreenivasa * Remove a log device. The config lock is held for the specified TXG. 1543*5cabbc6bSPrashanth Sreenivasa */ 1544*5cabbc6bSPrashanth Sreenivasa static int 1545*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) 1546*5cabbc6bSPrashanth Sreenivasa { 1547*5cabbc6bSPrashanth Sreenivasa metaslab_group_t *mg = vd->vdev_mg; 1548*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1549*5cabbc6bSPrashanth Sreenivasa int error = 0; 1550*5cabbc6bSPrashanth Sreenivasa 1551*5cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_islog); 1552*5cabbc6bSPrashanth Sreenivasa ASSERT(vd == vd->vdev_top); 1553*5cabbc6bSPrashanth Sreenivasa 1554*5cabbc6bSPrashanth Sreenivasa /* 1555*5cabbc6bSPrashanth Sreenivasa * Stop allocating from this vdev. 1556*5cabbc6bSPrashanth Sreenivasa */ 1557*5cabbc6bSPrashanth Sreenivasa metaslab_group_passivate(mg); 1558*5cabbc6bSPrashanth Sreenivasa 1559*5cabbc6bSPrashanth Sreenivasa /* 1560*5cabbc6bSPrashanth Sreenivasa * Wait for the youngest allocations and frees to sync, 1561*5cabbc6bSPrashanth Sreenivasa * and then wait for the deferral of those frees to finish. 1562*5cabbc6bSPrashanth Sreenivasa */ 1563*5cabbc6bSPrashanth Sreenivasa spa_vdev_config_exit(spa, NULL, 1564*5cabbc6bSPrashanth Sreenivasa *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1565*5cabbc6bSPrashanth Sreenivasa 1566*5cabbc6bSPrashanth Sreenivasa /* 1567*5cabbc6bSPrashanth Sreenivasa * Evacuate the device. We don't hold the config lock as writer 1568*5cabbc6bSPrashanth Sreenivasa * since we need to do I/O but we do keep the 1569*5cabbc6bSPrashanth Sreenivasa * spa_namespace_lock held. Once this completes the device 1570*5cabbc6bSPrashanth Sreenivasa * should no longer have any blocks allocated on it. 1571*5cabbc6bSPrashanth Sreenivasa */ 1572*5cabbc6bSPrashanth Sreenivasa if (vd->vdev_islog) { 1573*5cabbc6bSPrashanth Sreenivasa if (vd->vdev_stat.vs_alloc != 0) 1574*5cabbc6bSPrashanth Sreenivasa error = spa_reset_logs(spa); 1575*5cabbc6bSPrashanth Sreenivasa } 1576*5cabbc6bSPrashanth Sreenivasa 1577*5cabbc6bSPrashanth Sreenivasa *txg = spa_vdev_config_enter(spa); 1578*5cabbc6bSPrashanth Sreenivasa 1579*5cabbc6bSPrashanth Sreenivasa if (error != 0) { 1580*5cabbc6bSPrashanth Sreenivasa metaslab_group_activate(mg); 1581*5cabbc6bSPrashanth Sreenivasa return (error); 1582*5cabbc6bSPrashanth Sreenivasa } 1583*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_stat.vs_alloc); 1584*5cabbc6bSPrashanth Sreenivasa 1585*5cabbc6bSPrashanth Sreenivasa /* 1586*5cabbc6bSPrashanth Sreenivasa * The evacuation succeeded. Remove any remaining MOS metadata 1587*5cabbc6bSPrashanth Sreenivasa * associated with this vdev, and wait for these changes to sync. 1588*5cabbc6bSPrashanth Sreenivasa */ 1589*5cabbc6bSPrashanth Sreenivasa vd->vdev_removing = B_TRUE; 1590*5cabbc6bSPrashanth Sreenivasa 1591*5cabbc6bSPrashanth Sreenivasa vdev_dirty_leaves(vd, VDD_DTL, *txg); 1592*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(vd); 1593*5cabbc6bSPrashanth Sreenivasa 1594*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove", NULL, 1595*5cabbc6bSPrashanth Sreenivasa "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, 1596*5cabbc6bSPrashanth Sreenivasa (vd->vdev_path != NULL) ? vd->vdev_path : "-"); 1597*5cabbc6bSPrashanth Sreenivasa 1598*5cabbc6bSPrashanth Sreenivasa /* Make sure these changes are sync'ed */ 1599*5cabbc6bSPrashanth Sreenivasa spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); 1600*5cabbc6bSPrashanth Sreenivasa 1601*5cabbc6bSPrashanth Sreenivasa *txg = spa_vdev_config_enter(spa); 1602*5cabbc6bSPrashanth Sreenivasa 1603*5cabbc6bSPrashanth Sreenivasa sysevent_t *ev = spa_event_create(spa, vd, NULL, 1604*5cabbc6bSPrashanth Sreenivasa ESC_ZFS_VDEV_REMOVE_DEV); 1605*5cabbc6bSPrashanth Sreenivasa ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1606*5cabbc6bSPrashanth Sreenivasa ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1607*5cabbc6bSPrashanth Sreenivasa 1608*5cabbc6bSPrashanth Sreenivasa /* The top ZAP should have been destroyed by vdev_remove_empty. */ 1609*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_top_zap); 1610*5cabbc6bSPrashanth Sreenivasa /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ 1611*5cabbc6bSPrashanth Sreenivasa ASSERT0(vd->vdev_leaf_zap); 1612*5cabbc6bSPrashanth Sreenivasa 1613*5cabbc6bSPrashanth Sreenivasa (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1614*5cabbc6bSPrashanth Sreenivasa 1615*5cabbc6bSPrashanth Sreenivasa if (list_link_active(&vd->vdev_state_dirty_node)) 1616*5cabbc6bSPrashanth Sreenivasa vdev_state_clean(vd); 1617*5cabbc6bSPrashanth Sreenivasa if (list_link_active(&vd->vdev_config_dirty_node)) 1618*5cabbc6bSPrashanth Sreenivasa vdev_config_clean(vd); 1619*5cabbc6bSPrashanth Sreenivasa 1620*5cabbc6bSPrashanth Sreenivasa /* 1621*5cabbc6bSPrashanth Sreenivasa * Clean up the vdev namespace. 1622*5cabbc6bSPrashanth Sreenivasa */ 1623*5cabbc6bSPrashanth Sreenivasa vdev_remove_make_hole_and_free(vd); 1624*5cabbc6bSPrashanth Sreenivasa 1625*5cabbc6bSPrashanth Sreenivasa if (ev != NULL) 1626*5cabbc6bSPrashanth Sreenivasa spa_event_post(ev); 1627*5cabbc6bSPrashanth Sreenivasa 1628*5cabbc6bSPrashanth Sreenivasa return (0); 1629*5cabbc6bSPrashanth Sreenivasa } 1630*5cabbc6bSPrashanth Sreenivasa 1631*5cabbc6bSPrashanth Sreenivasa static int 1632*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_top_check(vdev_t *vd) 1633*5cabbc6bSPrashanth Sreenivasa { 1634*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1635*5cabbc6bSPrashanth Sreenivasa 1636*5cabbc6bSPrashanth Sreenivasa if (vd != vd->vdev_top) 1637*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(ENOTSUP)); 1638*5cabbc6bSPrashanth Sreenivasa 1639*5cabbc6bSPrashanth Sreenivasa if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) 1640*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(ENOTSUP)); 1641*5cabbc6bSPrashanth Sreenivasa 1642*5cabbc6bSPrashanth Sreenivasa /* 1643*5cabbc6bSPrashanth Sreenivasa * There has to be enough free space to remove the 1644*5cabbc6bSPrashanth Sreenivasa * device and leave double the "slop" space (i.e. we 1645*5cabbc6bSPrashanth Sreenivasa * must leave at least 3% of the pool free, in addition to 1646*5cabbc6bSPrashanth Sreenivasa * the normal slop space). 1647*5cabbc6bSPrashanth Sreenivasa */ 1648*5cabbc6bSPrashanth Sreenivasa if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, 1649*5cabbc6bSPrashanth Sreenivasa NULL, 0, B_TRUE) < 1650*5cabbc6bSPrashanth Sreenivasa vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { 1651*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(ENOSPC)); 1652*5cabbc6bSPrashanth Sreenivasa } 1653*5cabbc6bSPrashanth Sreenivasa 1654*5cabbc6bSPrashanth Sreenivasa /* 1655*5cabbc6bSPrashanth Sreenivasa * There can not be a removal in progress. 1656*5cabbc6bSPrashanth Sreenivasa */ 1657*5cabbc6bSPrashanth Sreenivasa if (spa->spa_removing_phys.sr_state == DSS_SCANNING) 1658*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EBUSY)); 1659*5cabbc6bSPrashanth Sreenivasa 1660*5cabbc6bSPrashanth Sreenivasa /* 1661*5cabbc6bSPrashanth Sreenivasa * The device must have all its data. 1662*5cabbc6bSPrashanth Sreenivasa */ 1663*5cabbc6bSPrashanth Sreenivasa if (!vdev_dtl_empty(vd, DTL_MISSING) || 1664*5cabbc6bSPrashanth Sreenivasa !vdev_dtl_empty(vd, DTL_OUTAGE)) 1665*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EBUSY)); 1666*5cabbc6bSPrashanth Sreenivasa 1667*5cabbc6bSPrashanth Sreenivasa /* 1668*5cabbc6bSPrashanth Sreenivasa * The device must be healthy. 1669*5cabbc6bSPrashanth Sreenivasa */ 1670*5cabbc6bSPrashanth Sreenivasa if (!vdev_readable(vd)) 1671*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EIO)); 1672*5cabbc6bSPrashanth Sreenivasa 1673*5cabbc6bSPrashanth Sreenivasa /* 1674*5cabbc6bSPrashanth Sreenivasa * All vdevs in normal class must have the same ashift. 1675*5cabbc6bSPrashanth Sreenivasa */ 1676*5cabbc6bSPrashanth Sreenivasa if (spa->spa_max_ashift != spa->spa_min_ashift) { 1677*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EINVAL)); 1678*5cabbc6bSPrashanth Sreenivasa } 1679*5cabbc6bSPrashanth Sreenivasa 1680*5cabbc6bSPrashanth Sreenivasa /* 1681*5cabbc6bSPrashanth Sreenivasa * All vdevs in normal class must have the same ashift 1682*5cabbc6bSPrashanth Sreenivasa * and not be raidz. 1683*5cabbc6bSPrashanth Sreenivasa */ 1684*5cabbc6bSPrashanth Sreenivasa vdev_t *rvd = spa->spa_root_vdev; 1685*5cabbc6bSPrashanth Sreenivasa int num_indirect = 0; 1686*5cabbc6bSPrashanth Sreenivasa for (uint64_t id = 0; id < rvd->vdev_children; id++) { 1687*5cabbc6bSPrashanth Sreenivasa vdev_t *cvd = rvd->vdev_child[id]; 1688*5cabbc6bSPrashanth Sreenivasa if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) 1689*5cabbc6bSPrashanth Sreenivasa ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); 1690*5cabbc6bSPrashanth Sreenivasa if (cvd->vdev_ops == &vdev_indirect_ops) 1691*5cabbc6bSPrashanth Sreenivasa num_indirect++; 1692*5cabbc6bSPrashanth Sreenivasa if (!vdev_is_concrete(cvd)) 1693*5cabbc6bSPrashanth Sreenivasa continue; 1694*5cabbc6bSPrashanth Sreenivasa if (cvd->vdev_ops == &vdev_raidz_ops) 1695*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EINVAL)); 1696*5cabbc6bSPrashanth Sreenivasa /* 1697*5cabbc6bSPrashanth Sreenivasa * Need the mirror to be mirror of leaf vdevs only 1698*5cabbc6bSPrashanth Sreenivasa */ 1699*5cabbc6bSPrashanth Sreenivasa if (cvd->vdev_ops == &vdev_mirror_ops) { 1700*5cabbc6bSPrashanth Sreenivasa for (uint64_t cid = 0; 1701*5cabbc6bSPrashanth Sreenivasa cid < cvd->vdev_children; cid++) { 1702*5cabbc6bSPrashanth Sreenivasa vdev_t *tmp = cvd->vdev_child[cid]; 1703*5cabbc6bSPrashanth Sreenivasa if (!tmp->vdev_ops->vdev_op_leaf) 1704*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(EINVAL)); 1705*5cabbc6bSPrashanth Sreenivasa } 1706*5cabbc6bSPrashanth Sreenivasa } 1707*5cabbc6bSPrashanth Sreenivasa } 1708*5cabbc6bSPrashanth Sreenivasa 1709*5cabbc6bSPrashanth Sreenivasa return (0); 1710*5cabbc6bSPrashanth Sreenivasa } 1711*5cabbc6bSPrashanth Sreenivasa 1712*5cabbc6bSPrashanth Sreenivasa /* 1713*5cabbc6bSPrashanth Sreenivasa * Initiate removal of a top-level vdev, reducing the total space in the pool. 1714*5cabbc6bSPrashanth Sreenivasa * The config lock is held for the specified TXG. Once initiated, 1715*5cabbc6bSPrashanth Sreenivasa * evacuation of all allocated space (copying it to other vdevs) happens 1716*5cabbc6bSPrashanth Sreenivasa * in the background (see spa_vdev_remove_thread()), and can be canceled 1717*5cabbc6bSPrashanth Sreenivasa * (see spa_vdev_remove_cancel()). If successful, the vdev will 1718*5cabbc6bSPrashanth Sreenivasa * be transformed to an indirect vdev (see spa_vdev_remove_complete()). 1719*5cabbc6bSPrashanth Sreenivasa */ 1720*5cabbc6bSPrashanth Sreenivasa static int 1721*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) 1722*5cabbc6bSPrashanth Sreenivasa { 1723*5cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 1724*5cabbc6bSPrashanth Sreenivasa int error; 1725*5cabbc6bSPrashanth Sreenivasa 1726*5cabbc6bSPrashanth Sreenivasa /* 1727*5cabbc6bSPrashanth Sreenivasa * Check for errors up-front, so that we don't waste time 1728*5cabbc6bSPrashanth Sreenivasa * passivating the metaslab group and clearing the ZIL if there 1729*5cabbc6bSPrashanth Sreenivasa * are errors. 1730*5cabbc6bSPrashanth Sreenivasa */ 1731*5cabbc6bSPrashanth Sreenivasa error = spa_vdev_remove_top_check(vd); 1732*5cabbc6bSPrashanth Sreenivasa if (error != 0) 1733*5cabbc6bSPrashanth Sreenivasa return (error); 1734*5cabbc6bSPrashanth Sreenivasa 1735*5cabbc6bSPrashanth Sreenivasa /* 1736*5cabbc6bSPrashanth Sreenivasa * Stop allocating from this vdev. Note that we must check 1737*5cabbc6bSPrashanth Sreenivasa * that this is not the only device in the pool before 1738*5cabbc6bSPrashanth Sreenivasa * passivating, otherwise we will not be able to make 1739*5cabbc6bSPrashanth Sreenivasa * progress because we can't allocate from any vdevs. 1740*5cabbc6bSPrashanth Sreenivasa * The above check for sufficient free space serves this 1741*5cabbc6bSPrashanth Sreenivasa * purpose. 1742*5cabbc6bSPrashanth Sreenivasa */ 1743*5cabbc6bSPrashanth Sreenivasa metaslab_group_t *mg = vd->vdev_mg; 1744*5cabbc6bSPrashanth Sreenivasa metaslab_group_passivate(mg); 1745*5cabbc6bSPrashanth Sreenivasa 1746*5cabbc6bSPrashanth Sreenivasa /* 1747*5cabbc6bSPrashanth Sreenivasa * Wait for the youngest allocations and frees to sync, 1748*5cabbc6bSPrashanth Sreenivasa * and then wait for the deferral of those frees to finish. 1749*5cabbc6bSPrashanth Sreenivasa */ 1750*5cabbc6bSPrashanth Sreenivasa spa_vdev_config_exit(spa, NULL, 1751*5cabbc6bSPrashanth Sreenivasa *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 1752*5cabbc6bSPrashanth Sreenivasa 1753*5cabbc6bSPrashanth Sreenivasa /* 1754*5cabbc6bSPrashanth Sreenivasa * We must ensure that no "stubby" log blocks are allocated 1755*5cabbc6bSPrashanth Sreenivasa * on the device to be removed. These blocks could be 1756*5cabbc6bSPrashanth Sreenivasa * written at any time, including while we are in the middle 1757*5cabbc6bSPrashanth Sreenivasa * of copying them. 1758*5cabbc6bSPrashanth Sreenivasa */ 1759*5cabbc6bSPrashanth Sreenivasa error = spa_reset_logs(spa); 1760*5cabbc6bSPrashanth Sreenivasa 1761*5cabbc6bSPrashanth Sreenivasa *txg = spa_vdev_config_enter(spa); 1762*5cabbc6bSPrashanth Sreenivasa 1763*5cabbc6bSPrashanth Sreenivasa /* 1764*5cabbc6bSPrashanth Sreenivasa * Things might have changed while the config lock was dropped 1765*5cabbc6bSPrashanth Sreenivasa * (e.g. space usage). Check for errors again. 1766*5cabbc6bSPrashanth Sreenivasa */ 1767*5cabbc6bSPrashanth Sreenivasa if (error == 0) 1768*5cabbc6bSPrashanth Sreenivasa error = spa_vdev_remove_top_check(vd); 1769*5cabbc6bSPrashanth Sreenivasa 1770*5cabbc6bSPrashanth Sreenivasa if (error != 0) { 1771*5cabbc6bSPrashanth Sreenivasa metaslab_group_activate(mg); 1772*5cabbc6bSPrashanth Sreenivasa return (error); 1773*5cabbc6bSPrashanth Sreenivasa } 1774*5cabbc6bSPrashanth Sreenivasa 1775*5cabbc6bSPrashanth Sreenivasa vd->vdev_removing = B_TRUE; 1776*5cabbc6bSPrashanth Sreenivasa 1777*5cabbc6bSPrashanth Sreenivasa vdev_dirty_leaves(vd, VDD_DTL, *txg); 1778*5cabbc6bSPrashanth Sreenivasa vdev_config_dirty(vd); 1779*5cabbc6bSPrashanth Sreenivasa dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); 1780*5cabbc6bSPrashanth Sreenivasa dsl_sync_task_nowait(spa->spa_dsl_pool, 1781*5cabbc6bSPrashanth Sreenivasa vdev_remove_initiate_sync, 1782*5cabbc6bSPrashanth Sreenivasa vd, 0, ZFS_SPACE_CHECK_NONE, tx); 1783*5cabbc6bSPrashanth Sreenivasa dmu_tx_commit(tx); 1784*5cabbc6bSPrashanth Sreenivasa 1785*5cabbc6bSPrashanth Sreenivasa return (0); 1786*5cabbc6bSPrashanth Sreenivasa } 1787*5cabbc6bSPrashanth Sreenivasa 1788*5cabbc6bSPrashanth Sreenivasa /* 1789*5cabbc6bSPrashanth Sreenivasa * Remove a device from the pool. 1790*5cabbc6bSPrashanth Sreenivasa * 1791*5cabbc6bSPrashanth Sreenivasa * Removing a device from the vdev namespace requires several steps 1792*5cabbc6bSPrashanth Sreenivasa * and can take a significant amount of time. As a result we use 1793*5cabbc6bSPrashanth Sreenivasa * the spa_vdev_config_[enter/exit] functions which allow us to 1794*5cabbc6bSPrashanth Sreenivasa * grab and release the spa_config_lock while still holding the namespace 1795*5cabbc6bSPrashanth Sreenivasa * lock. During each step the configuration is synced out. 1796*5cabbc6bSPrashanth Sreenivasa */ 1797*5cabbc6bSPrashanth Sreenivasa int 1798*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1799*5cabbc6bSPrashanth Sreenivasa { 1800*5cabbc6bSPrashanth Sreenivasa vdev_t *vd; 1801*5cabbc6bSPrashanth Sreenivasa nvlist_t **spares, **l2cache, *nv; 1802*5cabbc6bSPrashanth Sreenivasa uint64_t txg = 0; 1803*5cabbc6bSPrashanth Sreenivasa uint_t nspares, nl2cache; 1804*5cabbc6bSPrashanth Sreenivasa int error = 0; 1805*5cabbc6bSPrashanth Sreenivasa boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 1806*5cabbc6bSPrashanth Sreenivasa sysevent_t *ev = NULL; 1807*5cabbc6bSPrashanth Sreenivasa 1808*5cabbc6bSPrashanth Sreenivasa ASSERT(spa_writeable(spa)); 1809*5cabbc6bSPrashanth Sreenivasa 1810*5cabbc6bSPrashanth Sreenivasa if (!locked) 1811*5cabbc6bSPrashanth Sreenivasa txg = spa_vdev_enter(spa); 1812*5cabbc6bSPrashanth Sreenivasa 1813*5cabbc6bSPrashanth Sreenivasa vd = spa_lookup_by_guid(spa, guid, B_FALSE); 1814*5cabbc6bSPrashanth Sreenivasa 1815*5cabbc6bSPrashanth Sreenivasa if (spa->spa_spares.sav_vdevs != NULL && 1816*5cabbc6bSPrashanth Sreenivasa nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1817*5cabbc6bSPrashanth Sreenivasa ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 1818*5cabbc6bSPrashanth Sreenivasa (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 1819*5cabbc6bSPrashanth Sreenivasa /* 1820*5cabbc6bSPrashanth Sreenivasa * Only remove the hot spare if it's not currently in use 1821*5cabbc6bSPrashanth Sreenivasa * in this pool. 1822*5cabbc6bSPrashanth Sreenivasa */ 1823*5cabbc6bSPrashanth Sreenivasa if (vd == NULL || unspare) { 1824*5cabbc6bSPrashanth Sreenivasa char *nvstr = fnvlist_lookup_string(nv, 1825*5cabbc6bSPrashanth Sreenivasa ZPOOL_CONFIG_PATH); 1826*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove", NULL, 1827*5cabbc6bSPrashanth Sreenivasa "%s vdev (%s) %s", spa_name(spa), 1828*5cabbc6bSPrashanth Sreenivasa VDEV_TYPE_SPARE, nvstr); 1829*5cabbc6bSPrashanth Sreenivasa if (vd == NULL) 1830*5cabbc6bSPrashanth Sreenivasa vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1831*5cabbc6bSPrashanth Sreenivasa ev = spa_event_create(spa, vd, NULL, 1832*5cabbc6bSPrashanth Sreenivasa ESC_ZFS_VDEV_REMOVE_AUX); 1833*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_aux(spa->spa_spares.sav_config, 1834*5cabbc6bSPrashanth Sreenivasa ZPOOL_CONFIG_SPARES, spares, nspares, nv); 1835*5cabbc6bSPrashanth Sreenivasa spa_load_spares(spa); 1836*5cabbc6bSPrashanth Sreenivasa spa->spa_spares.sav_sync = B_TRUE; 1837*5cabbc6bSPrashanth Sreenivasa } else { 1838*5cabbc6bSPrashanth Sreenivasa error = SET_ERROR(EBUSY); 1839*5cabbc6bSPrashanth Sreenivasa } 1840*5cabbc6bSPrashanth Sreenivasa } else if (spa->spa_l2cache.sav_vdevs != NULL && 1841*5cabbc6bSPrashanth Sreenivasa nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1842*5cabbc6bSPrashanth Sreenivasa ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 1843*5cabbc6bSPrashanth Sreenivasa (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 1844*5cabbc6bSPrashanth Sreenivasa char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); 1845*5cabbc6bSPrashanth Sreenivasa spa_history_log_internal(spa, "vdev remove", NULL, 1846*5cabbc6bSPrashanth Sreenivasa "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); 1847*5cabbc6bSPrashanth Sreenivasa /* 1848*5cabbc6bSPrashanth Sreenivasa * Cache devices can always be removed. 1849*5cabbc6bSPrashanth Sreenivasa */ 1850*5cabbc6bSPrashanth Sreenivasa vd = spa_lookup_by_guid(spa, guid, B_TRUE); 1851*5cabbc6bSPrashanth Sreenivasa ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); 1852*5cabbc6bSPrashanth Sreenivasa spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 1853*5cabbc6bSPrashanth Sreenivasa ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 1854*5cabbc6bSPrashanth Sreenivasa spa_load_l2cache(spa); 1855*5cabbc6bSPrashanth Sreenivasa spa->spa_l2cache.sav_sync = B_TRUE; 1856*5cabbc6bSPrashanth Sreenivasa } else if (vd != NULL && vd->vdev_islog) { 1857*5cabbc6bSPrashanth Sreenivasa ASSERT(!locked); 1858*5cabbc6bSPrashanth Sreenivasa error = spa_vdev_remove_log(vd, &txg); 1859*5cabbc6bSPrashanth Sreenivasa } else if (vd != NULL) { 1860*5cabbc6bSPrashanth Sreenivasa ASSERT(!locked); 1861*5cabbc6bSPrashanth Sreenivasa error = spa_vdev_remove_top(vd, &txg); 1862*5cabbc6bSPrashanth Sreenivasa } else { 1863*5cabbc6bSPrashanth Sreenivasa /* 1864*5cabbc6bSPrashanth Sreenivasa * There is no vdev of any kind with the specified guid. 1865*5cabbc6bSPrashanth Sreenivasa */ 1866*5cabbc6bSPrashanth Sreenivasa error = SET_ERROR(ENOENT); 1867*5cabbc6bSPrashanth Sreenivasa } 1868*5cabbc6bSPrashanth Sreenivasa 1869*5cabbc6bSPrashanth Sreenivasa if (!locked) 1870*5cabbc6bSPrashanth Sreenivasa error = spa_vdev_exit(spa, NULL, txg, error); 1871*5cabbc6bSPrashanth Sreenivasa 1872*5cabbc6bSPrashanth Sreenivasa if (ev != NULL) { 1873*5cabbc6bSPrashanth Sreenivasa if (error != 0) { 1874*5cabbc6bSPrashanth Sreenivasa spa_event_discard(ev); 1875*5cabbc6bSPrashanth Sreenivasa } else { 1876*5cabbc6bSPrashanth Sreenivasa spa_event_post(ev); 1877*5cabbc6bSPrashanth Sreenivasa } 1878*5cabbc6bSPrashanth Sreenivasa } 1879*5cabbc6bSPrashanth Sreenivasa 1880*5cabbc6bSPrashanth Sreenivasa return (error); 1881*5cabbc6bSPrashanth Sreenivasa } 1882*5cabbc6bSPrashanth Sreenivasa 1883*5cabbc6bSPrashanth Sreenivasa int 1884*5cabbc6bSPrashanth Sreenivasa spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) 1885*5cabbc6bSPrashanth Sreenivasa { 1886*5cabbc6bSPrashanth Sreenivasa prs->prs_state = spa->spa_removing_phys.sr_state; 1887*5cabbc6bSPrashanth Sreenivasa 1888*5cabbc6bSPrashanth Sreenivasa if (prs->prs_state == DSS_NONE) 1889*5cabbc6bSPrashanth Sreenivasa return (SET_ERROR(ENOENT)); 1890*5cabbc6bSPrashanth Sreenivasa 1891*5cabbc6bSPrashanth Sreenivasa prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; 1892*5cabbc6bSPrashanth Sreenivasa prs->prs_start_time = spa->spa_removing_phys.sr_start_time; 1893*5cabbc6bSPrashanth Sreenivasa prs->prs_end_time = spa->spa_removing_phys.sr_end_time; 1894*5cabbc6bSPrashanth Sreenivasa prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; 1895*5cabbc6bSPrashanth Sreenivasa prs->prs_copied = spa->spa_removing_phys.sr_copied; 1896*5cabbc6bSPrashanth Sreenivasa 1897*5cabbc6bSPrashanth Sreenivasa if (spa->spa_vdev_removal != NULL) { 1898*5cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 1899*5cabbc6bSPrashanth Sreenivasa prs->prs_copied += 1900*5cabbc6bSPrashanth Sreenivasa spa->spa_vdev_removal->svr_bytes_done[i]; 1901*5cabbc6bSPrashanth Sreenivasa } 1902*5cabbc6bSPrashanth Sreenivasa } 1903*5cabbc6bSPrashanth Sreenivasa 1904*5cabbc6bSPrashanth Sreenivasa prs->prs_mapping_memory = 0; 1905*5cabbc6bSPrashanth Sreenivasa uint64_t indirect_vdev_id = 1906*5cabbc6bSPrashanth Sreenivasa spa->spa_removing_phys.sr_prev_indirect_vdev; 1907*5cabbc6bSPrashanth Sreenivasa while (indirect_vdev_id != -1) { 1908*5cabbc6bSPrashanth Sreenivasa vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; 1909*5cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1910*5cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1911*5cabbc6bSPrashanth Sreenivasa 1912*5cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 1913*5cabbc6bSPrashanth Sreenivasa prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); 1914*5cabbc6bSPrashanth Sreenivasa indirect_vdev_id = vic->vic_prev_indirect_vdev; 1915*5cabbc6bSPrashanth Sreenivasa } 1916*5cabbc6bSPrashanth Sreenivasa 1917*5cabbc6bSPrashanth Sreenivasa return (0); 1918*5cabbc6bSPrashanth Sreenivasa } 1919