15cabbc6Prashanth Sreenivasa/*
25cabbc6Prashanth Sreenivasa * CDDL HEADER START
35cabbc6Prashanth Sreenivasa *
45cabbc6Prashanth Sreenivasa * The contents of this file are subject to the terms of the
55cabbc6Prashanth Sreenivasa * Common Development and Distribution License (the "License").
65cabbc6Prashanth Sreenivasa * You may not use this file except in compliance with the License.
75cabbc6Prashanth Sreenivasa *
85cabbc6Prashanth Sreenivasa * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
95cabbc6Prashanth Sreenivasa * or http://www.opensolaris.org/os/licensing.
105cabbc6Prashanth Sreenivasa * See the License for the specific language governing permissions
115cabbc6Prashanth Sreenivasa * and limitations under the License.
125cabbc6Prashanth Sreenivasa *
135cabbc6Prashanth Sreenivasa * When distributing Covered Code, include this CDDL HEADER in each
145cabbc6Prashanth Sreenivasa * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
155cabbc6Prashanth Sreenivasa * If applicable, add the following below this CDDL HEADER, with the
165cabbc6Prashanth Sreenivasa * fields enclosed by brackets "[]" replaced with your own identifying
175cabbc6Prashanth Sreenivasa * information: Portions Copyright [yyyy] [name of copyright owner]
185cabbc6Prashanth Sreenivasa *
195cabbc6Prashanth Sreenivasa * CDDL HEADER END
205cabbc6Prashanth Sreenivasa */
215cabbc6Prashanth Sreenivasa
225cabbc6Prashanth Sreenivasa/*
235cabbc6Prashanth Sreenivasa * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
245cabbc6Prashanth Sreenivasa * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
255cabbc6Prashanth Sreenivasa */
265cabbc6Prashanth Sreenivasa
275cabbc6Prashanth Sreenivasa#include <sys/zfs_context.h>
285cabbc6Prashanth Sreenivasa#include <sys/spa_impl.h>
295cabbc6Prashanth Sreenivasa#include <sys/dmu.h>
305cabbc6Prashanth Sreenivasa#include <sys/dmu_tx.h>
315cabbc6Prashanth Sreenivasa#include <sys/zap.h>
325cabbc6Prashanth Sreenivasa#include <sys/vdev_impl.h>
335cabbc6Prashanth Sreenivasa#include <sys/metaslab.h>
345cabbc6Prashanth Sreenivasa#include <sys/metaslab_impl.h>
355cabbc6Prashanth Sreenivasa#include <sys/uberblock_impl.h>
365cabbc6Prashanth Sreenivasa#include <sys/txg.h>
375cabbc6Prashanth Sreenivasa#include <sys/avl.h>
385cabbc6Prashanth Sreenivasa#include <sys/bpobj.h>
395cabbc6Prashanth Sreenivasa#include <sys/dsl_pool.h>
405cabbc6Prashanth Sreenivasa#include <sys/dsl_synctask.h>
415cabbc6Prashanth Sreenivasa#include <sys/dsl_dir.h>
425cabbc6Prashanth Sreenivasa#include <sys/arc.h>
435cabbc6Prashanth Sreenivasa#include <sys/zfeature.h>
445cabbc6Prashanth Sreenivasa#include <sys/vdev_indirect_births.h>
455cabbc6Prashanth Sreenivasa#include <sys/vdev_indirect_mapping.h>
465cabbc6Prashanth Sreenivasa#include <sys/abd.h>
47094e47eGeorge Wilson#include <sys/vdev_initialize.h>
48084fd14Brian Behlendorf#include <sys/vdev_trim.h>
495cabbc6Prashanth Sreenivasa
505cabbc6Prashanth Sreenivasa/*
515cabbc6Prashanth Sreenivasa * This file contains the necessary logic to remove vdevs from a
525cabbc6Prashanth Sreenivasa * storage pool.  Currently, the only devices that can be removed
535cabbc6Prashanth Sreenivasa * are log, cache, and spare devices; and top level vdevs from a pool
545cabbc6Prashanth Sreenivasa * w/o raidz.  (Note that members of a mirror can also be removed
555cabbc6Prashanth Sreenivasa * by the detach operation.)
565cabbc6Prashanth Sreenivasa *
575cabbc6Prashanth Sreenivasa * Log vdevs are removed by evacuating them and then turning the vdev
585cabbc6Prashanth Sreenivasa * into a hole vdev while holding spa config locks.
595cabbc6Prashanth Sreenivasa *
605cabbc6Prashanth Sreenivasa * Top level vdevs are removed and converted into an indirect vdev via
615cabbc6Prashanth Sreenivasa * a multi-step process:
625cabbc6Prashanth Sreenivasa *
635cabbc6Prashanth Sreenivasa *  - Disable allocations from this device (spa_vdev_remove_top).
645cabbc6Prashanth Sreenivasa *
655cabbc6Prashanth Sreenivasa *  - From a new thread (spa_vdev_remove_thread), copy data from
665cabbc6Prashanth Sreenivasa *    the removing vdev to a different vdev.  The copy happens in open
675cabbc6Prashanth Sreenivasa *    context (spa_vdev_copy_impl) and issues a sync task
685cabbc6Prashanth Sreenivasa *    (vdev_mapping_sync) so the sync thread can update the partial
695cabbc6Prashanth Sreenivasa *    indirect mappings in core and on disk.
705cabbc6Prashanth Sreenivasa *
715cabbc6Prashanth Sreenivasa *  - If a free happens during a removal, it is freed from the
725cabbc6Prashanth Sreenivasa *    removing vdev, and if it has already been copied, from the new
735cabbc6Prashanth Sreenivasa *    location as well (free_from_removing_vdev).
745cabbc6Prashanth Sreenivasa *
755cabbc6Prashanth Sreenivasa *  - After the removal is completed, the copy thread converts the vdev
765cabbc6Prashanth Sreenivasa *    into an indirect vdev (vdev_remove_complete) before instructing
775cabbc6Prashanth Sreenivasa *    the sync thread to destroy the space maps and finish the removal
785cabbc6Prashanth Sreenivasa *    (spa_finish_removal).
795cabbc6Prashanth Sreenivasa */
805cabbc6Prashanth Sreenivasa
815cabbc6Prashanth Sreenivasatypedef struct vdev_copy_arg {
825cabbc6Prashanth Sreenivasa	metaslab_t	*vca_msp;
835cabbc6Prashanth Sreenivasa	uint64_t	vca_outstanding_bytes;
845cabbc6Prashanth Sreenivasa	kcondvar_t	vca_cv;
855cabbc6Prashanth Sreenivasa	kmutex_t	vca_lock;
865cabbc6Prashanth Sreenivasa} vdev_copy_arg_t;
875cabbc6Prashanth Sreenivasa
885cabbc6Prashanth Sreenivasa/*
893a4b1beMatthew Ahrens * The maximum amount of memory we can use for outstanding i/o while
903a4b1beMatthew Ahrens * doing a device removal.  This determines how much i/o we can have
913a4b1beMatthew Ahrens * in flight concurrently.
925cabbc6Prashanth Sreenivasa */
933a4b1beMatthew Ahrensint zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
945cabbc6Prashanth Sreenivasa
955cabbc6Prashanth Sreenivasa/*
965cabbc6Prashanth Sreenivasa * The largest contiguous segment that we will attempt to allocate when
975cabbc6Prashanth Sreenivasa * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
985cabbc6Prashanth Sreenivasa * there is a performance problem with attempting to allocate large blocks,
995cabbc6Prashanth Sreenivasa * consider decreasing this.
1005cabbc6Prashanth Sreenivasa *
1015cabbc6Prashanth Sreenivasa * Note: we will issue I/Os of up to this size.  The mpt driver does not
1025cabbc6Prashanth Sreenivasa * respond well to I/Os larger than 1MB, so we set this to 1MB.  (When
1035cabbc6Prashanth Sreenivasa * mpt processes an I/O larger than 1MB, it needs to do an allocation of
1045cabbc6Prashanth Sreenivasa * 2 physically contiguous pages; if this allocation fails, mpt will drop
1055cabbc6Prashanth Sreenivasa * the I/O and hang the device.)
1065cabbc6Prashanth Sreenivasa */
1075cabbc6Prashanth Sreenivasaint zfs_remove_max_segment = 1024 * 1024;
1085cabbc6Prashanth Sreenivasa
1098671400Serapheim Dimitropoulos/*
110cfd63e1Matthew Ahrens * Allow a remap segment to span free chunks of at most this size. The main
111cfd63e1Matthew Ahrens * impact of a larger span is that we will read and write larger, more
112cfd63e1Matthew Ahrens * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
113cfd63e1Matthew Ahrens * for iops.  The value here was chosen to align with
114cfd63e1Matthew Ahrens * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
115cfd63e1Matthew Ahrens * reads (but there's no reason it has to be the same).
116cfd63e1Matthew Ahrens *
117cfd63e1Matthew Ahrens * Additionally, a higher span will have the following relatively minor
118cfd63e1Matthew Ahrens * effects:
119cfd63e1Matthew Ahrens *  - the mapping will be smaller, since one entry can cover more allocated
120cfd63e1Matthew Ahrens *    segments
121cfd63e1Matthew Ahrens *  - more of the fragmentation in the removing device will be preserved
122cfd63e1Matthew Ahrens *  - we'll do larger allocations, which may fail and fall back on smaller
123cfd63e1Matthew Ahrens *    allocations
124cfd63e1Matthew Ahrens */
125cfd63e1Matthew Ahrensint vdev_removal_max_span = 32 * 1024;
126cfd63e1Matthew Ahrens
127cfd63e1Matthew Ahrens/*
1288671400Serapheim Dimitropoulos * This is used by the test suite so that it can ensure that certain
1298671400Serapheim Dimitropoulos * actions happen while in the middle of a removal.
1308671400Serapheim Dimitropoulos */
131e4c795bTom Caputiint zfs_removal_suspend_progress = 0;
1328671400Serapheim Dimitropoulos
1335cabbc6Prashanth Sreenivasa#define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
1345cabbc6Prashanth Sreenivasa
1355cabbc6Prashanth Sreenivasastatic void spa_vdev_remove_thread(void *arg);
1365cabbc6Prashanth Sreenivasa
1375cabbc6Prashanth Sreenivasastatic void
1385cabbc6Prashanth Sreenivasaspa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
1395cabbc6Prashanth Sreenivasa{
1405cabbc6Prashanth Sreenivasa	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
1415cabbc6Prashanth Sreenivasa	    DMU_POOL_DIRECTORY_OBJECT,
1425cabbc6Prashanth Sreenivasa	    DMU_POOL_REMOVING, sizeof (uint64_t),
1435cabbc6Prashanth Sreenivasa	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
1445cabbc6Prashanth Sreenivasa	    &spa->spa_removing_phys, tx));
1455cabbc6Prashanth Sreenivasa}
1465cabbc6Prashanth Sreenivasa
1475cabbc6Prashanth Sreenivasastatic nvlist_t *
1485cabbc6Prashanth Sreenivasaspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
1495cabbc6Prashanth Sreenivasa{
1505cabbc6Prashanth Sreenivasa	for (int i = 0; i < count; i++) {
1515cabbc6Prashanth Sreenivasa		uint64_t guid =
1525cabbc6Prashanth Sreenivasa		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
1535cabbc6Prashanth Sreenivasa
1545cabbc6Prashanth Sreenivasa		if (guid == target_guid)
1555cabbc6Prashanth Sreenivasa			return (nvpp[i]);
1565cabbc6Prashanth Sreenivasa	}
1575cabbc6Prashanth Sreenivasa
1585cabbc6Prashanth Sreenivasa	return (NULL);
1595cabbc6Prashanth Sreenivasa}
1605cabbc6Prashanth Sreenivasa
1615cabbc6Prashanth Sreenivasastatic void
1625cabbc6Prashanth Sreenivasaspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
1635cabbc6Prashanth Sreenivasa    nvlist_t *dev_to_remove)
1645cabbc6Prashanth Sreenivasa{
1655cabbc6Prashanth Sreenivasa	nvlist_t **newdev = NULL;
1665cabbc6Prashanth Sreenivasa
1675cabbc6Prashanth Sreenivasa	if (count > 1)
1685cabbc6Prashanth Sreenivasa		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
1695cabbc6Prashanth Sreenivasa
1705cabbc6Prashanth Sreenivasa	for (int i = 0, j = 0; i < count; i++) {
1715cabbc6Prashanth Sreenivasa		if (dev[i] == dev_to_remove)
1725cabbc6Prashanth Sreenivasa			continue;
1735cabbc6Prashanth Sreenivasa		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
1745cabbc6Prashanth Sreenivasa	}
1755cabbc6Prashanth Sreenivasa
1765cabbc6Prashanth Sreenivasa	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
1775cabbc6Prashanth Sreenivasa	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
1785cabbc6Prashanth Sreenivasa
1795cabbc6Prashanth Sreenivasa	for (int i = 0; i < count - 1; i++)
1805cabbc6Prashanth Sreenivasa		nvlist_free(newdev[i]);
1815cabbc6Prashanth Sreenivasa
1825cabbc6Prashanth Sreenivasa	if (count > 1)
1835cabbc6Prashanth Sreenivasa		kmem_free(newdev, (count - 1) * sizeof (void *));
1845cabbc6Prashanth Sreenivasa}
1855cabbc6Prashanth Sreenivasa
1865cabbc6Prashanth Sreenivasastatic spa_vdev_removal_t *
1875cabbc6Prashanth Sreenivasaspa_vdev_removal_create(vdev_t *vd)
1885cabbc6Prashanth Sreenivasa{
1895cabbc6Prashanth Sreenivasa	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
1905cabbc6Prashanth Sreenivasa	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
1915cabbc6Prashanth Sreenivasa	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
1924d7988dPaul Dagnelie	svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
1933a4b1beMatthew Ahrens	svr->svr_vdev_id = vd->vdev_id;
1945cabbc6Prashanth Sreenivasa
1955cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
1964d7988dPaul Dagnelie		svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
1974d7988dPaul Dagnelie		    0, 0);
1985cabbc6Prashanth Sreenivasa		list_create(&svr->svr_new_segments[i],
1995cabbc6Prashanth Sreenivasa		    sizeof (vdev_indirect_mapping_entry_t),
2005cabbc6Prashanth Sreenivasa		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
2015cabbc6Prashanth Sreenivasa	}
2025cabbc6Prashanth Sreenivasa
2035cabbc6Prashanth Sreenivasa	return (svr);
2045cabbc6Prashanth Sreenivasa}
2055cabbc6Prashanth Sreenivasa
2065cabbc6Prashanth Sreenivasavoid
2075cabbc6Prashanth Sreenivasaspa_vdev_removal_destroy(spa_vdev_removal_t *svr)
2085cabbc6Prashanth Sreenivasa{
2095cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
2105cabbc6Prashanth Sreenivasa		ASSERT0(svr->svr_bytes_done[i]);
2115cabbc6Prashanth Sreenivasa		ASSERT0(svr->svr_max_offset_to_sync[i]);
2125cabbc6Prashanth Sreenivasa		range_tree_destroy(svr->svr_frees[i]);
2135cabbc6Prashanth Sreenivasa		list_destroy(&svr->svr_new_segments[i]);
2145cabbc6Prashanth Sreenivasa	}
2155cabbc6Prashanth Sreenivasa
2165cabbc6Prashanth Sreenivasa	range_tree_destroy(svr->svr_allocd_segs);
2175cabbc6Prashanth Sreenivasa	mutex_destroy(&svr->svr_lock);
2185cabbc6Prashanth Sreenivasa	cv_destroy(&svr->svr_cv);
2195cabbc6Prashanth Sreenivasa	kmem_free(svr, sizeof (*svr));
2205cabbc6Prashanth Sreenivasa}
2215cabbc6Prashanth Sreenivasa
2225cabbc6Prashanth Sreenivasa/*
2235cabbc6Prashanth Sreenivasa * This is called as a synctask in the txg in which we will mark this vdev
2245cabbc6Prashanth Sreenivasa * as removing (in the config stored in the MOS).
2255cabbc6Prashanth Sreenivasa *
2265cabbc6Prashanth Sreenivasa * It begins the evacuation of a toplevel vdev by:
2275cabbc6Prashanth Sreenivasa * - initializing the spa_removing_phys which tracks this removal
2285cabbc6Prashanth Sreenivasa * - computing the amount of space to remove for accounting purposes
2295cabbc6Prashanth Sreenivasa * - dirtying all dbufs in the spa_config_object
2305cabbc6Prashanth Sreenivasa * - creating the spa_vdev_removal
2315cabbc6Prashanth Sreenivasa * - starting the spa_vdev_remove_thread
2325cabbc6Prashanth Sreenivasa */
2335cabbc6Prashanth Sreenivasastatic void
2345cabbc6Prashanth Sreenivasavdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
2355cabbc6Prashanth Sreenivasa{
2363a4b1beMatthew Ahrens	int vdev_id = (uintptr_t)arg;
2373a4b1beMatthew Ahrens	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2383a4b1beMatthew Ahrens	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
2395cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
2405cabbc6Prashanth Sreenivasa	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
2415cabbc6Prashanth Sreenivasa	spa_vdev_removal_t *svr = NULL;
2425cabbc6Prashanth Sreenivasa	uint64_t txg = dmu_tx_get_txg(tx);
2435cabbc6Prashanth Sreenivasa
2445cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
2455cabbc6Prashanth Sreenivasa	svr = spa_vdev_removal_create(vd);
2465cabbc6Prashanth Sreenivasa
2475cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_removing);
2485cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
2495cabbc6Prashanth Sreenivasa
2505cabbc6Prashanth Sreenivasa	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
2515cabbc6Prashanth Sreenivasa	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
2525cabbc6Prashanth Sreenivasa		/*
2535cabbc6Prashanth Sreenivasa		 * By activating the OBSOLETE_COUNTS feature, we prevent
2545cabbc6Prashanth Sreenivasa		 * the pool from being downgraded and ensure that the
2555cabbc6Prashanth Sreenivasa		 * refcounts are precise.
2565cabbc6Prashanth Sreenivasa		 */
2575cabbc6Prashanth Sreenivasa		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
2585cabbc6Prashanth Sreenivasa		uint64_t one = 1;
2595cabbc6Prashanth Sreenivasa		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
2605cabbc6Prashanth Sreenivasa		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
2615cabbc6Prashanth Sreenivasa		    &one, tx));
2625cabbc6Prashanth Sreenivasa		ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0);
2635cabbc6Prashanth Sreenivasa	}
2645cabbc6Prashanth Sreenivasa
2655cabbc6Prashanth Sreenivasa	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
2665cabbc6Prashanth Sreenivasa	vd->vdev_indirect_mapping =
2675cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
2685cabbc6Prashanth Sreenivasa	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
2695cabbc6Prashanth Sreenivasa	vd->vdev_indirect_births =
2705cabbc6Prashanth Sreenivasa	    vdev_indirect_births_open(mos, vic->vic_births_object);
2715cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
2725cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
2735cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_end_time = 0;
2745cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_state = DSS_SCANNING;
2755cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_to_copy = 0;
2765cabbc6Prashanth Sreenivasa	spa->spa_removing_phys.sr_copied = 0;
2775cabbc6Prashanth Sreenivasa
2785cabbc6Prashanth Sreenivasa	/*
2795cabbc6Prashanth Sreenivasa	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
2805cabbc6Prashanth Sreenivasa	 * there may be space in the defer tree, which is free, but still
2815cabbc6Prashanth Sreenivasa	 * counted in vs_alloc.
2825cabbc6Prashanth Sreenivasa	 */
2835cabbc6Prashanth Sreenivasa	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
2845cabbc6Prashanth Sreenivasa		metaslab_t *ms = vd->vdev_ms[i];
2855cabbc6Prashanth Sreenivasa		if (ms->ms_sm == NULL)
2865cabbc6Prashanth Sreenivasa			continue;
2875cabbc6Prashanth Sreenivasa
2885cabbc6Prashanth Sreenivasa		spa->spa_removing_phys.sr_to_copy +=
289555d674Serapheim Dimitropoulos		    metaslab_allocated_space(ms);
2905cabbc6Prashanth Sreenivasa
2915cabbc6Prashanth Sreenivasa		/*
2925cabbc6Prashanth Sreenivasa		 * Space which we are freeing this txg does not need to
2935cabbc6Prashanth Sreenivasa		 * be copied.
2945cabbc6Prashanth Sreenivasa		 */
2955cabbc6Prashanth Sreenivasa		spa->spa_removing_phys.sr_to_copy -=
2968671400Serapheim Dimitropoulos		    range_tree_space(ms->ms_freeing);
2975cabbc6Prashanth Sreenivasa
2988671400Serapheim Dimitropoulos		ASSERT0(range_tree_space(ms->ms_freed));
2995cabbc6Prashanth Sreenivasa		for (int t = 0; t < TXG_SIZE; t++)
3008671400Serapheim Dimitropoulos			ASSERT0(range_tree_space(ms->ms_allocating[t]));
3015cabbc6Prashanth Sreenivasa	}
3025cabbc6Prashanth Sreenivasa
3035cabbc6Prashanth Sreenivasa	/*
3045cabbc6Prashanth Sreenivasa	 * Sync tasks are called before metaslab_sync(), so there should
3055cabbc6Prashanth Sreenivasa	 * be no already-synced metaslabs in the TXG_CLEAN list.
3065cabbc6Prashanth Sreenivasa	 */
3075cabbc6Prashanth Sreenivasa	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
3085cabbc6Prashanth Sreenivasa
3095cabbc6Prashanth Sreenivasa	spa_sync_removing_state(spa, tx);
3105cabbc6Prashanth Sreenivasa
3115cabbc6Prashanth Sreenivasa	/*
3125cabbc6Prashanth Sreenivasa	 * All blocks that we need to read the most recent mapping must be
3135cabbc6Prashanth Sreenivasa	 * stored on concrete vdevs.  Therefore, we must dirty anything that
3145cabbc6Prashanth Sreenivasa	 * is read before spa_remove_init().  Specifically, the
3155cabbc6Prashanth Sreenivasa	 * spa_config_object.  (Note that although we already modified the
3165cabbc6Prashanth Sreenivasa	 * spa_config_object in spa_sync_removing_state, that may not have
3175cabbc6Prashanth Sreenivasa	 * modified all blocks of the object.)
3185cabbc6Prashanth Sreenivasa	 */
3195cabbc6Prashanth Sreenivasa	dmu_object_info_t doi;
3205cabbc6Prashanth Sreenivasa	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
3215cabbc6Prashanth Sreenivasa	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
3225cabbc6Prashanth Sreenivasa		dmu_buf_t *dbuf;
3235cabbc6Prashanth Sreenivasa		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
3245cabbc6Prashanth Sreenivasa		    offset, FTAG, &dbuf, 0));
3255cabbc6Prashanth Sreenivasa		dmu_buf_will_dirty(dbuf, tx);
3265cabbc6Prashanth Sreenivasa		offset += dbuf->db_size;
3275cabbc6Prashanth Sreenivasa		dmu_buf_rele(dbuf, FTAG);
3285cabbc6Prashanth Sreenivasa	}
3295cabbc6Prashanth Sreenivasa
3305cabbc6Prashanth Sreenivasa	/*
3315cabbc6Prashanth Sreenivasa	 * Now that we've allocated the im_object, dirty the vdev to ensure
3325cabbc6Prashanth Sreenivasa	 * that the object gets written to the config on disk.
3335cabbc6Prashanth Sreenivasa	 */
3345cabbc6Prashanth Sreenivasa	vdev_config_dirty(vd);
3355cabbc6Prashanth Sreenivasa
3365cabbc6Prashanth Sreenivasa	zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu "
3375cabbc6Prashanth Sreenivasa	    "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
3385cabbc6Prashanth Sreenivasa	    vic->vic_mapping_object);
3395cabbc6Prashanth Sreenivasa
3405cabbc6Prashanth Sreenivasa	spa_history_log_internal(spa, "vdev remove started", tx,
3415cabbc6Prashanth Sreenivasa	    "%s vdev %llu %s", spa_name(spa), vd->vdev_id,
3425cabbc6Prashanth Sreenivasa	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
3435cabbc6Prashanth Sreenivasa	/*
3445cabbc6Prashanth Sreenivasa	 * Setting spa_vdev_removal causes subsequent frees to call
3455cabbc6Prashanth Sreenivasa	 * free_from_removing_vdev().  Note that we don't need any locking
3465cabbc6Prashanth Sreenivasa	 * because we are the sync thread, and metaslab_free_impl() is only
3475cabbc6Prashanth Sreenivasa	 * called from syncing context (potentially from a zio taskq thread,
3485cabbc6Prashanth Sreenivasa	 * but in any case only when there are outstanding free i/os, which
3495cabbc6Prashanth Sreenivasa	 * there are not).
3505cabbc6Prashanth Sreenivasa	 */
3515cabbc6Prashanth Sreenivasa	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
3525cabbc6Prashanth Sreenivasa	spa->spa_vdev_removal = svr;
3535cabbc6Prashanth Sreenivasa	svr->svr_thread = thread_create(NULL, 0,
3543a4b1beMatthew Ahrens	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
3555cabbc6Prashanth Sreenivasa}
3565cabbc6Prashanth Sreenivasa
3575cabbc6Prashanth Sreenivasa/*
3585cabbc6Prashanth Sreenivasa * When we are opening a pool, we must read the mapping for each
3595cabbc6Prashanth Sreenivasa * indirect vdev in order from most recently removed to least
3605cabbc6Prashanth Sreenivasa * recently removed.  We do this because the blocks for the mapping
3615cabbc6Prashanth Sreenivasa * of older indirect vdevs may be stored on more recently removed vdevs.
3625cabbc6Prashanth Sreenivasa * In order to read each indirect mapping object, we must have
3635cabbc6Prashanth Sreenivasa * initialized all more recently removed vdevs.
3645cabbc6Prashanth Sreenivasa */
3655cabbc6Prashanth Sreenivasaint
3665cabbc6Prashanth Sreenivasaspa_remove_init(spa_t *spa)
3675cabbc6Prashanth Sreenivasa{
3685cabbc6Prashanth Sreenivasa	int error;
3695cabbc6Prashanth Sreenivasa
3705cabbc6Prashanth Sreenivasa	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
3715cabbc6Prashanth Sreenivasa	    DMU_POOL_DIRECTORY_OBJECT,
3725cabbc6Prashanth Sreenivasa	    DMU_POOL_REMOVING, sizeof (uint64_t),
3735cabbc6Prashanth Sreenivasa	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
3745cabbc6Prashanth Sreenivasa	    &spa->spa_removing_phys);
3755cabbc6Prashanth Sreenivasa
3765cabbc6Prashanth Sreenivasa	if (error == ENOENT) {
3775cabbc6Prashanth Sreenivasa		spa->spa_removing_phys.sr_state = DSS_NONE;
3785cabbc6Prashanth Sreenivasa		spa->spa_removing_phys.sr_removing_vdev = -1;
3795cabbc6Prashanth Sreenivasa		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
38047b8d4bAlexander Motin		spa->spa_indirect_vdevs_loaded = B_TRUE;
3815cabbc6Prashanth Sreenivasa		return (0);
3825cabbc6Prashanth Sreenivasa	} else if (error != 0) {
3835cabbc6Prashanth Sreenivasa		return (error);
3845cabbc6Prashanth Sreenivasa	}
3855cabbc6Prashanth Sreenivasa
3865cabbc6Prashanth Sreenivasa	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
3875cabbc6Prashanth Sreenivasa		/*
3885cabbc6Prashanth Sreenivasa		 * We are currently removing a vdev.  Create and
3895cabbc6Prashanth Sreenivasa		 * initialize a spa_vdev_removal_t from the bonus
3905cabbc6Prashanth Sreenivasa		 * buffer of the removing vdevs vdev_im_object, and
3915cabbc6Prashanth Sreenivasa		 * initialize its partial mapping.
3925cabbc6Prashanth Sreenivasa		 */
3935cabbc6Prashanth Sreenivasa		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3945cabbc6Prashanth Sreenivasa		vdev_t *vd = vdev_lookup_top(spa,
3955cabbc6Prashanth Sreenivasa		    spa->spa_removing_phys.sr_removing_vdev);
3965cabbc6Prashanth Sreenivasa
3973a4b1beMatthew Ahrens		if (vd == NULL) {
3983a4b1beMatthew Ahrens			spa_config_exit(spa, SCL_STATE, FTAG);
3995cabbc6Prashanth Sreenivasa			return (EINVAL);
4003a4b1beMatthew Ahrens		}
4015cabbc6Prashanth Sreenivasa
4025cabbc6Prashanth Sreenivasa		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4035cabbc6Prashanth Sreenivasa
4045cabbc6Prashanth Sreenivasa		ASSERT(vdev_is_concrete(vd));
4055cabbc6Prashanth Sreenivasa		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
4063a4b1beMatthew Ahrens		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
4073a4b1beMatthew Ahrens		ASSERT(vd->vdev_removing);
4085cabbc6Prashanth Sreenivasa
4095cabbc6Prashanth Sreenivasa		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
4105cabbc6Prashanth Sreenivasa		    spa->spa_meta_objset, vic->vic_mapping_object);
4115cabbc6Prashanth Sreenivasa		vd->vdev_indirect_births = vdev_indirect_births_open(
4125cabbc6Prashanth Sreenivasa		    spa->spa_meta_objset, vic->vic_births_object);
4133a4b1beMatthew Ahrens		spa_config_exit(spa, SCL_STATE, FTAG);
4145cabbc6Prashanth Sreenivasa
4155cabbc6Prashanth Sreenivasa		spa->spa_vdev_removal = svr;
4165cabbc6Prashanth Sreenivasa	}
4175cabbc6Prashanth Sreenivasa
4185cabbc6Prashanth Sreenivasa	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4195cabbc6Prashanth Sreenivasa	uint64_t indirect_vdev_id =
4205cabbc6Prashanth Sreenivasa	    spa->spa_removing_phys.sr_prev_indirect_vdev;
4215cabbc6Prashanth Sreenivasa	while (indirect_vdev_id != UINT64_MAX) {
4225cabbc6Prashanth Sreenivasa		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
4235cabbc6Prashanth Sreenivasa		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4245cabbc6Prashanth Sreenivasa
4255cabbc6Prashanth Sreenivasa		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4265cabbc6Prashanth Sreenivasa		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
4275cabbc6Prashanth Sreenivasa		    spa->spa_meta_objset, vic->vic_mapping_object);
4285cabbc6Prashanth Sreenivasa		vd->vdev_indirect_births = vdev_indirect_births_open(
4295cabbc6Prashanth Sreenivasa		    spa->spa_meta_objset, vic->vic_births_object);
4305cabbc6Prashanth Sreenivasa
4315cabbc6Prashanth Sreenivasa		indirect_vdev_id = vic->vic_prev_indirect_vdev;
4325cabbc6Prashanth Sreenivasa	}
4335cabbc6Prashanth Sreenivasa	spa_config_exit(spa, SCL_STATE, FTAG);
4345cabbc6Prashanth Sreenivasa
4355cabbc6Prashanth Sreenivasa	/*
4365cabbc6Prashanth Sreenivasa	 * Now that we've loaded all the indirect mappings, we can allow
4375cabbc6Prashanth Sreenivasa	 * reads from other blocks (e.g. via predictive prefetch).
4385cabbc6Prashanth Sreenivasa	 */
4395cabbc6Prashanth Sreenivasa	spa->spa_indirect_vdevs_loaded = B_TRUE;
4405cabbc6Prashanth Sreenivasa	return (0);
4415cabbc6Prashanth Sreenivasa}
4425cabbc6Prashanth Sreenivasa
4435cabbc6Prashanth Sreenivasavoid
4445cabbc6Prashanth Sreenivasaspa_restart_removal(spa_t *spa)
4455cabbc6Prashanth Sreenivasa{
4465cabbc6Prashanth Sreenivasa	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
4475cabbc6Prashanth Sreenivasa
4485cabbc6Prashanth Sreenivasa	if (svr == NULL)
4495cabbc6Prashanth Sreenivasa		return;
4505cabbc6Prashanth Sreenivasa
4515cabbc6Prashanth Sreenivasa	/*
4525cabbc6Prashanth Sreenivasa	 * In general when this function is called there is no
4535cabbc6Prashanth Sreenivasa	 * removal thread running. The only scenario where this
4545cabbc6Prashanth Sreenivasa	 * is not true is during spa_import() where this function
4555cabbc6Prashanth Sreenivasa	 * is called twice [once from spa_import_impl() and
4565cabbc6Prashanth Sreenivasa	 * spa_async_resume()]. Thus, in the scenario where we
4575cabbc6Prashanth Sreenivasa	 * import a pool that has an ongoing removal we don't
4585cabbc6Prashanth Sreenivasa	 * want to spawn a second thread.
4595cabbc6Prashanth Sreenivasa	 */
4605cabbc6Prashanth Sreenivasa	if (svr->svr_thread != NULL)
4615cabbc6Prashanth Sreenivasa		return;
4625cabbc6Prashanth Sreenivasa
4635cabbc6Prashanth Sreenivasa	if (!spa_writeable(spa))
4645cabbc6Prashanth Sreenivasa		return;
4655cabbc6Prashanth Sreenivasa
4663a4b1beMatthew Ahrens	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
4673a4b1beMatthew Ahrens	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
4685cabbc6Prashanth Sreenivasa	    0, &p0, TS_RUN, minclsyspri);
4695cabbc6Prashanth Sreenivasa}
4705cabbc6Prashanth Sreenivasa
4715cabbc6Prashanth Sreenivasa/*
4725cabbc6Prashanth Sreenivasa * Process freeing from a device which is in the middle of being removed.
4735cabbc6Prashanth Sreenivasa * We must handle this carefully so that we attempt to copy freed data,
4745cabbc6Prashanth Sreenivasa * and we correctly free already-copied data.
4755cabbc6Prashanth Sreenivasa */
4765cabbc6Prashanth Sreenivasavoid
4778671400Serapheim Dimitropoulosfree_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
4785cabbc6Prashanth Sreenivasa{
4795cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
4805cabbc6Prashanth Sreenivasa	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
4815cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
4828671400Serapheim Dimitropoulos	uint64_t txg = spa_syncing_txg(spa);
4835cabbc6Prashanth Sreenivasa	uint64_t max_offset_yet = 0;
4845cabbc6Prashanth Sreenivasa
4855cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
4865cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
4875cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_object(vim));
4883a4b1beMatthew Ahrens	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
4895cabbc6Prashanth Sreenivasa
4905cabbc6Prashanth Sreenivasa	mutex_enter(&svr->svr_lock);
4915cabbc6Prashanth Sreenivasa
4925cabbc6Prashanth Sreenivasa	/*
4935cabbc6Prashanth Sreenivasa	 * Remove the segment from the removing vdev's spacemap.  This
4945cabbc6Prashanth Sreenivasa	 * ensures that we will not attempt to copy this space (if the
4955cabbc6Prashanth Sreenivasa	 * removal thread has not yet visited it), and also ensures
4965cabbc6Prashanth Sreenivasa	 * that we know what is actually allocated on the new vdevs
4975cabbc6Prashanth Sreenivasa	 * (needed if we cancel the removal).
4985cabbc6Prashanth Sreenivasa	 *
4995cabbc6Prashanth Sreenivasa	 * Note: we must do the metaslab_free_concrete() with the svr_lock
5005cabbc6Prashanth Sreenivasa	 * held, so that the remove_thread can not load this metaslab and then
5015cabbc6Prashanth Sreenivasa	 * visit this offset between the time that we metaslab_free_concrete()
5025cabbc6Prashanth Sreenivasa	 * and when we check to see if it has been visited.
5038671400Serapheim Dimitropoulos	 *
5048671400Serapheim Dimitropoulos	 * Note: The checkpoint flag is set to false as having/taking
5058671400Serapheim Dimitropoulos	 * a checkpoint and removing a device can't happen at the same
5068671400Serapheim Dimitropoulos	 * time.
5075cabbc6Prashanth Sreenivasa	 */
5088671400Serapheim Dimitropoulos	ASSERT(!spa_has_checkpoint(spa));
5098671400Serapheim Dimitropoulos	metaslab_free_concrete(vd, offset, size, B_FALSE);
5105cabbc6Prashanth Sreenivasa
5115cabbc6Prashanth Sreenivasa	uint64_t synced_size = 0;
5125cabbc6Prashanth Sreenivasa	uint64_t synced_offset = 0;
5135cabbc6Prashanth Sreenivasa	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
5145cabbc6Prashanth Sreenivasa	if (offset < max_offset_synced) {
5155cabbc6Prashanth Sreenivasa		/*
5165cabbc6Prashanth Sreenivasa		 * The mapping for this offset is already on disk.
5175cabbc6Prashanth Sreenivasa		 * Free from the new location.
5185cabbc6Prashanth Sreenivasa		 *
5195cabbc6Prashanth Sreenivasa		 * Note that we use svr_max_synced_offset because it is
5205cabbc6Prashanth Sreenivasa		 * updated atomically with respect to the in-core mapping.
5215cabbc6Prashanth Sreenivasa		 * By contrast, vim_max_offset is not.
5225cabbc6Prashanth Sreenivasa		 *
5235cabbc6Prashanth Sreenivasa		 * This block may be split between a synced entry and an
5245cabbc6Prashanth Sreenivasa		 * in-flight or unvisited entry.  Only process the synced
5255cabbc6Prashanth Sreenivasa		 * portion of it here.
5265cabbc6Prashanth Sreenivasa		 */
5275cabbc6Prashanth Sreenivasa		synced_size = MIN(size, max_offset_synced - offset);
5285cabbc6Prashanth Sreenivasa		synced_offset = offset;
5295cabbc6Prashanth Sreenivasa
5305cabbc6Prashanth Sreenivasa		ASSERT3U(max_offset_yet, <=, max_offset_synced);
5315cabbc6Prashanth Sreenivasa		max_offset_yet = max_offset_synced;
5325cabbc6Prashanth Sreenivasa
5335cabbc6Prashanth Sreenivasa		DTRACE_PROBE3(remove__free__synced,
5345cabbc6Prashanth Sreenivasa		    spa_t *, spa,
5355cabbc6Prashanth Sreenivasa		    uint64_t, offset,
5365cabbc6Prashanth Sreenivasa		    uint64_t, synced_size);
5375cabbc6Prashanth Sreenivasa
5385cabbc6Prashanth Sreenivasa		size -= synced_size;
5395cabbc6Prashanth Sreenivasa		offset += synced_size;
5405cabbc6Prashanth Sreenivasa	}
5415cabbc6Prashanth Sreenivasa
5425cabbc6Prashanth Sreenivasa	/*
5435cabbc6Prashanth Sreenivasa	 * Look at all in-flight txgs starting from the currently syncing one
5445cabbc6Prashanth Sreenivasa	 * and see if a section of this free is being copied. By starting from
5455cabbc6Prashanth Sreenivasa	 * this txg and iterating forward, we might find that this region
5465cabbc6Prashanth Sreenivasa	 * was copied in two different txgs and handle it appropriately.
5475cabbc6Prashanth Sreenivasa	 */
5485cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
5495cabbc6Prashanth Sreenivasa		int txgoff = (txg + i) & TXG_MASK;
5505cabbc6Prashanth Sreenivasa		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
5515cabbc6Prashanth Sreenivasa			/*
5525cabbc6Prashanth Sreenivasa			 * The mapping for this offset is in flight, and
5535cabbc6Prashanth Sreenivasa			 * will be synced in txg+i.
5545cabbc6Prashanth Sreenivasa			 */
5555cabbc6Prashanth Sreenivasa			uint64_t inflight_size = MIN(size,
5565cabbc6Prashanth Sreenivasa			    svr->svr_max_offset_to_sync[txgoff] - offset);
5575cabbc6Prashanth Sreenivasa
5585cabbc6Prashanth Sreenivasa			DTRACE_PROBE4(remove__free__inflight,
5595cabbc6Prashanth Sreenivasa			    spa_t *, spa,
5605cabbc6Prashanth Sreenivasa			    uint64_t, offset,
5615cabbc6Prashanth Sreenivasa			    uint64_t, inflight_size,
5625cabbc6Prashanth Sreenivasa			    uint64_t, txg + i);
5635cabbc6Prashanth Sreenivasa
5645cabbc6Prashanth Sreenivasa			/*
5655cabbc6Prashanth Sreenivasa			 * We copy data in order of increasing offset.
5665cabbc6Prashanth Sreenivasa			 * Therefore the max_offset_to_sync[] must increase
5675cabbc6Prashanth Sreenivasa			 * (or be zero, indicating that nothing is being
5685cabbc6Prashanth Sreenivasa			 * copied in that txg).
5695cabbc6Prashanth Sreenivasa			 */
5705cabbc6Prashanth Sreenivasa			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
5715cabbc6Prashanth Sreenivasa				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
5725cabbc6Prashanth Sreenivasa				    >=, max_offset_yet);
5735cabbc6Prashanth Sreenivasa				max_offset_yet =
5745cabbc6Prashanth Sreenivasa				    svr->svr_max_offset_to_sync[txgoff];
5755cabbc6Prashanth Sreenivasa			}
5765cabbc6Prashanth Sreenivasa
5775cabbc6Prashanth Sreenivasa			/*
5785cabbc6Prashanth Sreenivasa			 * We've already committed to copying this segment:
5795cabbc6Prashanth Sreenivasa			 * we have allocated space elsewhere in the pool for
5805cabbc6Prashanth Sreenivasa			 * it and have an IO outstanding to copy the data. We
5815cabbc6Prashanth Sreenivasa			 * cannot free the space before the copy has
5825cabbc6Prashanth Sreenivasa			 * completed, or else the copy IO might overwrite any
5835cabbc6Prashanth Sreenivasa			 * new data. To free that space, we record the
5845cabbc6Prashanth Sreenivasa			 * segment in the appropriate svr_frees tree and free
5855cabbc6Prashanth Sreenivasa			 * the mapped space later, in the txg where we have
5865cabbc6Prashanth Sreenivasa			 * completed the copy and synced the mapping (see
5875cabbc6Prashanth Sreenivasa			 * vdev_mapping_sync).
5885cabbc6Prashanth Sreenivasa			 */
5895cabbc6Prashanth Sreenivasa			range_tree_add(svr->svr_frees[txgoff],
5905cabbc6Prashanth Sreenivasa			    offset, inflight_size);
5915cabbc6Prashanth Sreenivasa			size -= inflight_size;
5925cabbc6Prashanth Sreenivasa			offset += inflight_size;
5935cabbc6Prashanth Sreenivasa
5945cabbc6Prashanth Sreenivasa			/*
5955cabbc6Prashanth Sreenivasa			 * This space is already accounted for as being
5965cabbc6Prashanth Sreenivasa			 * done, because it is being copied in txg+i.
5975cabbc6Prashanth Sreenivasa			 * However, if i!=0, then it is being copied in
5985cabbc6Prashanth Sreenivasa			 * a future txg.  If we crash after this txg
5995cabbc6Prashanth Sreenivasa			 * syncs but before txg+i syncs, then the space
6005cabbc6Prashanth Sreenivasa			 * will be free.  Therefore we must account
6015cabbc6Prashanth Sreenivasa			 * for the space being done in *this* txg
6025cabbc6Prashanth Sreenivasa			 * (when it is freed) rather than the future txg
6035cabbc6Prashanth Sreenivasa			 * (when it will be copied).
6045cabbc6Prashanth Sreenivasa			 */
6055cabbc6Prashanth Sreenivasa			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
6065cabbc6Prashanth Sreenivasa			    inflight_size);
6075cabbc6Prashanth Sreenivasa			svr->svr_bytes_done[txgoff] -= inflight_size;
6085cabbc6Prashanth Sreenivasa			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
6095cabbc6Prashanth Sreenivasa		}
6105cabbc6Prashanth Sreenivasa	}
6115cabbc6Prashanth Sreenivasa	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
6125cabbc6Prashanth Sreenivasa
6135cabbc6Prashanth Sreenivasa	if (size > 0) {
6145cabbc6Prashanth Sreenivasa		/*
6155cabbc6Prashanth Sreenivasa		 * The copy thread has not yet visited this offset.  Ensure
6165cabbc6Prashanth Sreenivasa		 * that it doesn't.
6175cabbc6Prashanth Sreenivasa		 */
6185cabbc6Prashanth Sreenivasa
6195cabbc6Prashanth Sreenivasa		DTRACE_PROBE3(remove__free__unvisited,
6205cabbc6Prashanth Sreenivasa		    spa_t *, spa,
6215cabbc6Prashanth Sreenivasa		    uint64_t, offset,
6225cabbc6Prashanth Sreenivasa		    uint64_t, size);
6235cabbc6Prashanth Sreenivasa
6245cabbc6Prashanth Sreenivasa		if (svr->svr_allocd_segs != NULL)
6255cabbc6Prashanth Sreenivasa			range_tree_clear(svr->svr_allocd_segs, offset, size);
6265cabbc6Prashanth Sreenivasa
6275cabbc6Prashanth Sreenivasa		/*
6285cabbc6Prashanth Sreenivasa		 * Since we now do not need to copy this data, for
6295cabbc6Prashanth Sreenivasa		 * accounting purposes we have done our job and can count
6305cabbc6Prashanth Sreenivasa		 * it as completed.
6315cabbc6Prashanth Sreenivasa		 */
6325cabbc6Prashanth Sreenivasa		svr->svr_bytes_done[txg & TXG_MASK] += size;
6335cabbc6Prashanth Sreenivasa	}
6345cabbc6Prashanth Sreenivasa	mutex_exit(&svr->svr_lock);
6355cabbc6Prashanth Sreenivasa
6365cabbc6Prashanth Sreenivasa	/*
6375cabbc6Prashanth Sreenivasa	 * Now that we have dropped svr_lock, process the synced portion
6385cabbc6Prashanth Sreenivasa	 * of this free.
6395cabbc6Prashanth Sreenivasa	 */
6405cabbc6Prashanth Sreenivasa	if (synced_size > 0) {
6418671400Serapheim Dimitropoulos		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
6428671400Serapheim Dimitropoulos
6435cabbc6Prashanth Sreenivasa		/*
6445cabbc6Prashanth Sreenivasa		 * Note: this can only be called from syncing context,
6455cabbc6Prashanth Sreenivasa		 * and the vdev_indirect_mapping is only changed from the
6465cabbc6Prashanth Sreenivasa		 * sync thread, so we don't need svr_lock while doing
6475cabbc6Prashanth Sreenivasa		 * metaslab_free_impl_cb.
6485cabbc6Prashanth Sreenivasa		 */
6498671400Serapheim Dimitropoulos		boolean_t checkpoint = B_FALSE;
6505cabbc6Prashanth Sreenivasa		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
6518671400Serapheim Dimitropoulos		    metaslab_free_impl_cb, &checkpoint);
6525cabbc6Prashanth Sreenivasa	}
6535cabbc6Prashanth Sreenivasa}
6545cabbc6Prashanth Sreenivasa
6555cabbc6Prashanth Sreenivasa/*
6565cabbc6Prashanth Sreenivasa * Stop an active removal and update the spa_removing phys.
6575cabbc6Prashanth Sreenivasa */
6585cabbc6Prashanth Sreenivasastatic void
6595cabbc6Prashanth Sreenivasaspa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
6605cabbc6Prashanth Sreenivasa{
6615cabbc6Prashanth Sreenivasa	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
6625cabbc6Prashanth Sreenivasa	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
6635cabbc6Prashanth Sreenivasa
6645cabbc6Prashanth Sreenivasa	/* Ensure the removal thread has completed before we free the svr. */
6655cabbc6Prashanth Sreenivasa	spa_vdev_remove_suspend(spa);
6665cabbc6Prashanth Sreenivasa
6675cabbc6Prashanth Sreenivasa	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
6685cabbc6Prashanth Sreenivasa
6695cabbc6Prashanth Sreenivasa	if (state == DSS_FINISHED) {
6705cabbc6Prashanth Sreenivasa		spa_removing_phys_t *srp = &spa->spa_removing_phys;
6713a4b1beMatthew Ahrens		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);