15cabbc6Prashanth Sreenivasa/*
25cabbc6Prashanth Sreenivasa * CDDL HEADER START
35cabbc6Prashanth Sreenivasa *
45cabbc6Prashanth Sreenivasa * This file and its contents are supplied under the terms of the
55cabbc6Prashanth Sreenivasa * Common Development and Distribution License ("CDDL"), version 1.0.
65cabbc6Prashanth Sreenivasa * You may only use this file in accordance with the terms of version
75cabbc6Prashanth Sreenivasa * 1.0 of the CDDL.
85cabbc6Prashanth Sreenivasa *
95cabbc6Prashanth Sreenivasa * A full copy of the text of the CDDL should have accompanied this
105cabbc6Prashanth Sreenivasa * source.  A copy of the CDDL is also available via the Internet at
115cabbc6Prashanth Sreenivasa * http://www.illumos.org/license/CDDL.
125cabbc6Prashanth Sreenivasa *
135cabbc6Prashanth Sreenivasa * CDDL HEADER END
145cabbc6Prashanth Sreenivasa */
155cabbc6Prashanth Sreenivasa
165cabbc6Prashanth Sreenivasa/*
17814dcd4Serapheim Dimitropoulos * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
185cabbc6Prashanth Sreenivasa */
195cabbc6Prashanth Sreenivasa
205cabbc6Prashanth Sreenivasa#include <sys/zfs_context.h>
215cabbc6Prashanth Sreenivasa#include <sys/spa.h>
225cabbc6Prashanth Sreenivasa#include <sys/spa_impl.h>
235cabbc6Prashanth Sreenivasa#include <sys/vdev_impl.h>
245cabbc6Prashanth Sreenivasa#include <sys/fs/zfs.h>
255cabbc6Prashanth Sreenivasa#include <sys/zio.h>
263a4b1beMatthew Ahrens#include <sys/zio_checksum.h>
275cabbc6Prashanth Sreenivasa#include <sys/metaslab.h>
285cabbc6Prashanth Sreenivasa#include <sys/refcount.h>
295cabbc6Prashanth Sreenivasa#include <sys/dmu.h>
305cabbc6Prashanth Sreenivasa#include <sys/vdev_indirect_mapping.h>
315cabbc6Prashanth Sreenivasa#include <sys/dmu_tx.h>
325cabbc6Prashanth Sreenivasa#include <sys/dsl_synctask.h>
335cabbc6Prashanth Sreenivasa#include <sys/zap.h>
34667ec66Serapheim Dimitropoulos#include <sys/abd.h>
35667ec66Serapheim Dimitropoulos#include <sys/zthr.h>
365cabbc6Prashanth Sreenivasa
375cabbc6Prashanth Sreenivasa/*
385cabbc6Prashanth Sreenivasa * An indirect vdev corresponds to a vdev that has been removed.  Since
395cabbc6Prashanth Sreenivasa * we cannot rewrite block pointers of snapshots, etc., we keep a
405cabbc6Prashanth Sreenivasa * mapping from old location on the removed device to the new location
415cabbc6Prashanth Sreenivasa * on another device in the pool and use this mapping whenever we need
425cabbc6Prashanth Sreenivasa * to access the DVA.  Unfortunately, this mapping did not respect
435cabbc6Prashanth Sreenivasa * logical block boundaries when it was first created, and so a DVA on
445cabbc6Prashanth Sreenivasa * this indirect vdev may be "split" into multiple sections that each
455cabbc6Prashanth Sreenivasa * map to a different location.  As a consequence, not all DVAs can be
465cabbc6Prashanth Sreenivasa * translated to an equivalent new DVA.  Instead we must provide a
475cabbc6Prashanth Sreenivasa * "vdev_remap" operation that executes a callback on each contiguous
485cabbc6Prashanth Sreenivasa * segment of the new location.  This function is used in multiple ways:
495cabbc6Prashanth Sreenivasa *
503a4b1beMatthew Ahrens *  - i/os to this vdev use the callback to determine where the
513a4b1beMatthew Ahrens *    data is now located, and issue child i/os for each segment's new
523a4b1beMatthew Ahrens *    location.
535cabbc6Prashanth Sreenivasa *
543a4b1beMatthew Ahrens *  - frees and claims to this vdev use the callback to free or claim
555cabbc6Prashanth Sreenivasa *    each mapped segment.  (Note that we don't actually need to claim
565cabbc6Prashanth Sreenivasa *    log blocks on indirect vdevs, because we don't allocate to
575cabbc6Prashanth Sreenivasa *    removing vdevs.  However, zdb uses zio_claim() for its leak
585cabbc6Prashanth Sreenivasa *    detection.)
595cabbc6Prashanth Sreenivasa */
605cabbc6Prashanth Sreenivasa
615cabbc6Prashanth Sreenivasa/*
625cabbc6Prashanth Sreenivasa * "Big theory statement" for how we mark blocks obsolete.
635cabbc6Prashanth Sreenivasa *
645cabbc6Prashanth Sreenivasa * When a block on an indirect vdev is freed or remapped, a section of
655cabbc6Prashanth Sreenivasa * that vdev's mapping may no longer be referenced (aka "obsolete").  We
665cabbc6Prashanth Sreenivasa * keep track of how much of each mapping entry is obsolete.  When
675cabbc6Prashanth Sreenivasa * an entry becomes completely obsolete, we can remove it, thus reducing
685cabbc6Prashanth Sreenivasa * the memory used by the mapping.  The complete picture of obsolescence
695cabbc6Prashanth Sreenivasa * is given by the following data structures, described below:
705cabbc6Prashanth Sreenivasa *  - the entry-specific obsolete count
715cabbc6Prashanth Sreenivasa *  - the vdev-specific obsolete spacemap
725cabbc6Prashanth Sreenivasa *  - the pool-specific obsolete bpobj
735cabbc6Prashanth Sreenivasa *
745cabbc6Prashanth Sreenivasa * == On disk data structures used ==
755cabbc6Prashanth Sreenivasa *
765cabbc6Prashanth Sreenivasa * We track the obsolete space for the pool using several objects.  Each
775cabbc6Prashanth Sreenivasa * of these objects is created on demand and freed when no longer
785cabbc6Prashanth Sreenivasa * needed, and is assumed to be empty if it does not exist.
795cabbc6Prashanth Sreenivasa * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
805cabbc6Prashanth Sreenivasa *
815cabbc6Prashanth Sreenivasa *  - Each vic_mapping_object (associated with an indirect vdev) can
825cabbc6Prashanth Sreenivasa *    have a vimp_counts_object.  This is an array of uint32_t's
835cabbc6Prashanth Sreenivasa *    with the same number of entries as the vic_mapping_object.  When
845cabbc6Prashanth Sreenivasa *    the mapping is condensed, entries from the vic_obsolete_sm_object
855cabbc6Prashanth Sreenivasa *    (see below) are folded into the counts.  Therefore, each
865cabbc6Prashanth Sreenivasa *    obsolete_counts entry tells us the number of bytes in the
875cabbc6Prashanth Sreenivasa *    corresponding mapping entry that were not referenced when the
885cabbc6Prashanth Sreenivasa *    mapping was last condensed.
895cabbc6Prashanth Sreenivasa *
905cabbc6Prashanth Sreenivasa *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
915cabbc6Prashanth Sreenivasa *    This is a space map containing an alloc entry for every DVA that
925cabbc6Prashanth Sreenivasa *    has been obsoleted since the last time this indirect vdev was
935cabbc6Prashanth Sreenivasa *    condensed.  We use this object in order to improve performance
945cabbc6Prashanth Sreenivasa *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
955cabbc6Prashanth Sreenivasa *    offset of the vimp_counts_object, we only need to append an entry
965cabbc6Prashanth Sreenivasa *    to the end of this object.  When a DVA becomes obsolete, it is
975cabbc6Prashanth Sreenivasa *    added to the obsolete space map.  This happens when the DVA is
985cabbc6Prashanth Sreenivasa *    freed, remapped and not referenced by a snapshot, or the last
995cabbc6Prashanth Sreenivasa *    snapshot referencing it is destroyed.
1005cabbc6Prashanth Sreenivasa *
1015cabbc6Prashanth Sreenivasa *  - Each dataset can have a ds_remap_deadlist object.  This is a
1025cabbc6Prashanth Sreenivasa *    deadlist object containing all blocks that were remapped in this
1035cabbc6Prashanth Sreenivasa *    dataset but referenced in a previous snapshot.  Blocks can *only*
1045cabbc6Prashanth Sreenivasa *    appear on this list if they were remapped (dsl_dataset_block_remapped);
1055cabbc6Prashanth Sreenivasa *    blocks that were killed in a head dataset are put on the normal
1065cabbc6Prashanth Sreenivasa *    ds_deadlist and marked obsolete when they are freed.
1075cabbc6Prashanth Sreenivasa *
1085cabbc6Prashanth Sreenivasa *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
1095cabbc6Prashanth Sreenivasa *    in the pool that need to be marked obsolete.  When a snapshot is
1105cabbc6Prashanth Sreenivasa *    destroyed, we move some of the ds_remap_deadlist to the obsolete
1115cabbc6Prashanth Sreenivasa *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
1125cabbc6Prashanth Sreenivasa *    asynchronously process the obsolete bpobj, moving its entries to
1135cabbc6Prashanth Sreenivasa *    the specific vdevs' obsolete space maps.
1145cabbc6Prashanth Sreenivasa *
1155cabbc6Prashanth Sreenivasa * == Summary of how we mark blocks as obsolete ==
1165cabbc6Prashanth Sreenivasa *
1175cabbc6Prashanth Sreenivasa * - When freeing a block: if any DVA is on an indirect vdev, append to
1185cabbc6Prashanth Sreenivasa *   vic_obsolete_sm_object.
1195cabbc6Prashanth Sreenivasa * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
1205cabbc6Prashanth Sreenivasa *   references; otherwise append to vic_obsolete_sm_object).
1215cabbc6Prashanth Sreenivasa * - When freeing a snapshot: move parts of ds_remap_deadlist to
1225cabbc6Prashanth Sreenivasa *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
1235cabbc6Prashanth Sreenivasa * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
1245cabbc6Prashanth Sreenivasa *   individual vdev's vic_obsolete_sm_object.
1255cabbc6Prashanth Sreenivasa */
1265cabbc6Prashanth Sreenivasa
1275cabbc6Prashanth Sreenivasa/*
1285cabbc6Prashanth Sreenivasa * "Big theory statement" for how we condense indirect vdevs.
1295cabbc6Prashanth Sreenivasa *
1305cabbc6Prashanth Sreenivasa * Condensing an indirect vdev's mapping is the process of determining
1315cabbc6Prashanth Sreenivasa * the precise counts of obsolete space for each mapping entry (by
1325cabbc6Prashanth Sreenivasa * integrating the obsolete spacemap into the obsolete counts) and
1335cabbc6Prashanth Sreenivasa * writing out a new mapping that contains only referenced entries.
1345cabbc6Prashanth Sreenivasa *
1355cabbc6Prashanth Sreenivasa * We condense a vdev when we expect the mapping to shrink (see
1365cabbc6Prashanth Sreenivasa * vdev_indirect_should_condense()), but only perform one condense at a
1375cabbc6Prashanth Sreenivasa * time to limit the memory usage.  In addition, we use a separate
1385cabbc6Prashanth Sreenivasa * open-context thread (spa_condense_indirect_thread) to incrementally
1395cabbc6Prashanth Sreenivasa * create the new mapping object in a way that minimizes the impact on
1405cabbc6Prashanth Sreenivasa * the rest of the system.
1415cabbc6Prashanth Sreenivasa *
1425cabbc6Prashanth Sreenivasa * == Generating a new mapping ==
1435cabbc6Prashanth Sreenivasa *
1445cabbc6Prashanth Sreenivasa * To generate a new mapping, we follow these steps:
1455cabbc6Prashanth Sreenivasa *
1465cabbc6Prashanth Sreenivasa * 1. Save the old obsolete space map and create a new mapping object
1475cabbc6Prashanth Sreenivasa *    (see spa_condense_indirect_start_sync()).  This initializes the
1485cabbc6Prashanth Sreenivasa *    spa_condensing_indirect_phys with the "previous obsolete space map",
1495cabbc6Prashanth Sreenivasa *    which is now read only.  Newly obsolete DVAs will be added to a
1505cabbc6Prashanth Sreenivasa *    new (initially empty) obsolete space map, and will not be
1515cabbc6Prashanth Sreenivasa *    considered as part of this condense operation.
1525cabbc6Prashanth Sreenivasa *
1535cabbc6Prashanth Sreenivasa * 2. Construct in memory the precise counts of obsolete space for each
1545cabbc6Prashanth Sreenivasa *    mapping entry, by incorporating the obsolete space map into the
1555cabbc6Prashanth Sreenivasa *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
1565cabbc6Prashanth Sreenivasa *
1575cabbc6Prashanth Sreenivasa * 3. Iterate through each mapping entry, writing to the new mapping any
1585cabbc6Prashanth Sreenivasa *    entries that are not completely obsolete (i.e. which don't have
1595cabbc6Prashanth Sreenivasa *    obsolete count == mapping length).  (See
1605cabbc6Prashanth Sreenivasa *    spa_condense_indirect_generate_new_mapping().)
1615cabbc6Prashanth Sreenivasa *
1625cabbc6Prashanth Sreenivasa * 4. Destroy the old mapping object and switch over to the new one
1635cabbc6Prashanth Sreenivasa *    (spa_condense_indirect_complete_sync).
1645cabbc6Prashanth Sreenivasa *
1655cabbc6Prashanth Sreenivasa * == Restarting from failure ==
1665cabbc6Prashanth Sreenivasa *
1675cabbc6Prashanth Sreenivasa * To restart the condense when we import/open the pool, we must start
1685cabbc6Prashanth Sreenivasa * at the 2nd step above: reconstruct the precise counts in memory,
1695cabbc6Prashanth Sreenivasa * based on the space map + counts.  Then in the 3rd step, we start
1705cabbc6Prashanth Sreenivasa * iterating where we left off: at vimp_max_offset of the new mapping
1715cabbc6Prashanth Sreenivasa * object.
1725cabbc6Prashanth Sreenivasa */
1735cabbc6Prashanth Sreenivasa
1745cabbc6Prashanth Sreenivasaboolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
1755cabbc6Prashanth Sreenivasa
1765cabbc6Prashanth Sreenivasa/*
1775cabbc6Prashanth Sreenivasa * Condense if at least this percent of the bytes in the mapping is
1785cabbc6Prashanth Sreenivasa * obsolete.  With the default of 25%, the amount of space mapped
1795cabbc6Prashanth Sreenivasa * will be reduced to 1% of its original size after at most 16
1805cabbc6Prashanth Sreenivasa * condenses.  Higher values will condense less often (causing less
1815cabbc6Prashanth Sreenivasa * i/o); lower values will reduce the mapping size more quickly.
1825cabbc6Prashanth Sreenivasa */
1835cabbc6Prashanth Sreenivasaint zfs_indirect_condense_obsolete_pct = 25;
1845cabbc6Prashanth Sreenivasa
1855cabbc6Prashanth Sreenivasa/*
1865cabbc6Prashanth Sreenivasa * Condense if the obsolete space map takes up more than this amount of
1875cabbc6Prashanth Sreenivasa * space on disk (logically).  This limits the amount of disk space
1885cabbc6Prashanth Sreenivasa * consumed by the obsolete space map; the default of 1GB is small enough
1895cabbc6Prashanth Sreenivasa * that we typically don't mind "wasting" it.
1905cabbc6Prashanth Sreenivasa */
1915cabbc6Prashanth Sreenivasauint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
1925cabbc6Prashanth Sreenivasa
1935cabbc6Prashanth Sreenivasa/*
1945cabbc6Prashanth Sreenivasa * Don't bother condensing if the mapping uses less than this amount of
1955cabbc6Prashanth Sreenivasa * memory.  The default of 128KB is considered a "trivial" amount of
1965cabbc6Prashanth Sreenivasa * memory and not worth reducing.
1975cabbc6Prashanth Sreenivasa */
1985cabbc6Prashanth Sreenivasauint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
1995cabbc6Prashanth Sreenivasa
2005cabbc6Prashanth Sreenivasa/*
2015cabbc6Prashanth Sreenivasa * This is used by the test suite so that it can ensure that certain
2025cabbc6Prashanth Sreenivasa * actions happen while in the middle of a condense (which might otherwise
2035cabbc6Prashanth Sreenivasa * complete too quickly).  If used to reduce the performance impact of
2045cabbc6Prashanth Sreenivasa * condensing in production, a maximum value of 1 should be sufficient.
2055cabbc6Prashanth Sreenivasa */
2065cabbc6Prashanth Sreenivasaint zfs_condense_indirect_commit_entry_delay_ticks = 0;
2075cabbc6Prashanth Sreenivasa
2085cabbc6Prashanth Sreenivasa/*
209a21fe34Brian Behlendorf * If an indirect split block contains more than this many possible unique
210a21fe34Brian Behlendorf * combinations when being reconstructed, consider it too computationally
211a21fe34Brian Behlendorf * expensive to check them all. Instead, try at most 100 randomly-selected
212a21fe34Brian Behlendorf * combinations each time the block is accessed.  This allows all segment
213a21fe34Brian Behlendorf * copies to participate fairly in the reconstruction when all combinations
214a21fe34Brian Behlendorf * cannot be checked and prevents repeated use of one bad copy.
215a21fe34Brian Behlendorf */
216a21fe34Brian Behlendorfint zfs_reconstruct_indirect_combinations_max = 256;
217a21fe34Brian Behlendorf
218a21fe34Brian Behlendorf
219a21fe34Brian Behlendorf/*
220a21fe34Brian Behlendorf * Enable to simulate damaged segments and validate reconstruction.
221a21fe34Brian Behlendorf * Used by ztest
2223a4b1beMatthew Ahrens */
223a21fe34Brian Behlendorfunsigned long zfs_reconstruct_indirect_damage_fraction = 0;
2243a4b1beMatthew Ahrens
2253a4b1beMatthew Ahrens/*
2263a4b1beMatthew Ahrens * The indirect_child_t represents the vdev that we will read from, when we
2273a4b1beMatthew Ahrens * need to read all copies of the data (e.g. for scrub or reconstruction).
2283a4b1beMatthew Ahrens * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
2293a4b1beMatthew Ahrens * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
2303a4b1beMatthew Ahrens * ic_vdev is a child of the mirror.
2313a4b1beMatthew Ahrens */
2323a4b1beMatthew Ahrenstypedef struct indirect_child {
2333a4b1beMatthew Ahrens	abd_t *ic_data;
2343a4b1beMatthew Ahrens	vdev_t *ic_vdev;
235a21fe34Brian Behlendorf
236a21fe34Brian Behlendorf	/*
237a21fe34Brian Behlendorf	 * ic_duplicate is NULL when the ic_data contents are unique, when it
238a21fe34Brian Behlendorf	 * is determined to be a duplicate it references the primary child.
239a21fe34Brian Behlendorf	 */
240a21fe34Brian Behlendorf	struct indirect_child *ic_duplicate;
241a21fe34Brian Behlendorf	list_node_t ic_node; /* node on is_unique_child */
2423a4b1beMatthew Ahrens} indirect_child_t;
2433a4b1beMatthew Ahrens
2443a4b1beMatthew Ahrens/*
2453a4b1beMatthew Ahrens * The indirect_split_t represents one mapped segment of an i/o to the
2463a4b1beMatthew Ahrens * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
2473a4b1beMatthew Ahrens * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
2483a4b1beMatthew Ahrens * For split blocks, there will be several of these.
2493a4b1beMatthew Ahrens */
2503a4b1beMatthew Ahrenstypedef struct indirect_split {
2513a4b1beMatthew Ahrens	list_node_t is_node; /* link on iv_splits */
2523a4b1beMatthew Ahrens
2533a4b1beMatthew Ahrens	/*
2543a4b1beMatthew Ahrens	 * is_split_offset is the offset into the i/o.
2553a4b1beMatthew Ahrens	 * This is the sum of the previous splits' is_size's.
2563a4b1beMatthew Ahrens	 */
2573a4b1beMatthew Ahrens	uint64_t is_split_offset;
2583a4b1beMatthew Ahrens
2593a4b1beMatthew Ahrens	vdev_t *is_vdev; /* top-level vdev */
2603a4b1beMatthew Ahrens	uint64_t is_target_offset; /* offset on is_vdev */
2613a4b1beMatthew Ahrens	uint64_t is_size;
2623a4b1beMatthew Ahrens	int is_children; /* number of entries in is_child[] */
263a21fe34Brian Behlendorf	int is_unique_children; /* number of entries in is_unique_child */
264a21fe34Brian Behlendorf	list_t is_unique_child;
2653a4b1beMatthew Ahrens
2663a4b1beMatthew Ahrens	/*
2673a4b1beMatthew Ahrens	 * is_good_child is the child that we are currently using to
2683a4b1beMatthew Ahrens	 * attempt reconstruction.
2693a4b1beMatthew Ahrens	 */
270a21fe34Brian Behlendorf	indirect_child_t *is_good_child;
2713a4b1beMatthew Ahrens
2723a4b1beMatthew Ahrens	indirect_child_t is_child[1]; /* variable-length */
2733a4b1beMatthew Ahrens} indirect_split_t;
2743a4b1beMatthew Ahrens
2753a4b1beMatthew Ahrens/*
2763a4b1beMatthew Ahrens * The indirect_vsd_t is associated with each i/o to the indirect vdev.
2773a4b1beMatthew Ahrens * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
2783a4b1beMatthew Ahrens */
2793a4b1beMatthew Ahrenstypedef struct indirect_vsd {
2803a4b1beMatthew Ahrens	boolean_t iv_split_block;
2813a4b1beMatthew Ahrens	boolean_t iv_reconstruct;
282a21fe34Brian Behlendorf	uint64_t iv_unique_combinations;
283a21fe34Brian Behlendorf	uint64_t iv_attempts;
284a21fe34Brian Behlendorf	uint64_t iv_attempts_max;
2853a4b1beMatthew Ahrens
2863a4b1beMatthew Ahrens	list_t iv_splits; /* list of indirect_split_t's */
2873a4b1beMatthew Ahrens} indirect_vsd_t;
2883a4b1beMatthew Ahrens
2893a4b1beMatthew Ahrensstatic void
2903a4b1beMatthew Ahrensvdev_indirect_map_free(zio_t *zio)
2913a4b1beMatthew Ahrens{
2923a4b1beMatthew Ahrens	indirect_vsd_t *iv = zio->io_vsd;
2933a4b1beMatthew Ahrens
2943a4b1beMatthew Ahrens	indirect_split_t *is;
2953a4b1beMatthew Ahrens	while ((is = list_head(&iv->iv_splits)) != NULL) {
2963a4b1beMatthew Ahrens		for (int c = 0; c < is->is_children; c++) {
2973a4b1beMatthew Ahrens			indirect_child_t *ic = &is->is_child[c];
2983a4b1beMatthew Ahrens			if (ic->ic_data != NULL)
2993a4b1beMatthew Ahrens				abd_free(ic->ic_data);
3003a4b1beMatthew Ahrens		}
3013a4b1beMatthew Ahrens		list_remove(&iv->iv_splits, is);
302a21fe34Brian Behlendorf
303a21fe34Brian Behlendorf		indirect_child_t *ic;
304a21fe34Brian Behlendorf		while ((ic = list_head(&is->is_unique_child)) != NULL)
305a21fe34Brian Behlendorf			list_remove(&is->is_unique_child, ic);
306a21fe34Brian Behlendorf
307a21fe34Brian Behlendorf		list_destroy(&is->is_unique_child);
308a21fe34Brian Behlendorf
3093a4b1beMatthew Ahrens		kmem_free(is,
3103a4b1beMatthew Ahrens		    offsetof(indirect_split_t, is_child[is->is_children]));
3113a4b1beMatthew Ahrens	}
3123a4b1beMatthew Ahrens	kmem_free(iv, sizeof (*iv));
3133a4b1beMatthew Ahrens}
3143a4b1beMatthew Ahrens
3153a4b1beMatthew Ahrensstatic const zio_vsd_ops_t vdev_indirect_vsd_ops = {
3163a4b1beMatthew Ahrens	vdev_indirect_map_free,
3173a4b1beMatthew Ahrens	zio_vsd_default_cksum_report
3183a4b1beMatthew Ahrens};
3193a4b1beMatthew Ahrens/*
3208671400Serapheim Dimitropoulos * Mark the given offset and size as being obsolete.
3215cabbc6Prashanth Sreenivasa */
3225cabbc6Prashanth Sreenivasavoid
3238671400Serapheim Dimitropoulosvdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
3245cabbc6Prashanth Sreenivasa{
3255cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
3268671400Serapheim Dimitropoulos
3275cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
3285cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
3295cabbc6Prashanth Sreenivasa	ASSERT(size > 0);
3305cabbc6Prashanth Sreenivasa	VERIFY(vdev_indirect_mapping_entry_for_offset(
3315cabbc6Prashanth Sreenivasa	    vd->vdev_indirect_mapping, offset) != NULL);
3325cabbc6Prashanth Sreenivasa
3335cabbc6Prashanth Sreenivasa	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
3345cabbc6Prashanth Sreenivasa		mutex_enter(&vd->vdev_obsolete_lock);
3355cabbc6Prashanth Sreenivasa		range_tree_add(vd->vdev_obsolete_segments, offset, size);
3365cabbc6Prashanth Sreenivasa		mutex_exit(&vd->vdev_obsolete_lock);
3378671400Serapheim Dimitropoulos		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
3385cabbc6Prashanth Sreenivasa	}
3395cabbc6Prashanth Sreenivasa}
3405cabbc6Prashanth Sreenivasa
3415cabbc6Prashanth Sreenivasa/*
3425cabbc6Prashanth Sreenivasa * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
3435cabbc6Prashanth Sreenivasa * wrapper is provided because the DMU does not know about vdev_t's and
3445cabbc6Prashanth Sreenivasa * cannot directly call vdev_indirect_mark_obsolete.
3455cabbc6Prashanth Sreenivasa */
3465cabbc6Prashanth Sreenivasavoid
3475cabbc6Prashanth Sreenivasaspa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
3485cabbc6Prashanth Sreenivasa    uint64_t size, dmu_tx_t *tx)
3495cabbc6Prashanth Sreenivasa{
3505cabbc6Prashanth Sreenivasa	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3515cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
3525cabbc6Prashanth Sreenivasa
3535cabbc6Prashanth Sreenivasa	/* The DMU can only remap indirect vdevs. */
3545cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
3558671400Serapheim Dimitropoulos	vdev_indirect_mark_obsolete(vd, offset, size);
3565cabbc6Prashanth Sreenivasa}
3575cabbc6Prashanth Sreenivasa
3585cabbc6Prashanth Sreenivasastatic spa_condensing_indirect_t *
3595cabbc6Prashanth Sreenivasaspa_condensing_indirect_create(spa_t *spa)
3605cabbc6Prashanth Sreenivasa{
3615cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
3625cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
3635cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
3645cabbc6Prashanth Sreenivasa	objset_t *mos = spa->spa_meta_objset;
3655cabbc6Prashanth Sreenivasa
3665cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
3675cabbc6Prashanth Sreenivasa		list_create(&sci->sci_new_mapping_entries[i],
3685cabbc6Prashanth Sreenivasa		    sizeof (vdev_indirect_mapping_entry_t),
3695cabbc6Prashanth Sreenivasa		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
3705cabbc6Prashanth Sreenivasa	}
3715cabbc6Prashanth Sreenivasa
3725cabbc6Prashanth Sreenivasa	sci->sci_new_mapping =
3735cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
3745cabbc6Prashanth Sreenivasa
3755cabbc6Prashanth Sreenivasa	return (sci);
3765cabbc6Prashanth Sreenivasa}
3775cabbc6Prashanth Sreenivasa
3785cabbc6Prashanth Sreenivasastatic void
3795cabbc6Prashanth Sreenivasaspa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
3805cabbc6Prashanth Sreenivasa{
3815cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++)
3825cabbc6Prashanth Sreenivasa		list_destroy(&sci->sci_new_mapping_entries[i]);
3835cabbc6Prashanth Sreenivasa
3845cabbc6Prashanth Sreenivasa	if (sci->sci_new_mapping != NULL)
3855cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_close(sci->sci_new_mapping);
3865cabbc6Prashanth Sreenivasa
3875cabbc6Prashanth Sreenivasa	kmem_free(sci, sizeof (*sci));
3885cabbc6Prashanth Sreenivasa}
3895cabbc6Prashanth Sreenivasa
3905cabbc6Prashanth Sreenivasaboolean_t
3915cabbc6Prashanth Sreenivasavdev_indirect_should_condense(vdev_t *vd)
3925cabbc6Prashanth Sreenivasa{
3935cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3945cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
3955cabbc6Prashanth Sreenivasa
3965cabbc6Prashanth Sreenivasa	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
3975cabbc6Prashanth Sreenivasa
3985cabbc6Prashanth Sreenivasa	if (!zfs_condense_indirect_vdevs_enable)
3995cabbc6Prashanth Sreenivasa		return (B_FALSE);
4005cabbc6Prashanth Sreenivasa
4015cabbc6Prashanth Sreenivasa	/*
4025cabbc6Prashanth Sreenivasa	 * We can only condense one indirect vdev at a time.
4035cabbc6Prashanth Sreenivasa	 */
4045cabbc6Prashanth Sreenivasa	if (spa->spa_condensing_indirect != NULL)
4055cabbc6Prashanth Sreenivasa		return (B_FALSE);
4065cabbc6Prashanth Sreenivasa
4075cabbc6Prashanth Sreenivasa	if (spa_shutting_down(spa))
4085cabbc6Prashanth Sreenivasa		return (B_FALSE);
4095cabbc6Prashanth Sreenivasa
4105cabbc6Prashanth Sreenivasa	/*
4115cabbc6Prashanth Sreenivasa	 * The mapping object size must not change while we are
4125cabbc6Prashanth Sreenivasa	 * condensing, so we can only condense indirect vdevs
4135cabbc6Prashanth Sreenivasa	 * (not vdevs that are still in the middle of being removed).
4145cabbc6Prashanth Sreenivasa	 */
4155cabbc6Prashanth Sreenivasa	if (vd->vdev_ops != &vdev_indirect_ops)
4165cabbc6Prashanth Sreenivasa		return (B_FALSE);
4175cabbc6Prashanth Sreenivasa
4185cabbc6Prashanth Sreenivasa	/*
4195cabbc6Prashanth Sreenivasa	 * If nothing new has been marked obsolete, there is no
4205cabbc6Prashanth Sreenivasa	 * point in condensing.
4215cabbc6Prashanth Sreenivasa	 */
4225cabbc6Prashanth Sreenivasa	if (vd->vdev_obsolete_sm == NULL) {
4235cabbc6Prashanth Sreenivasa		ASSERT0(vdev_obsolete_sm_object(vd));
4245cabbc6Prashanth Sreenivasa		return (B_FALSE);
4255cabbc6Prashanth Sreenivasa	}
4265cabbc6Prashanth Sreenivasa
4275cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_obsolete_sm != NULL);
4285cabbc6Prashanth Sreenivasa
4295cabbc6Prashanth Sreenivasa	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
4305cabbc6Prashanth Sreenivasa	    space_map_object(vd->vdev_obsolete_sm));
4315cabbc6Prashanth Sreenivasa
4325cabbc6Prashanth Sreenivasa	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
4335cabbc6Prashanth Sreenivasa	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
4345cabbc6Prashanth Sreenivasa	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
4355cabbc6Prashanth Sreenivasa	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
4365cabbc6Prashanth Sreenivasa
4375cabbc6Prashanth Sreenivasa	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
4385cabbc6Prashanth Sreenivasa
4395cabbc6Prashanth Sreenivasa	/*
4405cabbc6Prashanth Sreenivasa	 * If a high percentage of the bytes that are mapped have become
4415cabbc6Prashanth Sreenivasa	 * obsolete, condense (unless the mapping is already small enough).
4425cabbc6Prashanth Sreenivasa	 * This has a good chance of reducing the amount of memory used
4435cabbc6Prashanth Sreenivasa	 * by the mapping.
4445cabbc6Prashanth Sreenivasa	 */
4455cabbc6Prashanth Sreenivasa	if (bytes_obsolete * 100 / bytes_mapped >=
4465cabbc6Prashanth Sreenivasa	    zfs_indirect_condense_obsolete_pct &&
4475cabbc6Prashanth Sreenivasa	    mapping_size > zfs_condense_min_mapping_bytes) {
4485cabbc6Prashanth Sreenivasa		zfs_dbgmsg("should condense vdev %llu because obsolete "
4495cabbc6Prashanth Sreenivasa		    "spacemap covers %d%% of %lluMB mapping",
4505cabbc6Prashanth Sreenivasa		    (u_longlong_t)vd->vdev_id,
4515cabbc6Prashanth Sreenivasa		    (int)(bytes_obsolete * 100 / bytes_mapped),
4525cabbc6Prashanth Sreenivasa		    (u_longlong_t)bytes_mapped / 1024 / 1024);
4535cabbc6Prashanth Sreenivasa		return (B_TRUE);
4545cabbc6Prashanth Sreenivasa	}
4555cabbc6Prashanth Sreenivasa
4565cabbc6Prashanth Sreenivasa	/*
4575cabbc6Prashanth Sreenivasa	 * If the obsolete space map takes up too much space on disk,
4585cabbc6Prashanth Sreenivasa	 * condense in order to free up this disk space.
4595cabbc6Prashanth Sreenivasa	 */
4605cabbc6Prashanth Sreenivasa	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
4615cabbc6Prashanth Sreenivasa		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
4625cabbc6Prashanth Sreenivasa		    "length %lluMB >= max size %lluMB",
4635cabbc6Prashanth Sreenivasa		    (u_longlong_t)vd->vdev_id,
4645cabbc6Prashanth Sreenivasa		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
4655cabbc6Prashanth Sreenivasa		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
4665cabbc6Prashanth Sreenivasa		    1024 / 1024);
4675cabbc6Prashanth Sreenivasa		return (B_TRUE);
4685cabbc6Prashanth Sreenivasa	}
4695cabbc6Prashanth Sreenivasa
4705cabbc6Prashanth Sreenivasa	return (B_FALSE);
4715cabbc6Prashanth Sreenivasa}
4725cabbc6Prashanth Sreenivasa
4735cabbc6Prashanth Sreenivasa/*
4745cabbc6Prashanth Sreenivasa * This sync task completes (finishes) a condense, deleting the old
4755cabbc6Prashanth Sreenivasa * mapping and replacing it with the new one.
4765cabbc6Prashanth Sreenivasa */
4775cabbc6Prashanth Sreenivasastatic void
4785cabbc6Prashanth Sreenivasaspa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
4795cabbc6Prashanth Sreenivasa{
4805cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = arg;
4815cabbc6Prashanth Sreenivasa	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4825cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
4835cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
4845cabbc6Prashanth Sreenivasa	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
4855cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4865cabbc6Prashanth Sreenivasa	objset_t *mos = spa->spa_meta_objset;
4875cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
4885cabbc6Prashanth Sreenivasa	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
4895cabbc6Prashanth Sreenivasa	uint64_t new_count =
4905cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
4915cabbc6Prashanth Sreenivasa
4925cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
4935cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4945cabbc6Prashanth Sreenivasa	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
4955cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
4965cabbc6Prashanth Sreenivasa		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
4975cabbc6Prashanth Sreenivasa	}
4985cabbc6Prashanth Sreenivasa	ASSERT(vic->vic_mapping_object != 0);
4995cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
5005cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_next_mapping_object != 0);
5015cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
5025cabbc6Prashanth Sreenivasa
5035cabbc6Prashanth Sreenivasa	/*
5045cabbc6Prashanth Sreenivasa	 * Reset vdev_indirect_mapping to refer to the new object.
5055cabbc6Prashanth Sreenivasa	 */
5065cabbc6Prashanth Sreenivasa	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
5075cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
5085cabbc6Prashanth Sreenivasa	vd->vdev_indirect_mapping = sci->sci_new_mapping;
5095cabbc6Prashanth Sreenivasa	rw_exit(&vd->vdev_indirect_rwlock);
5105cabbc6Prashanth Sreenivasa
5115cabbc6Prashanth Sreenivasa	sci->sci_new_mapping = NULL;
5125cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
5135cabbc6Prashanth Sreenivasa	vic->vic_mapping_object = scip->scip_next_mapping_object;
5145cabbc6Prashanth Sreenivasa	scip->scip_next_mapping_object = 0;
5155cabbc6Prashanth Sreenivasa
5165cabbc6Prashanth Sreenivasa	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
5175cabbc6Prashanth Sreenivasa	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
5185cabbc6Prashanth Sreenivasa	scip->scip_prev_obsolete_sm_object = 0;
5195cabbc6Prashanth Sreenivasa
5205cabbc6Prashanth Sreenivasa	scip->scip_vdev = 0;
5215cabbc6Prashanth Sreenivasa
5225cabbc6Prashanth Sreenivasa	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
5235cabbc6Prashanth Sreenivasa	    DMU_POOL_CONDENSING_INDIRECT, tx));
5245cabbc6Prashanth Sreenivasa	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
5255cabbc6Prashanth Sreenivasa	spa->spa_condensing_indirect = NULL;
5265cabbc6Prashanth Sreenivasa
5275cabbc6Prashanth Sreenivasa	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
5285cabbc6Prashanth Sreenivasa	    "new mapping object %llu has %llu entries "
5295cabbc6Prashanth Sreenivasa	    "(was %llu entries)",
5305cabbc6Prashanth Sreenivasa	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
5315cabbc6Prashanth Sreenivasa	    new_count, old_count);
5325cabbc6Prashanth Sreenivasa
5335cabbc6Prashanth Sreenivasa	vdev_config_dirty(spa->spa_root_vdev);
5345cabbc6Prashanth Sreenivasa}
5355cabbc6Prashanth Sreenivasa
5365cabbc6Prashanth Sreenivasa/*
5375cabbc6Prashanth Sreenivasa * This sync task appends entries to the new mapping object.
5385cabbc6Prashanth Sreenivasa */
5395cabbc6Prashanth Sreenivasastatic void
5405cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
5415cabbc6Prashanth Sreenivasa{
5425cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = arg;
5435cabbc6Prashanth Sreenivasa	uint64_t txg = dmu_tx_get_txg(tx);
5445cabbc6Prashanth Sreenivasa	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5455cabbc6Prashanth Sreenivasa
5465cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
5475cabbc6Prashanth Sreenivasa	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
5485cabbc6Prashanth Sreenivasa
5495cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
5505cabbc6Prashanth Sreenivasa	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
5515cabbc6Prashanth Sreenivasa	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
5525cabbc6Prashanth Sreenivasa}
5535cabbc6Prashanth Sreenivasa
5545cabbc6Prashanth Sreenivasa/*
5555cabbc6Prashanth Sreenivasa * Open-context function to add one entry to the new mapping.  The new
5565cabbc6Prashanth Sreenivasa * entry will be remembered and written from syncing context.
5575cabbc6Prashanth Sreenivasa */
5585cabbc6Prashanth Sreenivasastatic void
5595cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_entry(spa_t *spa,
5605cabbc6Prashanth Sreenivasa    vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
5615cabbc6Prashanth Sreenivasa{
5625cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
5635cabbc6Prashanth Sreenivasa
5645cabbc6Prashanth Sreenivasa	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
5655cabbc6Prashanth Sreenivasa
5665cabbc6Prashanth Sreenivasa	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5675cabbc6Prashanth Sreenivasa	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
5685cabbc6Prashanth Sreenivasa	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
5695cabbc6Prashanth Sreenivasa	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
5705cabbc6Prashanth Sreenivasa
5715cabbc6Prashanth Sreenivasa	/*
5725cabbc6Prashanth Sreenivasa	 * If we are the first entry committed this txg, kick off the sync
5735cabbc6Prashanth Sreenivasa	 * task to write to the MOS on our behalf.
5745cabbc6Prashanth Sreenivasa	 */
5755cabbc6Prashanth Sreenivasa	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
5765cabbc6Prashanth Sreenivasa		dsl_sync_task_nowait(dmu_tx_pool(tx),
5775cabbc6Prashanth Sreenivasa		    spa_condense_indirect_commit_sync, sci,
5785cabbc6Prashanth Sreenivasa		    0, ZFS_SPACE_CHECK_NONE, tx);
5795cabbc6Prashanth Sreenivasa	}
5805cabbc6Prashanth Sreenivasa
5815cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_entry_t *vime =
5825cabbc6Prashanth Sreenivasa	    kmem_alloc(sizeof (*vime), KM_SLEEP);
5835cabbc6Prashanth Sreenivasa	vime->vime_mapping = *vimep;
5845cabbc6Prashanth Sreenivasa	vime->vime_obsolete_count = count;
5855cabbc6Prashanth Sreenivasa	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
5865cabbc6Prashanth Sreenivasa
5875cabbc6Prashanth Sreenivasa	dmu_tx_commit(tx);
5885cabbc6Prashanth Sreenivasa}
5895cabbc6Prashanth Sreenivasa
5905cabbc6Prashanth Sreenivasastatic void
5915cabbc6Prashanth Sreenivasaspa_condense_indirect_generate_new_mapping(vdev_t *vd,
592667ec66Serapheim Dimitropoulos    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
5935cabbc6Prashanth Sreenivasa{
5945cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
5955cabbc6Prashanth Sreenivasa	uint64_t mapi = start_index;
5965cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
5975cabbc6Prashanth Sreenivasa	uint64_t old_num_entries =
5985cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_num_entries(old_mapping);
5995cabbc6Prashanth Sreenivasa
6005cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6015cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
6025cabbc6Prashanth Sreenivasa
6035cabbc6Prashanth Sreenivasa	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
6045cabbc6Prashanth Sreenivasa	    (u_longlong_t)vd->vdev_id,
6055cabbc6Prashanth Sreenivasa	    (u_longlong_t)mapi);
6065cabbc6Prashanth Sreenivasa
607667ec66Serapheim Dimitropoulos	while (mapi < old_num_entries) {
608667ec66Serapheim Dimitropoulos
609667ec66Serapheim Dimitropoulos		if (zthr_iscancelled(zthr)) {
610667ec66Serapheim Dimitropoulos			zfs_dbgmsg("pausing condense of vdev %llu "
611667ec66Serapheim Dimitropoulos			    "at index %llu", (u_longlong_t)vd->vdev_id,
612667ec66Serapheim Dimitropoulos			    (u_longlong_t)mapi);
613667ec66Serapheim Dimitropoulos			break;
614667ec66Serapheim Dimitropoulos		}
615667ec66Serapheim Dimitropoulos
6165cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_entry_phys_t *entry =
6175cabbc6Prashanth Sreenivasa		    &old_mapping->vim_entries[mapi];
6185cabbc6Prashanth Sreenivasa		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
6195cabbc6Prashanth Sreenivasa		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
6205cabbc6Prashanth Sreenivasa		if (obsolete_counts[mapi] < entry_size) {
6215cabbc6Prashanth Sreenivasa			spa_condense_indirect_commit_entry(spa, entry,
6225cabbc6Prashanth Sreenivasa			    obsolete_counts[mapi]);
6235cabbc6Prashanth Sreenivasa
6245cabbc6Prashanth Sreenivasa			/*
6255cabbc6Prashanth Sreenivasa			 * This delay may be requested for testing, debugging,
6265cabbc6Prashanth Sreenivasa			 * or performance reasons.
6275cabbc6Prashanth Sreenivasa			 */
6285cabbc6Prashanth Sreenivasa			delay(zfs_condense_indirect_commit_entry_delay_ticks);
6295cabbc6Prashanth Sreenivasa		}
6305cabbc6Prashanth Sreenivasa
6315cabbc6Prashanth Sreenivasa		mapi++;
6325cabbc6Prashanth Sreenivasa	}
6335cabbc6Prashanth Sreenivasa}
6345cabbc6Prashanth Sreenivasa
635667ec66Serapheim Dimitropoulos/* ARGSUSED */
636667ec66Serapheim Dimitropoulosstatic boolean_t
637667ec66Serapheim Dimitropoulosspa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
6385cabbc6Prashanth Sreenivasa{
639667ec66Serapheim Dimitropoulos	spa_t *spa = arg;
640667ec66Serapheim Dimitropoulos
641667ec66Serapheim Dimitropoulos	return (spa->spa_condensing_indirect != NULL);
642667ec66Serapheim Dimitropoulos}
643667ec66Serapheim Dimitropoulos
644667ec66Serapheim Dimitropoulos/* ARGSUSED */
6456a316e1Serapheim Dimitropoulosstatic void
646667ec66Serapheim Dimitropoulosspa_condense_indirect_thread(void *arg, zthr_t *zthr)
647667ec66Serapheim Dimitropoulos{
648667ec66Serapheim Dimitropoulos	spa_t *spa = arg;
649667ec66Serapheim Dimitropoulos	vdev_t *vd;
650667ec66Serapheim Dimitropoulos
651667ec66Serapheim Dimitropoulos	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
652667ec66Serapheim Dimitropoulos	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
653667ec66Serapheim Dimitropoulos	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
654667ec66Serapheim Dimitropoulos	ASSERT3P(vd, !=, NULL);
655667ec66Serapheim Dimitropoulos	spa_config_exit(spa, SCL_VDEV, FTAG);
656667ec66Serapheim Dimitropoulos
6575cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
6585cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
6595cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
6605cabbc6Prashanth Sreenivasa	uint32_t *counts;
6615cabbc6Prashanth Sreenivasa	uint64_t start_index;
6625cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
6635cabbc6Prashanth Sreenivasa	space_map_t *prev_obsolete_sm = NULL;
6645cabbc6Prashanth Sreenivasa
6655cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
6665cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_next_mapping_object != 0);
6675cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
6685cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6695cabbc6Prashanth Sreenivasa
6705cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
6715cabbc6Prashanth Sreenivasa		/*
6725cabbc6Prashanth Sreenivasa		 * The list must start out empty in order for the
6735cabbc6Prashanth Sreenivasa		 * _commit_sync() sync task to be properly registered
6745cabbc6Prashanth Sreenivasa		 * on the first call to _commit_entry(); so it's wise
6755cabbc6Prashanth Sreenivasa		 * to double check and ensure we actually are starting
6765cabbc6Prashanth Sreenivasa		 * with empty lists.
6775cabbc6Prashanth Sreenivasa		 */
6785cabbc6Prashanth Sreenivasa		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
6795cabbc6Prashanth Sreenivasa	}
6805cabbc6Prashanth Sreenivasa
6815cabbc6Prashanth Sreenivasa	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6825cabbc6Prashanth Sreenivasa	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6835cabbc6Prashanth Sreenivasa	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
6845cabbc6Prashanth Sreenivasa	if (prev_obsolete_sm != NULL) {
6855cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
6865cabbc6Prashanth Sreenivasa		    counts, prev_obsolete_sm);
687