15cabbc6Prashanth Sreenivasa/*
25cabbc6Prashanth Sreenivasa * CDDL HEADER START
35cabbc6Prashanth Sreenivasa *
45cabbc6Prashanth Sreenivasa * This file and its contents are supplied under the terms of the
55cabbc6Prashanth Sreenivasa * Common Development and Distribution License ("CDDL"), version 1.0.
65cabbc6Prashanth Sreenivasa * You may only use this file in accordance with the terms of version
75cabbc6Prashanth Sreenivasa * 1.0 of the CDDL.
85cabbc6Prashanth Sreenivasa *
95cabbc6Prashanth Sreenivasa * A full copy of the text of the CDDL should have accompanied this
105cabbc6Prashanth Sreenivasa * source.  A copy of the CDDL is also available via the Internet at
115cabbc6Prashanth Sreenivasa * http://www.illumos.org/license/CDDL.
125cabbc6Prashanth Sreenivasa *
135cabbc6Prashanth Sreenivasa * CDDL HEADER END
145cabbc6Prashanth Sreenivasa */
155cabbc6Prashanth Sreenivasa
165cabbc6Prashanth Sreenivasa/*
17814dcd4Serapheim Dimitropoulos * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
18ac04831Mike Gerdts * Copyright 2019 Joyent, Inc.
195cabbc6Prashanth Sreenivasa */
205cabbc6Prashanth Sreenivasa
215cabbc6Prashanth Sreenivasa#include <sys/zfs_context.h>
225cabbc6Prashanth Sreenivasa#include <sys/spa.h>
235cabbc6Prashanth Sreenivasa#include <sys/spa_impl.h>
245cabbc6Prashanth Sreenivasa#include <sys/vdev_impl.h>
255cabbc6Prashanth Sreenivasa#include <sys/fs/zfs.h>
265cabbc6Prashanth Sreenivasa#include <sys/zio.h>
273a4b1beMatthew Ahrens#include <sys/zio_checksum.h>
285cabbc6Prashanth Sreenivasa#include <sys/metaslab.h>
295cabbc6Prashanth Sreenivasa#include <sys/refcount.h>
305cabbc6Prashanth Sreenivasa#include <sys/dmu.h>
315cabbc6Prashanth Sreenivasa#include <sys/vdev_indirect_mapping.h>
325cabbc6Prashanth Sreenivasa#include <sys/dmu_tx.h>
335cabbc6Prashanth Sreenivasa#include <sys/dsl_synctask.h>
345cabbc6Prashanth Sreenivasa#include <sys/zap.h>
35667ec66Serapheim Dimitropoulos#include <sys/abd.h>
36667ec66Serapheim Dimitropoulos#include <sys/zthr.h>
375cabbc6Prashanth Sreenivasa
385cabbc6Prashanth Sreenivasa/*
395cabbc6Prashanth Sreenivasa * An indirect vdev corresponds to a vdev that has been removed.  Since
405cabbc6Prashanth Sreenivasa * we cannot rewrite block pointers of snapshots, etc., we keep a
415cabbc6Prashanth Sreenivasa * mapping from old location on the removed device to the new location
425cabbc6Prashanth Sreenivasa * on another device in the pool and use this mapping whenever we need
435cabbc6Prashanth Sreenivasa * to access the DVA.  Unfortunately, this mapping did not respect
445cabbc6Prashanth Sreenivasa * logical block boundaries when it was first created, and so a DVA on
455cabbc6Prashanth Sreenivasa * this indirect vdev may be "split" into multiple sections that each
465cabbc6Prashanth Sreenivasa * map to a different location.  As a consequence, not all DVAs can be
475cabbc6Prashanth Sreenivasa * translated to an equivalent new DVA.  Instead we must provide a
485cabbc6Prashanth Sreenivasa * "vdev_remap" operation that executes a callback on each contiguous
495cabbc6Prashanth Sreenivasa * segment of the new location.  This function is used in multiple ways:
505cabbc6Prashanth Sreenivasa *
513a4b1beMatthew Ahrens *  - i/os to this vdev use the callback to determine where the
523a4b1beMatthew Ahrens *    data is now located, and issue child i/os for each segment's new
533a4b1beMatthew Ahrens *    location.
545cabbc6Prashanth Sreenivasa *
553a4b1beMatthew Ahrens *  - frees and claims to this vdev use the callback to free or claim
565cabbc6Prashanth Sreenivasa *    each mapped segment.  (Note that we don't actually need to claim
575cabbc6Prashanth Sreenivasa *    log blocks on indirect vdevs, because we don't allocate to
585cabbc6Prashanth Sreenivasa *    removing vdevs.  However, zdb uses zio_claim() for its leak
595cabbc6Prashanth Sreenivasa *    detection.)
605cabbc6Prashanth Sreenivasa */
615cabbc6Prashanth Sreenivasa
625cabbc6Prashanth Sreenivasa/*
635cabbc6Prashanth Sreenivasa * "Big theory statement" for how we mark blocks obsolete.
645cabbc6Prashanth Sreenivasa *
655cabbc6Prashanth Sreenivasa * When a block on an indirect vdev is freed or remapped, a section of
665cabbc6Prashanth Sreenivasa * that vdev's mapping may no longer be referenced (aka "obsolete").  We
675cabbc6Prashanth Sreenivasa * keep track of how much of each mapping entry is obsolete.  When
685cabbc6Prashanth Sreenivasa * an entry becomes completely obsolete, we can remove it, thus reducing
695cabbc6Prashanth Sreenivasa * the memory used by the mapping.  The complete picture of obsolescence
705cabbc6Prashanth Sreenivasa * is given by the following data structures, described below:
715cabbc6Prashanth Sreenivasa *  - the entry-specific obsolete count
725cabbc6Prashanth Sreenivasa *  - the vdev-specific obsolete spacemap
735cabbc6Prashanth Sreenivasa *  - the pool-specific obsolete bpobj
745cabbc6Prashanth Sreenivasa *
755cabbc6Prashanth Sreenivasa * == On disk data structures used ==
765cabbc6Prashanth Sreenivasa *
775cabbc6Prashanth Sreenivasa * We track the obsolete space for the pool using several objects.  Each
785cabbc6Prashanth Sreenivasa * of these objects is created on demand and freed when no longer
795cabbc6Prashanth Sreenivasa * needed, and is assumed to be empty if it does not exist.
805cabbc6Prashanth Sreenivasa * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
815cabbc6Prashanth Sreenivasa *
825cabbc6Prashanth Sreenivasa *  - Each vic_mapping_object (associated with an indirect vdev) can
835cabbc6Prashanth Sreenivasa *    have a vimp_counts_object.  This is an array of uint32_t's
845cabbc6Prashanth Sreenivasa *    with the same number of entries as the vic_mapping_object.  When
855cabbc6Prashanth Sreenivasa *    the mapping is condensed, entries from the vic_obsolete_sm_object
865cabbc6Prashanth Sreenivasa *    (see below) are folded into the counts.  Therefore, each
875cabbc6Prashanth Sreenivasa *    obsolete_counts entry tells us the number of bytes in the
885cabbc6Prashanth Sreenivasa *    corresponding mapping entry that were not referenced when the
895cabbc6Prashanth Sreenivasa *    mapping was last condensed.
905cabbc6Prashanth Sreenivasa *
915cabbc6Prashanth Sreenivasa *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
925cabbc6Prashanth Sreenivasa *    This is a space map containing an alloc entry for every DVA that
935cabbc6Prashanth Sreenivasa *    has been obsoleted since the last time this indirect vdev was
945cabbc6Prashanth Sreenivasa *    condensed.  We use this object in order to improve performance
955cabbc6Prashanth Sreenivasa *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
965cabbc6Prashanth Sreenivasa *    offset of the vimp_counts_object, we only need to append an entry
975cabbc6Prashanth Sreenivasa *    to the end of this object.  When a DVA becomes obsolete, it is
985cabbc6Prashanth Sreenivasa *    added to the obsolete space map.  This happens when the DVA is
995cabbc6Prashanth Sreenivasa *    freed, remapped and not referenced by a snapshot, or the last
1005cabbc6Prashanth Sreenivasa *    snapshot referencing it is destroyed.
1015cabbc6Prashanth Sreenivasa *
1025cabbc6Prashanth Sreenivasa *  - Each dataset can have a ds_remap_deadlist object.  This is a
1035cabbc6Prashanth Sreenivasa *    deadlist object containing all blocks that were remapped in this
1045cabbc6Prashanth Sreenivasa *    dataset but referenced in a previous snapshot.  Blocks can *only*
1055cabbc6Prashanth Sreenivasa *    appear on this list if they were remapped (dsl_dataset_block_remapped);
1065cabbc6Prashanth Sreenivasa *    blocks that were killed in a head dataset are put on the normal
1075cabbc6Prashanth Sreenivasa *    ds_deadlist and marked obsolete when they are freed.
1085cabbc6Prashanth Sreenivasa *
1095cabbc6Prashanth Sreenivasa *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
1105cabbc6Prashanth Sreenivasa *    in the pool that need to be marked obsolete.  When a snapshot is
1115cabbc6Prashanth Sreenivasa *    destroyed, we move some of the ds_remap_deadlist to the obsolete
1125cabbc6Prashanth Sreenivasa *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
1135cabbc6Prashanth Sreenivasa *    asynchronously process the obsolete bpobj, moving its entries to
1145cabbc6Prashanth Sreenivasa *    the specific vdevs' obsolete space maps.
1155cabbc6Prashanth Sreenivasa *
1165cabbc6Prashanth Sreenivasa * == Summary of how we mark blocks as obsolete ==
1175cabbc6Prashanth Sreenivasa *
1185cabbc6Prashanth Sreenivasa * - When freeing a block: if any DVA is on an indirect vdev, append to
1195cabbc6Prashanth Sreenivasa *   vic_obsolete_sm_object.
1205cabbc6Prashanth Sreenivasa * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
1215cabbc6Prashanth Sreenivasa *   references; otherwise append to vic_obsolete_sm_object).
1225cabbc6Prashanth Sreenivasa * - When freeing a snapshot: move parts of ds_remap_deadlist to
1235cabbc6Prashanth Sreenivasa *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
1245cabbc6Prashanth Sreenivasa * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
1255cabbc6Prashanth Sreenivasa *   individual vdev's vic_obsolete_sm_object.
1265cabbc6Prashanth Sreenivasa */
1275cabbc6Prashanth Sreenivasa
1285cabbc6Prashanth Sreenivasa/*
1295cabbc6Prashanth Sreenivasa * "Big theory statement" for how we condense indirect vdevs.
1305cabbc6Prashanth Sreenivasa *
1315cabbc6Prashanth Sreenivasa * Condensing an indirect vdev's mapping is the process of determining
1325cabbc6Prashanth Sreenivasa * the precise counts of obsolete space for each mapping entry (by
1335cabbc6Prashanth Sreenivasa * integrating the obsolete spacemap into the obsolete counts) and
1345cabbc6Prashanth Sreenivasa * writing out a new mapping that contains only referenced entries.
1355cabbc6Prashanth Sreenivasa *
1365cabbc6Prashanth Sreenivasa * We condense a vdev when we expect the mapping to shrink (see
1375cabbc6Prashanth Sreenivasa * vdev_indirect_should_condense()), but only perform one condense at a
1385cabbc6Prashanth Sreenivasa * time to limit the memory usage.  In addition, we use a separate
1395cabbc6Prashanth Sreenivasa * open-context thread (spa_condense_indirect_thread) to incrementally
1405cabbc6Prashanth Sreenivasa * create the new mapping object in a way that minimizes the impact on
1415cabbc6Prashanth Sreenivasa * the rest of the system.
1425cabbc6Prashanth Sreenivasa *
1435cabbc6Prashanth Sreenivasa * == Generating a new mapping ==
1445cabbc6Prashanth Sreenivasa *
1455cabbc6Prashanth Sreenivasa * To generate a new mapping, we follow these steps:
1465cabbc6Prashanth Sreenivasa *
1475cabbc6Prashanth Sreenivasa * 1. Save the old obsolete space map and create a new mapping object
1485cabbc6Prashanth Sreenivasa *    (see spa_condense_indirect_start_sync()).  This initializes the
1495cabbc6Prashanth Sreenivasa *    spa_condensing_indirect_phys with the "previous obsolete space map",
1505cabbc6Prashanth Sreenivasa *    which is now read only.  Newly obsolete DVAs will be added to a
1515cabbc6Prashanth Sreenivasa *    new (initially empty) obsolete space map, and will not be
1525cabbc6Prashanth Sreenivasa *    considered as part of this condense operation.
1535cabbc6Prashanth Sreenivasa *
1545cabbc6Prashanth Sreenivasa * 2. Construct in memory the precise counts of obsolete space for each
1555cabbc6Prashanth Sreenivasa *    mapping entry, by incorporating the obsolete space map into the
1565cabbc6Prashanth Sreenivasa *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
1575cabbc6Prashanth Sreenivasa *
1585cabbc6Prashanth Sreenivasa * 3. Iterate through each mapping entry, writing to the new mapping any
1595cabbc6Prashanth Sreenivasa *    entries that are not completely obsolete (i.e. which don't have
1605cabbc6Prashanth Sreenivasa *    obsolete count == mapping length).  (See
1615cabbc6Prashanth Sreenivasa *    spa_condense_indirect_generate_new_mapping().)
1625cabbc6Prashanth Sreenivasa *
1635cabbc6Prashanth Sreenivasa * 4. Destroy the old mapping object and switch over to the new one
1645cabbc6Prashanth Sreenivasa *    (spa_condense_indirect_complete_sync).
1655cabbc6Prashanth Sreenivasa *
1665cabbc6Prashanth Sreenivasa * == Restarting from failure ==
1675cabbc6Prashanth Sreenivasa *
1685cabbc6Prashanth Sreenivasa * To restart the condense when we import/open the pool, we must start
1695cabbc6Prashanth Sreenivasa * at the 2nd step above: reconstruct the precise counts in memory,
1705cabbc6Prashanth Sreenivasa * based on the space map + counts.  Then in the 3rd step, we start
1715cabbc6Prashanth Sreenivasa * iterating where we left off: at vimp_max_offset of the new mapping
1725cabbc6Prashanth Sreenivasa * object.
1735cabbc6Prashanth Sreenivasa */
1745cabbc6Prashanth Sreenivasa
1755cabbc6Prashanth Sreenivasaboolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
1765cabbc6Prashanth Sreenivasa
1775cabbc6Prashanth Sreenivasa/*
1785cabbc6Prashanth Sreenivasa * Condense if at least this percent of the bytes in the mapping is
1795cabbc6Prashanth Sreenivasa * obsolete.  With the default of 25%, the amount of space mapped
1805cabbc6Prashanth Sreenivasa * will be reduced to 1% of its original size after at most 16
1815cabbc6Prashanth Sreenivasa * condenses.  Higher values will condense less often (causing less
1825cabbc6Prashanth Sreenivasa * i/o); lower values will reduce the mapping size more quickly.
1835cabbc6Prashanth Sreenivasa */
1845cabbc6Prashanth Sreenivasaint zfs_indirect_condense_obsolete_pct = 25;
1855cabbc6Prashanth Sreenivasa
1865cabbc6Prashanth Sreenivasa/*
1875cabbc6Prashanth Sreenivasa * Condense if the obsolete space map takes up more than this amount of
1885cabbc6Prashanth Sreenivasa * space on disk (logically).  This limits the amount of disk space
1895cabbc6Prashanth Sreenivasa * consumed by the obsolete space map; the default of 1GB is small enough
1905cabbc6Prashanth Sreenivasa * that we typically don't mind "wasting" it.
1915cabbc6Prashanth Sreenivasa */
1925cabbc6Prashanth Sreenivasauint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
1935cabbc6Prashanth Sreenivasa
1945cabbc6Prashanth Sreenivasa/*
1955cabbc6Prashanth Sreenivasa * Don't bother condensing if the mapping uses less than this amount of
1965cabbc6Prashanth Sreenivasa * memory.  The default of 128KB is considered a "trivial" amount of
1975cabbc6Prashanth Sreenivasa * memory and not worth reducing.
1985cabbc6Prashanth Sreenivasa */
1995cabbc6Prashanth Sreenivasauint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
2005cabbc6Prashanth Sreenivasa
2015cabbc6Prashanth Sreenivasa/*
2025cabbc6Prashanth Sreenivasa * This is used by the test suite so that it can ensure that certain
2035cabbc6Prashanth Sreenivasa * actions happen while in the middle of a condense (which might otherwise
2045cabbc6Prashanth Sreenivasa * complete too quickly).  If used to reduce the performance impact of
2055cabbc6Prashanth Sreenivasa * condensing in production, a maximum value of 1 should be sufficient.
2065cabbc6Prashanth Sreenivasa */
2075cabbc6Prashanth Sreenivasaint zfs_condense_indirect_commit_entry_delay_ticks = 0;
2085cabbc6Prashanth Sreenivasa
2095cabbc6Prashanth Sreenivasa/*
210a21fe34Brian Behlendorf * If an indirect split block contains more than this many possible unique
211a21fe34Brian Behlendorf * combinations when being reconstructed, consider it too computationally
212a21fe34Brian Behlendorf * expensive to check them all. Instead, try at most 100 randomly-selected
213a21fe34Brian Behlendorf * combinations each time the block is accessed.  This allows all segment
214a21fe34Brian Behlendorf * copies to participate fairly in the reconstruction when all combinations
215a21fe34Brian Behlendorf * cannot be checked and prevents repeated use of one bad copy.
216a21fe34Brian Behlendorf */
217a21fe34Brian Behlendorfint zfs_reconstruct_indirect_combinations_max = 256;
218a21fe34Brian Behlendorf
219a21fe34Brian Behlendorf
220a21fe34Brian Behlendorf/*
221a21fe34Brian Behlendorf * Enable to simulate damaged segments and validate reconstruction.
222a21fe34Brian Behlendorf * Used by ztest
2233a4b1beMatthew Ahrens */
224a21fe34Brian Behlendorfunsigned long zfs_reconstruct_indirect_damage_fraction = 0;
2253a4b1beMatthew Ahrens
2263a4b1beMatthew Ahrens/*
2273a4b1beMatthew Ahrens * The indirect_child_t represents the vdev that we will read from, when we
2283a4b1beMatthew Ahrens * need to read all copies of the data (e.g. for scrub or reconstruction).
2293a4b1beMatthew Ahrens * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
2303a4b1beMatthew Ahrens * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
2313a4b1beMatthew Ahrens * ic_vdev is a child of the mirror.
2323a4b1beMatthew Ahrens */
2333a4b1beMatthew Ahrenstypedef struct indirect_child {
2343a4b1beMatthew Ahrens	abd_t *ic_data;
2353a4b1beMatthew Ahrens	vdev_t *ic_vdev;
236a21fe34Brian Behlendorf
237a21fe34Brian Behlendorf	/*
238a21fe34Brian Behlendorf	 * ic_duplicate is NULL when the ic_data contents are unique, when it
239a21fe34Brian Behlendorf	 * is determined to be a duplicate it references the primary child.
240a21fe34Brian Behlendorf	 */
241a21fe34Brian Behlendorf	struct indirect_child *ic_duplicate;
242a21fe34Brian Behlendorf	list_node_t ic_node; /* node on is_unique_child */
2433a4b1beMatthew Ahrens} indirect_child_t;
2443a4b1beMatthew Ahrens
2453a4b1beMatthew Ahrens/*
2463a4b1beMatthew Ahrens * The indirect_split_t represents one mapped segment of an i/o to the
2473a4b1beMatthew Ahrens * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
2483a4b1beMatthew Ahrens * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
2493a4b1beMatthew Ahrens * For split blocks, there will be several of these.
2503a4b1beMatthew Ahrens */
2513a4b1beMatthew Ahrenstypedef struct indirect_split {
2523a4b1beMatthew Ahrens	list_node_t is_node; /* link on iv_splits */
2533a4b1beMatthew Ahrens
2543a4b1beMatthew Ahrens	/*
2553a4b1beMatthew Ahrens	 * is_split_offset is the offset into the i/o.
2563a4b1beMatthew Ahrens	 * This is the sum of the previous splits' is_size's.
2573a4b1beMatthew Ahrens	 */
2583a4b1beMatthew Ahrens	uint64_t is_split_offset;
2593a4b1beMatthew Ahrens
2603a4b1beMatthew Ahrens	vdev_t *is_vdev; /* top-level vdev */
2613a4b1beMatthew Ahrens	uint64_t is_target_offset; /* offset on is_vdev */
2623a4b1beMatthew Ahrens	uint64_t is_size;
2633a4b1beMatthew Ahrens	int is_children; /* number of entries in is_child[] */
264a21fe34Brian Behlendorf	int is_unique_children; /* number of entries in is_unique_child */
265a21fe34Brian Behlendorf	list_t is_unique_child;
2663a4b1beMatthew Ahrens
2673a4b1beMatthew Ahrens	/*
2683a4b1beMatthew Ahrens	 * is_good_child is the child that we are currently using to
2693a4b1beMatthew Ahrens	 * attempt reconstruction.
2703a4b1beMatthew Ahrens	 */
271a21fe34Brian Behlendorf	indirect_child_t *is_good_child;
2723a4b1beMatthew Ahrens
2733a4b1beMatthew Ahrens	indirect_child_t is_child[1]; /* variable-length */
2743a4b1beMatthew Ahrens} indirect_split_t;
2753a4b1beMatthew Ahrens
2763a4b1beMatthew Ahrens/*
2773a4b1beMatthew Ahrens * The indirect_vsd_t is associated with each i/o to the indirect vdev.
2783a4b1beMatthew Ahrens * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
2793a4b1beMatthew Ahrens */
2803a4b1beMatthew Ahrenstypedef struct indirect_vsd {
2813a4b1beMatthew Ahrens	boolean_t iv_split_block;
2823a4b1beMatthew Ahrens	boolean_t iv_reconstruct;
283a21fe34Brian Behlendorf	uint64_t iv_unique_combinations;
284a21fe34Brian Behlendorf	uint64_t iv_attempts;
285a21fe34Brian Behlendorf	uint64_t iv_attempts_max;
2863a4b1beMatthew Ahrens
2873a4b1beMatthew Ahrens	list_t iv_splits; /* list of indirect_split_t's */
2883a4b1beMatthew Ahrens} indirect_vsd_t;
2893a4b1beMatthew Ahrens
2903a4b1beMatthew Ahrensstatic void
2913a4b1beMatthew Ahrensvdev_indirect_map_free(zio_t *zio)
2923a4b1beMatthew Ahrens{
2933a4b1beMatthew Ahrens	indirect_vsd_t *iv = zio->io_vsd;
2943a4b1beMatthew Ahrens
2953a4b1beMatthew Ahrens	indirect_split_t *is;
2963a4b1beMatthew Ahrens	while ((is = list_head(&iv->iv_splits)) != NULL) {
2973a4b1beMatthew Ahrens		for (int c = 0; c < is->is_children; c++) {
2983a4b1beMatthew Ahrens			indirect_child_t *ic = &is->is_child[c];
2993a4b1beMatthew Ahrens			if (ic->ic_data != NULL)
3003a4b1beMatthew Ahrens				abd_free(ic->ic_data);
3013a4b1beMatthew Ahrens		}
3023a4b1beMatthew Ahrens		list_remove(&iv->iv_splits, is);
303a21fe34Brian Behlendorf
304a21fe34Brian Behlendorf		indirect_child_t *ic;
305a21fe34Brian Behlendorf		while ((ic = list_head(&is->is_unique_child)) != NULL)
306a21fe34Brian Behlendorf			list_remove(&is->is_unique_child, ic);
307a21fe34Brian Behlendorf
308a21fe34Brian Behlendorf		list_destroy(&is->is_unique_child);
309a21fe34Brian Behlendorf
3103a4b1beMatthew Ahrens		kmem_free(is,
3113a4b1beMatthew Ahrens		    offsetof(indirect_split_t, is_child[is->is_children]));
3123a4b1beMatthew Ahrens	}
3133a4b1beMatthew Ahrens	kmem_free(iv, sizeof (*iv));
3143a4b1beMatthew Ahrens}
3153a4b1beMatthew Ahrens
3163a4b1beMatthew Ahrensstatic const zio_vsd_ops_t vdev_indirect_vsd_ops = {
3173a4b1beMatthew Ahrens	vdev_indirect_map_free,
3183a4b1beMatthew Ahrens	zio_vsd_default_cksum_report
3193a4b1beMatthew Ahrens};
3203a4b1beMatthew Ahrens/*
3218671400Serapheim Dimitropoulos * Mark the given offset and size as being obsolete.
3225cabbc6Prashanth Sreenivasa */
3235cabbc6Prashanth Sreenivasavoid
3248671400Serapheim Dimitropoulosvdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
3255cabbc6Prashanth Sreenivasa{
3265cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
3278671400Serapheim Dimitropoulos
3285cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
3295cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
3305cabbc6Prashanth Sreenivasa	ASSERT(size > 0);
3315cabbc6Prashanth Sreenivasa	VERIFY(vdev_indirect_mapping_entry_for_offset(
3325cabbc6Prashanth Sreenivasa	    vd->vdev_indirect_mapping, offset) != NULL);
3335cabbc6Prashanth Sreenivasa
3345cabbc6Prashanth Sreenivasa	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
3355cabbc6Prashanth Sreenivasa		mutex_enter(&vd->vdev_obsolete_lock);
3365cabbc6Prashanth Sreenivasa		range_tree_add(vd->vdev_obsolete_segments, offset, size);
3375cabbc6Prashanth Sreenivasa		mutex_exit(&vd->vdev_obsolete_lock);
3388671400Serapheim Dimitropoulos		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
3395cabbc6Prashanth Sreenivasa	}
3405cabbc6Prashanth Sreenivasa}
3415cabbc6Prashanth Sreenivasa
3425cabbc6Prashanth Sreenivasa/*
3435cabbc6Prashanth Sreenivasa * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
3445cabbc6Prashanth Sreenivasa * wrapper is provided because the DMU does not know about vdev_t's and
3455cabbc6Prashanth Sreenivasa * cannot directly call vdev_indirect_mark_obsolete.
3465cabbc6Prashanth Sreenivasa */
3475cabbc6Prashanth Sreenivasavoid
3485cabbc6Prashanth Sreenivasaspa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
3495cabbc6Prashanth Sreenivasa    uint64_t size, dmu_tx_t *tx)
3505cabbc6Prashanth Sreenivasa{
3515cabbc6Prashanth Sreenivasa	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3525cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
3535cabbc6Prashanth Sreenivasa
3545cabbc6Prashanth Sreenivasa	/* The DMU can only remap indirect vdevs. */
3555cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
3568671400Serapheim Dimitropoulos	vdev_indirect_mark_obsolete(vd, offset, size);
3575cabbc6Prashanth Sreenivasa}
3585cabbc6Prashanth Sreenivasa
3595cabbc6Prashanth Sreenivasastatic spa_condensing_indirect_t *
3605cabbc6Prashanth Sreenivasaspa_condensing_indirect_create(spa_t *spa)
3615cabbc6Prashanth Sreenivasa{
3625cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
3635cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
3645cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
3655cabbc6Prashanth Sreenivasa	objset_t *mos = spa->spa_meta_objset;
3665cabbc6Prashanth Sreenivasa
3675cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
3685cabbc6Prashanth Sreenivasa		list_create(&sci->sci_new_mapping_entries[i],
3695cabbc6Prashanth Sreenivasa		    sizeof (vdev_indirect_mapping_entry_t),
3705cabbc6Prashanth Sreenivasa		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
3715cabbc6Prashanth Sreenivasa	}
3725cabbc6Prashanth Sreenivasa
3735cabbc6Prashanth Sreenivasa	sci->sci_new_mapping =
3745cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
3755cabbc6Prashanth Sreenivasa
3765cabbc6Prashanth Sreenivasa	return (sci);
3775cabbc6Prashanth Sreenivasa}
3785cabbc6Prashanth Sreenivasa
3795cabbc6Prashanth Sreenivasastatic void
3805cabbc6Prashanth Sreenivasaspa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
3815cabbc6Prashanth Sreenivasa{
3825cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++)
3835cabbc6Prashanth Sreenivasa		list_destroy(&sci->sci_new_mapping_entries[i]);
3845cabbc6Prashanth Sreenivasa
3855cabbc6Prashanth Sreenivasa	if (sci->sci_new_mapping != NULL)
3865cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_close(sci->sci_new_mapping);
3875cabbc6Prashanth Sreenivasa
3885cabbc6Prashanth Sreenivasa	kmem_free(sci, sizeof (*sci));
3895cabbc6Prashanth Sreenivasa}
3905cabbc6Prashanth Sreenivasa
3915cabbc6Prashanth Sreenivasaboolean_t
3925cabbc6Prashanth Sreenivasavdev_indirect_should_condense(vdev_t *vd)
3935cabbc6Prashanth Sreenivasa{
3945cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3955cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
3965cabbc6Prashanth Sreenivasa
3975cabbc6Prashanth Sreenivasa	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
3985cabbc6Prashanth Sreenivasa
3995cabbc6Prashanth Sreenivasa	if (!zfs_condense_indirect_vdevs_enable)
4005cabbc6Prashanth Sreenivasa		return (B_FALSE);
4015cabbc6Prashanth Sreenivasa
4025cabbc6Prashanth Sreenivasa	/*
4035cabbc6Prashanth Sreenivasa	 * We can only condense one indirect vdev at a time.
4045cabbc6Prashanth Sreenivasa	 */
4055cabbc6Prashanth Sreenivasa	if (spa->spa_condensing_indirect != NULL)
4065cabbc6Prashanth Sreenivasa		return (B_FALSE);
4075cabbc6Prashanth Sreenivasa
4085cabbc6Prashanth Sreenivasa	if (spa_shutting_down(spa))
4095cabbc6Prashanth Sreenivasa		return (B_FALSE);
4105cabbc6Prashanth Sreenivasa
4115cabbc6Prashanth Sreenivasa	/*
4125cabbc6Prashanth Sreenivasa	 * The mapping object size must not change while we are
4135cabbc6Prashanth Sreenivasa	 * condensing, so we can only condense indirect vdevs
4145cabbc6Prashanth Sreenivasa	 * (not vdevs that are still in the middle of being removed).
4155cabbc6Prashanth Sreenivasa	 */
4165cabbc6Prashanth Sreenivasa	if (vd->vdev_ops != &vdev_indirect_ops)
4175cabbc6Prashanth Sreenivasa		return (B_FALSE);
4185cabbc6Prashanth Sreenivasa
4195cabbc6Prashanth Sreenivasa	/*
4205cabbc6Prashanth Sreenivasa	 * If nothing new has been marked obsolete, there is no
4215cabbc6Prashanth Sreenivasa	 * point in condensing.
4225cabbc6Prashanth Sreenivasa	 */
4235cabbc6Prashanth Sreenivasa	if (vd->vdev_obsolete_sm == NULL) {
4245cabbc6Prashanth Sreenivasa		ASSERT0(vdev_obsolete_sm_object(vd));
4255cabbc6Prashanth Sreenivasa		return (B_FALSE);
4265cabbc6Prashanth Sreenivasa	}
4275cabbc6Prashanth Sreenivasa
4285cabbc6Prashanth Sreenivasa	ASSERT(vd->vdev_obsolete_sm != NULL);
4295cabbc6Prashanth Sreenivasa
4305cabbc6Prashanth Sreenivasa	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
4315cabbc6Prashanth Sreenivasa	    space_map_object(vd->vdev_obsolete_sm));
4325cabbc6Prashanth Sreenivasa
4335cabbc6Prashanth Sreenivasa	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
4345cabbc6Prashanth Sreenivasa	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
4355cabbc6Prashanth Sreenivasa	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
4365cabbc6Prashanth Sreenivasa	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
4375cabbc6Prashanth Sreenivasa
4385cabbc6Prashanth Sreenivasa	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
4395cabbc6Prashanth Sreenivasa
4405cabbc6Prashanth Sreenivasa	/*
4415cabbc6Prashanth Sreenivasa	 * If a high percentage of the bytes that are mapped have become
4425cabbc6Prashanth Sreenivasa	 * obsolete, condense (unless the mapping is already small enough).
4435cabbc6Prashanth Sreenivasa	 * This has a good chance of reducing the amount of memory used
4445cabbc6Prashanth Sreenivasa	 * by the mapping.
4455cabbc6Prashanth Sreenivasa	 */
4465cabbc6Prashanth Sreenivasa	if (bytes_obsolete * 100 / bytes_mapped >=
4475cabbc6Prashanth Sreenivasa	    zfs_indirect_condense_obsolete_pct &&
4485cabbc6Prashanth Sreenivasa	    mapping_size > zfs_condense_min_mapping_bytes) {
4495cabbc6Prashanth Sreenivasa		zfs_dbgmsg("should condense vdev %llu because obsolete "
4505cabbc6Prashanth Sreenivasa		    "spacemap covers %d%% of %lluMB mapping",
4515cabbc6Prashanth Sreenivasa		    (u_longlong_t)vd->vdev_id,
4525cabbc6Prashanth Sreenivasa		    (int)(bytes_obsolete * 100 / bytes_mapped),
4535cabbc6Prashanth Sreenivasa		    (u_longlong_t)bytes_mapped / 1024 / 1024);
4545cabbc6Prashanth Sreenivasa		return (B_TRUE);
4555cabbc6Prashanth Sreenivasa	}
4565cabbc6Prashanth Sreenivasa
4575cabbc6Prashanth Sreenivasa	/*
4585cabbc6Prashanth Sreenivasa	 * If the obsolete space map takes up too much space on disk,
4595cabbc6Prashanth Sreenivasa	 * condense in order to free up this disk space.
4605cabbc6Prashanth Sreenivasa	 */
4615cabbc6Prashanth Sreenivasa	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
4625cabbc6Prashanth Sreenivasa		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
4635cabbc6Prashanth Sreenivasa		    "length %lluMB >= max size %lluMB",
4645cabbc6Prashanth Sreenivasa		    (u_longlong_t)vd->vdev_id,
4655cabbc6Prashanth Sreenivasa		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
4665cabbc6Prashanth Sreenivasa		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
4675cabbc6Prashanth Sreenivasa		    1024 / 1024);
4685cabbc6Prashanth Sreenivasa		return (B_TRUE);
4695cabbc6Prashanth Sreenivasa	}
4705cabbc6Prashanth Sreenivasa
4715cabbc6Prashanth Sreenivasa	return (B_FALSE);
4725cabbc6Prashanth Sreenivasa}
4735cabbc6Prashanth Sreenivasa
4745cabbc6Prashanth Sreenivasa/*
4755cabbc6Prashanth Sreenivasa * This sync task completes (finishes) a condense, deleting the old
4765cabbc6Prashanth Sreenivasa * mapping and replacing it with the new one.
4775cabbc6Prashanth Sreenivasa */
4785cabbc6Prashanth Sreenivasastatic void
4795cabbc6Prashanth Sreenivasaspa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
4805cabbc6Prashanth Sreenivasa{
4815cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = arg;
4825cabbc6Prashanth Sreenivasa	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4835cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
4845cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
4855cabbc6Prashanth Sreenivasa	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
4865cabbc6Prashanth Sreenivasa	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4875cabbc6Prashanth Sreenivasa	objset_t *mos = spa->spa_meta_objset;
4885cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
4895cabbc6Prashanth Sreenivasa	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
4905cabbc6Prashanth Sreenivasa	uint64_t new_count =
4915cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
4925cabbc6Prashanth Sreenivasa
4935cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
4945cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4955cabbc6Prashanth Sreenivasa	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
4965cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
4975cabbc6Prashanth Sreenivasa		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
4985cabbc6Prashanth Sreenivasa	}
4995cabbc6Prashanth Sreenivasa	ASSERT(vic->vic_mapping_object != 0);
5005cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
5015cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_next_mapping_object != 0);
5025cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
5035cabbc6Prashanth Sreenivasa
5045cabbc6Prashanth Sreenivasa	/*
5055cabbc6Prashanth Sreenivasa	 * Reset vdev_indirect_mapping to refer to the new object.
5065cabbc6Prashanth Sreenivasa	 */
5075cabbc6Prashanth Sreenivasa	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
5085cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
5095cabbc6Prashanth Sreenivasa	vd->vdev_indirect_mapping = sci->sci_new_mapping;
5105cabbc6Prashanth Sreenivasa	rw_exit(&vd->vdev_indirect_rwlock);
5115cabbc6Prashanth Sreenivasa
5125cabbc6Prashanth Sreenivasa	sci->sci_new_mapping = NULL;
5135cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
5145cabbc6Prashanth Sreenivasa	vic->vic_mapping_object = scip->scip_next_mapping_object;
5155cabbc6Prashanth Sreenivasa	scip->scip_next_mapping_object = 0;
5165cabbc6Prashanth Sreenivasa
5175cabbc6Prashanth Sreenivasa	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
5185cabbc6Prashanth Sreenivasa	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
5195cabbc6Prashanth Sreenivasa	scip->scip_prev_obsolete_sm_object = 0;
5205cabbc6Prashanth Sreenivasa
5215cabbc6Prashanth Sreenivasa	scip->scip_vdev = 0;
5225cabbc6Prashanth Sreenivasa
5235cabbc6Prashanth Sreenivasa	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
5245cabbc6Prashanth Sreenivasa	    DMU_POOL_CONDENSING_INDIRECT, tx));
5255cabbc6Prashanth Sreenivasa	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
5265cabbc6Prashanth Sreenivasa	spa->spa_condensing_indirect = NULL;
5275cabbc6Prashanth Sreenivasa
5285cabbc6Prashanth Sreenivasa	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
5295cabbc6Prashanth Sreenivasa	    "new mapping object %llu has %llu entries "
5305cabbc6Prashanth Sreenivasa	    "(was %llu entries)",
5315cabbc6Prashanth Sreenivasa	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
5325cabbc6Prashanth Sreenivasa	    new_count, old_count);
5335cabbc6Prashanth Sreenivasa
5345cabbc6Prashanth Sreenivasa	vdev_config_dirty(spa->spa_root_vdev);
5355cabbc6Prashanth Sreenivasa}
5365cabbc6Prashanth Sreenivasa
5375cabbc6Prashanth Sreenivasa/*
5385cabbc6Prashanth Sreenivasa * This sync task appends entries to the new mapping object.
5395cabbc6Prashanth Sreenivasa */
5405cabbc6Prashanth Sreenivasastatic void
5415cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
5425cabbc6Prashanth Sreenivasa{
5435cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = arg;
5445cabbc6Prashanth Sreenivasa	uint64_t txg = dmu_tx_get_txg(tx);
5455cabbc6Prashanth Sreenivasa	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5465cabbc6Prashanth Sreenivasa
5475cabbc6Prashanth Sreenivasa	ASSERT(dmu_tx_is_syncing(tx));
5485cabbc6Prashanth Sreenivasa	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
5495cabbc6Prashanth Sreenivasa
5505cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
5515cabbc6Prashanth Sreenivasa	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
5525cabbc6Prashanth Sreenivasa	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
5535cabbc6Prashanth Sreenivasa}
5545cabbc6Prashanth Sreenivasa
5555cabbc6Prashanth Sreenivasa/*
5565cabbc6Prashanth Sreenivasa * Open-context function to add one entry to the new mapping.  The new
5575cabbc6Prashanth Sreenivasa * entry will be remembered and written from syncing context.
5585cabbc6Prashanth Sreenivasa */
5595cabbc6Prashanth Sreenivasastatic void
5605cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_entry(spa_t *spa,
5615cabbc6Prashanth Sreenivasa    vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
5625cabbc6Prashanth Sreenivasa{
5635cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
5645cabbc6Prashanth Sreenivasa
5655cabbc6Prashanth Sreenivasa	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
5665cabbc6Prashanth Sreenivasa
5675cabbc6Prashanth Sreenivasa	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5685cabbc6Prashanth Sreenivasa	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
5695cabbc6Prashanth Sreenivasa	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
5705cabbc6Prashanth Sreenivasa	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
5715cabbc6Prashanth Sreenivasa
5725cabbc6Prashanth Sreenivasa	/*
5735cabbc6Prashanth Sreenivasa	 * If we are the first entry committed this txg, kick off the sync
5745cabbc6Prashanth Sreenivasa	 * task to write to the MOS on our behalf.
5755cabbc6Prashanth Sreenivasa	 */
5765cabbc6Prashanth Sreenivasa	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
5775cabbc6Prashanth Sreenivasa		dsl_sync_task_nowait(dmu_tx_pool(tx),
5785cabbc6Prashanth Sreenivasa		    spa_condense_indirect_commit_sync, sci,
5795cabbc6Prashanth Sreenivasa		    0, ZFS_SPACE_CHECK_NONE, tx);
5805cabbc6Prashanth Sreenivasa	}
5815cabbc6Prashanth Sreenivasa
5825cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_entry_t *vime =
5835cabbc6Prashanth Sreenivasa	    kmem_alloc(sizeof (*vime), KM_SLEEP);
5845cabbc6Prashanth Sreenivasa	vime->vime_mapping = *vimep;
5855cabbc6Prashanth Sreenivasa	vime->vime_obsolete_count = count;
5865cabbc6Prashanth Sreenivasa	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
5875cabbc6Prashanth Sreenivasa
5885cabbc6Prashanth Sreenivasa	dmu_tx_commit(tx);
5895cabbc6Prashanth Sreenivasa}
5905cabbc6Prashanth Sreenivasa
5915cabbc6Prashanth Sreenivasastatic void
5925cabbc6Prashanth Sreenivasaspa_condense_indirect_generate_new_mapping(vdev_t *vd,
593667ec66Serapheim Dimitropoulos    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
5945cabbc6Prashanth Sreenivasa{
5955cabbc6Prashanth Sreenivasa	spa_t *spa = vd->vdev_spa;
5965cabbc6Prashanth Sreenivasa	uint64_t mapi = start_index;
5975cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
5985cabbc6Prashanth Sreenivasa	uint64_t old_num_entries =
5995cabbc6Prashanth Sreenivasa	    vdev_indirect_mapping_num_entries(old_mapping);
6005cabbc6Prashanth Sreenivasa
6015cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6025cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
6035cabbc6Prashanth Sreenivasa
6045cabbc6Prashanth Sreenivasa	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
6055cabbc6Prashanth Sreenivasa	    (u_longlong_t)vd->vdev_id,
6065cabbc6Prashanth Sreenivasa	    (u_longlong_t)mapi);
6075cabbc6Prashanth Sreenivasa
608667ec66Serapheim Dimitropoulos	while (mapi < old_num_entries) {
609667ec66Serapheim Dimitropoulos
610667ec66Serapheim Dimitropoulos		if (zthr_iscancelled(zthr)) {
611667ec66Serapheim Dimitropoulos			zfs_dbgmsg("pausing condense of vdev %llu "
612667ec66Serapheim Dimitropoulos			    "at index %llu", (u_longlong_t)vd->vdev_id,
613667ec66Serapheim Dimitropoulos			    (u_longlong_t)mapi);
614667ec66Serapheim Dimitropoulos			break;
615667ec66Serapheim Dimitropoulos		}
616667ec66Serapheim Dimitropoulos
6175cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_entry_phys_t *entry =
6185cabbc6Prashanth Sreenivasa		    &old_mapping->vim_entries[mapi];
6195cabbc6Prashanth Sreenivasa		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
6205cabbc6Prashanth Sreenivasa		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
6215cabbc6Prashanth Sreenivasa		if (obsolete_counts[mapi] < entry_size) {
6225cabbc6Prashanth Sreenivasa			spa_condense_indirect_commit_entry(spa, entry,
6235cabbc6Prashanth Sreenivasa			    obsolete_counts[mapi]);
6245cabbc6Prashanth Sreenivasa
6255cabbc6Prashanth Sreenivasa			/*
6265cabbc6Prashanth Sreenivasa			 * This delay may be requested for testing, debugging,
6275cabbc6Prashanth Sreenivasa			 * or performance reasons.
6285cabbc6Prashanth Sreenivasa			 */
6295cabbc6Prashanth Sreenivasa			delay(zfs_condense_indirect_commit_entry_delay_ticks);
6305cabbc6Prashanth Sreenivasa		}
6315cabbc6Prashanth Sreenivasa
6325cabbc6Prashanth Sreenivasa		mapi++;
6335cabbc6Prashanth Sreenivasa	}
6345cabbc6Prashanth Sreenivasa}
6355cabbc6Prashanth Sreenivasa
636667ec66Serapheim Dimitropoulos/* ARGSUSED */
637667ec66Serapheim Dimitropoulosstatic boolean_t
638667ec66Serapheim Dimitropoulosspa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
6395cabbc6Prashanth Sreenivasa{
640667ec66Serapheim Dimitropoulos	spa_t *spa = arg;
641667ec66Serapheim Dimitropoulos
642667ec66Serapheim Dimitropoulos	return (spa->spa_condensing_indirect != NULL);
643667ec66Serapheim Dimitropoulos}
644667ec66Serapheim Dimitropoulos
645667ec66Serapheim Dimitropoulos/* ARGSUSED */
6466a316e1Serapheim Dimitropoulosstatic void
647667ec66Serapheim Dimitropoulosspa_condense_indirect_thread(void *arg, zthr_t *zthr)
648667ec66Serapheim Dimitropoulos{
649667ec66Serapheim Dimitropoulos	spa_t *spa = arg;
650667ec66Serapheim Dimitropoulos	vdev_t *vd;
651667ec66Serapheim Dimitropoulos
652667ec66Serapheim Dimitropoulos	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
653667ec66Serapheim Dimitropoulos	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
654667ec66Serapheim Dimitropoulos	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
655667ec66Serapheim Dimitropoulos	ASSERT3P(vd, !=, NULL);
656667ec66Serapheim Dimitropoulos	spa_config_exit(spa, SCL_VDEV, FTAG);
657667ec66Serapheim Dimitropoulos
6585cabbc6Prashanth Sreenivasa	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
6595cabbc6Prashanth Sreenivasa	spa_condensing_indirect_phys_t *scip =
6605cabbc6Prashanth Sreenivasa	    &spa->spa_condensing_indirect_phys;
6615cabbc6Prashanth Sreenivasa	uint32_t *counts;
6625cabbc6Prashanth Sreenivasa	uint64_t start_index;
6635cabbc6Prashanth Sreenivasa	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
6645cabbc6Prashanth Sreenivasa	space_map_t *prev_obsolete_sm = NULL;
6655cabbc6Prashanth Sreenivasa
6665cabbc6Prashanth Sreenivasa	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
6675cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_next_mapping_object != 0);
6685cabbc6Prashanth Sreenivasa	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
6695cabbc6Prashanth Sreenivasa	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6705cabbc6Prashanth Sreenivasa
6715cabbc6Prashanth Sreenivasa	for (int i = 0; i < TXG_SIZE; i++) {
6725cabbc6Prashanth Sreenivasa		/*
6735cabbc6Prashanth Sreenivasa		 * The list must start out empty in order for the
6745cabbc6Prashanth Sreenivasa		 * _commit_sync() sync task to be properly registered
6755cabbc6Prashanth Sreenivasa		 * on the first call to _commit_entry(); so it's wise
6765cabbc6Prashanth Sreenivasa		 * to double check and ensure we actually are starting
6775cabbc6Prashanth Sreenivasa		 * with empty lists.
6785cabbc6Prashanth Sreenivasa		 */
6795cabbc6Prashanth Sreenivasa		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
6805cabbc6Prashanth Sreenivasa	}
6815cabbc6Prashanth Sreenivasa
6825cabbc6Prashanth Sreenivasa	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6835cabbc6Prashanth Sreenivasa	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6845cabbc6Prashanth Sreenivasa	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
6855cabbc6Prashanth Sreenivasa	if (prev_obsolete_sm != NULL) {
6865cabbc6Prashanth Sreenivasa		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,