xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_indirect.c (revision 814dcd43c3de9925fd6226c256e4d4327841a0e1)
15cabbc6bSPrashanth Sreenivasa /*
25cabbc6bSPrashanth Sreenivasa  * CDDL HEADER START
35cabbc6bSPrashanth Sreenivasa  *
45cabbc6bSPrashanth Sreenivasa  * This file and its contents are supplied under the terms of the
55cabbc6bSPrashanth Sreenivasa  * Common Development and Distribution License ("CDDL"), version 1.0.
65cabbc6bSPrashanth Sreenivasa  * You may only use this file in accordance with the terms of version
75cabbc6bSPrashanth Sreenivasa  * 1.0 of the CDDL.
85cabbc6bSPrashanth Sreenivasa  *
95cabbc6bSPrashanth Sreenivasa  * A full copy of the text of the CDDL should have accompanied this
105cabbc6bSPrashanth Sreenivasa  * source.  A copy of the CDDL is also available via the Internet at
115cabbc6bSPrashanth Sreenivasa  * http://www.illumos.org/license/CDDL.
125cabbc6bSPrashanth Sreenivasa  *
135cabbc6bSPrashanth Sreenivasa  * CDDL HEADER END
145cabbc6bSPrashanth Sreenivasa  */
155cabbc6bSPrashanth Sreenivasa 
165cabbc6bSPrashanth Sreenivasa /*
17*814dcd43SSerapheim Dimitropoulos  * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
185cabbc6bSPrashanth Sreenivasa  */
195cabbc6bSPrashanth Sreenivasa 
205cabbc6bSPrashanth Sreenivasa #include <sys/zfs_context.h>
215cabbc6bSPrashanth Sreenivasa #include <sys/spa.h>
225cabbc6bSPrashanth Sreenivasa #include <sys/spa_impl.h>
235cabbc6bSPrashanth Sreenivasa #include <sys/vdev_impl.h>
245cabbc6bSPrashanth Sreenivasa #include <sys/fs/zfs.h>
255cabbc6bSPrashanth Sreenivasa #include <sys/zio.h>
263a4b1be9SMatthew Ahrens #include <sys/zio_checksum.h>
275cabbc6bSPrashanth Sreenivasa #include <sys/metaslab.h>
285cabbc6bSPrashanth Sreenivasa #include <sys/refcount.h>
295cabbc6bSPrashanth Sreenivasa #include <sys/dmu.h>
305cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_mapping.h>
315cabbc6bSPrashanth Sreenivasa #include <sys/dmu_tx.h>
325cabbc6bSPrashanth Sreenivasa #include <sys/dsl_synctask.h>
335cabbc6bSPrashanth Sreenivasa #include <sys/zap.h>
34667ec66fSSerapheim Dimitropoulos #include <sys/abd.h>
35667ec66fSSerapheim Dimitropoulos #include <sys/zthr.h>
365cabbc6bSPrashanth Sreenivasa 
375cabbc6bSPrashanth Sreenivasa /*
385cabbc6bSPrashanth Sreenivasa  * An indirect vdev corresponds to a vdev that has been removed.  Since
395cabbc6bSPrashanth Sreenivasa  * we cannot rewrite block pointers of snapshots, etc., we keep a
405cabbc6bSPrashanth Sreenivasa  * mapping from old location on the removed device to the new location
415cabbc6bSPrashanth Sreenivasa  * on another device in the pool and use this mapping whenever we need
425cabbc6bSPrashanth Sreenivasa  * to access the DVA.  Unfortunately, this mapping did not respect
435cabbc6bSPrashanth Sreenivasa  * logical block boundaries when it was first created, and so a DVA on
445cabbc6bSPrashanth Sreenivasa  * this indirect vdev may be "split" into multiple sections that each
455cabbc6bSPrashanth Sreenivasa  * map to a different location.  As a consequence, not all DVAs can be
465cabbc6bSPrashanth Sreenivasa  * translated to an equivalent new DVA.  Instead we must provide a
475cabbc6bSPrashanth Sreenivasa  * "vdev_remap" operation that executes a callback on each contiguous
485cabbc6bSPrashanth Sreenivasa  * segment of the new location.  This function is used in multiple ways:
495cabbc6bSPrashanth Sreenivasa  *
503a4b1be9SMatthew Ahrens  *  - i/os to this vdev use the callback to determine where the
513a4b1be9SMatthew Ahrens  *    data is now located, and issue child i/os for each segment's new
523a4b1be9SMatthew Ahrens  *    location.
535cabbc6bSPrashanth Sreenivasa  *
543a4b1be9SMatthew Ahrens  *  - frees and claims to this vdev use the callback to free or claim
555cabbc6bSPrashanth Sreenivasa  *    each mapped segment.  (Note that we don't actually need to claim
565cabbc6bSPrashanth Sreenivasa  *    log blocks on indirect vdevs, because we don't allocate to
575cabbc6bSPrashanth Sreenivasa  *    removing vdevs.  However, zdb uses zio_claim() for its leak
585cabbc6bSPrashanth Sreenivasa  *    detection.)
595cabbc6bSPrashanth Sreenivasa  */
605cabbc6bSPrashanth Sreenivasa 
615cabbc6bSPrashanth Sreenivasa /*
625cabbc6bSPrashanth Sreenivasa  * "Big theory statement" for how we mark blocks obsolete.
635cabbc6bSPrashanth Sreenivasa  *
645cabbc6bSPrashanth Sreenivasa  * When a block on an indirect vdev is freed or remapped, a section of
655cabbc6bSPrashanth Sreenivasa  * that vdev's mapping may no longer be referenced (aka "obsolete").  We
665cabbc6bSPrashanth Sreenivasa  * keep track of how much of each mapping entry is obsolete.  When
675cabbc6bSPrashanth Sreenivasa  * an entry becomes completely obsolete, we can remove it, thus reducing
685cabbc6bSPrashanth Sreenivasa  * the memory used by the mapping.  The complete picture of obsolescence
695cabbc6bSPrashanth Sreenivasa  * is given by the following data structures, described below:
705cabbc6bSPrashanth Sreenivasa  *  - the entry-specific obsolete count
715cabbc6bSPrashanth Sreenivasa  *  - the vdev-specific obsolete spacemap
725cabbc6bSPrashanth Sreenivasa  *  - the pool-specific obsolete bpobj
735cabbc6bSPrashanth Sreenivasa  *
745cabbc6bSPrashanth Sreenivasa  * == On disk data structures used ==
755cabbc6bSPrashanth Sreenivasa  *
765cabbc6bSPrashanth Sreenivasa  * We track the obsolete space for the pool using several objects.  Each
775cabbc6bSPrashanth Sreenivasa  * of these objects is created on demand and freed when no longer
785cabbc6bSPrashanth Sreenivasa  * needed, and is assumed to be empty if it does not exist.
795cabbc6bSPrashanth Sreenivasa  * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
805cabbc6bSPrashanth Sreenivasa  *
815cabbc6bSPrashanth Sreenivasa  *  - Each vic_mapping_object (associated with an indirect vdev) can
825cabbc6bSPrashanth Sreenivasa  *    have a vimp_counts_object.  This is an array of uint32_t's
835cabbc6bSPrashanth Sreenivasa  *    with the same number of entries as the vic_mapping_object.  When
845cabbc6bSPrashanth Sreenivasa  *    the mapping is condensed, entries from the vic_obsolete_sm_object
855cabbc6bSPrashanth Sreenivasa  *    (see below) are folded into the counts.  Therefore, each
865cabbc6bSPrashanth Sreenivasa  *    obsolete_counts entry tells us the number of bytes in the
875cabbc6bSPrashanth Sreenivasa  *    corresponding mapping entry that were not referenced when the
885cabbc6bSPrashanth Sreenivasa  *    mapping was last condensed.
895cabbc6bSPrashanth Sreenivasa  *
905cabbc6bSPrashanth Sreenivasa  *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
915cabbc6bSPrashanth Sreenivasa  *    This is a space map containing an alloc entry for every DVA that
925cabbc6bSPrashanth Sreenivasa  *    has been obsoleted since the last time this indirect vdev was
935cabbc6bSPrashanth Sreenivasa  *    condensed.  We use this object in order to improve performance
945cabbc6bSPrashanth Sreenivasa  *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
955cabbc6bSPrashanth Sreenivasa  *    offset of the vimp_counts_object, we only need to append an entry
965cabbc6bSPrashanth Sreenivasa  *    to the end of this object.  When a DVA becomes obsolete, it is
975cabbc6bSPrashanth Sreenivasa  *    added to the obsolete space map.  This happens when the DVA is
985cabbc6bSPrashanth Sreenivasa  *    freed, remapped and not referenced by a snapshot, or the last
995cabbc6bSPrashanth Sreenivasa  *    snapshot referencing it is destroyed.
1005cabbc6bSPrashanth Sreenivasa  *
1015cabbc6bSPrashanth Sreenivasa  *  - Each dataset can have a ds_remap_deadlist object.  This is a
1025cabbc6bSPrashanth Sreenivasa  *    deadlist object containing all blocks that were remapped in this
1035cabbc6bSPrashanth Sreenivasa  *    dataset but referenced in a previous snapshot.  Blocks can *only*
1045cabbc6bSPrashanth Sreenivasa  *    appear on this list if they were remapped (dsl_dataset_block_remapped);
1055cabbc6bSPrashanth Sreenivasa  *    blocks that were killed in a head dataset are put on the normal
1065cabbc6bSPrashanth Sreenivasa  *    ds_deadlist and marked obsolete when they are freed.
1075cabbc6bSPrashanth Sreenivasa  *
1085cabbc6bSPrashanth Sreenivasa  *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
1095cabbc6bSPrashanth Sreenivasa  *    in the pool that need to be marked obsolete.  When a snapshot is
1105cabbc6bSPrashanth Sreenivasa  *    destroyed, we move some of the ds_remap_deadlist to the obsolete
1115cabbc6bSPrashanth Sreenivasa  *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
1125cabbc6bSPrashanth Sreenivasa  *    asynchronously process the obsolete bpobj, moving its entries to
1135cabbc6bSPrashanth Sreenivasa  *    the specific vdevs' obsolete space maps.
1145cabbc6bSPrashanth Sreenivasa  *
1155cabbc6bSPrashanth Sreenivasa  * == Summary of how we mark blocks as obsolete ==
1165cabbc6bSPrashanth Sreenivasa  *
1175cabbc6bSPrashanth Sreenivasa  * - When freeing a block: if any DVA is on an indirect vdev, append to
1185cabbc6bSPrashanth Sreenivasa  *   vic_obsolete_sm_object.
1195cabbc6bSPrashanth Sreenivasa  * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
1205cabbc6bSPrashanth Sreenivasa  *   references; otherwise append to vic_obsolete_sm_object).
1215cabbc6bSPrashanth Sreenivasa  * - When freeing a snapshot: move parts of ds_remap_deadlist to
1225cabbc6bSPrashanth Sreenivasa  *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
1235cabbc6bSPrashanth Sreenivasa  * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
1245cabbc6bSPrashanth Sreenivasa  *   individual vdev's vic_obsolete_sm_object.
1255cabbc6bSPrashanth Sreenivasa  */
1265cabbc6bSPrashanth Sreenivasa 
1275cabbc6bSPrashanth Sreenivasa /*
1285cabbc6bSPrashanth Sreenivasa  * "Big theory statement" for how we condense indirect vdevs.
1295cabbc6bSPrashanth Sreenivasa  *
1305cabbc6bSPrashanth Sreenivasa  * Condensing an indirect vdev's mapping is the process of determining
1315cabbc6bSPrashanth Sreenivasa  * the precise counts of obsolete space for each mapping entry (by
1325cabbc6bSPrashanth Sreenivasa  * integrating the obsolete spacemap into the obsolete counts) and
1335cabbc6bSPrashanth Sreenivasa  * writing out a new mapping that contains only referenced entries.
1345cabbc6bSPrashanth Sreenivasa  *
1355cabbc6bSPrashanth Sreenivasa  * We condense a vdev when we expect the mapping to shrink (see
1365cabbc6bSPrashanth Sreenivasa  * vdev_indirect_should_condense()), but only perform one condense at a
1375cabbc6bSPrashanth Sreenivasa  * time to limit the memory usage.  In addition, we use a separate
1385cabbc6bSPrashanth Sreenivasa  * open-context thread (spa_condense_indirect_thread) to incrementally
1395cabbc6bSPrashanth Sreenivasa  * create the new mapping object in a way that minimizes the impact on
1405cabbc6bSPrashanth Sreenivasa  * the rest of the system.
1415cabbc6bSPrashanth Sreenivasa  *
1425cabbc6bSPrashanth Sreenivasa  * == Generating a new mapping ==
1435cabbc6bSPrashanth Sreenivasa  *
1445cabbc6bSPrashanth Sreenivasa  * To generate a new mapping, we follow these steps:
1455cabbc6bSPrashanth Sreenivasa  *
1465cabbc6bSPrashanth Sreenivasa  * 1. Save the old obsolete space map and create a new mapping object
1475cabbc6bSPrashanth Sreenivasa  *    (see spa_condense_indirect_start_sync()).  This initializes the
1485cabbc6bSPrashanth Sreenivasa  *    spa_condensing_indirect_phys with the "previous obsolete space map",
1495cabbc6bSPrashanth Sreenivasa  *    which is now read only.  Newly obsolete DVAs will be added to a
1505cabbc6bSPrashanth Sreenivasa  *    new (initially empty) obsolete space map, and will not be
1515cabbc6bSPrashanth Sreenivasa  *    considered as part of this condense operation.
1525cabbc6bSPrashanth Sreenivasa  *
1535cabbc6bSPrashanth Sreenivasa  * 2. Construct in memory the precise counts of obsolete space for each
1545cabbc6bSPrashanth Sreenivasa  *    mapping entry, by incorporating the obsolete space map into the
1555cabbc6bSPrashanth Sreenivasa  *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
1565cabbc6bSPrashanth Sreenivasa  *
1575cabbc6bSPrashanth Sreenivasa  * 3. Iterate through each mapping entry, writing to the new mapping any
1585cabbc6bSPrashanth Sreenivasa  *    entries that are not completely obsolete (i.e. which don't have
1595cabbc6bSPrashanth Sreenivasa  *    obsolete count == mapping length).  (See
1605cabbc6bSPrashanth Sreenivasa  *    spa_condense_indirect_generate_new_mapping().)
1615cabbc6bSPrashanth Sreenivasa  *
1625cabbc6bSPrashanth Sreenivasa  * 4. Destroy the old mapping object and switch over to the new one
1635cabbc6bSPrashanth Sreenivasa  *    (spa_condense_indirect_complete_sync).
1645cabbc6bSPrashanth Sreenivasa  *
1655cabbc6bSPrashanth Sreenivasa  * == Restarting from failure ==
1665cabbc6bSPrashanth Sreenivasa  *
1675cabbc6bSPrashanth Sreenivasa  * To restart the condense when we import/open the pool, we must start
1685cabbc6bSPrashanth Sreenivasa  * at the 2nd step above: reconstruct the precise counts in memory,
1695cabbc6bSPrashanth Sreenivasa  * based on the space map + counts.  Then in the 3rd step, we start
1705cabbc6bSPrashanth Sreenivasa  * iterating where we left off: at vimp_max_offset of the new mapping
1715cabbc6bSPrashanth Sreenivasa  * object.
1725cabbc6bSPrashanth Sreenivasa  */
1735cabbc6bSPrashanth Sreenivasa 
1745cabbc6bSPrashanth Sreenivasa boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
1755cabbc6bSPrashanth Sreenivasa 
1765cabbc6bSPrashanth Sreenivasa /*
1775cabbc6bSPrashanth Sreenivasa  * Condense if at least this percent of the bytes in the mapping is
1785cabbc6bSPrashanth Sreenivasa  * obsolete.  With the default of 25%, the amount of space mapped
1795cabbc6bSPrashanth Sreenivasa  * will be reduced to 1% of its original size after at most 16
1805cabbc6bSPrashanth Sreenivasa  * condenses.  Higher values will condense less often (causing less
1815cabbc6bSPrashanth Sreenivasa  * i/o); lower values will reduce the mapping size more quickly.
1825cabbc6bSPrashanth Sreenivasa  */
1835cabbc6bSPrashanth Sreenivasa int zfs_indirect_condense_obsolete_pct = 25;
1845cabbc6bSPrashanth Sreenivasa 
1855cabbc6bSPrashanth Sreenivasa /*
1865cabbc6bSPrashanth Sreenivasa  * Condense if the obsolete space map takes up more than this amount of
1875cabbc6bSPrashanth Sreenivasa  * space on disk (logically).  This limits the amount of disk space
1885cabbc6bSPrashanth Sreenivasa  * consumed by the obsolete space map; the default of 1GB is small enough
1895cabbc6bSPrashanth Sreenivasa  * that we typically don't mind "wasting" it.
1905cabbc6bSPrashanth Sreenivasa  */
1915cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
1925cabbc6bSPrashanth Sreenivasa 
1935cabbc6bSPrashanth Sreenivasa /*
1945cabbc6bSPrashanth Sreenivasa  * Don't bother condensing if the mapping uses less than this amount of
1955cabbc6bSPrashanth Sreenivasa  * memory.  The default of 128KB is considered a "trivial" amount of
1965cabbc6bSPrashanth Sreenivasa  * memory and not worth reducing.
1975cabbc6bSPrashanth Sreenivasa  */
1985cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
1995cabbc6bSPrashanth Sreenivasa 
2005cabbc6bSPrashanth Sreenivasa /*
2015cabbc6bSPrashanth Sreenivasa  * This is used by the test suite so that it can ensure that certain
2025cabbc6bSPrashanth Sreenivasa  * actions happen while in the middle of a condense (which might otherwise
2035cabbc6bSPrashanth Sreenivasa  * complete too quickly).  If used to reduce the performance impact of
2045cabbc6bSPrashanth Sreenivasa  * condensing in production, a maximum value of 1 should be sufficient.
2055cabbc6bSPrashanth Sreenivasa  */
2065cabbc6bSPrashanth Sreenivasa int zfs_condense_indirect_commit_entry_delay_ticks = 0;
2075cabbc6bSPrashanth Sreenivasa 
2083a4b1be9SMatthew Ahrens /*
209a21fe349SBrian Behlendorf  * If an indirect split block contains more than this many possible unique
210a21fe349SBrian Behlendorf  * combinations when being reconstructed, consider it too computationally
211a21fe349SBrian Behlendorf  * expensive to check them all. Instead, try at most 100 randomly-selected
212a21fe349SBrian Behlendorf  * combinations each time the block is accessed.  This allows all segment
213a21fe349SBrian Behlendorf  * copies to participate fairly in the reconstruction when all combinations
214a21fe349SBrian Behlendorf  * cannot be checked and prevents repeated use of one bad copy.
215a21fe349SBrian Behlendorf  */
216a21fe349SBrian Behlendorf int zfs_reconstruct_indirect_combinations_max = 256;
217a21fe349SBrian Behlendorf 
218a21fe349SBrian Behlendorf 
219a21fe349SBrian Behlendorf /*
220a21fe349SBrian Behlendorf  * Enable to simulate damaged segments and validate reconstruction.
221a21fe349SBrian Behlendorf  * Used by ztest
2223a4b1be9SMatthew Ahrens  */
223a21fe349SBrian Behlendorf unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
2243a4b1be9SMatthew Ahrens 
2253a4b1be9SMatthew Ahrens /*
2263a4b1be9SMatthew Ahrens  * The indirect_child_t represents the vdev that we will read from, when we
2273a4b1be9SMatthew Ahrens  * need to read all copies of the data (e.g. for scrub or reconstruction).
2283a4b1be9SMatthew Ahrens  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
2293a4b1be9SMatthew Ahrens  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
2303a4b1be9SMatthew Ahrens  * ic_vdev is a child of the mirror.
2313a4b1be9SMatthew Ahrens  */
2323a4b1be9SMatthew Ahrens typedef struct indirect_child {
2333a4b1be9SMatthew Ahrens 	abd_t *ic_data;
2343a4b1be9SMatthew Ahrens 	vdev_t *ic_vdev;
235a21fe349SBrian Behlendorf 
236a21fe349SBrian Behlendorf 	/*
237a21fe349SBrian Behlendorf 	 * ic_duplicate is NULL when the ic_data contents are unique, when it
238a21fe349SBrian Behlendorf 	 * is determined to be a duplicate it references the primary child.
239a21fe349SBrian Behlendorf 	 */
240a21fe349SBrian Behlendorf 	struct indirect_child *ic_duplicate;
241a21fe349SBrian Behlendorf 	list_node_t ic_node; /* node on is_unique_child */
2423a4b1be9SMatthew Ahrens } indirect_child_t;
2433a4b1be9SMatthew Ahrens 
2443a4b1be9SMatthew Ahrens /*
2453a4b1be9SMatthew Ahrens  * The indirect_split_t represents one mapped segment of an i/o to the
2463a4b1be9SMatthew Ahrens  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
2473a4b1be9SMatthew Ahrens  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
2483a4b1be9SMatthew Ahrens  * For split blocks, there will be several of these.
2493a4b1be9SMatthew Ahrens  */
2503a4b1be9SMatthew Ahrens typedef struct indirect_split {
2513a4b1be9SMatthew Ahrens 	list_node_t is_node; /* link on iv_splits */
2523a4b1be9SMatthew Ahrens 
2533a4b1be9SMatthew Ahrens 	/*
2543a4b1be9SMatthew Ahrens 	 * is_split_offset is the offset into the i/o.
2553a4b1be9SMatthew Ahrens 	 * This is the sum of the previous splits' is_size's.
2563a4b1be9SMatthew Ahrens 	 */
2573a4b1be9SMatthew Ahrens 	uint64_t is_split_offset;
2583a4b1be9SMatthew Ahrens 
2593a4b1be9SMatthew Ahrens 	vdev_t *is_vdev; /* top-level vdev */
2603a4b1be9SMatthew Ahrens 	uint64_t is_target_offset; /* offset on is_vdev */
2613a4b1be9SMatthew Ahrens 	uint64_t is_size;
2623a4b1be9SMatthew Ahrens 	int is_children; /* number of entries in is_child[] */
263a21fe349SBrian Behlendorf 	int is_unique_children; /* number of entries in is_unique_child */
264a21fe349SBrian Behlendorf 	list_t is_unique_child;
2653a4b1be9SMatthew Ahrens 
2663a4b1be9SMatthew Ahrens 	/*
2673a4b1be9SMatthew Ahrens 	 * is_good_child is the child that we are currently using to
2683a4b1be9SMatthew Ahrens 	 * attempt reconstruction.
2693a4b1be9SMatthew Ahrens 	 */
270a21fe349SBrian Behlendorf 	indirect_child_t *is_good_child;
2713a4b1be9SMatthew Ahrens 
2723a4b1be9SMatthew Ahrens 	indirect_child_t is_child[1]; /* variable-length */
2733a4b1be9SMatthew Ahrens } indirect_split_t;
2743a4b1be9SMatthew Ahrens 
2753a4b1be9SMatthew Ahrens /*
2763a4b1be9SMatthew Ahrens  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
2773a4b1be9SMatthew Ahrens  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
2783a4b1be9SMatthew Ahrens  */
2793a4b1be9SMatthew Ahrens typedef struct indirect_vsd {
2803a4b1be9SMatthew Ahrens 	boolean_t iv_split_block;
2813a4b1be9SMatthew Ahrens 	boolean_t iv_reconstruct;
282a21fe349SBrian Behlendorf 	uint64_t iv_unique_combinations;
283a21fe349SBrian Behlendorf 	uint64_t iv_attempts;
284a21fe349SBrian Behlendorf 	uint64_t iv_attempts_max;
2853a4b1be9SMatthew Ahrens 
2863a4b1be9SMatthew Ahrens 	list_t iv_splits; /* list of indirect_split_t's */
2873a4b1be9SMatthew Ahrens } indirect_vsd_t;
2883a4b1be9SMatthew Ahrens 
2893a4b1be9SMatthew Ahrens static void
2903a4b1be9SMatthew Ahrens vdev_indirect_map_free(zio_t *zio)
2913a4b1be9SMatthew Ahrens {
2923a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
2933a4b1be9SMatthew Ahrens 
2943a4b1be9SMatthew Ahrens 	indirect_split_t *is;
2953a4b1be9SMatthew Ahrens 	while ((is = list_head(&iv->iv_splits)) != NULL) {
2963a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
2973a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
2983a4b1be9SMatthew Ahrens 			if (ic->ic_data != NULL)
2993a4b1be9SMatthew Ahrens 				abd_free(ic->ic_data);
3003a4b1be9SMatthew Ahrens 		}
3013a4b1be9SMatthew Ahrens 		list_remove(&iv->iv_splits, is);
302a21fe349SBrian Behlendorf 
303a21fe349SBrian Behlendorf 		indirect_child_t *ic;
304a21fe349SBrian Behlendorf 		while ((ic = list_head(&is->is_unique_child)) != NULL)
305a21fe349SBrian Behlendorf 			list_remove(&is->is_unique_child, ic);
306a21fe349SBrian Behlendorf 
307a21fe349SBrian Behlendorf 		list_destroy(&is->is_unique_child);
308a21fe349SBrian Behlendorf 
3093a4b1be9SMatthew Ahrens 		kmem_free(is,
3103a4b1be9SMatthew Ahrens 		    offsetof(indirect_split_t, is_child[is->is_children]));
3113a4b1be9SMatthew Ahrens 	}
3123a4b1be9SMatthew Ahrens 	kmem_free(iv, sizeof (*iv));
3133a4b1be9SMatthew Ahrens }
3143a4b1be9SMatthew Ahrens 
3153a4b1be9SMatthew Ahrens static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
3163a4b1be9SMatthew Ahrens 	vdev_indirect_map_free,
3173a4b1be9SMatthew Ahrens 	zio_vsd_default_cksum_report
3183a4b1be9SMatthew Ahrens };
3195cabbc6bSPrashanth Sreenivasa /*
32086714001SSerapheim Dimitropoulos  * Mark the given offset and size as being obsolete.
3215cabbc6bSPrashanth Sreenivasa  */
3225cabbc6bSPrashanth Sreenivasa void
32386714001SSerapheim Dimitropoulos vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
3245cabbc6bSPrashanth Sreenivasa {
3255cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
32686714001SSerapheim Dimitropoulos 
3275cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
3285cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
3295cabbc6bSPrashanth Sreenivasa 	ASSERT(size > 0);
3305cabbc6bSPrashanth Sreenivasa 	VERIFY(vdev_indirect_mapping_entry_for_offset(
3315cabbc6bSPrashanth Sreenivasa 	    vd->vdev_indirect_mapping, offset) != NULL);
3325cabbc6bSPrashanth Sreenivasa 
3335cabbc6bSPrashanth Sreenivasa 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
3345cabbc6bSPrashanth Sreenivasa 		mutex_enter(&vd->vdev_obsolete_lock);
3355cabbc6bSPrashanth Sreenivasa 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
3365cabbc6bSPrashanth Sreenivasa 		mutex_exit(&vd->vdev_obsolete_lock);
33786714001SSerapheim Dimitropoulos 		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
3385cabbc6bSPrashanth Sreenivasa 	}
3395cabbc6bSPrashanth Sreenivasa }
3405cabbc6bSPrashanth Sreenivasa 
3415cabbc6bSPrashanth Sreenivasa /*
3425cabbc6bSPrashanth Sreenivasa  * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
3435cabbc6bSPrashanth Sreenivasa  * wrapper is provided because the DMU does not know about vdev_t's and
3445cabbc6bSPrashanth Sreenivasa  * cannot directly call vdev_indirect_mark_obsolete.
3455cabbc6bSPrashanth Sreenivasa  */
3465cabbc6bSPrashanth Sreenivasa void
3475cabbc6bSPrashanth Sreenivasa spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
3485cabbc6bSPrashanth Sreenivasa     uint64_t size, dmu_tx_t *tx)
3495cabbc6bSPrashanth Sreenivasa {
3505cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3515cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
3525cabbc6bSPrashanth Sreenivasa 
3535cabbc6bSPrashanth Sreenivasa 	/* The DMU can only remap indirect vdevs. */
3545cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
35586714001SSerapheim Dimitropoulos 	vdev_indirect_mark_obsolete(vd, offset, size);
3565cabbc6bSPrashanth Sreenivasa }
3575cabbc6bSPrashanth Sreenivasa 
3585cabbc6bSPrashanth Sreenivasa static spa_condensing_indirect_t *
3595cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_create(spa_t *spa)
3605cabbc6bSPrashanth Sreenivasa {
3615cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
3625cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
3635cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
3645cabbc6bSPrashanth Sreenivasa 	objset_t *mos = spa->spa_meta_objset;
3655cabbc6bSPrashanth Sreenivasa 
3665cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
3675cabbc6bSPrashanth Sreenivasa 		list_create(&sci->sci_new_mapping_entries[i],
3685cabbc6bSPrashanth Sreenivasa 		    sizeof (vdev_indirect_mapping_entry_t),
3695cabbc6bSPrashanth Sreenivasa 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
3705cabbc6bSPrashanth Sreenivasa 	}
3715cabbc6bSPrashanth Sreenivasa 
3725cabbc6bSPrashanth Sreenivasa 	sci->sci_new_mapping =
3735cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
3745cabbc6bSPrashanth Sreenivasa 
3755cabbc6bSPrashanth Sreenivasa 	return (sci);
3765cabbc6bSPrashanth Sreenivasa }
3775cabbc6bSPrashanth Sreenivasa 
3785cabbc6bSPrashanth Sreenivasa static void
3795cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
3805cabbc6bSPrashanth Sreenivasa {
3815cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++)
3825cabbc6bSPrashanth Sreenivasa 		list_destroy(&sci->sci_new_mapping_entries[i]);
3835cabbc6bSPrashanth Sreenivasa 
3845cabbc6bSPrashanth Sreenivasa 	if (sci->sci_new_mapping != NULL)
3855cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_close(sci->sci_new_mapping);
3865cabbc6bSPrashanth Sreenivasa 
3875cabbc6bSPrashanth Sreenivasa 	kmem_free(sci, sizeof (*sci));
3885cabbc6bSPrashanth Sreenivasa }
3895cabbc6bSPrashanth Sreenivasa 
3905cabbc6bSPrashanth Sreenivasa boolean_t
3915cabbc6bSPrashanth Sreenivasa vdev_indirect_should_condense(vdev_t *vd)
3925cabbc6bSPrashanth Sreenivasa {
3935cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3945cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
3955cabbc6bSPrashanth Sreenivasa 
3965cabbc6bSPrashanth Sreenivasa 	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
3975cabbc6bSPrashanth Sreenivasa 
3985cabbc6bSPrashanth Sreenivasa 	if (!zfs_condense_indirect_vdevs_enable)
3995cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4005cabbc6bSPrashanth Sreenivasa 
4015cabbc6bSPrashanth Sreenivasa 	/*
4025cabbc6bSPrashanth Sreenivasa 	 * We can only condense one indirect vdev at a time.
4035cabbc6bSPrashanth Sreenivasa 	 */
4045cabbc6bSPrashanth Sreenivasa 	if (spa->spa_condensing_indirect != NULL)
4055cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4065cabbc6bSPrashanth Sreenivasa 
4075cabbc6bSPrashanth Sreenivasa 	if (spa_shutting_down(spa))
4085cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4095cabbc6bSPrashanth Sreenivasa 
4105cabbc6bSPrashanth Sreenivasa 	/*
4115cabbc6bSPrashanth Sreenivasa 	 * The mapping object size must not change while we are
4125cabbc6bSPrashanth Sreenivasa 	 * condensing, so we can only condense indirect vdevs
4135cabbc6bSPrashanth Sreenivasa 	 * (not vdevs that are still in the middle of being removed).
4145cabbc6bSPrashanth Sreenivasa 	 */
4155cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops != &vdev_indirect_ops)
4165cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4175cabbc6bSPrashanth Sreenivasa 
4185cabbc6bSPrashanth Sreenivasa 	/*
4195cabbc6bSPrashanth Sreenivasa 	 * If nothing new has been marked obsolete, there is no
4205cabbc6bSPrashanth Sreenivasa 	 * point in condensing.
4215cabbc6bSPrashanth Sreenivasa 	 */
4225cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_obsolete_sm == NULL) {
4235cabbc6bSPrashanth Sreenivasa 		ASSERT0(vdev_obsolete_sm_object(vd));
4245cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4255cabbc6bSPrashanth Sreenivasa 	}
4265cabbc6bSPrashanth Sreenivasa 
4275cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_obsolete_sm != NULL);
4285cabbc6bSPrashanth Sreenivasa 
4295cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
4305cabbc6bSPrashanth Sreenivasa 	    space_map_object(vd->vdev_obsolete_sm));
4315cabbc6bSPrashanth Sreenivasa 
4325cabbc6bSPrashanth Sreenivasa 	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
4335cabbc6bSPrashanth Sreenivasa 	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
4345cabbc6bSPrashanth Sreenivasa 	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
4355cabbc6bSPrashanth Sreenivasa 	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
4365cabbc6bSPrashanth Sreenivasa 
4375cabbc6bSPrashanth Sreenivasa 	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
4385cabbc6bSPrashanth Sreenivasa 
4395cabbc6bSPrashanth Sreenivasa 	/*
4405cabbc6bSPrashanth Sreenivasa 	 * If a high percentage of the bytes that are mapped have become
4415cabbc6bSPrashanth Sreenivasa 	 * obsolete, condense (unless the mapping is already small enough).
4425cabbc6bSPrashanth Sreenivasa 	 * This has a good chance of reducing the amount of memory used
4435cabbc6bSPrashanth Sreenivasa 	 * by the mapping.
4445cabbc6bSPrashanth Sreenivasa 	 */
4455cabbc6bSPrashanth Sreenivasa 	if (bytes_obsolete * 100 / bytes_mapped >=
4465cabbc6bSPrashanth Sreenivasa 	    zfs_indirect_condense_obsolete_pct &&
4475cabbc6bSPrashanth Sreenivasa 	    mapping_size > zfs_condense_min_mapping_bytes) {
4485cabbc6bSPrashanth Sreenivasa 		zfs_dbgmsg("should condense vdev %llu because obsolete "
4495cabbc6bSPrashanth Sreenivasa 		    "spacemap covers %d%% of %lluMB mapping",
4505cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)vd->vdev_id,
4515cabbc6bSPrashanth Sreenivasa 		    (int)(bytes_obsolete * 100 / bytes_mapped),
4525cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)bytes_mapped / 1024 / 1024);
4535cabbc6bSPrashanth Sreenivasa 		return (B_TRUE);
4545cabbc6bSPrashanth Sreenivasa 	}
4555cabbc6bSPrashanth Sreenivasa 
4565cabbc6bSPrashanth Sreenivasa 	/*
4575cabbc6bSPrashanth Sreenivasa 	 * If the obsolete space map takes up too much space on disk,
4585cabbc6bSPrashanth Sreenivasa 	 * condense in order to free up this disk space.
4595cabbc6bSPrashanth Sreenivasa 	 */
4605cabbc6bSPrashanth Sreenivasa 	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
4615cabbc6bSPrashanth Sreenivasa 		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
4625cabbc6bSPrashanth Sreenivasa 		    "length %lluMB >= max size %lluMB",
4635cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)vd->vdev_id,
4645cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
4655cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
4665cabbc6bSPrashanth Sreenivasa 		    1024 / 1024);
4675cabbc6bSPrashanth Sreenivasa 		return (B_TRUE);
4685cabbc6bSPrashanth Sreenivasa 	}
4695cabbc6bSPrashanth Sreenivasa 
4705cabbc6bSPrashanth Sreenivasa 	return (B_FALSE);
4715cabbc6bSPrashanth Sreenivasa }
4725cabbc6bSPrashanth Sreenivasa 
4735cabbc6bSPrashanth Sreenivasa /*
4745cabbc6bSPrashanth Sreenivasa  * This sync task completes (finishes) a condense, deleting the old
4755cabbc6bSPrashanth Sreenivasa  * mapping and replacing it with the new one.
4765cabbc6bSPrashanth Sreenivasa  */
4775cabbc6bSPrashanth Sreenivasa static void
4785cabbc6bSPrashanth Sreenivasa spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
4795cabbc6bSPrashanth Sreenivasa {
4805cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = arg;
4815cabbc6bSPrashanth Sreenivasa 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4825cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
4835cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
4845cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
4855cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4865cabbc6bSPrashanth Sreenivasa 	objset_t *mos = spa->spa_meta_objset;
4875cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
4885cabbc6bSPrashanth Sreenivasa 	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
4895cabbc6bSPrashanth Sreenivasa 	uint64_t new_count =
4905cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
4915cabbc6bSPrashanth Sreenivasa 
4925cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
4935cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4945cabbc6bSPrashanth Sreenivasa 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
4955cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
4965cabbc6bSPrashanth Sreenivasa 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
4975cabbc6bSPrashanth Sreenivasa 	}
4985cabbc6bSPrashanth Sreenivasa 	ASSERT(vic->vic_mapping_object != 0);
4995cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
5005cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_next_mapping_object != 0);
5015cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
5025cabbc6bSPrashanth Sreenivasa 
5035cabbc6bSPrashanth Sreenivasa 	/*
5045cabbc6bSPrashanth Sreenivasa 	 * Reset vdev_indirect_mapping to refer to the new object.
5055cabbc6bSPrashanth Sreenivasa 	 */
5065cabbc6bSPrashanth Sreenivasa 	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
5075cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
5085cabbc6bSPrashanth Sreenivasa 	vd->vdev_indirect_mapping = sci->sci_new_mapping;
5095cabbc6bSPrashanth Sreenivasa 	rw_exit(&vd->vdev_indirect_rwlock);
5105cabbc6bSPrashanth Sreenivasa 
5115cabbc6bSPrashanth Sreenivasa 	sci->sci_new_mapping = NULL;
5125cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
5135cabbc6bSPrashanth Sreenivasa 	vic->vic_mapping_object = scip->scip_next_mapping_object;
5145cabbc6bSPrashanth Sreenivasa 	scip->scip_next_mapping_object = 0;
5155cabbc6bSPrashanth Sreenivasa 
5165cabbc6bSPrashanth Sreenivasa 	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
5175cabbc6bSPrashanth Sreenivasa 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
5185cabbc6bSPrashanth Sreenivasa 	scip->scip_prev_obsolete_sm_object = 0;
5195cabbc6bSPrashanth Sreenivasa 
5205cabbc6bSPrashanth Sreenivasa 	scip->scip_vdev = 0;
5215cabbc6bSPrashanth Sreenivasa 
5225cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
5235cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, tx));
5245cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
5255cabbc6bSPrashanth Sreenivasa 	spa->spa_condensing_indirect = NULL;
5265cabbc6bSPrashanth Sreenivasa 
5275cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
5285cabbc6bSPrashanth Sreenivasa 	    "new mapping object %llu has %llu entries "
5295cabbc6bSPrashanth Sreenivasa 	    "(was %llu entries)",
5305cabbc6bSPrashanth Sreenivasa 	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
5315cabbc6bSPrashanth Sreenivasa 	    new_count, old_count);
5325cabbc6bSPrashanth Sreenivasa 
5335cabbc6bSPrashanth Sreenivasa 	vdev_config_dirty(spa->spa_root_vdev);
5345cabbc6bSPrashanth Sreenivasa }
5355cabbc6bSPrashanth Sreenivasa 
5365cabbc6bSPrashanth Sreenivasa /*
5375cabbc6bSPrashanth Sreenivasa  * This sync task appends entries to the new mapping object.
5385cabbc6bSPrashanth Sreenivasa  */
5395cabbc6bSPrashanth Sreenivasa static void
5405cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
5415cabbc6bSPrashanth Sreenivasa {
5425cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = arg;
5435cabbc6bSPrashanth Sreenivasa 	uint64_t txg = dmu_tx_get_txg(tx);
5445cabbc6bSPrashanth Sreenivasa 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5455cabbc6bSPrashanth Sreenivasa 
5465cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
5475cabbc6bSPrashanth Sreenivasa 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
5485cabbc6bSPrashanth Sreenivasa 
5495cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
5505cabbc6bSPrashanth Sreenivasa 	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
5515cabbc6bSPrashanth Sreenivasa 	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
5525cabbc6bSPrashanth Sreenivasa }
5535cabbc6bSPrashanth Sreenivasa 
5545cabbc6bSPrashanth Sreenivasa /*
5555cabbc6bSPrashanth Sreenivasa  * Open-context function to add one entry to the new mapping.  The new
5565cabbc6bSPrashanth Sreenivasa  * entry will be remembered and written from syncing context.
5575cabbc6bSPrashanth Sreenivasa  */
5585cabbc6bSPrashanth Sreenivasa static void
5595cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_entry(spa_t *spa,
5605cabbc6bSPrashanth Sreenivasa     vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
5615cabbc6bSPrashanth Sreenivasa {
5625cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
5635cabbc6bSPrashanth Sreenivasa 
5645cabbc6bSPrashanth Sreenivasa 	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
5655cabbc6bSPrashanth Sreenivasa 
5665cabbc6bSPrashanth Sreenivasa 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5675cabbc6bSPrashanth Sreenivasa 	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
5685cabbc6bSPrashanth Sreenivasa 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
5695cabbc6bSPrashanth Sreenivasa 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
5705cabbc6bSPrashanth Sreenivasa 
5715cabbc6bSPrashanth Sreenivasa 	/*
5725cabbc6bSPrashanth Sreenivasa 	 * If we are the first entry committed this txg, kick off the sync
5735cabbc6bSPrashanth Sreenivasa 	 * task to write to the MOS on our behalf.
5745cabbc6bSPrashanth Sreenivasa 	 */
5755cabbc6bSPrashanth Sreenivasa 	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
5765cabbc6bSPrashanth Sreenivasa 		dsl_sync_task_nowait(dmu_tx_pool(tx),
5775cabbc6bSPrashanth Sreenivasa 		    spa_condense_indirect_commit_sync, sci,
5785cabbc6bSPrashanth Sreenivasa 		    0, ZFS_SPACE_CHECK_NONE, tx);
5795cabbc6bSPrashanth Sreenivasa 	}
5805cabbc6bSPrashanth Sreenivasa 
5815cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_entry_t *vime =
5825cabbc6bSPrashanth Sreenivasa 	    kmem_alloc(sizeof (*vime), KM_SLEEP);
5835cabbc6bSPrashanth Sreenivasa 	vime->vime_mapping = *vimep;
5845cabbc6bSPrashanth Sreenivasa 	vime->vime_obsolete_count = count;
5855cabbc6bSPrashanth Sreenivasa 	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
5865cabbc6bSPrashanth Sreenivasa 
5875cabbc6bSPrashanth Sreenivasa 	dmu_tx_commit(tx);
5885cabbc6bSPrashanth Sreenivasa }
5895cabbc6bSPrashanth Sreenivasa 
5905cabbc6bSPrashanth Sreenivasa static void
5915cabbc6bSPrashanth Sreenivasa spa_condense_indirect_generate_new_mapping(vdev_t *vd,
592667ec66fSSerapheim Dimitropoulos     uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
5935cabbc6bSPrashanth Sreenivasa {
5945cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
5955cabbc6bSPrashanth Sreenivasa 	uint64_t mapi = start_index;
5965cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
5975cabbc6bSPrashanth Sreenivasa 	uint64_t old_num_entries =
5985cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_num_entries(old_mapping);
5995cabbc6bSPrashanth Sreenivasa 
6005cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6015cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
6025cabbc6bSPrashanth Sreenivasa 
6035cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
6045cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)vd->vdev_id,
6055cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)mapi);
6065cabbc6bSPrashanth Sreenivasa 
607667ec66fSSerapheim Dimitropoulos 	while (mapi < old_num_entries) {
608667ec66fSSerapheim Dimitropoulos 
609667ec66fSSerapheim Dimitropoulos 		if (zthr_iscancelled(zthr)) {
610667ec66fSSerapheim Dimitropoulos 			zfs_dbgmsg("pausing condense of vdev %llu "
611667ec66fSSerapheim Dimitropoulos 			    "at index %llu", (u_longlong_t)vd->vdev_id,
612667ec66fSSerapheim Dimitropoulos 			    (u_longlong_t)mapi);
613667ec66fSSerapheim Dimitropoulos 			break;
614667ec66fSSerapheim Dimitropoulos 		}
615667ec66fSSerapheim Dimitropoulos 
6165cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *entry =
6175cabbc6bSPrashanth Sreenivasa 		    &old_mapping->vim_entries[mapi];
6185cabbc6bSPrashanth Sreenivasa 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
6195cabbc6bSPrashanth Sreenivasa 		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
6205cabbc6bSPrashanth Sreenivasa 		if (obsolete_counts[mapi] < entry_size) {
6215cabbc6bSPrashanth Sreenivasa 			spa_condense_indirect_commit_entry(spa, entry,
6225cabbc6bSPrashanth Sreenivasa 			    obsolete_counts[mapi]);
6235cabbc6bSPrashanth Sreenivasa 
6245cabbc6bSPrashanth Sreenivasa 			/*
6255cabbc6bSPrashanth Sreenivasa 			 * This delay may be requested for testing, debugging,
6265cabbc6bSPrashanth Sreenivasa 			 * or performance reasons.
6275cabbc6bSPrashanth Sreenivasa 			 */
6285cabbc6bSPrashanth Sreenivasa 			delay(zfs_condense_indirect_commit_entry_delay_ticks);
6295cabbc6bSPrashanth Sreenivasa 		}
6305cabbc6bSPrashanth Sreenivasa 
6315cabbc6bSPrashanth Sreenivasa 		mapi++;
6325cabbc6bSPrashanth Sreenivasa 	}
6335cabbc6bSPrashanth Sreenivasa }
6345cabbc6bSPrashanth Sreenivasa 
635667ec66fSSerapheim Dimitropoulos /* ARGSUSED */
636667ec66fSSerapheim Dimitropoulos static boolean_t
637667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
6385cabbc6bSPrashanth Sreenivasa {
639667ec66fSSerapheim Dimitropoulos 	spa_t *spa = arg;
640667ec66fSSerapheim Dimitropoulos 
641667ec66fSSerapheim Dimitropoulos 	return (spa->spa_condensing_indirect != NULL);
642667ec66fSSerapheim Dimitropoulos }
643667ec66fSSerapheim Dimitropoulos 
644667ec66fSSerapheim Dimitropoulos /* ARGSUSED */
6456a316e1fSSerapheim Dimitropoulos static void
646667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread(void *arg, zthr_t *zthr)
647667ec66fSSerapheim Dimitropoulos {
648667ec66fSSerapheim Dimitropoulos 	spa_t *spa = arg;
649667ec66fSSerapheim Dimitropoulos 	vdev_t *vd;
650667ec66fSSerapheim Dimitropoulos 
651667ec66fSSerapheim Dimitropoulos 	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
652667ec66fSSerapheim Dimitropoulos 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
653667ec66fSSerapheim Dimitropoulos 	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
654667ec66fSSerapheim Dimitropoulos 	ASSERT3P(vd, !=, NULL);
655667ec66fSSerapheim Dimitropoulos 	spa_config_exit(spa, SCL_VDEV, FTAG);
656667ec66fSSerapheim Dimitropoulos 
6575cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
6585cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
6595cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
6605cabbc6bSPrashanth Sreenivasa 	uint32_t *counts;
6615cabbc6bSPrashanth Sreenivasa 	uint64_t start_index;
6625cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
6635cabbc6bSPrashanth Sreenivasa 	space_map_t *prev_obsolete_sm = NULL;
6645cabbc6bSPrashanth Sreenivasa 
6655cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
6665cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_next_mapping_object != 0);
6675cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
6685cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6695cabbc6bSPrashanth Sreenivasa 
6705cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
6715cabbc6bSPrashanth Sreenivasa 		/*
6725cabbc6bSPrashanth Sreenivasa 		 * The list must start out empty in order for the
6735cabbc6bSPrashanth Sreenivasa 		 * _commit_sync() sync task to be properly registered
6745cabbc6bSPrashanth Sreenivasa 		 * on the first call to _commit_entry(); so it's wise
6755cabbc6bSPrashanth Sreenivasa 		 * to double check and ensure we actually are starting
6765cabbc6bSPrashanth Sreenivasa 		 * with empty lists.
6775cabbc6bSPrashanth Sreenivasa 		 */
6785cabbc6bSPrashanth Sreenivasa 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
6795cabbc6bSPrashanth Sreenivasa 	}
6805cabbc6bSPrashanth Sreenivasa 
6815cabbc6bSPrashanth Sreenivasa 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6825cabbc6bSPrashanth Sreenivasa 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6835cabbc6bSPrashanth Sreenivasa 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
6845cabbc6bSPrashanth Sreenivasa 	if (prev_obsolete_sm != NULL) {
6855cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
6865cabbc6bSPrashanth Sreenivasa 		    counts, prev_obsolete_sm);
6875cabbc6bSPrashanth Sreenivasa 	}
6885cabbc6bSPrashanth Sreenivasa 	space_map_close(prev_obsolete_sm);
6895cabbc6bSPrashanth Sreenivasa 
6905cabbc6bSPrashanth Sreenivasa 	/*
6915cabbc6bSPrashanth Sreenivasa 	 * Generate new mapping.  Determine what index to continue from
6925cabbc6bSPrashanth Sreenivasa 	 * based on the max offset that we've already written in the
6935cabbc6bSPrashanth Sreenivasa 	 * new mapping.
6945cabbc6bSPrashanth Sreenivasa 	 */
6955cabbc6bSPrashanth Sreenivasa 	uint64_t max_offset =
6965cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
6975cabbc6bSPrashanth Sreenivasa 	if (max_offset == 0) {
6985cabbc6bSPrashanth Sreenivasa 		/* We haven't written anything to the new mapping yet. */
6995cabbc6bSPrashanth Sreenivasa 		start_index = 0;
7005cabbc6bSPrashanth Sreenivasa 	} else {
7015cabbc6bSPrashanth Sreenivasa 		/*
7025cabbc6bSPrashanth Sreenivasa 		 * Pick up from where we left off. _entry_for_offset()
7035cabbc6bSPrashanth Sreenivasa 		 * returns a pointer into the vim_entries array. If
7045cabbc6bSPrashanth Sreenivasa 		 * max_offset is greater than any of the mappings
7055cabbc6bSPrashanth Sreenivasa 		 * contained in the table  NULL will be returned and
7065cabbc6bSPrashanth Sreenivasa 		 * that indicates we've exhausted our iteration of the
7075cabbc6bSPrashanth Sreenivasa 		 * old_mapping.
7085cabbc6bSPrashanth Sreenivasa 		 */
7095cabbc6bSPrashanth Sreenivasa 
7105cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *entry =
7115cabbc6bSPrashanth Sreenivasa 		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
7125cabbc6bSPrashanth Sreenivasa 		    max_offset);
7135cabbc6bSPrashanth Sreenivasa 
7145cabbc6bSPrashanth Sreenivasa 		if (entry == NULL) {
7155cabbc6bSPrashanth Sreenivasa 			/*
7165cabbc6bSPrashanth Sreenivasa 			 * We've already written the whole new mapping.
7175cabbc6bSPrashanth Sreenivasa 			 * This special value will cause us to skip the
7185cabbc6bSPrashanth Sreenivasa 			 * generate_new_mapping step and just do the sync
7195cabbc6bSPrashanth Sreenivasa 			 * task to complete the condense.
7205cabbc6bSPrashanth Sreenivasa 			 */
7215cabbc6bSPrashanth Sreenivasa 			start_index = UINT64_MAX;
7225cabbc6bSPrashanth Sreenivasa 		} else {
7235cabbc6bSPrashanth Sreenivasa 			start_index = entry - old_mapping->vim_entries;
7245cabbc6bSPrashanth Sreenivasa 			ASSERT3U(start_index, <,
7255cabbc6bSPrashanth Sreenivasa 			    vdev_indirect_mapping_num_entries(old_mapping));
7265cabbc6bSPrashanth Sreenivasa 		}
7275cabbc6bSPrashanth Sreenivasa 	}
7285cabbc6bSPrashanth Sreenivasa 
729667ec66fSSerapheim Dimitropoulos 	spa_condense_indirect_generate_new_mapping(vd, counts,
730667ec66fSSerapheim Dimitropoulos 	    start_index, zthr);
7315cabbc6bSPrashanth Sreenivasa 
7325cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
7335cabbc6bSPrashanth Sreenivasa 
7345cabbc6bSPrashanth Sreenivasa 	/*
735667ec66fSSerapheim Dimitropoulos 	 * If the zthr has received a cancellation signal while running
736667ec66fSSerapheim Dimitropoulos 	 * in generate_new_mapping() or at any point after that, then bail
737667ec66fSSerapheim Dimitropoulos 	 * early. We don't want to complete the condense if the spa is
738667ec66fSSerapheim Dimitropoulos 	 * shutting down.
7395cabbc6bSPrashanth Sreenivasa 	 */
740667ec66fSSerapheim Dimitropoulos 	if (zthr_iscancelled(zthr))
7416a316e1fSSerapheim Dimitropoulos 		return;
742667ec66fSSerapheim Dimitropoulos 
743667ec66fSSerapheim Dimitropoulos 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
74486714001SSerapheim Dimitropoulos 	    spa_condense_indirect_complete_sync, sci, 0,
74586714001SSerapheim Dimitropoulos 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
7465cabbc6bSPrashanth Sreenivasa }
7475cabbc6bSPrashanth Sreenivasa 
7485cabbc6bSPrashanth Sreenivasa /*
7495cabbc6bSPrashanth Sreenivasa  * Sync task to begin the condensing process.
7505cabbc6bSPrashanth Sreenivasa  */
7515cabbc6bSPrashanth Sreenivasa void
7525cabbc6bSPrashanth Sreenivasa spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
7535cabbc6bSPrashanth Sreenivasa {
7545cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
7555cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
7565cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
7575cabbc6bSPrashanth Sreenivasa 
7585cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_next_mapping_object);
7595cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_prev_obsolete_sm_object);
7605cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_vdev);
7615cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
7625cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
7635cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
7645cabbc6bSPrashanth Sreenivasa 	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
7655cabbc6bSPrashanth Sreenivasa 
7665cabbc6bSPrashanth Sreenivasa 	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
7675cabbc6bSPrashanth Sreenivasa 	ASSERT(obsolete_sm_obj != 0);
7685cabbc6bSPrashanth Sreenivasa 
7695cabbc6bSPrashanth Sreenivasa 	scip->scip_vdev = vd->vdev_id;
7705cabbc6bSPrashanth Sreenivasa 	scip->scip_next_mapping_object =
7715cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
7725cabbc6bSPrashanth Sreenivasa 
7735cabbc6bSPrashanth Sreenivasa 	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
7745cabbc6bSPrashanth Sreenivasa 
7755cabbc6bSPrashanth Sreenivasa 	/*
7765cabbc6bSPrashanth Sreenivasa 	 * We don't need to allocate a new space map object, since
7775cabbc6bSPrashanth Sreenivasa 	 * vdev_indirect_sync_obsolete will allocate one when needed.
7785cabbc6bSPrashanth Sreenivasa 	 */
7795cabbc6bSPrashanth Sreenivasa 	space_map_close(vd->vdev_obsolete_sm);
7805cabbc6bSPrashanth Sreenivasa 	vd->vdev_obsolete_sm = NULL;
7815cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
7825cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
7835cabbc6bSPrashanth Sreenivasa 
7845cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
7855cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_DIRECTORY_OBJECT,
7865cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
7875cabbc6bSPrashanth Sreenivasa 	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
7885cabbc6bSPrashanth Sreenivasa 
7895cabbc6bSPrashanth Sreenivasa 	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
7905cabbc6bSPrashanth Sreenivasa 	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
7915cabbc6bSPrashanth Sreenivasa 
7925cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
7935cabbc6bSPrashanth Sreenivasa 	    "posm=%llu nm=%llu",
7945cabbc6bSPrashanth Sreenivasa 	    vd->vdev_id, dmu_tx_get_txg(tx),
7955cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
7965cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)scip->scip_next_mapping_object);
7975cabbc6bSPrashanth Sreenivasa 
798667ec66fSSerapheim Dimitropoulos 	zthr_wakeup(spa->spa_condense_zthr);
7995cabbc6bSPrashanth Sreenivasa }
8005cabbc6bSPrashanth Sreenivasa 
8015cabbc6bSPrashanth Sreenivasa /*
8025cabbc6bSPrashanth Sreenivasa  * Sync to the given vdev's obsolete space map any segments that are no longer
8035cabbc6bSPrashanth Sreenivasa  * referenced as of the given txg.
8045cabbc6bSPrashanth Sreenivasa  *
8055cabbc6bSPrashanth Sreenivasa  * If the obsolete space map doesn't exist yet, create and open it.
8065cabbc6bSPrashanth Sreenivasa  */
8075cabbc6bSPrashanth Sreenivasa void
8085cabbc6bSPrashanth Sreenivasa vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
8095cabbc6bSPrashanth Sreenivasa {
8105cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
8115cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
8125cabbc6bSPrashanth Sreenivasa 
8135cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vic->vic_mapping_object, !=, 0);
8145cabbc6bSPrashanth Sreenivasa 	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
8155cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
8165cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
8175cabbc6bSPrashanth Sreenivasa 
8185cabbc6bSPrashanth Sreenivasa 	if (vdev_obsolete_sm_object(vd) == 0) {
8195cabbc6bSPrashanth Sreenivasa 		uint64_t obsolete_sm_object =
82086714001SSerapheim Dimitropoulos 		    space_map_alloc(spa->spa_meta_objset,
821*814dcd43SSerapheim Dimitropoulos 		    zfs_vdev_standard_sm_blksz, tx);
8225cabbc6bSPrashanth Sreenivasa 
8235cabbc6bSPrashanth Sreenivasa 		ASSERT(vd->vdev_top_zap != 0);
8245cabbc6bSPrashanth Sreenivasa 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
8255cabbc6bSPrashanth Sreenivasa 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
8265cabbc6bSPrashanth Sreenivasa 		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
8275cabbc6bSPrashanth Sreenivasa 		ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
8285cabbc6bSPrashanth Sreenivasa 
8295cabbc6bSPrashanth Sreenivasa 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
8305cabbc6bSPrashanth Sreenivasa 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
8315cabbc6bSPrashanth Sreenivasa 		    spa->spa_meta_objset, obsolete_sm_object,
8325cabbc6bSPrashanth Sreenivasa 		    0, vd->vdev_asize, 0));
8335cabbc6bSPrashanth Sreenivasa 	}
8345cabbc6bSPrashanth Sreenivasa 
8355cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_obsolete_sm != NULL);
8365cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
8375cabbc6bSPrashanth Sreenivasa 	    space_map_object(vd->vdev_obsolete_sm));
8385cabbc6bSPrashanth Sreenivasa 
8395cabbc6bSPrashanth Sreenivasa 	space_map_write(vd->vdev_obsolete_sm,
84017f11284SSerapheim Dimitropoulos 	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
8415cabbc6bSPrashanth Sreenivasa 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
8425cabbc6bSPrashanth Sreenivasa }
8435cabbc6bSPrashanth Sreenivasa 
8445cabbc6bSPrashanth Sreenivasa int
8455cabbc6bSPrashanth Sreenivasa spa_condense_init(spa_t *spa)
8465cabbc6bSPrashanth Sreenivasa {
8475cabbc6bSPrashanth Sreenivasa 	int error = zap_lookup(spa->spa_meta_objset,
8485cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_DIRECTORY_OBJECT,
8495cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
8505cabbc6bSPrashanth Sreenivasa 	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
8515cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys);
8525cabbc6bSPrashanth Sreenivasa 	if (error == 0) {
8535cabbc6bSPrashanth Sreenivasa 		if (spa_writeable(spa)) {
8545cabbc6bSPrashanth Sreenivasa 			spa->spa_condensing_indirect =
8555cabbc6bSPrashanth Sreenivasa 			    spa_condensing_indirect_create(spa);
8565cabbc6bSPrashanth Sreenivasa 		}
8575cabbc6bSPrashanth Sreenivasa 		return (0);
8585cabbc6bSPrashanth Sreenivasa 	} else if (error == ENOENT) {
8595cabbc6bSPrashanth Sreenivasa 		return (0);
8605cabbc6bSPrashanth Sreenivasa 	} else {
8615cabbc6bSPrashanth Sreenivasa 		return (error);
8625cabbc6bSPrashanth Sreenivasa 	}
8635cabbc6bSPrashanth Sreenivasa }
8645cabbc6bSPrashanth Sreenivasa 
8655cabbc6bSPrashanth Sreenivasa void
8665cabbc6bSPrashanth Sreenivasa spa_condense_fini(spa_t *spa)
8675cabbc6bSPrashanth Sreenivasa {
8685cabbc6bSPrashanth Sreenivasa 	if (spa->spa_condensing_indirect != NULL) {
8695cabbc6bSPrashanth Sreenivasa 		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
8705cabbc6bSPrashanth Sreenivasa 		spa->spa_condensing_indirect = NULL;
8715cabbc6bSPrashanth Sreenivasa 	}
8725cabbc6bSPrashanth Sreenivasa }
8735cabbc6bSPrashanth Sreenivasa 
8745cabbc6bSPrashanth Sreenivasa void
875667ec66fSSerapheim Dimitropoulos spa_start_indirect_condensing_thread(spa_t *spa)
8765cabbc6bSPrashanth Sreenivasa {
877667ec66fSSerapheim Dimitropoulos 	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
878667ec66fSSerapheim Dimitropoulos 	spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
879667ec66fSSerapheim Dimitropoulos 	    spa_condense_indirect_thread, spa);
8805cabbc6bSPrashanth Sreenivasa }
8815cabbc6bSPrashanth Sreenivasa 
8825cabbc6bSPrashanth Sreenivasa /*
8835cabbc6bSPrashanth Sreenivasa  * Gets the obsolete spacemap object from the vdev's ZAP.
8845cabbc6bSPrashanth Sreenivasa  * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
8855cabbc6bSPrashanth Sreenivasa  * exist yet.
8865cabbc6bSPrashanth Sreenivasa  */
8875cabbc6bSPrashanth Sreenivasa int
8885cabbc6bSPrashanth Sreenivasa vdev_obsolete_sm_object(vdev_t *vd)
8895cabbc6bSPrashanth Sreenivasa {
8905cabbc6bSPrashanth Sreenivasa 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
8915cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_top_zap == 0) {
8925cabbc6bSPrashanth Sreenivasa 		return (0);
8935cabbc6bSPrashanth Sreenivasa 	}
8945cabbc6bSPrashanth Sreenivasa 
8955cabbc6bSPrashanth Sreenivasa 	uint64_t sm_obj = 0;
8965cabbc6bSPrashanth Sreenivasa 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
8975cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
8985cabbc6bSPrashanth Sreenivasa 
8995cabbc6bSPrashanth Sreenivasa 	ASSERT(err == 0 || err == ENOENT);
9005cabbc6bSPrashanth Sreenivasa 
9015cabbc6bSPrashanth Sreenivasa 	return (sm_obj);
9025cabbc6bSPrashanth Sreenivasa }
9035cabbc6bSPrashanth Sreenivasa 
9045cabbc6bSPrashanth Sreenivasa boolean_t
9055cabbc6bSPrashanth Sreenivasa vdev_obsolete_counts_are_precise(vdev_t *vd)
9065cabbc6bSPrashanth Sreenivasa {
9075cabbc6bSPrashanth Sreenivasa 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
9085cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_top_zap == 0) {
9095cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
9105cabbc6bSPrashanth Sreenivasa 	}
9115cabbc6bSPrashanth Sreenivasa 
9125cabbc6bSPrashanth Sreenivasa 	uint64_t val = 0;
9135cabbc6bSPrashanth Sreenivasa 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
9145cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
9155cabbc6bSPrashanth Sreenivasa 
9165cabbc6bSPrashanth Sreenivasa 	ASSERT(err == 0 || err == ENOENT);
9175cabbc6bSPrashanth Sreenivasa 
9185cabbc6bSPrashanth Sreenivasa 	return (val != 0);
9195cabbc6bSPrashanth Sreenivasa }
9205cabbc6bSPrashanth Sreenivasa 
9215cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
9225cabbc6bSPrashanth Sreenivasa static void
9235cabbc6bSPrashanth Sreenivasa vdev_indirect_close(vdev_t *vd)
9245cabbc6bSPrashanth Sreenivasa {
9255cabbc6bSPrashanth Sreenivasa }
9265cabbc6bSPrashanth Sreenivasa 
9275cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
9285cabbc6bSPrashanth Sreenivasa static int
9295cabbc6bSPrashanth Sreenivasa vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
9305cabbc6bSPrashanth Sreenivasa     uint64_t *ashift)
9315cabbc6bSPrashanth Sreenivasa {
9325cabbc6bSPrashanth Sreenivasa 	*psize = *max_psize = vd->vdev_asize +
9335cabbc6bSPrashanth Sreenivasa 	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
9345cabbc6bSPrashanth Sreenivasa 	*ashift = vd->vdev_ashift;
9355cabbc6bSPrashanth Sreenivasa 	return (0);
9365cabbc6bSPrashanth Sreenivasa }
9375cabbc6bSPrashanth Sreenivasa 
9385cabbc6bSPrashanth Sreenivasa typedef struct remap_segment {
9395cabbc6bSPrashanth Sreenivasa 	vdev_t *rs_vd;
9405cabbc6bSPrashanth Sreenivasa 	uint64_t rs_offset;
9415cabbc6bSPrashanth Sreenivasa 	uint64_t rs_asize;
9425cabbc6bSPrashanth Sreenivasa 	uint64_t rs_split_offset;
9435cabbc6bSPrashanth Sreenivasa 	list_node_t rs_node;
9445cabbc6bSPrashanth Sreenivasa } remap_segment_t;
9455cabbc6bSPrashanth Sreenivasa 
9465cabbc6bSPrashanth Sreenivasa remap_segment_t *
9475cabbc6bSPrashanth Sreenivasa rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
9485cabbc6bSPrashanth Sreenivasa {
9495cabbc6bSPrashanth Sreenivasa 	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
9505cabbc6bSPrashanth Sreenivasa 	rs->rs_vd = vd;
9515cabbc6bSPrashanth Sreenivasa 	rs->rs_offset = offset;
9525cabbc6bSPrashanth Sreenivasa 	rs->rs_asize = asize;
9535cabbc6bSPrashanth Sreenivasa 	rs->rs_split_offset = split_offset;
9545cabbc6bSPrashanth Sreenivasa 	return (rs);
9555cabbc6bSPrashanth Sreenivasa }
9565cabbc6bSPrashanth Sreenivasa 
957bdfded42SSerapheim Dimitropoulos /*
958bdfded42SSerapheim Dimitropoulos  * Given an indirect vdev and an extent on that vdev, it duplicates the
959bdfded42SSerapheim Dimitropoulos  * physical entries of the indirect mapping that correspond to the extent
960bdfded42SSerapheim Dimitropoulos  * to a new array and returns a pointer to it. In addition, copied_entries
961bdfded42SSerapheim Dimitropoulos  * is populated with the number of mapping entries that were duplicated.
962bdfded42SSerapheim Dimitropoulos  *
963bdfded42SSerapheim Dimitropoulos  * Note that the function assumes that the caller holds vdev_indirect_rwlock.
964bdfded42SSerapheim Dimitropoulos  * This ensures that the mapping won't change due to condensing as we
965bdfded42SSerapheim Dimitropoulos  * copy over its contents.
966bdfded42SSerapheim Dimitropoulos  *
967bdfded42SSerapheim Dimitropoulos  * Finally, since we are doing an allocation, it is up to the caller to
968bdfded42SSerapheim Dimitropoulos  * free the array allocated in this function.
969bdfded42SSerapheim Dimitropoulos  */
970bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *
971bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
972bdfded42SSerapheim Dimitropoulos     uint64_t asize, uint64_t *copied_entries)
973bdfded42SSerapheim Dimitropoulos {
974bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
975bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
976bdfded42SSerapheim Dimitropoulos 	uint64_t entries = 0;
977bdfded42SSerapheim Dimitropoulos 
978bdfded42SSerapheim Dimitropoulos 	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
979bdfded42SSerapheim Dimitropoulos 
980bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *first_mapping =
981bdfded42SSerapheim Dimitropoulos 	    vdev_indirect_mapping_entry_for_offset(vim, offset);
982bdfded42SSerapheim Dimitropoulos 	ASSERT3P(first_mapping, !=, NULL);
983bdfded42SSerapheim Dimitropoulos 
984bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
985bdfded42SSerapheim Dimitropoulos 	while (asize > 0) {
986bdfded42SSerapheim Dimitropoulos 		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
987bdfded42SSerapheim Dimitropoulos 
988bdfded42SSerapheim Dimitropoulos 		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
989bdfded42SSerapheim Dimitropoulos 		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
990bdfded42SSerapheim Dimitropoulos 
991bdfded42SSerapheim Dimitropoulos 		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
992bdfded42SSerapheim Dimitropoulos 		uint64_t inner_size = MIN(asize, size - inner_offset);
993bdfded42SSerapheim Dimitropoulos 
994bdfded42SSerapheim Dimitropoulos 		offset += inner_size;
995bdfded42SSerapheim Dimitropoulos 		asize -= inner_size;
996bdfded42SSerapheim Dimitropoulos 		entries++;
997bdfded42SSerapheim Dimitropoulos 		m++;
998bdfded42SSerapheim Dimitropoulos 	}
999bdfded42SSerapheim Dimitropoulos 
1000bdfded42SSerapheim Dimitropoulos 	size_t copy_length = entries * sizeof (*first_mapping);
1001bdfded42SSerapheim Dimitropoulos 	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
1002bdfded42SSerapheim Dimitropoulos 	bcopy(first_mapping, duplicate_mappings, copy_length);
1003bdfded42SSerapheim Dimitropoulos 	*copied_entries = entries;
1004bdfded42SSerapheim Dimitropoulos 
1005bdfded42SSerapheim Dimitropoulos 	return (duplicate_mappings);
1006bdfded42SSerapheim Dimitropoulos }
1007bdfded42SSerapheim Dimitropoulos 
10085cabbc6bSPrashanth Sreenivasa /*
10095cabbc6bSPrashanth Sreenivasa  * Goes through the relevant indirect mappings until it hits a concrete vdev
10105cabbc6bSPrashanth Sreenivasa  * and issues the callback. On the way to the concrete vdev, if any other
10115cabbc6bSPrashanth Sreenivasa  * indirect vdevs are encountered, then the callback will also be called on
10125cabbc6bSPrashanth Sreenivasa  * each of those indirect vdevs. For example, if the segment is mapped to
10135cabbc6bSPrashanth Sreenivasa  * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
10145cabbc6bSPrashanth Sreenivasa  * mapped to segment B on concrete vdev 2, then the callback will be called on
10155cabbc6bSPrashanth Sreenivasa  * both vdev 1 and vdev 2.
10165cabbc6bSPrashanth Sreenivasa  *
10175cabbc6bSPrashanth Sreenivasa  * While the callback passed to vdev_indirect_remap() is called on every vdev
10185cabbc6bSPrashanth Sreenivasa  * the function encounters, certain callbacks only care about concrete vdevs.
10195cabbc6bSPrashanth Sreenivasa  * These types of callbacks should return immediately and explicitly when they
10205cabbc6bSPrashanth Sreenivasa  * are called on an indirect vdev.
10215cabbc6bSPrashanth Sreenivasa  *
10225cabbc6bSPrashanth Sreenivasa  * Because there is a possibility that a DVA section in the indirect device
10235cabbc6bSPrashanth Sreenivasa  * has been split into multiple sections in our mapping, we keep track
10245cabbc6bSPrashanth Sreenivasa  * of the relevant contiguous segments of the new location (remap_segment_t)
10255cabbc6bSPrashanth Sreenivasa  * in a stack. This way we can call the callback for each of the new sections
10265cabbc6bSPrashanth Sreenivasa  * created by a single section of the indirect device. Note though, that in
10275cabbc6bSPrashanth Sreenivasa  * this scenario the callbacks in each split block won't occur in-order in
10285cabbc6bSPrashanth Sreenivasa  * terms of offset, so callers should not make any assumptions about that.
10295cabbc6bSPrashanth Sreenivasa  *
10305cabbc6bSPrashanth Sreenivasa  * For callbacks that don't handle split blocks and immediately return when
10315cabbc6bSPrashanth Sreenivasa  * they encounter them (as is the case for remap_blkptr_cb), the caller can
10325cabbc6bSPrashanth Sreenivasa  * assume that its callback will be applied from the first indirect vdev
10335cabbc6bSPrashanth Sreenivasa  * encountered to the last one and then the concrete vdev, in that order.
10345cabbc6bSPrashanth Sreenivasa  */
10355cabbc6bSPrashanth Sreenivasa static void
10365cabbc6bSPrashanth Sreenivasa vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
10375cabbc6bSPrashanth Sreenivasa     void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
10385cabbc6bSPrashanth Sreenivasa {
10395cabbc6bSPrashanth Sreenivasa 	list_t stack;
10405cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
10415cabbc6bSPrashanth Sreenivasa 
10425cabbc6bSPrashanth Sreenivasa 	list_create(&stack, sizeof (remap_segment_t),
10435cabbc6bSPrashanth Sreenivasa 	    offsetof(remap_segment_t, rs_node));
10445cabbc6bSPrashanth Sreenivasa 
10455cabbc6bSPrashanth Sreenivasa 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
10465cabbc6bSPrashanth Sreenivasa 	    rs != NULL; rs = list_remove_head(&stack)) {
10475cabbc6bSPrashanth Sreenivasa 		vdev_t *v = rs->rs_vd;
1048bdfded42SSerapheim Dimitropoulos 		uint64_t num_entries = 0;
1049bdfded42SSerapheim Dimitropoulos 
1050bdfded42SSerapheim Dimitropoulos 		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1051bdfded42SSerapheim Dimitropoulos 		ASSERT(rs->rs_asize > 0);
10525cabbc6bSPrashanth Sreenivasa 
10535cabbc6bSPrashanth Sreenivasa 		/*
1054bdfded42SSerapheim Dimitropoulos 		 * Note: As this function can be called from open context
1055bdfded42SSerapheim Dimitropoulos 		 * (e.g. zio_read()), we need the following rwlock to
1056bdfded42SSerapheim Dimitropoulos 		 * prevent the mapping from being changed by condensing.
1057bdfded42SSerapheim Dimitropoulos 		 *
1058bdfded42SSerapheim Dimitropoulos 		 * So we grab the lock and we make a copy of the entries
1059bdfded42SSerapheim Dimitropoulos 		 * that are relevant to the extent that we are working on.
1060bdfded42SSerapheim Dimitropoulos 		 * Once that is done, we drop the lock and iterate over
1061bdfded42SSerapheim Dimitropoulos 		 * our copy of the mapping. Once we are done with the with
1062bdfded42SSerapheim Dimitropoulos 		 * the remap segment and we free it, we also free our copy
1063bdfded42SSerapheim Dimitropoulos 		 * of the indirect mapping entries that are relevant to it.
1064bdfded42SSerapheim Dimitropoulos 		 *
1065bdfded42SSerapheim Dimitropoulos 		 * This way we don't need to wait until the function is
1066bdfded42SSerapheim Dimitropoulos 		 * finished with a segment, to condense it. In addition, we
1067bdfded42SSerapheim Dimitropoulos 		 * don't need a recursive rwlock for the case that a call to
1068bdfded42SSerapheim Dimitropoulos 		 * vdev_indirect_remap() needs to call itself (through the
1069bdfded42SSerapheim Dimitropoulos 		 * codepath of its callback) for the same vdev in the middle
1070bdfded42SSerapheim Dimitropoulos 		 * of its execution.
10715cabbc6bSPrashanth Sreenivasa 		 */
10725cabbc6bSPrashanth Sreenivasa 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
10735cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
10745cabbc6bSPrashanth Sreenivasa 		ASSERT3P(vim, !=, NULL);
10755cabbc6bSPrashanth Sreenivasa 
10765cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *mapping =
1077bdfded42SSerapheim Dimitropoulos 		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
1078bdfded42SSerapheim Dimitropoulos 		    rs->rs_offset, rs->rs_asize, &num_entries);
10795cabbc6bSPrashanth Sreenivasa 		ASSERT3P(mapping, !=, NULL);
1080bdfded42SSerapheim Dimitropoulos 		ASSERT3U(num_entries, >, 0);
1081bdfded42SSerapheim Dimitropoulos 		rw_exit(&v->vdev_indirect_rwlock);
10825cabbc6bSPrashanth Sreenivasa 
1083bdfded42SSerapheim Dimitropoulos 		for (uint64_t i = 0; i < num_entries; i++) {
10845cabbc6bSPrashanth Sreenivasa 			/*
10855cabbc6bSPrashanth Sreenivasa 			 * Note: the vdev_indirect_mapping can not change
10865cabbc6bSPrashanth Sreenivasa 			 * while we are running.  It only changes while the
10875cabbc6bSPrashanth Sreenivasa 			 * removal is in progress, and then only from syncing
10885cabbc6bSPrashanth Sreenivasa 			 * context. While a removal is in progress, this
10895cabbc6bSPrashanth Sreenivasa 			 * function is only called for frees, which also only
10905cabbc6bSPrashanth Sreenivasa 			 * happen from syncing context.
10915cabbc6bSPrashanth Sreenivasa 			 */
1092bdfded42SSerapheim Dimitropoulos 			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
1093bdfded42SSerapheim Dimitropoulos 
1094bdfded42SSerapheim Dimitropoulos 			ASSERT3P(m, !=, NULL);
1095bdfded42SSerapheim Dimitropoulos 			ASSERT3U(rs->rs_asize, >, 0);
10965cabbc6bSPrashanth Sreenivasa 
1097bdfded42SSerapheim Dimitropoulos 			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
1098bdfded42SSerapheim Dimitropoulos 			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
1099bdfded42SSerapheim Dimitropoulos 			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
11005cabbc6bSPrashanth Sreenivasa 
11015cabbc6bSPrashanth Sreenivasa 			ASSERT3U(rs->rs_offset, >=,
1102bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m));
11035cabbc6bSPrashanth Sreenivasa 			ASSERT3U(rs->rs_offset, <,
1104bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
11055cabbc6bSPrashanth Sreenivasa 			ASSERT3U(dst_vdev, !=, v->vdev_id);
11065cabbc6bSPrashanth Sreenivasa 
11075cabbc6bSPrashanth Sreenivasa 			uint64_t inner_offset = rs->rs_offset -
1108bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m);
11095cabbc6bSPrashanth Sreenivasa 			uint64_t inner_size =
11105cabbc6bSPrashanth Sreenivasa 			    MIN(rs->rs_asize, size - inner_offset);
11115cabbc6bSPrashanth Sreenivasa 
11125cabbc6bSPrashanth Sreenivasa 			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
11135cabbc6bSPrashanth Sreenivasa 			ASSERT3P(dst_v, !=, NULL);
11145cabbc6bSPrashanth Sreenivasa 
11155cabbc6bSPrashanth Sreenivasa 			if (dst_v->vdev_ops == &vdev_indirect_ops) {
11165cabbc6bSPrashanth Sreenivasa 				list_insert_head(&stack,
11175cabbc6bSPrashanth Sreenivasa 				    rs_alloc(dst_v, dst_offset + inner_offset,
11185cabbc6bSPrashanth Sreenivasa 				    inner_size, rs->rs_split_offset));
11195cabbc6bSPrashanth Sreenivasa 
11205cabbc6bSPrashanth Sreenivasa 			}
11215cabbc6bSPrashanth Sreenivasa 
11225cabbc6bSPrashanth Sreenivasa 			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
11235cabbc6bSPrashanth Sreenivasa 			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
11245cabbc6bSPrashanth Sreenivasa 				/*
11255cabbc6bSPrashanth Sreenivasa 				 * Note: This clause exists only solely for
11265cabbc6bSPrashanth Sreenivasa 				 * testing purposes. We use it to ensure that
11275cabbc6bSPrashanth Sreenivasa 				 * split blocks work and that the callbacks
11285cabbc6bSPrashanth Sreenivasa 				 * using them yield the same result if issued
11295cabbc6bSPrashanth Sreenivasa 				 * in reverse order.
11305cabbc6bSPrashanth Sreenivasa 				 */
11315cabbc6bSPrashanth Sreenivasa 				uint64_t inner_half = inner_size / 2;
11325cabbc6bSPrashanth Sreenivasa 
11335cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset + inner_half, dst_v,
11345cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset + inner_half,
11355cabbc6bSPrashanth Sreenivasa 				    inner_half, arg);
11365cabbc6bSPrashanth Sreenivasa 
11375cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset, dst_v,
11385cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset,
11395cabbc6bSPrashanth Sreenivasa 				    inner_half, arg);
11405cabbc6bSPrashanth Sreenivasa 			} else {
11415cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset, dst_v,
11425cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset,
11435cabbc6bSPrashanth Sreenivasa 				    inner_size, arg);
11445cabbc6bSPrashanth Sreenivasa 			}
11455cabbc6bSPrashanth Sreenivasa 
11465cabbc6bSPrashanth Sreenivasa 			rs->rs_offset += inner_size;
11475cabbc6bSPrashanth Sreenivasa 			rs->rs_asize -= inner_size;
11485cabbc6bSPrashanth Sreenivasa 			rs->rs_split_offset += inner_size;
11495cabbc6bSPrashanth Sreenivasa 		}
1150bdfded42SSerapheim Dimitropoulos 		VERIFY0(rs->rs_asize);
11515cabbc6bSPrashanth Sreenivasa 
1152bdfded42SSerapheim Dimitropoulos 		kmem_free(mapping, num_entries * sizeof (*mapping));
11535cabbc6bSPrashanth Sreenivasa 		kmem_free(rs, sizeof (remap_segment_t));
11545cabbc6bSPrashanth Sreenivasa 	}
11555cabbc6bSPrashanth Sreenivasa 	list_destroy(&stack);
11565cabbc6bSPrashanth Sreenivasa }
11575cabbc6bSPrashanth Sreenivasa 
11585cabbc6bSPrashanth Sreenivasa static void
11595cabbc6bSPrashanth Sreenivasa vdev_indirect_child_io_done(zio_t *zio)
11605cabbc6bSPrashanth Sreenivasa {
11615cabbc6bSPrashanth Sreenivasa 	zio_t *pio = zio->io_private;
11625cabbc6bSPrashanth Sreenivasa 
11635cabbc6bSPrashanth Sreenivasa 	mutex_enter(&pio->io_lock);
11645cabbc6bSPrashanth Sreenivasa 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
11655cabbc6bSPrashanth Sreenivasa 	mutex_exit(&pio->io_lock);
11665cabbc6bSPrashanth Sreenivasa 
11675cabbc6bSPrashanth Sreenivasa 	abd_put(zio->io_abd);
11685cabbc6bSPrashanth Sreenivasa }
11695cabbc6bSPrashanth Sreenivasa 
11703a4b1be9SMatthew Ahrens /*
11713a4b1be9SMatthew Ahrens  * This is a callback for vdev_indirect_remap() which allocates an
11723a4b1be9SMatthew Ahrens  * indirect_split_t for each split segment and adds it to iv_splits.
11733a4b1be9SMatthew Ahrens  */
11745cabbc6bSPrashanth Sreenivasa static void
11753a4b1be9SMatthew Ahrens vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
11765cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
11775cabbc6bSPrashanth Sreenivasa {
11785cabbc6bSPrashanth Sreenivasa 	zio_t *zio = arg;
11793a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
11805cabbc6bSPrashanth Sreenivasa 
11815cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd, !=, NULL);
11825cabbc6bSPrashanth Sreenivasa 
11835cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops == &vdev_indirect_ops)
11845cabbc6bSPrashanth Sreenivasa 		return;
11855cabbc6bSPrashanth Sreenivasa 
11863a4b1be9SMatthew Ahrens 	int n = 1;
11873a4b1be9SMatthew Ahrens 	if (vd->vdev_ops == &vdev_mirror_ops)
11883a4b1be9SMatthew Ahrens 		n = vd->vdev_children;
11893a4b1be9SMatthew Ahrens 
11903a4b1be9SMatthew Ahrens 	indirect_split_t *is =
11913a4b1be9SMatthew Ahrens 	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
11923a4b1be9SMatthew Ahrens 
11933a4b1be9SMatthew Ahrens 	is->is_children = n;
11943a4b1be9SMatthew Ahrens 	is->is_size = size;
11953a4b1be9SMatthew Ahrens 	is->is_split_offset = split_offset;
11963a4b1be9SMatthew Ahrens 	is->is_target_offset = offset;
11973a4b1be9SMatthew Ahrens 	is->is_vdev = vd;
1198a21fe349SBrian Behlendorf 	list_create(&is->is_unique_child, sizeof (indirect_child_t),
1199a21fe349SBrian Behlendorf 	    offsetof(indirect_child_t, ic_node));
12003a4b1be9SMatthew Ahrens 
12013a4b1be9SMatthew Ahrens 	/*
12023a4b1be9SMatthew Ahrens 	 * Note that we only consider multiple copies of the data for
12033a4b1be9SMatthew Ahrens 	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
12043a4b1be9SMatthew Ahrens 	 * though they use the same ops as mirror, because there's only one
12053a4b1be9SMatthew Ahrens 	 * "good" copy under the replacing/spare.
12063a4b1be9SMatthew Ahrens 	 */
12073a4b1be9SMatthew Ahrens 	if (vd->vdev_ops == &vdev_mirror_ops) {
12083a4b1be9SMatthew Ahrens 		for (int i = 0; i < n; i++) {
12093a4b1be9SMatthew Ahrens 			is->is_child[i].ic_vdev = vd->vdev_child[i];
1210a21fe349SBrian Behlendorf 			list_link_init(&is->is_child[i].ic_node);
12113a4b1be9SMatthew Ahrens 		}
12123a4b1be9SMatthew Ahrens 	} else {
12133a4b1be9SMatthew Ahrens 		is->is_child[0].ic_vdev = vd;
12143a4b1be9SMatthew Ahrens 	}
12153a4b1be9SMatthew Ahrens 
12163a4b1be9SMatthew Ahrens 	list_insert_tail(&iv->iv_splits, is);
12173a4b1be9SMatthew Ahrens }
12183a4b1be9SMatthew Ahrens 
12193a4b1be9SMatthew Ahrens static void
12203a4b1be9SMatthew Ahrens vdev_indirect_read_split_done(zio_t *zio)
12213a4b1be9SMatthew Ahrens {
12223a4b1be9SMatthew Ahrens 	indirect_child_t *ic = zio->io_private;
12233a4b1be9SMatthew Ahrens 
12243a4b1be9SMatthew Ahrens 	if (zio->io_error != 0) {
12253a4b1be9SMatthew Ahrens 		/*
12263a4b1be9SMatthew Ahrens 		 * Clear ic_data to indicate that we do not have data for this
12273a4b1be9SMatthew Ahrens 		 * child.
12283a4b1be9SMatthew Ahrens 		 */
12293a4b1be9SMatthew Ahrens 		abd_free(ic->ic_data);
12303a4b1be9SMatthew Ahrens 		ic->ic_data = NULL;
12313a4b1be9SMatthew Ahrens 	}
12323a4b1be9SMatthew Ahrens }
12333a4b1be9SMatthew Ahrens 
12343a4b1be9SMatthew Ahrens /*
12353a4b1be9SMatthew Ahrens  * Issue reads for all copies (mirror children) of all splits.
12363a4b1be9SMatthew Ahrens  */
12373a4b1be9SMatthew Ahrens static void
12383a4b1be9SMatthew Ahrens vdev_indirect_read_all(zio_t *zio)
12393a4b1be9SMatthew Ahrens {
12403a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
12413a4b1be9SMatthew Ahrens 
1242e4c795beSTom Caputi 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1243e4c795beSTom Caputi 
12443a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
12453a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
12463a4b1be9SMatthew Ahrens 		for (int i = 0; i < is->is_children; i++) {
12473a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[i];
12483a4b1be9SMatthew Ahrens 
12493a4b1be9SMatthew Ahrens 			if (!vdev_readable(ic->ic_vdev))
12503a4b1be9SMatthew Ahrens 				continue;
12513a4b1be9SMatthew Ahrens 
12523a4b1be9SMatthew Ahrens 			/*
12533a4b1be9SMatthew Ahrens 			 * Note, we may read from a child whose DTL
12543a4b1be9SMatthew Ahrens 			 * indicates that the data may not be present here.
12553a4b1be9SMatthew Ahrens 			 * While this might result in a few i/os that will
12563a4b1be9SMatthew Ahrens 			 * likely return incorrect data, it simplifies the
12573a4b1be9SMatthew Ahrens 			 * code since we can treat scrub and resilver
12583a4b1be9SMatthew Ahrens 			 * identically.  (The incorrect data will be
12593a4b1be9SMatthew Ahrens 			 * detected and ignored when we verify the
12603a4b1be9SMatthew Ahrens 			 * checksum.)
12613a4b1be9SMatthew Ahrens 			 */
12623a4b1be9SMatthew Ahrens 
12633a4b1be9SMatthew Ahrens 			ic->ic_data = abd_alloc_sametype(zio->io_abd,
12643a4b1be9SMatthew Ahrens 			    is->is_size);
1265a21fe349SBrian Behlendorf 			ic->ic_duplicate = NULL;
12663a4b1be9SMatthew Ahrens 
12673a4b1be9SMatthew Ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
12683a4b1be9SMatthew Ahrens 			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
12693a4b1be9SMatthew Ahrens 			    is->is_size, zio->io_type, zio->io_priority, 0,
12703a4b1be9SMatthew Ahrens 			    vdev_indirect_read_split_done, ic));
12713a4b1be9SMatthew Ahrens 		}
12723a4b1be9SMatthew Ahrens 	}
12733a4b1be9SMatthew Ahrens 	iv->iv_reconstruct = B_TRUE;
12745cabbc6bSPrashanth Sreenivasa }
12755cabbc6bSPrashanth Sreenivasa 
12765cabbc6bSPrashanth Sreenivasa static void
12775cabbc6bSPrashanth Sreenivasa vdev_indirect_io_start(zio_t *zio)
12785cabbc6bSPrashanth Sreenivasa {
12795cabbc6bSPrashanth Sreenivasa 	spa_t *spa = zio->io_spa;
12803a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
12813a4b1be9SMatthew Ahrens 	list_create(&iv->iv_splits,
12823a4b1be9SMatthew Ahrens 	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
12833a4b1be9SMatthew Ahrens 
12843a4b1be9SMatthew Ahrens 	zio->io_vsd = iv;
12853a4b1be9SMatthew Ahrens 	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
12865cabbc6bSPrashanth Sreenivasa 
12875cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
12885cabbc6bSPrashanth Sreenivasa 	if (zio->io_type != ZIO_TYPE_READ) {
12895cabbc6bSPrashanth Sreenivasa 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
12903a4b1be9SMatthew Ahrens 		/*
12913a4b1be9SMatthew Ahrens 		 * Note: this code can handle other kinds of writes,
12923a4b1be9SMatthew Ahrens 		 * but we don't expect them.
12933a4b1be9SMatthew Ahrens 		 */
12943a4b1be9SMatthew Ahrens 		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
12953a4b1be9SMatthew Ahrens 		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
12965cabbc6bSPrashanth Sreenivasa 	}
12975cabbc6bSPrashanth Sreenivasa 
12985cabbc6bSPrashanth Sreenivasa 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
12993a4b1be9SMatthew Ahrens 	    vdev_indirect_gather_splits, zio);
13003a4b1be9SMatthew Ahrens 
13013a4b1be9SMatthew Ahrens 	indirect_split_t *first = list_head(&iv->iv_splits);
13023a4b1be9SMatthew Ahrens 	if (first->is_size == zio->io_size) {
13033a4b1be9SMatthew Ahrens 		/*
13043a4b1be9SMatthew Ahrens 		 * This is not a split block; we are pointing to the entire
13053a4b1be9SMatthew Ahrens 		 * data, which will checksum the same as the original data.
13063a4b1be9SMatthew Ahrens 		 * Pass the BP down so that the child i/o can verify the
13073a4b1be9SMatthew Ahrens 		 * checksum, and try a different location if available
13083a4b1be9SMatthew Ahrens 		 * (e.g. on a mirror).
13093a4b1be9SMatthew Ahrens 		 *
13103a4b1be9SMatthew Ahrens 		 * While this special case could be handled the same as the
13113a4b1be9SMatthew Ahrens 		 * general (split block) case, doing it this way ensures
13123a4b1be9SMatthew Ahrens 		 * that the vast majority of blocks on indirect vdevs
13133a4b1be9SMatthew Ahrens 		 * (which are not split) are handled identically to blocks
13143a4b1be9SMatthew Ahrens 		 * on non-indirect vdevs.  This allows us to be less strict
13153a4b1be9SMatthew Ahrens 		 * about performance in the general (but rare) case.
13163a4b1be9SMatthew Ahrens 		 */
13173a4b1be9SMatthew Ahrens 		ASSERT0(first->is_split_offset);
13183a4b1be9SMatthew Ahrens 		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
13193a4b1be9SMatthew Ahrens 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
13203a4b1be9SMatthew Ahrens 		    first->is_vdev, first->is_target_offset,
13213a4b1be9SMatthew Ahrens 		    abd_get_offset(zio->io_abd, 0),
13223a4b1be9SMatthew Ahrens 		    zio->io_size, zio->io_type, zio->io_priority, 0,
13233a4b1be9SMatthew Ahrens 		    vdev_indirect_child_io_done, zio));
13243a4b1be9SMatthew Ahrens 	} else {
13253a4b1be9SMatthew Ahrens 		iv->iv_split_block = B_TRUE;
1326e4c795beSTom Caputi 		if (zio->io_type == ZIO_TYPE_READ &&
1327e4c795beSTom Caputi 		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
13283a4b1be9SMatthew Ahrens 			/*
13293a4b1be9SMatthew Ahrens 			 * Read all copies.  Note that for simplicity,
13303a4b1be9SMatthew Ahrens 			 * we don't bother consulting the DTL in the
13313a4b1be9SMatthew Ahrens 			 * resilver case.
13323a4b1be9SMatthew Ahrens 			 */
13333a4b1be9SMatthew Ahrens 			vdev_indirect_read_all(zio);
13343a4b1be9SMatthew Ahrens 		} else {
13353a4b1be9SMatthew Ahrens 			/*
1336e4c795beSTom Caputi 			 * If this is a read zio, we read one copy of each
1337e4c795beSTom Caputi 			 * split segment, from the top-level vdev.  Since
1338e4c795beSTom Caputi 			 * we don't know the checksum of each split
1339e4c795beSTom Caputi 			 * individually, the child zio can't ensure that
1340e4c795beSTom Caputi 			 * we get the right data. E.g. if it's a mirror,
1341e4c795beSTom Caputi 			 * it will just read from a random (healthy) leaf
1342e4c795beSTom Caputi 			 * vdev. We have to verify the checksum in
1343e4c795beSTom Caputi 			 * vdev_indirect_io_done().
1344e4c795beSTom Caputi 			 *
1345e4c795beSTom Caputi 			 * For write zios, the vdev code will ensure we write
1346e4c795beSTom Caputi 			 * to all children.
13473a4b1be9SMatthew Ahrens 			 */
13483a4b1be9SMatthew Ahrens 			for (indirect_split_t *is = list_head(&iv->iv_splits);
13493a4b1be9SMatthew Ahrens 			    is != NULL; is = list_next(&iv->iv_splits, is)) {
13503a4b1be9SMatthew Ahrens 				zio_nowait(zio_vdev_child_io(zio, NULL,
13513a4b1be9SMatthew Ahrens 				    is->is_vdev, is->is_target_offset,
13523a4b1be9SMatthew Ahrens 				    abd_get_offset(zio->io_abd,
13533a4b1be9SMatthew Ahrens 				    is->is_split_offset),
13543a4b1be9SMatthew Ahrens 				    is->is_size, zio->io_type,
13553a4b1be9SMatthew Ahrens 				    zio->io_priority, 0,
13563a4b1be9SMatthew Ahrens 				    vdev_indirect_child_io_done, zio));
13573a4b1be9SMatthew Ahrens 			}
13583a4b1be9SMatthew Ahrens 		}
13593a4b1be9SMatthew Ahrens 	}
13605cabbc6bSPrashanth Sreenivasa 
13615cabbc6bSPrashanth Sreenivasa 	zio_execute(zio);
13625cabbc6bSPrashanth Sreenivasa }
13635cabbc6bSPrashanth Sreenivasa 
13643a4b1be9SMatthew Ahrens /*
13653a4b1be9SMatthew Ahrens  * Report a checksum error for a child.
13663a4b1be9SMatthew Ahrens  */
13673a4b1be9SMatthew Ahrens static void
13683a4b1be9SMatthew Ahrens vdev_indirect_checksum_error(zio_t *zio,
13693a4b1be9SMatthew Ahrens     indirect_split_t *is, indirect_child_t *ic)
13703a4b1be9SMatthew Ahrens {
13713a4b1be9SMatthew Ahrens 	vdev_t *vd = ic->ic_vdev;
13723a4b1be9SMatthew Ahrens 
13733a4b1be9SMatthew Ahrens 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
13743a4b1be9SMatthew Ahrens 		return;
13753a4b1be9SMatthew Ahrens 
13763a4b1be9SMatthew Ahrens 	mutex_enter(&vd->vdev_stat_lock);
13773a4b1be9SMatthew Ahrens 	vd->vdev_stat.vs_checksum_errors++;
13783a4b1be9SMatthew Ahrens 	mutex_exit(&vd->vdev_stat_lock);
13793a4b1be9SMatthew Ahrens 
13803a4b1be9SMatthew Ahrens 	zio_bad_cksum_t zbc = { 0 };
13813a4b1be9SMatthew Ahrens 	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
1382a21fe349SBrian Behlendorf 	abd_t *good_abd = is->is_good_child->ic_data;
13833a4b1be9SMatthew Ahrens 	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
1384eb633035STom Caputi 	zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio,
13853a4b1be9SMatthew Ahrens 	    is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
13863a4b1be9SMatthew Ahrens 	abd_return_buf(ic->ic_data, bad_buf, is->is_size);
13873a4b1be9SMatthew Ahrens 	abd_return_buf(good_abd, good_buf, is->is_size);
13883a4b1be9SMatthew Ahrens }
13893a4b1be9SMatthew Ahrens 
13903a4b1be9SMatthew Ahrens /*
13913a4b1be9SMatthew Ahrens  * Issue repair i/os for any incorrect copies.  We do this by comparing
13923a4b1be9SMatthew Ahrens  * each split segment's correct data (is_good_child's ic_data) with each
13933a4b1be9SMatthew Ahrens  * other copy of the data.  If they differ, then we overwrite the bad data
13943a4b1be9SMatthew Ahrens  * with the good copy.  Note that we do this without regard for the DTL's,
13953a4b1be9SMatthew Ahrens  * which simplifies this code and also issues the optimal number of writes
13963a4b1be9SMatthew Ahrens  * (based on which copies actually read bad data, as opposed to which we
13973a4b1be9SMatthew Ahrens  * think might be wrong).  For the same reason, we always use
13983a4b1be9SMatthew Ahrens  * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
13993a4b1be9SMatthew Ahrens  */
14003a4b1be9SMatthew Ahrens static void
14013a4b1be9SMatthew Ahrens vdev_indirect_repair(zio_t *zio)
14023a4b1be9SMatthew Ahrens {
14033a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
14043a4b1be9SMatthew Ahrens 
14053a4b1be9SMatthew Ahrens 	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
14063a4b1be9SMatthew Ahrens 
14073a4b1be9SMatthew Ahrens 	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
14083a4b1be9SMatthew Ahrens 		flags |= ZIO_FLAG_SELF_HEAL;
14093a4b1be9SMatthew Ahrens 
14103a4b1be9SMatthew Ahrens 	if (!spa_writeable(zio->io_spa))
14113a4b1be9SMatthew Ahrens 		return;
14123a4b1be9SMatthew Ahrens 
14133a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
14143a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
14153a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
14163a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
1417a21fe349SBrian Behlendorf 			if (ic == is->is_good_child)
14183a4b1be9SMatthew Ahrens 				continue;
14193a4b1be9SMatthew Ahrens 			if (ic->ic_data == NULL)
14203a4b1be9SMatthew Ahrens 				continue;
1421a21fe349SBrian Behlendorf 			if (ic->ic_duplicate == is->is_good_child)
14223a4b1be9SMatthew Ahrens 				continue;
14233a4b1be9SMatthew Ahrens 
14243a4b1be9SMatthew Ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
14253a4b1be9SMatthew Ahrens 			    ic->ic_vdev, is->is_target_offset,
1426a21fe349SBrian Behlendorf 			    is->is_good_child->ic_data, is->is_size,
14273a4b1be9SMatthew Ahrens 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
14283a4b1be9SMatthew Ahrens 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
14293a4b1be9SMatthew Ahrens 			    NULL, NULL));
14303a4b1be9SMatthew Ahrens 
14313a4b1be9SMatthew Ahrens 			vdev_indirect_checksum_error(zio, is, ic);
14323a4b1be9SMatthew Ahrens 		}
14333a4b1be9SMatthew Ahrens 	}
14343a4b1be9SMatthew Ahrens }
14353a4b1be9SMatthew Ahrens 
14363a4b1be9SMatthew Ahrens /*
14373a4b1be9SMatthew Ahrens  * Report checksum errors on all children that we read from.
14383a4b1be9SMatthew Ahrens  */
14393a4b1be9SMatthew Ahrens static void
14403a4b1be9SMatthew Ahrens vdev_indirect_all_checksum_errors(zio_t *zio)
14413a4b1be9SMatthew Ahrens {
14423a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
14433a4b1be9SMatthew Ahrens 
14443a4b1be9SMatthew Ahrens 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
14453a4b1be9SMatthew Ahrens 		return;
14463a4b1be9SMatthew Ahrens 
14473a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
14483a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
14493a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
14503a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
14513a4b1be9SMatthew Ahrens 
14523a4b1be9SMatthew Ahrens 			if (ic->ic_data == NULL)
14533a4b1be9SMatthew Ahrens 				continue;
14543a4b1be9SMatthew Ahrens 
14553a4b1be9SMatthew Ahrens 			vdev_t *vd = ic->ic_vdev;
14563a4b1be9SMatthew Ahrens 
14573a4b1be9SMatthew Ahrens 			mutex_enter(&vd->vdev_stat_lock);
14583a4b1be9SMatthew Ahrens 			vd->vdev_stat.vs_checksum_errors++;
14593a4b1be9SMatthew Ahrens 			mutex_exit(&vd->vdev_stat_lock);
14603a4b1be9SMatthew Ahrens 
1461eb633035STom Caputi 			zfs_ereport_post_checksum(zio->io_spa, vd,
1462eb633035STom Caputi 			    &zio->io_bookmark, zio, is->is_target_offset,
1463eb633035STom Caputi 			    is->is_size, NULL, NULL, NULL);
14643a4b1be9SMatthew Ahrens 		}
14653a4b1be9SMatthew Ahrens 	}
14663a4b1be9SMatthew Ahrens }
14673a4b1be9SMatthew Ahrens 
1468a21fe349SBrian Behlendorf /*
1469a21fe349SBrian Behlendorf  * Copy data from all the splits to a main zio then validate the checksum.
1470a21fe349SBrian Behlendorf  * If then checksum is successfully validated return success.
1471a21fe349SBrian Behlendorf  */
1472a21fe349SBrian Behlendorf static int
1473a21fe349SBrian Behlendorf vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
1474a21fe349SBrian Behlendorf {
1475a21fe349SBrian Behlendorf 	zio_bad_cksum_t zbc;
1476a21fe349SBrian Behlendorf 
1477a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1478a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1479a21fe349SBrian Behlendorf 
1480a21fe349SBrian Behlendorf 		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
1481a21fe349SBrian Behlendorf 		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
1482a21fe349SBrian Behlendorf 
1483a21fe349SBrian Behlendorf 		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
1484a21fe349SBrian Behlendorf 		    is->is_split_offset, 0, is->is_size);
1485a21fe349SBrian Behlendorf 	}
1486a21fe349SBrian Behlendorf 
1487a21fe349SBrian Behlendorf 	return (zio_checksum_error(zio, &zbc));
1488a21fe349SBrian Behlendorf }
1489a21fe349SBrian Behlendorf 
1490a21fe349SBrian Behlendorf /*
1491a21fe349SBrian Behlendorf  * There are relatively few possible combinations making it feasible to
1492a21fe349SBrian Behlendorf  * deterministically check them all.  We do this by setting the good_child
1493a21fe349SBrian Behlendorf  * to the next unique split version.  If we reach the end of the list then
1494a21fe349SBrian Behlendorf  * "carry over" to the next unique split version (like counting in base
1495a21fe349SBrian Behlendorf  * is_unique_children, but each digit can have a different base).
1496a21fe349SBrian Behlendorf  */
1497a21fe349SBrian Behlendorf static int
1498a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
1499a21fe349SBrian Behlendorf {
1500a21fe349SBrian Behlendorf 	boolean_t more = B_TRUE;
1501a21fe349SBrian Behlendorf 
1502a21fe349SBrian Behlendorf 	iv->iv_attempts = 0;
1503a21fe349SBrian Behlendorf 
1504a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1505a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is))
1506a21fe349SBrian Behlendorf 		is->is_good_child = list_head(&is->is_unique_child);
1507a21fe349SBrian Behlendorf 
1508a21fe349SBrian Behlendorf 	while (more == B_TRUE) {
1509a21fe349SBrian Behlendorf 		iv->iv_attempts++;
1510a21fe349SBrian Behlendorf 		more = B_FALSE;
1511a21fe349SBrian Behlendorf 
1512a21fe349SBrian Behlendorf 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
1513a21fe349SBrian Behlendorf 			return (0);
1514a21fe349SBrian Behlendorf 
1515a21fe349SBrian Behlendorf 		for (indirect_split_t *is = list_head(&iv->iv_splits);
1516a21fe349SBrian Behlendorf 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1517a21fe349SBrian Behlendorf 			is->is_good_child = list_next(&is->is_unique_child,
1518a21fe349SBrian Behlendorf 			    is->is_good_child);
1519a21fe349SBrian Behlendorf 			if (is->is_good_child != NULL) {
1520a21fe349SBrian Behlendorf 				more = B_TRUE;
1521a21fe349SBrian Behlendorf 				break;
1522a21fe349SBrian Behlendorf 			}
1523a21fe349SBrian Behlendorf 
1524a21fe349SBrian Behlendorf 			is->is_good_child = list_head(&is->is_unique_child);
1525a21fe349SBrian Behlendorf 		}
1526a21fe349SBrian Behlendorf 	}
1527a21fe349SBrian Behlendorf 
1528a21fe349SBrian Behlendorf 	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
1529a21fe349SBrian Behlendorf 
1530a21fe349SBrian Behlendorf 	return (SET_ERROR(ECKSUM));
1531a21fe349SBrian Behlendorf }
1532a21fe349SBrian Behlendorf 
1533a21fe349SBrian Behlendorf /*
1534a21fe349SBrian Behlendorf  * There are too many combinations to try all of them in a reasonable amount
1535a21fe349SBrian Behlendorf  * of time.  So try a fixed number of random combinations from the unique
1536a21fe349SBrian Behlendorf  * split versions, after which we'll consider the block unrecoverable.
1537a21fe349SBrian Behlendorf  */
1538a21fe349SBrian Behlendorf static int
1539a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
1540a21fe349SBrian Behlendorf {
1541a21fe349SBrian Behlendorf 	iv->iv_attempts = 0;
1542a21fe349SBrian Behlendorf 
1543a21fe349SBrian Behlendorf 	while (iv->iv_attempts < iv->iv_attempts_max) {
1544a21fe349SBrian Behlendorf 		iv->iv_attempts++;
1545a21fe349SBrian Behlendorf 
1546a21fe349SBrian Behlendorf 		for (indirect_split_t *is = list_head(&iv->iv_splits);
1547a21fe349SBrian Behlendorf 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1548a21fe349SBrian Behlendorf 			indirect_child_t *ic = list_head(&is->is_unique_child);
1549a21fe349SBrian Behlendorf 			int children = is->is_unique_children;
1550a21fe349SBrian Behlendorf 
1551a21fe349SBrian Behlendorf 			for (int i = spa_get_random(children); i > 0; i--)
1552a21fe349SBrian Behlendorf 				ic = list_next(&is->is_unique_child, ic);
1553a21fe349SBrian Behlendorf 
1554a21fe349SBrian Behlendorf 			ASSERT3P(ic, !=, NULL);
1555a21fe349SBrian Behlendorf 			is->is_good_child = ic;
1556a21fe349SBrian Behlendorf 		}
1557a21fe349SBrian Behlendorf 
1558a21fe349SBrian Behlendorf 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
1559a21fe349SBrian Behlendorf 			return (0);
1560a21fe349SBrian Behlendorf 	}
1561a21fe349SBrian Behlendorf 
1562a21fe349SBrian Behlendorf 	return (SET_ERROR(ECKSUM));
1563a21fe349SBrian Behlendorf }
1564a21fe349SBrian Behlendorf 
1565a21fe349SBrian Behlendorf /*
1566a21fe349SBrian Behlendorf  * This is a validation function for reconstruction.  It randomly selects
1567a21fe349SBrian Behlendorf  * a good combination, if one can be found, and then it intentionally
1568a21fe349SBrian Behlendorf  * damages all other segment copes by zeroing them.  This forces the
1569a21fe349SBrian Behlendorf  * reconstruction algorithm to locate the one remaining known good copy.
1570a21fe349SBrian Behlendorf  */
1571a21fe349SBrian Behlendorf static int
1572a21fe349SBrian Behlendorf vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
1573a21fe349SBrian Behlendorf {
1574a21fe349SBrian Behlendorf 	/* Presume all the copies are unique for initial selection. */
1575a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1576a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1577a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
1578a21fe349SBrian Behlendorf 
1579a21fe349SBrian Behlendorf 		for (int i = 0; i < is->is_children; i++) {
1580a21fe349SBrian Behlendorf 			indirect_child_t *ic = &is->is_child[i];
1581a21fe349SBrian Behlendorf 			if (ic->ic_data != NULL) {
1582a21fe349SBrian Behlendorf 				is->is_unique_children++;
1583a21fe349SBrian Behlendorf 				list_insert_tail(&is->is_unique_child, ic);
1584a21fe349SBrian Behlendorf 			}
1585a21fe349SBrian Behlendorf 		}
1586a21fe349SBrian Behlendorf 	}
1587a21fe349SBrian Behlendorf 
1588a21fe349SBrian Behlendorf 	/*
1589a21fe349SBrian Behlendorf 	 * Set each is_good_child to a randomly-selected child which
1590a21fe349SBrian Behlendorf 	 * is known to contain validated data.
1591a21fe349SBrian Behlendorf 	 */
1592a21fe349SBrian Behlendorf 	int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
1593a21fe349SBrian Behlendorf 	if (error)
1594a21fe349SBrian Behlendorf 		goto out;
1595a21fe349SBrian Behlendorf 
1596a21fe349SBrian Behlendorf 	/*
1597a21fe349SBrian Behlendorf 	 * Damage all but the known good copy by zeroing it.  This will
1598a21fe349SBrian Behlendorf 	 * result in two or less unique copies per indirect_child_t.
1599a21fe349SBrian Behlendorf 	 * Both may need to be checked in order to reconstruct the block.
1600a21fe349SBrian Behlendorf 	 * Set iv->iv_attempts_max such that all unique combinations will
1601a21fe349SBrian Behlendorf 	 * enumerated, but limit the damage to at most 16 indirect splits.
1602a21fe349SBrian Behlendorf 	 */
1603a21fe349SBrian Behlendorf 	iv->iv_attempts_max = 1;
1604a21fe349SBrian Behlendorf 
1605a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1606a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1607a21fe349SBrian Behlendorf 		for (int c = 0; c < is->is_children; c++) {
1608a21fe349SBrian Behlendorf 			indirect_child_t *ic = &is->is_child[c];
1609a21fe349SBrian Behlendorf 
1610a21fe349SBrian Behlendorf 			if (ic == is->is_good_child)
1611a21fe349SBrian Behlendorf 				continue;
1612a21fe349SBrian Behlendorf 			if (ic->ic_data == NULL)
1613a21fe349SBrian Behlendorf 				continue;
1614a21fe349SBrian Behlendorf 
1615a21fe349SBrian Behlendorf 			abd_zero(ic->ic_data, ic->ic_data->abd_size);
1616a21fe349SBrian Behlendorf 		}
1617a21fe349SBrian Behlendorf 
1618a21fe349SBrian Behlendorf 		iv->iv_attempts_max *= 2;
1619a21fe349SBrian Behlendorf 		if (iv->iv_attempts_max > (1ULL << 16)) {
1620a21fe349SBrian Behlendorf 			iv->iv_attempts_max = UINT64_MAX;
1621a21fe349SBrian Behlendorf 			break;
1622a21fe349SBrian Behlendorf 		}
1623a21fe349SBrian Behlendorf 	}
1624a21fe349SBrian Behlendorf 
1625a21fe349SBrian Behlendorf out:
1626a21fe349SBrian Behlendorf 	/* Empty the unique children lists so they can be reconstructed. */
1627a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1628a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1629a21fe349SBrian Behlendorf 		indirect_child_t *ic;
1630a21fe349SBrian Behlendorf 		while ((ic = list_head(&is->is_unique_child)) != NULL)
1631a21fe349SBrian Behlendorf 			list_remove(&is->is_unique_child, ic);
1632a21fe349SBrian Behlendorf 
1633a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
1634a21fe349SBrian Behlendorf 	}
1635a21fe349SBrian Behlendorf 
1636a21fe349SBrian Behlendorf 	return (error);
1637a21fe349SBrian Behlendorf }
1638a21fe349SBrian Behlendorf 
16393a4b1be9SMatthew Ahrens /*
16403a4b1be9SMatthew Ahrens  * This function is called when we have read all copies of the data and need
16413a4b1be9SMatthew Ahrens  * to try to find a combination of copies that gives us the right checksum.
16423a4b1be9SMatthew Ahrens  *
16433a4b1be9SMatthew Ahrens  * If we pointed to any mirror vdevs, this effectively does the job of the
16443a4b1be9SMatthew Ahrens  * mirror.  The mirror vdev code can't do its own job because we don't know
1645a21fe349SBrian Behlendorf  * the checksum of each split segment individually.
16463a4b1be9SMatthew Ahrens  *
1647a21fe349SBrian Behlendorf  * We have to try every unique combination of copies of split segments, until
1648a21fe349SBrian Behlendorf  * we find one that checksums correctly.  Duplicate segment copies are first
1649a21fe349SBrian Behlendorf  * identified and latter skipped during reconstruction.  This optimization
1650a21fe349SBrian Behlendorf  * reduces the search space and ensures that of the remaining combinations
1651a21fe349SBrian Behlendorf  * at most one is correct.
1652a21fe349SBrian Behlendorf  *
1653a21fe349SBrian Behlendorf  * When the total number of combinations is small they can all be checked.
1654a21fe349SBrian Behlendorf  * For example, if we have 3 segments in the split, and each points to a
1655a21fe349SBrian Behlendorf  * 2-way mirror with unique copies, we will have the following pieces of data:
16563a4b1be9SMatthew Ahrens  *
16573a4b1be9SMatthew Ahrens  *       |     mirror child
16583a4b1be9SMatthew Ahrens  * split |     [0]        [1]
16593a4b1be9SMatthew Ahrens  * ======|=====================
16603a4b1be9SMatthew Ahrens  *   A   |  data_A_0   data_A_1
16613a4b1be9SMatthew Ahrens  *   B   |  data_B_0   data_B_1
16623a4b1be9SMatthew Ahrens  *   C   |  data_C_0   data_C_1
16633a4b1be9SMatthew Ahrens  *
16643a4b1be9SMatthew Ahrens  * We will try the following (mirror children)^(number of splits) (2^3=8)
16653a4b1be9SMatthew Ahrens  * combinations, which is similar to bitwise-little-endian counting in
16663a4b1be9SMatthew Ahrens  * binary.  In general each "digit" corresponds to a split segment, and the
16673a4b1be9SMatthew Ahrens  * base of each digit is is_children, which can be different for each
16683a4b1be9SMatthew Ahrens  * digit.
16693a4b1be9SMatthew Ahrens  *
16703a4b1be9SMatthew Ahrens  * "low bit"        "high bit"
16713a4b1be9SMatthew Ahrens  *        v                 v
16723a4b1be9SMatthew Ahrens  * data_A_0 data_B_0 data_C_0
16733a4b1be9SMatthew Ahrens  * data_A_1 data_B_0 data_C_0
16743a4b1be9SMatthew Ahrens  * data_A_0 data_B_1 data_C_0
16753a4b1be9SMatthew Ahrens  * data_A_1 data_B_1 data_C_0
16763a4b1be9SMatthew Ahrens  * data_A_0 data_B_0 data_C_1
16773a4b1be9SMatthew Ahrens  * data_A_1 data_B_0 data_C_1
16783a4b1be9SMatthew Ahrens  * data_A_0 data_B_1 data_C_1
16793a4b1be9SMatthew Ahrens  * data_A_1 data_B_1 data_C_1
16803a4b1be9SMatthew Ahrens  *
16813a4b1be9SMatthew Ahrens  * Note that the split segments may be on the same or different top-level
1682a21fe349SBrian Behlendorf  * vdevs. In either case, we may need to try lots of combinations (see
1683a21fe349SBrian Behlendorf  * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
1684a21fe349SBrian Behlendorf  * has small silent errors on all of its children, we can still reconstruct
1685a21fe349SBrian Behlendorf  * the correct data, as long as those errors are at sufficiently-separated
16863a4b1be9SMatthew Ahrens  * offsets (specifically, separated by the largest block size - default of
16873a4b1be9SMatthew Ahrens  * 128KB, but up to 16MB).
16883a4b1be9SMatthew Ahrens  */
16893a4b1be9SMatthew Ahrens static void
16903a4b1be9SMatthew Ahrens vdev_indirect_reconstruct_io_done(zio_t *zio)
16913a4b1be9SMatthew Ahrens {
16923a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
1693a21fe349SBrian Behlendorf 	boolean_t known_good = B_FALSE;
1694a21fe349SBrian Behlendorf 	int error;
1695a21fe349SBrian Behlendorf 
1696a21fe349SBrian Behlendorf 	iv->iv_unique_combinations = 1;
1697a21fe349SBrian Behlendorf 	iv->iv_attempts_max = UINT64_MAX;
1698a21fe349SBrian Behlendorf 
1699a21fe349SBrian Behlendorf 	if (zfs_reconstruct_indirect_combinations_max > 0)
1700a21fe349SBrian Behlendorf 		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
1701a21fe349SBrian Behlendorf 
1702a21fe349SBrian Behlendorf 	/*
1703a21fe349SBrian Behlendorf 	 * If nonzero, every 1/x blocks will be damaged, in order to validate
1704a21fe349SBrian Behlendorf 	 * reconstruction when there are split segments with damaged copies.
1705a21fe349SBrian Behlendorf 	 * Known_good will TRUE when reconstruction is known to be possible.
1706a21fe349SBrian Behlendorf 	 */
1707a21fe349SBrian Behlendorf 	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
1708a21fe349SBrian Behlendorf 	    spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
1709a21fe349SBrian Behlendorf 		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
17103a4b1be9SMatthew Ahrens 
1711a21fe349SBrian Behlendorf 	/*
1712a21fe349SBrian Behlendorf 	 * Determine the unique children for a split segment and add them
1713a21fe349SBrian Behlendorf 	 * to the is_unique_child list.  By restricting reconstruction
1714a21fe349SBrian Behlendorf 	 * to these children, only unique combinations will be considered.
1715a21fe349SBrian Behlendorf 	 * This can vastly reduce the search space when there are a large
1716a21fe349SBrian Behlendorf 	 * number of indirect splits.
1717a21fe349SBrian Behlendorf 	 */
17183a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1719a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1720a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
17213a4b1be9SMatthew Ahrens 
1722a21fe349SBrian Behlendorf 		for (int i = 0; i < is->is_children; i++) {
1723a21fe349SBrian Behlendorf 			indirect_child_t *ic_i = &is->is_child[i];
17243a4b1be9SMatthew Ahrens 
1725a21fe349SBrian Behlendorf 			if (ic_i->ic_data == NULL ||
1726a21fe349SBrian Behlendorf 			    ic_i->ic_duplicate != NULL)
1727a21fe349SBrian Behlendorf 				continue;
17283a4b1be9SMatthew Ahrens 
1729a21fe349SBrian Behlendorf 			for (int j = i + 1; j < is->is_children; j++) {
1730a21fe349SBrian Behlendorf 				indirect_child_t *ic_j = &is->is_child[j];
17313a4b1be9SMatthew Ahrens 
1732a21fe349SBrian Behlendorf 				if (ic_j->ic_data == NULL ||
1733a21fe349SBrian Behlendorf 				    ic_j->ic_duplicate != NULL)
1734a21fe349SBrian Behlendorf 					continue;
17353a4b1be9SMatthew Ahrens 
1736a21fe349SBrian Behlendorf 				if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
1737a21fe349SBrian Behlendorf 				    is->is_size) == 0) {
1738a21fe349SBrian Behlendorf 					ic_j->ic_duplicate = ic_i;
17393a4b1be9SMatthew Ahrens 				}
17403a4b1be9SMatthew Ahrens 			}
1741a21fe349SBrian Behlendorf 
1742a21fe349SBrian Behlendorf 			is->is_unique_children++;
1743a21fe349SBrian Behlendorf 			list_insert_tail(&is->is_unique_child, ic_i);
17443a4b1be9SMatthew Ahrens 		}
1745a21fe349SBrian Behlendorf 
1746a21fe349SBrian Behlendorf 		/* Reconstruction is impossible, no valid children */
1747a21fe349SBrian Behlendorf 		EQUIV(list_is_empty(&is->is_unique_child),
1748a21fe349SBrian Behlendorf 		    is->is_unique_children == 0);
1749a21fe349SBrian Behlendorf 		if (list_is_empty(&is->is_unique_child)) {
1750a21fe349SBrian Behlendorf 			zio->io_error = EIO;
17513a4b1be9SMatthew Ahrens 			vdev_indirect_all_checksum_errors(zio);
17523a4b1be9SMatthew Ahrens 			zio_checksum_verified(zio);
17533a4b1be9SMatthew Ahrens 			return;
17543a4b1be9SMatthew Ahrens 		}
1755a21fe349SBrian Behlendorf 
1756a21fe349SBrian Behlendorf 		iv->iv_unique_combinations *= is->is_unique_children;
1757a21fe349SBrian Behlendorf 	}
1758a21fe349SBrian Behlendorf 
1759a21fe349SBrian Behlendorf 	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
1760a21fe349SBrian Behlendorf 		error = vdev_indirect_splits_enumerate_all(iv, zio);
1761a21fe349SBrian Behlendorf 	else
1762a21fe349SBrian Behlendorf 		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
1763a21fe349SBrian Behlendorf 
1764a21fe349SBrian Behlendorf 	if (error != 0) {
1765a21fe349SBrian Behlendorf 		/* All attempted combinations failed. */
1766a21fe349SBrian Behlendorf 		ASSERT3B(known_good, ==, B_FALSE);
1767a21fe349SBrian Behlendorf 		zio->io_error = error;
1768a21fe349SBrian Behlendorf 		vdev_indirect_all_checksum_errors(zio);
1769a21fe349SBrian Behlendorf 	} else {
1770a21fe349SBrian Behlendorf 		/*
1771a21fe349SBrian Behlendorf 		 * The checksum has been successfully validated.  Issue
1772a21fe349SBrian Behlendorf 		 * repair I/Os to any copies of splits which don't match
1773a21fe349SBrian Behlendorf 		 * the validated version.
1774a21fe349SBrian Behlendorf 		 */
1775a21fe349SBrian Behlendorf 		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
1776a21fe349SBrian Behlendorf 		vdev_indirect_repair(zio);
1777a21fe349SBrian Behlendorf 		zio_checksum_verified(zio);
17783a4b1be9SMatthew Ahrens 	}
17793a4b1be9SMatthew Ahrens }
17803a4b1be9SMatthew Ahrens 
17813a4b1be9SMatthew Ahrens static void
17823a4b1be9SMatthew Ahrens vdev_indirect_io_done(zio_t *zio)
17833a4b1be9SMatthew Ahrens {
17843a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
17853a4b1be9SMatthew Ahrens 
17863a4b1be9SMatthew Ahrens 	if (iv->iv_reconstruct) {
17873a4b1be9SMatthew Ahrens 		/*
17883a4b1be9SMatthew Ahrens 		 * We have read all copies of the data (e.g. from mirrors),
17893a4b1be9SMatthew Ahrens 		 * either because this was a scrub/resilver, or because the
17903a4b1be9SMatthew Ahrens 		 * one-copy read didn't checksum correctly.
17913a4b1be9SMatthew Ahrens 		 */
17923a4b1be9SMatthew Ahrens 		vdev_indirect_reconstruct_io_done(zio);
17933a4b1be9SMatthew Ahrens 		return;
17943a4b1be9SMatthew Ahrens 	}
17953a4b1be9SMatthew Ahrens 
17963a4b1be9SMatthew Ahrens 	if (!iv->iv_split_block) {
17973a4b1be9SMatthew Ahrens 		/*
17983a4b1be9SMatthew Ahrens 		 * This was not a split block, so we passed the BP down,
17993a4b1be9SMatthew Ahrens 		 * and the checksum was handled by the (one) child zio.
18003a4b1be9SMatthew Ahrens 		 */
18013a4b1be9SMatthew Ahrens 		return;
18023a4b1be9SMatthew Ahrens 	}
18033a4b1be9SMatthew Ahrens 
18043a4b1be9SMatthew Ahrens 	zio_bad_cksum_t zbc;
18053a4b1be9SMatthew Ahrens 	int ret = zio_checksum_error(zio, &zbc);
18063a4b1be9SMatthew Ahrens 	if (ret == 0) {
18073a4b1be9SMatthew Ahrens 		zio_checksum_verified(zio);
18083a4b1be9SMatthew Ahrens 		return;
18093a4b1be9SMatthew Ahrens 	}
18103a4b1be9SMatthew Ahrens 
18113a4b1be9SMatthew Ahrens 	/*
18123a4b1be9SMatthew Ahrens 	 * The checksum didn't match.  Read all copies of all splits, and
18133a4b1be9SMatthew Ahrens 	 * then we will try to reconstruct.  The next time
18143a4b1be9SMatthew Ahrens 	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
18153a4b1be9SMatthew Ahrens 	 */
18163a4b1be9SMatthew Ahrens 	vdev_indirect_read_all(zio);
18173a4b1be9SMatthew Ahrens 
18183a4b1be9SMatthew Ahrens 	zio_vdev_io_redone(zio);
18193a4b1be9SMatthew Ahrens }
18203a4b1be9SMatthew Ahrens 
18215cabbc6bSPrashanth Sreenivasa vdev_ops_t vdev_indirect_ops = {
1822a3874b8bSToomas Soome 	.vdev_op_open = vdev_indirect_open,
1823a3874b8bSToomas Soome 	.vdev_op_close = vdev_indirect_close,
1824a3874b8bSToomas Soome 	.vdev_op_asize = vdev_default_asize,
1825a3874b8bSToomas Soome 	.vdev_op_io_start = vdev_indirect_io_start,
1826a3874b8bSToomas Soome 	.vdev_op_io_done = vdev_indirect_io_done,
1827a3874b8bSToomas Soome 	.vdev_op_state_change = NULL,
1828a3874b8bSToomas Soome 	.vdev_op_need_resilver = NULL,
1829a3874b8bSToomas Soome 	.vdev_op_hold = NULL,
1830a3874b8bSToomas Soome 	.vdev_op_rele = NULL,
1831a3874b8bSToomas Soome 	.vdev_op_remap = vdev_indirect_remap,
1832a3874b8bSToomas Soome 	.vdev_op_xlate = NULL,
1833a3874b8bSToomas Soome 	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
1834a3874b8bSToomas Soome 	.vdev_op_leaf = B_FALSE			/* leaf vdev */
18355cabbc6bSPrashanth Sreenivasa };
1836