15cabbc6bSPrashanth Sreenivasa /*
25cabbc6bSPrashanth Sreenivasa  * CDDL HEADER START
35cabbc6bSPrashanth Sreenivasa  *
45cabbc6bSPrashanth Sreenivasa  * This file and its contents are supplied under the terms of the
55cabbc6bSPrashanth Sreenivasa  * Common Development and Distribution License ("CDDL"), version 1.0.
65cabbc6bSPrashanth Sreenivasa  * You may only use this file in accordance with the terms of version
75cabbc6bSPrashanth Sreenivasa  * 1.0 of the CDDL.
85cabbc6bSPrashanth Sreenivasa  *
95cabbc6bSPrashanth Sreenivasa  * A full copy of the text of the CDDL should have accompanied this
105cabbc6bSPrashanth Sreenivasa  * source.  A copy of the CDDL is also available via the Internet at
115cabbc6bSPrashanth Sreenivasa  * http://www.illumos.org/license/CDDL.
125cabbc6bSPrashanth Sreenivasa  *
135cabbc6bSPrashanth Sreenivasa  * CDDL HEADER END
145cabbc6bSPrashanth Sreenivasa  */
155cabbc6bSPrashanth Sreenivasa 
165cabbc6bSPrashanth Sreenivasa /*
17814dcd43SSerapheim Dimitropoulos  * Copyright (c) 2014, 2019 by Delphix. All rights reserved.
18ac04831dSMike Gerdts  * Copyright 2019 Joyent, Inc.
195cabbc6bSPrashanth Sreenivasa  */
205cabbc6bSPrashanth Sreenivasa 
215cabbc6bSPrashanth Sreenivasa #include <sys/zfs_context.h>
225cabbc6bSPrashanth Sreenivasa #include <sys/spa.h>
235cabbc6bSPrashanth Sreenivasa #include <sys/spa_impl.h>
245cabbc6bSPrashanth Sreenivasa #include <sys/vdev_impl.h>
255cabbc6bSPrashanth Sreenivasa #include <sys/fs/zfs.h>
265cabbc6bSPrashanth Sreenivasa #include <sys/zio.h>
273a4b1be9SMatthew Ahrens #include <sys/zio_checksum.h>
285cabbc6bSPrashanth Sreenivasa #include <sys/metaslab.h>
295cabbc6bSPrashanth Sreenivasa #include <sys/refcount.h>
305cabbc6bSPrashanth Sreenivasa #include <sys/dmu.h>
315cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_mapping.h>
325cabbc6bSPrashanth Sreenivasa #include <sys/dmu_tx.h>
335cabbc6bSPrashanth Sreenivasa #include <sys/dsl_synctask.h>
345cabbc6bSPrashanth Sreenivasa #include <sys/zap.h>
35667ec66fSSerapheim Dimitropoulos #include <sys/abd.h>
36667ec66fSSerapheim Dimitropoulos #include <sys/zthr.h>
375cabbc6bSPrashanth Sreenivasa 
385cabbc6bSPrashanth Sreenivasa /*
395cabbc6bSPrashanth Sreenivasa  * An indirect vdev corresponds to a vdev that has been removed.  Since
405cabbc6bSPrashanth Sreenivasa  * we cannot rewrite block pointers of snapshots, etc., we keep a
415cabbc6bSPrashanth Sreenivasa  * mapping from old location on the removed device to the new location
425cabbc6bSPrashanth Sreenivasa  * on another device in the pool and use this mapping whenever we need
435cabbc6bSPrashanth Sreenivasa  * to access the DVA.  Unfortunately, this mapping did not respect
445cabbc6bSPrashanth Sreenivasa  * logical block boundaries when it was first created, and so a DVA on
455cabbc6bSPrashanth Sreenivasa  * this indirect vdev may be "split" into multiple sections that each
465cabbc6bSPrashanth Sreenivasa  * map to a different location.  As a consequence, not all DVAs can be
475cabbc6bSPrashanth Sreenivasa  * translated to an equivalent new DVA.  Instead we must provide a
485cabbc6bSPrashanth Sreenivasa  * "vdev_remap" operation that executes a callback on each contiguous
495cabbc6bSPrashanth Sreenivasa  * segment of the new location.  This function is used in multiple ways:
505cabbc6bSPrashanth Sreenivasa  *
513a4b1be9SMatthew Ahrens  *  - i/os to this vdev use the callback to determine where the
523a4b1be9SMatthew Ahrens  *    data is now located, and issue child i/os for each segment's new
533a4b1be9SMatthew Ahrens  *    location.
545cabbc6bSPrashanth Sreenivasa  *
553a4b1be9SMatthew Ahrens  *  - frees and claims to this vdev use the callback to free or claim
565cabbc6bSPrashanth Sreenivasa  *    each mapped segment.  (Note that we don't actually need to claim
575cabbc6bSPrashanth Sreenivasa  *    log blocks on indirect vdevs, because we don't allocate to
585cabbc6bSPrashanth Sreenivasa  *    removing vdevs.  However, zdb uses zio_claim() for its leak
595cabbc6bSPrashanth Sreenivasa  *    detection.)
605cabbc6bSPrashanth Sreenivasa  */
615cabbc6bSPrashanth Sreenivasa 
625cabbc6bSPrashanth Sreenivasa /*
635cabbc6bSPrashanth Sreenivasa  * "Big theory statement" for how we mark blocks obsolete.
645cabbc6bSPrashanth Sreenivasa  *
655cabbc6bSPrashanth Sreenivasa  * When a block on an indirect vdev is freed or remapped, a section of
665cabbc6bSPrashanth Sreenivasa  * that vdev's mapping may no longer be referenced (aka "obsolete").  We
675cabbc6bSPrashanth Sreenivasa  * keep track of how much of each mapping entry is obsolete.  When
685cabbc6bSPrashanth Sreenivasa  * an entry becomes completely obsolete, we can remove it, thus reducing
695cabbc6bSPrashanth Sreenivasa  * the memory used by the mapping.  The complete picture of obsolescence
705cabbc6bSPrashanth Sreenivasa  * is given by the following data structures, described below:
715cabbc6bSPrashanth Sreenivasa  *  - the entry-specific obsolete count
725cabbc6bSPrashanth Sreenivasa  *  - the vdev-specific obsolete spacemap
735cabbc6bSPrashanth Sreenivasa  *  - the pool-specific obsolete bpobj
745cabbc6bSPrashanth Sreenivasa  *
755cabbc6bSPrashanth Sreenivasa  * == On disk data structures used ==
765cabbc6bSPrashanth Sreenivasa  *
775cabbc6bSPrashanth Sreenivasa  * We track the obsolete space for the pool using several objects.  Each
785cabbc6bSPrashanth Sreenivasa  * of these objects is created on demand and freed when no longer
795cabbc6bSPrashanth Sreenivasa  * needed, and is assumed to be empty if it does not exist.
805cabbc6bSPrashanth Sreenivasa  * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
815cabbc6bSPrashanth Sreenivasa  *
825cabbc6bSPrashanth Sreenivasa  *  - Each vic_mapping_object (associated with an indirect vdev) can
835cabbc6bSPrashanth Sreenivasa  *    have a vimp_counts_object.  This is an array of uint32_t's
845cabbc6bSPrashanth Sreenivasa  *    with the same number of entries as the vic_mapping_object.  When
855cabbc6bSPrashanth Sreenivasa  *    the mapping is condensed, entries from the vic_obsolete_sm_object
865cabbc6bSPrashanth Sreenivasa  *    (see below) are folded into the counts.  Therefore, each
875cabbc6bSPrashanth Sreenivasa  *    obsolete_counts entry tells us the number of bytes in the
885cabbc6bSPrashanth Sreenivasa  *    corresponding mapping entry that were not referenced when the
895cabbc6bSPrashanth Sreenivasa  *    mapping was last condensed.
905cabbc6bSPrashanth Sreenivasa  *
915cabbc6bSPrashanth Sreenivasa  *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
925cabbc6bSPrashanth Sreenivasa  *    This is a space map containing an alloc entry for every DVA that
935cabbc6bSPrashanth Sreenivasa  *    has been obsoleted since the last time this indirect vdev was
945cabbc6bSPrashanth Sreenivasa  *    condensed.  We use this object in order to improve performance
955cabbc6bSPrashanth Sreenivasa  *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
965cabbc6bSPrashanth Sreenivasa  *    offset of the vimp_counts_object, we only need to append an entry
975cabbc6bSPrashanth Sreenivasa  *    to the end of this object.  When a DVA becomes obsolete, it is
985cabbc6bSPrashanth Sreenivasa  *    added to the obsolete space map.  This happens when the DVA is
995cabbc6bSPrashanth Sreenivasa  *    freed, remapped and not referenced by a snapshot, or the last
1005cabbc6bSPrashanth Sreenivasa  *    snapshot referencing it is destroyed.
1015cabbc6bSPrashanth Sreenivasa  *
1025cabbc6bSPrashanth Sreenivasa  *  - Each dataset can have a ds_remap_deadlist object.  This is a
1035cabbc6bSPrashanth Sreenivasa  *    deadlist object containing all blocks that were remapped in this
1045cabbc6bSPrashanth Sreenivasa  *    dataset but referenced in a previous snapshot.  Blocks can *only*
1055cabbc6bSPrashanth Sreenivasa  *    appear on this list if they were remapped (dsl_dataset_block_remapped);
1065cabbc6bSPrashanth Sreenivasa  *    blocks that were killed in a head dataset are put on the normal
1075cabbc6bSPrashanth Sreenivasa  *    ds_deadlist and marked obsolete when they are freed.
1085cabbc6bSPrashanth Sreenivasa  *
1095cabbc6bSPrashanth Sreenivasa  *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
1105cabbc6bSPrashanth Sreenivasa  *    in the pool that need to be marked obsolete.  When a snapshot is
1115cabbc6bSPrashanth Sreenivasa  *    destroyed, we move some of the ds_remap_deadlist to the obsolete
1125cabbc6bSPrashanth Sreenivasa  *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
1135cabbc6bSPrashanth Sreenivasa  *    asynchronously process the obsolete bpobj, moving its entries to
1145cabbc6bSPrashanth Sreenivasa  *    the specific vdevs' obsolete space maps.
1155cabbc6bSPrashanth Sreenivasa  *
1165cabbc6bSPrashanth Sreenivasa  * == Summary of how we mark blocks as obsolete ==
1175cabbc6bSPrashanth Sreenivasa  *
1185cabbc6bSPrashanth Sreenivasa  * - When freeing a block: if any DVA is on an indirect vdev, append to
1195cabbc6bSPrashanth Sreenivasa  *   vic_obsolete_sm_object.
1205cabbc6bSPrashanth Sreenivasa  * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
1215cabbc6bSPrashanth Sreenivasa  *   references; otherwise append to vic_obsolete_sm_object).
1225cabbc6bSPrashanth Sreenivasa  * - When freeing a snapshot: move parts of ds_remap_deadlist to
1235cabbc6bSPrashanth Sreenivasa  *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
1245cabbc6bSPrashanth Sreenivasa  * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
1255cabbc6bSPrashanth Sreenivasa  *   individual vdev's vic_obsolete_sm_object.
1265cabbc6bSPrashanth Sreenivasa  */
1275cabbc6bSPrashanth Sreenivasa 
1285cabbc6bSPrashanth Sreenivasa /*
1295cabbc6bSPrashanth Sreenivasa  * "Big theory statement" for how we condense indirect vdevs.
1305cabbc6bSPrashanth Sreenivasa  *
1315cabbc6bSPrashanth Sreenivasa  * Condensing an indirect vdev's mapping is the process of determining
1325cabbc6bSPrashanth Sreenivasa  * the precise counts of obsolete space for each mapping entry (by
1335cabbc6bSPrashanth Sreenivasa  * integrating the obsolete spacemap into the obsolete counts) and
1345cabbc6bSPrashanth Sreenivasa  * writing out a new mapping that contains only referenced entries.
1355cabbc6bSPrashanth Sreenivasa  *
1365cabbc6bSPrashanth Sreenivasa  * We condense a vdev when we expect the mapping to shrink (see
1375cabbc6bSPrashanth Sreenivasa  * vdev_indirect_should_condense()), but only perform one condense at a
1385cabbc6bSPrashanth Sreenivasa  * time to limit the memory usage.  In addition, we use a separate
1395cabbc6bSPrashanth Sreenivasa  * open-context thread (spa_condense_indirect_thread) to incrementally
1405cabbc6bSPrashanth Sreenivasa  * create the new mapping object in a way that minimizes the impact on
1415cabbc6bSPrashanth Sreenivasa  * the rest of the system.
1425cabbc6bSPrashanth Sreenivasa  *
1435cabbc6bSPrashanth Sreenivasa  * == Generating a new mapping ==
1445cabbc6bSPrashanth Sreenivasa  *
1455cabbc6bSPrashanth Sreenivasa  * To generate a new mapping, we follow these steps:
1465cabbc6bSPrashanth Sreenivasa  *
1475cabbc6bSPrashanth Sreenivasa  * 1. Save the old obsolete space map and create a new mapping object
1485cabbc6bSPrashanth Sreenivasa  *    (see spa_condense_indirect_start_sync()).  This initializes the
1495cabbc6bSPrashanth Sreenivasa  *    spa_condensing_indirect_phys with the "previous obsolete space map",
1505cabbc6bSPrashanth Sreenivasa  *    which is now read only.  Newly obsolete DVAs will be added to a
1515cabbc6bSPrashanth Sreenivasa  *    new (initially empty) obsolete space map, and will not be
1525cabbc6bSPrashanth Sreenivasa  *    considered as part of this condense operation.
1535cabbc6bSPrashanth Sreenivasa  *
1545cabbc6bSPrashanth Sreenivasa  * 2. Construct in memory the precise counts of obsolete space for each
1555cabbc6bSPrashanth Sreenivasa  *    mapping entry, by incorporating the obsolete space map into the
1565cabbc6bSPrashanth Sreenivasa  *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
1575cabbc6bSPrashanth Sreenivasa  *
1585cabbc6bSPrashanth Sreenivasa  * 3. Iterate through each mapping entry, writing to the new mapping any
1595cabbc6bSPrashanth Sreenivasa  *    entries that are not completely obsolete (i.e. which don't have
1605cabbc6bSPrashanth Sreenivasa  *    obsolete count == mapping length).  (See
1615cabbc6bSPrashanth Sreenivasa  *    spa_condense_indirect_generate_new_mapping().)
1625cabbc6bSPrashanth Sreenivasa  *
1635cabbc6bSPrashanth Sreenivasa  * 4. Destroy the old mapping object and switch over to the new one
1645cabbc6bSPrashanth Sreenivasa  *    (spa_condense_indirect_complete_sync).
1655cabbc6bSPrashanth Sreenivasa  *
1665cabbc6bSPrashanth Sreenivasa  * == Restarting from failure ==
1675cabbc6bSPrashanth Sreenivasa  *
1685cabbc6bSPrashanth Sreenivasa  * To restart the condense when we import/open the pool, we must start
1695cabbc6bSPrashanth Sreenivasa  * at the 2nd step above: reconstruct the precise counts in memory,
1705cabbc6bSPrashanth Sreenivasa  * based on the space map + counts.  Then in the 3rd step, we start
1715cabbc6bSPrashanth Sreenivasa  * iterating where we left off: at vimp_max_offset of the new mapping
1725cabbc6bSPrashanth Sreenivasa  * object.
1735cabbc6bSPrashanth Sreenivasa  */
1745cabbc6bSPrashanth Sreenivasa 
1755cabbc6bSPrashanth Sreenivasa boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
1765cabbc6bSPrashanth Sreenivasa 
1775cabbc6bSPrashanth Sreenivasa /*
1785cabbc6bSPrashanth Sreenivasa  * Condense if at least this percent of the bytes in the mapping is
1795cabbc6bSPrashanth Sreenivasa  * obsolete.  With the default of 25%, the amount of space mapped
1805cabbc6bSPrashanth Sreenivasa  * will be reduced to 1% of its original size after at most 16
1815cabbc6bSPrashanth Sreenivasa  * condenses.  Higher values will condense less often (causing less
1825cabbc6bSPrashanth Sreenivasa  * i/o); lower values will reduce the mapping size more quickly.
1835cabbc6bSPrashanth Sreenivasa  */
1845cabbc6bSPrashanth Sreenivasa int zfs_indirect_condense_obsolete_pct = 25;
1855cabbc6bSPrashanth Sreenivasa 
1865cabbc6bSPrashanth Sreenivasa /*
1875cabbc6bSPrashanth Sreenivasa  * Condense if the obsolete space map takes up more than this amount of
1885cabbc6bSPrashanth Sreenivasa  * space on disk (logically).  This limits the amount of disk space
1895cabbc6bSPrashanth Sreenivasa  * consumed by the obsolete space map; the default of 1GB is small enough
1905cabbc6bSPrashanth Sreenivasa  * that we typically don't mind "wasting" it.
1915cabbc6bSPrashanth Sreenivasa  */
1925cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
1935cabbc6bSPrashanth Sreenivasa 
1945cabbc6bSPrashanth Sreenivasa /*
1955cabbc6bSPrashanth Sreenivasa  * Don't bother condensing if the mapping uses less than this amount of
1965cabbc6bSPrashanth Sreenivasa  * memory.  The default of 128KB is considered a "trivial" amount of
1975cabbc6bSPrashanth Sreenivasa  * memory and not worth reducing.
1985cabbc6bSPrashanth Sreenivasa  */
1995cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
2005cabbc6bSPrashanth Sreenivasa 
2015cabbc6bSPrashanth Sreenivasa /*
2025cabbc6bSPrashanth Sreenivasa  * This is used by the test suite so that it can ensure that certain
2035cabbc6bSPrashanth Sreenivasa  * actions happen while in the middle of a condense (which might otherwise
2045cabbc6bSPrashanth Sreenivasa  * complete too quickly).  If used to reduce the performance impact of
2055cabbc6bSPrashanth Sreenivasa  * condensing in production, a maximum value of 1 should be sufficient.
2065cabbc6bSPrashanth Sreenivasa  */
2075cabbc6bSPrashanth Sreenivasa int zfs_condense_indirect_commit_entry_delay_ticks = 0;
2085cabbc6bSPrashanth Sreenivasa 
2093a4b1be9SMatthew Ahrens /*
210a21fe349SBrian Behlendorf  * If an indirect split block contains more than this many possible unique
211a21fe349SBrian Behlendorf  * combinations when being reconstructed, consider it too computationally
212a21fe349SBrian Behlendorf  * expensive to check them all. Instead, try at most 100 randomly-selected
213a21fe349SBrian Behlendorf  * combinations each time the block is accessed.  This allows all segment
214a21fe349SBrian Behlendorf  * copies to participate fairly in the reconstruction when all combinations
215a21fe349SBrian Behlendorf  * cannot be checked and prevents repeated use of one bad copy.
216a21fe349SBrian Behlendorf  */
217a21fe349SBrian Behlendorf int zfs_reconstruct_indirect_combinations_max = 256;
218a21fe349SBrian Behlendorf 
219a21fe349SBrian Behlendorf 
220a21fe349SBrian Behlendorf /*
221a21fe349SBrian Behlendorf  * Enable to simulate damaged segments and validate reconstruction.
222a21fe349SBrian Behlendorf  * Used by ztest
2233a4b1be9SMatthew Ahrens  */
224a21fe349SBrian Behlendorf unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
2253a4b1be9SMatthew Ahrens 
2263a4b1be9SMatthew Ahrens /*
2273a4b1be9SMatthew Ahrens  * The indirect_child_t represents the vdev that we will read from, when we
2283a4b1be9SMatthew Ahrens  * need to read all copies of the data (e.g. for scrub or reconstruction).
2293a4b1be9SMatthew Ahrens  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
2303a4b1be9SMatthew Ahrens  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
2313a4b1be9SMatthew Ahrens  * ic_vdev is a child of the mirror.
2323a4b1be9SMatthew Ahrens  */
2333a4b1be9SMatthew Ahrens typedef struct indirect_child {
2343a4b1be9SMatthew Ahrens 	abd_t *ic_data;
2353a4b1be9SMatthew Ahrens 	vdev_t *ic_vdev;
236a21fe349SBrian Behlendorf 
237a21fe349SBrian Behlendorf 	/*
238a21fe349SBrian Behlendorf 	 * ic_duplicate is NULL when the ic_data contents are unique, when it
239a21fe349SBrian Behlendorf 	 * is determined to be a duplicate it references the primary child.
240a21fe349SBrian Behlendorf 	 */
241a21fe349SBrian Behlendorf 	struct indirect_child *ic_duplicate;
242a21fe349SBrian Behlendorf 	list_node_t ic_node; /* node on is_unique_child */
2433a4b1be9SMatthew Ahrens } indirect_child_t;
2443a4b1be9SMatthew Ahrens 
2453a4b1be9SMatthew Ahrens /*
2463a4b1be9SMatthew Ahrens  * The indirect_split_t represents one mapped segment of an i/o to the
2473a4b1be9SMatthew Ahrens  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
2483a4b1be9SMatthew Ahrens  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
2493a4b1be9SMatthew Ahrens  * For split blocks, there will be several of these.
2503a4b1be9SMatthew Ahrens  */
2513a4b1be9SMatthew Ahrens typedef struct indirect_split {
2523a4b1be9SMatthew Ahrens 	list_node_t is_node; /* link on iv_splits */
2533a4b1be9SMatthew Ahrens 
2543a4b1be9SMatthew Ahrens 	/*
2553a4b1be9SMatthew Ahrens 	 * is_split_offset is the offset into the i/o.
2563a4b1be9SMatthew Ahrens 	 * This is the sum of the previous splits' is_size's.
2573a4b1be9SMatthew Ahrens 	 */
2583a4b1be9SMatthew Ahrens 	uint64_t is_split_offset;
2593a4b1be9SMatthew Ahrens 
2603a4b1be9SMatthew Ahrens 	vdev_t *is_vdev; /* top-level vdev */
2613a4b1be9SMatthew Ahrens 	uint64_t is_target_offset; /* offset on is_vdev */
2623a4b1be9SMatthew Ahrens 	uint64_t is_size;
2633a4b1be9SMatthew Ahrens 	int is_children; /* number of entries in is_child[] */
264a21fe349SBrian Behlendorf 	int is_unique_children; /* number of entries in is_unique_child */
265a21fe349SBrian Behlendorf 	list_t is_unique_child;
2663a4b1be9SMatthew Ahrens 
2673a4b1be9SMatthew Ahrens 	/*
2683a4b1be9SMatthew Ahrens 	 * is_good_child is the child that we are currently using to
2693a4b1be9SMatthew Ahrens 	 * attempt reconstruction.
2703a4b1be9SMatthew Ahrens 	 */
271a21fe349SBrian Behlendorf 	indirect_child_t *is_good_child;
2723a4b1be9SMatthew Ahrens 
2733a4b1be9SMatthew Ahrens 	indirect_child_t is_child[1]; /* variable-length */
2743a4b1be9SMatthew Ahrens } indirect_split_t;
2753a4b1be9SMatthew Ahrens 
2763a4b1be9SMatthew Ahrens /*
2773a4b1be9SMatthew Ahrens  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
2783a4b1be9SMatthew Ahrens  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
2793a4b1be9SMatthew Ahrens  */
2803a4b1be9SMatthew Ahrens typedef struct indirect_vsd {
2813a4b1be9SMatthew Ahrens 	boolean_t iv_split_block;
2823a4b1be9SMatthew Ahrens 	boolean_t iv_reconstruct;
283a21fe349SBrian Behlendorf 	uint64_t iv_unique_combinations;
284a21fe349SBrian Behlendorf 	uint64_t iv_attempts;
285a21fe349SBrian Behlendorf 	uint64_t iv_attempts_max;
2863a4b1be9SMatthew Ahrens 
2873a4b1be9SMatthew Ahrens 	list_t iv_splits; /* list of indirect_split_t's */
2883a4b1be9SMatthew Ahrens } indirect_vsd_t;
2893a4b1be9SMatthew Ahrens 
2903a4b1be9SMatthew Ahrens static void
vdev_indirect_map_free(zio_t * zio)2913a4b1be9SMatthew Ahrens vdev_indirect_map_free(zio_t *zio)
2923a4b1be9SMatthew Ahrens {
2933a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
2943a4b1be9SMatthew Ahrens 
2953a4b1be9SMatthew Ahrens 	indirect_split_t *is;
2963a4b1be9SMatthew Ahrens 	while ((is = list_head(&iv->iv_splits)) != NULL) {
2973a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
2983a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
2993a4b1be9SMatthew Ahrens 			if (ic->ic_data != NULL)
3003a4b1be9SMatthew Ahrens 				abd_free(ic->ic_data);
3013a4b1be9SMatthew Ahrens 		}
3023a4b1be9SMatthew Ahrens 		list_remove(&iv->iv_splits, is);
303a21fe349SBrian Behlendorf 
304a21fe349SBrian Behlendorf 		indirect_child_t *ic;
305a21fe349SBrian Behlendorf 		while ((ic = list_head(&is->is_unique_child)) != NULL)
306a21fe349SBrian Behlendorf 			list_remove(&is->is_unique_child, ic);
307a21fe349SBrian Behlendorf 
308a21fe349SBrian Behlendorf 		list_destroy(&is->is_unique_child);
309a21fe349SBrian Behlendorf 
3103a4b1be9SMatthew Ahrens 		kmem_free(is,
3113a4b1be9SMatthew Ahrens 		    offsetof(indirect_split_t, is_child[is->is_children]));
3123a4b1be9SMatthew Ahrens 	}
3133a4b1be9SMatthew Ahrens 	kmem_free(iv, sizeof (*iv));
3143a4b1be9SMatthew Ahrens }
3153a4b1be9SMatthew Ahrens 
3163a4b1be9SMatthew Ahrens static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
3173a4b1be9SMatthew Ahrens 	vdev_indirect_map_free,
3183a4b1be9SMatthew Ahrens 	zio_vsd_default_cksum_report
3193a4b1be9SMatthew Ahrens };
3205cabbc6bSPrashanth Sreenivasa /*
32186714001SSerapheim Dimitropoulos  * Mark the given offset and size as being obsolete.
3225cabbc6bSPrashanth Sreenivasa  */
3235cabbc6bSPrashanth Sreenivasa void
vdev_indirect_mark_obsolete(vdev_t * vd,uint64_t offset,uint64_t size)32486714001SSerapheim Dimitropoulos vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
3255cabbc6bSPrashanth Sreenivasa {
3265cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
32786714001SSerapheim Dimitropoulos 
3285cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
3295cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
3305cabbc6bSPrashanth Sreenivasa 	ASSERT(size > 0);
3315cabbc6bSPrashanth Sreenivasa 	VERIFY(vdev_indirect_mapping_entry_for_offset(
3325cabbc6bSPrashanth Sreenivasa 	    vd->vdev_indirect_mapping, offset) != NULL);
3335cabbc6bSPrashanth Sreenivasa 
3345cabbc6bSPrashanth Sreenivasa 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
3355cabbc6bSPrashanth Sreenivasa 		mutex_enter(&vd->vdev_obsolete_lock);
3365cabbc6bSPrashanth Sreenivasa 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
3375cabbc6bSPrashanth Sreenivasa 		mutex_exit(&vd->vdev_obsolete_lock);
33886714001SSerapheim Dimitropoulos 		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
3395cabbc6bSPrashanth Sreenivasa 	}
3405cabbc6bSPrashanth Sreenivasa }
3415cabbc6bSPrashanth Sreenivasa 
3425cabbc6bSPrashanth Sreenivasa /*
3435cabbc6bSPrashanth Sreenivasa  * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
3445cabbc6bSPrashanth Sreenivasa  * wrapper is provided because the DMU does not know about vdev_t's and
3455cabbc6bSPrashanth Sreenivasa  * cannot directly call vdev_indirect_mark_obsolete.
3465cabbc6bSPrashanth Sreenivasa  */
3475cabbc6bSPrashanth Sreenivasa void
spa_vdev_indirect_mark_obsolete(spa_t * spa,uint64_t vdev_id,uint64_t offset,uint64_t size,dmu_tx_t * tx)3485cabbc6bSPrashanth Sreenivasa spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
3495cabbc6bSPrashanth Sreenivasa     uint64_t size, dmu_tx_t *tx)
3505cabbc6bSPrashanth Sreenivasa {
3515cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
3525cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
3535cabbc6bSPrashanth Sreenivasa 
3545cabbc6bSPrashanth Sreenivasa 	/* The DMU can only remap indirect vdevs. */
3555cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
35686714001SSerapheim Dimitropoulos 	vdev_indirect_mark_obsolete(vd, offset, size);
3575cabbc6bSPrashanth Sreenivasa }
3585cabbc6bSPrashanth Sreenivasa 
3595cabbc6bSPrashanth Sreenivasa static spa_condensing_indirect_t *
spa_condensing_indirect_create(spa_t * spa)3605cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_create(spa_t *spa)
3615cabbc6bSPrashanth Sreenivasa {
3625cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
3635cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
3645cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
3655cabbc6bSPrashanth Sreenivasa 	objset_t *mos = spa->spa_meta_objset;
3665cabbc6bSPrashanth Sreenivasa 
3675cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
3685cabbc6bSPrashanth Sreenivasa 		list_create(&sci->sci_new_mapping_entries[i],
3695cabbc6bSPrashanth Sreenivasa 		    sizeof (vdev_indirect_mapping_entry_t),
3705cabbc6bSPrashanth Sreenivasa 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
3715cabbc6bSPrashanth Sreenivasa 	}
3725cabbc6bSPrashanth Sreenivasa 
3735cabbc6bSPrashanth Sreenivasa 	sci->sci_new_mapping =
3745cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
3755cabbc6bSPrashanth Sreenivasa 
3765cabbc6bSPrashanth Sreenivasa 	return (sci);
3775cabbc6bSPrashanth Sreenivasa }
3785cabbc6bSPrashanth Sreenivasa 
3795cabbc6bSPrashanth Sreenivasa static void
spa_condensing_indirect_destroy(spa_condensing_indirect_t * sci)3805cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
3815cabbc6bSPrashanth Sreenivasa {
3825cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++)
3835cabbc6bSPrashanth Sreenivasa 		list_destroy(&sci->sci_new_mapping_entries[i]);
3845cabbc6bSPrashanth Sreenivasa 
3855cabbc6bSPrashanth Sreenivasa 	if (sci->sci_new_mapping != NULL)
3865cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_close(sci->sci_new_mapping);
3875cabbc6bSPrashanth Sreenivasa 
3885cabbc6bSPrashanth Sreenivasa 	kmem_free(sci, sizeof (*sci));
3895cabbc6bSPrashanth Sreenivasa }
3905cabbc6bSPrashanth Sreenivasa 
3915cabbc6bSPrashanth Sreenivasa boolean_t
vdev_indirect_should_condense(vdev_t * vd)3925cabbc6bSPrashanth Sreenivasa vdev_indirect_should_condense(vdev_t *vd)
3935cabbc6bSPrashanth Sreenivasa {
3945cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3955cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
3965cabbc6bSPrashanth Sreenivasa 
3975cabbc6bSPrashanth Sreenivasa 	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
3985cabbc6bSPrashanth Sreenivasa 
3995cabbc6bSPrashanth Sreenivasa 	if (!zfs_condense_indirect_vdevs_enable)
4005cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4015cabbc6bSPrashanth Sreenivasa 
4025cabbc6bSPrashanth Sreenivasa 	/*
4035cabbc6bSPrashanth Sreenivasa 	 * We can only condense one indirect vdev at a time.
4045cabbc6bSPrashanth Sreenivasa 	 */
4055cabbc6bSPrashanth Sreenivasa 	if (spa->spa_condensing_indirect != NULL)
4065cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4075cabbc6bSPrashanth Sreenivasa 
4085cabbc6bSPrashanth Sreenivasa 	if (spa_shutting_down(spa))
4095cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4105cabbc6bSPrashanth Sreenivasa 
4115cabbc6bSPrashanth Sreenivasa 	/*
4125cabbc6bSPrashanth Sreenivasa 	 * The mapping object size must not change while we are
4135cabbc6bSPrashanth Sreenivasa 	 * condensing, so we can only condense indirect vdevs
4145cabbc6bSPrashanth Sreenivasa 	 * (not vdevs that are still in the middle of being removed).
4155cabbc6bSPrashanth Sreenivasa 	 */
4165cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops != &vdev_indirect_ops)
4175cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4185cabbc6bSPrashanth Sreenivasa 
4195cabbc6bSPrashanth Sreenivasa 	/*
4205cabbc6bSPrashanth Sreenivasa 	 * If nothing new has been marked obsolete, there is no
4215cabbc6bSPrashanth Sreenivasa 	 * point in condensing.
4225cabbc6bSPrashanth Sreenivasa 	 */
4235cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_obsolete_sm == NULL) {
4245cabbc6bSPrashanth Sreenivasa 		ASSERT0(vdev_obsolete_sm_object(vd));
4255cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
4265cabbc6bSPrashanth Sreenivasa 	}
4275cabbc6bSPrashanth Sreenivasa 
4285cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_obsolete_sm != NULL);
4295cabbc6bSPrashanth Sreenivasa 
4305cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
4315cabbc6bSPrashanth Sreenivasa 	    space_map_object(vd->vdev_obsolete_sm));
4325cabbc6bSPrashanth Sreenivasa 
4335cabbc6bSPrashanth Sreenivasa 	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
4345cabbc6bSPrashanth Sreenivasa 	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
4355cabbc6bSPrashanth Sreenivasa 	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
4365cabbc6bSPrashanth Sreenivasa 	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
4375cabbc6bSPrashanth Sreenivasa 
4385cabbc6bSPrashanth Sreenivasa 	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
4395cabbc6bSPrashanth Sreenivasa 
4405cabbc6bSPrashanth Sreenivasa 	/*
4415cabbc6bSPrashanth Sreenivasa 	 * If a high percentage of the bytes that are mapped have become
4425cabbc6bSPrashanth Sreenivasa 	 * obsolete, condense (unless the mapping is already small enough).
4435cabbc6bSPrashanth Sreenivasa 	 * This has a good chance of reducing the amount of memory used
4445cabbc6bSPrashanth Sreenivasa 	 * by the mapping.
4455cabbc6bSPrashanth Sreenivasa 	 */
4465cabbc6bSPrashanth Sreenivasa 	if (bytes_obsolete * 100 / bytes_mapped >=
4475cabbc6bSPrashanth Sreenivasa 	    zfs_indirect_condense_obsolete_pct &&
4485cabbc6bSPrashanth Sreenivasa 	    mapping_size > zfs_condense_min_mapping_bytes) {
4495cabbc6bSPrashanth Sreenivasa 		zfs_dbgmsg("should condense vdev %llu because obsolete "
4505cabbc6bSPrashanth Sreenivasa 		    "spacemap covers %d%% of %lluMB mapping",
4515cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)vd->vdev_id,
4525cabbc6bSPrashanth Sreenivasa 		    (int)(bytes_obsolete * 100 / bytes_mapped),
4535cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)bytes_mapped / 1024 / 1024);
4545cabbc6bSPrashanth Sreenivasa 		return (B_TRUE);
4555cabbc6bSPrashanth Sreenivasa 	}
4565cabbc6bSPrashanth Sreenivasa 
4575cabbc6bSPrashanth Sreenivasa 	/*
4585cabbc6bSPrashanth Sreenivasa 	 * If the obsolete space map takes up too much space on disk,
4595cabbc6bSPrashanth Sreenivasa 	 * condense in order to free up this disk space.
4605cabbc6bSPrashanth Sreenivasa 	 */
4615cabbc6bSPrashanth Sreenivasa 	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
4625cabbc6bSPrashanth Sreenivasa 		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
4635cabbc6bSPrashanth Sreenivasa 		    "length %lluMB >= max size %lluMB",
4645cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)vd->vdev_id,
4655cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
4665cabbc6bSPrashanth Sreenivasa 		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
4675cabbc6bSPrashanth Sreenivasa 		    1024 / 1024);
4685cabbc6bSPrashanth Sreenivasa 		return (B_TRUE);
4695cabbc6bSPrashanth Sreenivasa 	}
4705cabbc6bSPrashanth Sreenivasa 
4715cabbc6bSPrashanth Sreenivasa 	return (B_FALSE);
4725cabbc6bSPrashanth Sreenivasa }
4735cabbc6bSPrashanth Sreenivasa 
4745cabbc6bSPrashanth Sreenivasa /*
4755cabbc6bSPrashanth Sreenivasa  * This sync task completes (finishes) a condense, deleting the old
4765cabbc6bSPrashanth Sreenivasa  * mapping and replacing it with the new one.
4775cabbc6bSPrashanth Sreenivasa  */
4785cabbc6bSPrashanth Sreenivasa static void
spa_condense_indirect_complete_sync(void * arg,dmu_tx_t * tx)4795cabbc6bSPrashanth Sreenivasa spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
4805cabbc6bSPrashanth Sreenivasa {
4815cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = arg;
4825cabbc6bSPrashanth Sreenivasa 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4835cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
4845cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
4855cabbc6bSPrashanth Sreenivasa 	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
4865cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4875cabbc6bSPrashanth Sreenivasa 	objset_t *mos = spa->spa_meta_objset;
4885cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
4895cabbc6bSPrashanth Sreenivasa 	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
4905cabbc6bSPrashanth Sreenivasa 	uint64_t new_count =
4915cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
4925cabbc6bSPrashanth Sreenivasa 
4935cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
4945cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4955cabbc6bSPrashanth Sreenivasa 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
4965cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
4975cabbc6bSPrashanth Sreenivasa 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
4985cabbc6bSPrashanth Sreenivasa 	}
4995cabbc6bSPrashanth Sreenivasa 	ASSERT(vic->vic_mapping_object != 0);
5005cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
5015cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_next_mapping_object != 0);
5025cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
5035cabbc6bSPrashanth Sreenivasa 
5045cabbc6bSPrashanth Sreenivasa 	/*
5055cabbc6bSPrashanth Sreenivasa 	 * Reset vdev_indirect_mapping to refer to the new object.
5065cabbc6bSPrashanth Sreenivasa 	 */
5075cabbc6bSPrashanth Sreenivasa 	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
5085cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
5095cabbc6bSPrashanth Sreenivasa 	vd->vdev_indirect_mapping = sci->sci_new_mapping;
5105cabbc6bSPrashanth Sreenivasa 	rw_exit(&vd->vdev_indirect_rwlock);
5115cabbc6bSPrashanth Sreenivasa 
5125cabbc6bSPrashanth Sreenivasa 	sci->sci_new_mapping = NULL;
5135cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
5145cabbc6bSPrashanth Sreenivasa 	vic->vic_mapping_object = scip->scip_next_mapping_object;
5155cabbc6bSPrashanth Sreenivasa 	scip->scip_next_mapping_object = 0;
5165cabbc6bSPrashanth Sreenivasa 
5175cabbc6bSPrashanth Sreenivasa 	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
5185cabbc6bSPrashanth Sreenivasa 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
5195cabbc6bSPrashanth Sreenivasa 	scip->scip_prev_obsolete_sm_object = 0;
5205cabbc6bSPrashanth Sreenivasa 
5215cabbc6bSPrashanth Sreenivasa 	scip->scip_vdev = 0;
5225cabbc6bSPrashanth Sreenivasa 
5235cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
5245cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, tx));
5255cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
5265cabbc6bSPrashanth Sreenivasa 	spa->spa_condensing_indirect = NULL;
5275cabbc6bSPrashanth Sreenivasa 
5285cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
5295cabbc6bSPrashanth Sreenivasa 	    "new mapping object %llu has %llu entries "
5305cabbc6bSPrashanth Sreenivasa 	    "(was %llu entries)",
5315cabbc6bSPrashanth Sreenivasa 	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
5325cabbc6bSPrashanth Sreenivasa 	    new_count, old_count);
5335cabbc6bSPrashanth Sreenivasa 
5345cabbc6bSPrashanth Sreenivasa 	vdev_config_dirty(spa->spa_root_vdev);
5355cabbc6bSPrashanth Sreenivasa }
5365cabbc6bSPrashanth Sreenivasa 
5375cabbc6bSPrashanth Sreenivasa /*
5385cabbc6bSPrashanth Sreenivasa  * This sync task appends entries to the new mapping object.
5395cabbc6bSPrashanth Sreenivasa  */
5405cabbc6bSPrashanth Sreenivasa static void
spa_condense_indirect_commit_sync(void * arg,dmu_tx_t * tx)5415cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
5425cabbc6bSPrashanth Sreenivasa {
5435cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = arg;
5445cabbc6bSPrashanth Sreenivasa 	uint64_t txg = dmu_tx_get_txg(tx);
5455cabbc6bSPrashanth Sreenivasa 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5465cabbc6bSPrashanth Sreenivasa 
5475cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
5485cabbc6bSPrashanth Sreenivasa 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
5495cabbc6bSPrashanth Sreenivasa 
5505cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
5515cabbc6bSPrashanth Sreenivasa 	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
5525cabbc6bSPrashanth Sreenivasa 	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
5535cabbc6bSPrashanth Sreenivasa }
5545cabbc6bSPrashanth Sreenivasa 
5555cabbc6bSPrashanth Sreenivasa /*
5565cabbc6bSPrashanth Sreenivasa  * Open-context function to add one entry to the new mapping.  The new
5575cabbc6bSPrashanth Sreenivasa  * entry will be remembered and written from syncing context.
5585cabbc6bSPrashanth Sreenivasa  */
5595cabbc6bSPrashanth Sreenivasa static void
spa_condense_indirect_commit_entry(spa_t * spa,vdev_indirect_mapping_entry_phys_t * vimep,uint32_t count)5605cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_entry(spa_t *spa,
5615cabbc6bSPrashanth Sreenivasa     vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
5625cabbc6bSPrashanth Sreenivasa {
5635cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
5645cabbc6bSPrashanth Sreenivasa 
5655cabbc6bSPrashanth Sreenivasa 	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
5665cabbc6bSPrashanth Sreenivasa 
5675cabbc6bSPrashanth Sreenivasa 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5685cabbc6bSPrashanth Sreenivasa 	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
5695cabbc6bSPrashanth Sreenivasa 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
5705cabbc6bSPrashanth Sreenivasa 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
5715cabbc6bSPrashanth Sreenivasa 
5725cabbc6bSPrashanth Sreenivasa 	/*
5735cabbc6bSPrashanth Sreenivasa 	 * If we are the first entry committed this txg, kick off the sync
5745cabbc6bSPrashanth Sreenivasa 	 * task to write to the MOS on our behalf.
5755cabbc6bSPrashanth Sreenivasa 	 */
5765cabbc6bSPrashanth Sreenivasa 	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
5775cabbc6bSPrashanth Sreenivasa 		dsl_sync_task_nowait(dmu_tx_pool(tx),
5785cabbc6bSPrashanth Sreenivasa 		    spa_condense_indirect_commit_sync, sci,
5795cabbc6bSPrashanth Sreenivasa 		    0, ZFS_SPACE_CHECK_NONE, tx);
5805cabbc6bSPrashanth Sreenivasa 	}
5815cabbc6bSPrashanth Sreenivasa 
5825cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_entry_t *vime =
5835cabbc6bSPrashanth Sreenivasa 	    kmem_alloc(sizeof (*vime), KM_SLEEP);
5845cabbc6bSPrashanth Sreenivasa 	vime->vime_mapping = *vimep;
5855cabbc6bSPrashanth Sreenivasa 	vime->vime_obsolete_count = count;
5865cabbc6bSPrashanth Sreenivasa 	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
5875cabbc6bSPrashanth Sreenivasa 
5885cabbc6bSPrashanth Sreenivasa 	dmu_tx_commit(tx);
5895cabbc6bSPrashanth Sreenivasa }
5905cabbc6bSPrashanth Sreenivasa 
5915cabbc6bSPrashanth Sreenivasa static void
spa_condense_indirect_generate_new_mapping(vdev_t * vd,uint32_t * obsolete_counts,uint64_t start_index,zthr_t * zthr)5925cabbc6bSPrashanth Sreenivasa spa_condense_indirect_generate_new_mapping(vdev_t *vd,
593667ec66fSSerapheim Dimitropoulos     uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
5945cabbc6bSPrashanth Sreenivasa {
5955cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
5965cabbc6bSPrashanth Sreenivasa 	uint64_t mapi = start_index;
5975cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
5985cabbc6bSPrashanth Sreenivasa 	uint64_t old_num_entries =
5995cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_num_entries(old_mapping);
6005cabbc6bSPrashanth Sreenivasa 
6015cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6025cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
6035cabbc6bSPrashanth Sreenivasa 
6045cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
6055cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)vd->vdev_id,
6065cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)mapi);
6075cabbc6bSPrashanth Sreenivasa 
608667ec66fSSerapheim Dimitropoulos 	while (mapi < old_num_entries) {
609667ec66fSSerapheim Dimitropoulos 
610667ec66fSSerapheim Dimitropoulos 		if (zthr_iscancelled(zthr)) {
611667ec66fSSerapheim Dimitropoulos 			zfs_dbgmsg("pausing condense of vdev %llu "
612667ec66fSSerapheim Dimitropoulos 			    "at index %llu", (u_longlong_t)vd->vdev_id,
613667ec66fSSerapheim Dimitropoulos 			    (u_longlong_t)mapi);
614667ec66fSSerapheim Dimitropoulos 			break;
615667ec66fSSerapheim Dimitropoulos 		}
616667ec66fSSerapheim Dimitropoulos 
6175cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *entry =
6185cabbc6bSPrashanth Sreenivasa 		    &old_mapping->vim_entries[mapi];
6195cabbc6bSPrashanth Sreenivasa 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
6205cabbc6bSPrashanth Sreenivasa 		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
6215cabbc6bSPrashanth Sreenivasa 		if (obsolete_counts[mapi] < entry_size) {
6225cabbc6bSPrashanth Sreenivasa 			spa_condense_indirect_commit_entry(spa, entry,
6235cabbc6bSPrashanth Sreenivasa 			    obsolete_counts[mapi]);
6245cabbc6bSPrashanth Sreenivasa 
6255cabbc6bSPrashanth Sreenivasa 			/*
6265cabbc6bSPrashanth Sreenivasa 			 * This delay may be requested for testing, debugging,
6275cabbc6bSPrashanth Sreenivasa 			 * or performance reasons.
6285cabbc6bSPrashanth Sreenivasa 			 */
6295cabbc6bSPrashanth Sreenivasa 			delay(zfs_condense_indirect_commit_entry_delay_ticks);
6305cabbc6bSPrashanth Sreenivasa 		}
6315cabbc6bSPrashanth Sreenivasa 
6325cabbc6bSPrashanth Sreenivasa 		mapi++;
6335cabbc6bSPrashanth Sreenivasa 	}
6345cabbc6bSPrashanth Sreenivasa }
6355cabbc6bSPrashanth Sreenivasa 
636667ec66fSSerapheim Dimitropoulos /* ARGSUSED */
637667ec66fSSerapheim Dimitropoulos static boolean_t
spa_condense_indirect_thread_check(void * arg,zthr_t * zthr)638667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
6395cabbc6bSPrashanth Sreenivasa {
640667ec66fSSerapheim Dimitropoulos 	spa_t *spa = arg;
641667ec66fSSerapheim Dimitropoulos 
642667ec66fSSerapheim Dimitropoulos 	return (spa->spa_condensing_indirect != NULL);
643667ec66fSSerapheim Dimitropoulos }
644667ec66fSSerapheim Dimitropoulos 
645667ec66fSSerapheim Dimitropoulos /* ARGSUSED */
6466a316e1fSSerapheim Dimitropoulos static void
spa_condense_indirect_thread(void * arg,zthr_t * zthr)647667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread(void *arg, zthr_t *zthr)
648667ec66fSSerapheim Dimitropoulos {
649667ec66fSSerapheim Dimitropoulos 	spa_t *spa = arg;
650667ec66fSSerapheim Dimitropoulos 	vdev_t *vd;
651667ec66fSSerapheim Dimitropoulos 
652667ec66fSSerapheim Dimitropoulos 	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
653667ec66fSSerapheim Dimitropoulos 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
654667ec66fSSerapheim Dimitropoulos 	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
655667ec66fSSerapheim Dimitropoulos 	ASSERT3P(vd, !=, NULL);
656667ec66fSSerapheim Dimitropoulos 	spa_config_exit(spa, SCL_VDEV, FTAG);
657667ec66fSSerapheim Dimitropoulos 
6585cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
6595cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
6605cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
6615cabbc6bSPrashanth Sreenivasa 	uint32_t *counts;
6625cabbc6bSPrashanth Sreenivasa 	uint64_t start_index;
6635cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
6645cabbc6bSPrashanth Sreenivasa 	space_map_t *prev_obsolete_sm = NULL;
6655cabbc6bSPrashanth Sreenivasa 
6665cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
6675cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_next_mapping_object != 0);
6685cabbc6bSPrashanth Sreenivasa 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
6695cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
6705cabbc6bSPrashanth Sreenivasa 
6715cabbc6bSPrashanth Sreenivasa 	for (int i = 0; i < TXG_SIZE; i++) {
6725cabbc6bSPrashanth Sreenivasa 		/*
6735cabbc6bSPrashanth Sreenivasa 		 * The list must start out empty in order for the
6745cabbc6bSPrashanth Sreenivasa 		 * _commit_sync() sync task to be properly registered
6755cabbc6bSPrashanth Sreenivasa 		 * on the first call to _commit_entry(); so it's wise
6765cabbc6bSPrashanth Sreenivasa 		 * to double check and ensure we actually are starting
6775cabbc6bSPrashanth Sreenivasa 		 * with empty lists.
6785cabbc6bSPrashanth Sreenivasa 		 */
6795cabbc6bSPrashanth Sreenivasa 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
6805cabbc6bSPrashanth Sreenivasa 	}
6815cabbc6bSPrashanth Sreenivasa 
6825cabbc6bSPrashanth Sreenivasa 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6835cabbc6bSPrashanth Sreenivasa 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6845cabbc6bSPrashanth Sreenivasa 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
6855cabbc6bSPrashanth Sreenivasa 	if (prev_obsolete_sm != NULL) {
6865cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
6875cabbc6bSPrashanth Sreenivasa 		    counts, prev_obsolete_sm);
6885cabbc6bSPrashanth Sreenivasa 	}
6895cabbc6bSPrashanth Sreenivasa 	space_map_close(prev_obsolete_sm);
6905cabbc6bSPrashanth Sreenivasa 
6915cabbc6bSPrashanth Sreenivasa 	/*
6925cabbc6bSPrashanth Sreenivasa 	 * Generate new mapping.  Determine what index to continue from
6935cabbc6bSPrashanth Sreenivasa 	 * based on the max offset that we've already written in the
6945cabbc6bSPrashanth Sreenivasa 	 * new mapping.
6955cabbc6bSPrashanth Sreenivasa 	 */
6965cabbc6bSPrashanth Sreenivasa 	uint64_t max_offset =
6975cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
6985cabbc6bSPrashanth Sreenivasa 	if (max_offset == 0) {
6995cabbc6bSPrashanth Sreenivasa 		/* We haven't written anything to the new mapping yet. */
7005cabbc6bSPrashanth Sreenivasa 		start_index = 0;
7015cabbc6bSPrashanth Sreenivasa 	} else {
7025cabbc6bSPrashanth Sreenivasa 		/*
7035cabbc6bSPrashanth Sreenivasa 		 * Pick up from where we left off. _entry_for_offset()
7045cabbc6bSPrashanth Sreenivasa 		 * returns a pointer into the vim_entries array. If
7055cabbc6bSPrashanth Sreenivasa 		 * max_offset is greater than any of the mappings
7065cabbc6bSPrashanth Sreenivasa 		 * contained in the table  NULL will be returned and
7075cabbc6bSPrashanth Sreenivasa 		 * that indicates we've exhausted our iteration of the
7085cabbc6bSPrashanth Sreenivasa 		 * old_mapping.
7095cabbc6bSPrashanth Sreenivasa 		 */
7105cabbc6bSPrashanth Sreenivasa 
7115cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *entry =
7125cabbc6bSPrashanth Sreenivasa 		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
7135cabbc6bSPrashanth Sreenivasa 		    max_offset);
7145cabbc6bSPrashanth Sreenivasa 
7155cabbc6bSPrashanth Sreenivasa 		if (entry == NULL) {
7165cabbc6bSPrashanth Sreenivasa 			/*
7175cabbc6bSPrashanth Sreenivasa 			 * We've already written the whole new mapping.
7185cabbc6bSPrashanth Sreenivasa 			 * This special value will cause us to skip the
7195cabbc6bSPrashanth Sreenivasa 			 * generate_new_mapping step and just do the sync
7205cabbc6bSPrashanth Sreenivasa 			 * task to complete the condense.
7215cabbc6bSPrashanth Sreenivasa 			 */
7225cabbc6bSPrashanth Sreenivasa 			start_index = UINT64_MAX;
7235cabbc6bSPrashanth Sreenivasa 		} else {
7245cabbc6bSPrashanth Sreenivasa 			start_index = entry - old_mapping->vim_entries;
7255cabbc6bSPrashanth Sreenivasa 			ASSERT3U(start_index, <,
7265cabbc6bSPrashanth Sreenivasa 			    vdev_indirect_mapping_num_entries(old_mapping));
7275cabbc6bSPrashanth Sreenivasa 		}
7285cabbc6bSPrashanth Sreenivasa 	}
7295cabbc6bSPrashanth Sreenivasa 
730667ec66fSSerapheim Dimitropoulos 	spa_condense_indirect_generate_new_mapping(vd, counts,
731667ec66fSSerapheim Dimitropoulos 	    start_index, zthr);
7325cabbc6bSPrashanth Sreenivasa 
7335cabbc6bSPrashanth Sreenivasa 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
7345cabbc6bSPrashanth Sreenivasa 
7355cabbc6bSPrashanth Sreenivasa 	/*
736667ec66fSSerapheim Dimitropoulos 	 * If the zthr has received a cancellation signal while running
737667ec66fSSerapheim Dimitropoulos 	 * in generate_new_mapping() or at any point after that, then bail
738667ec66fSSerapheim Dimitropoulos 	 * early. We don't want to complete the condense if the spa is
739667ec66fSSerapheim Dimitropoulos 	 * shutting down.
7405cabbc6bSPrashanth Sreenivasa 	 */
741667ec66fSSerapheim Dimitropoulos 	if (zthr_iscancelled(zthr))
7426a316e1fSSerapheim Dimitropoulos 		return;
743667ec66fSSerapheim Dimitropoulos 
744667ec66fSSerapheim Dimitropoulos 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
74586714001SSerapheim Dimitropoulos 	    spa_condense_indirect_complete_sync, sci, 0,
74686714001SSerapheim Dimitropoulos 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
7475cabbc6bSPrashanth Sreenivasa }
7485cabbc6bSPrashanth Sreenivasa 
7495cabbc6bSPrashanth Sreenivasa /*
7505cabbc6bSPrashanth Sreenivasa  * Sync task to begin the condensing process.
7515cabbc6bSPrashanth Sreenivasa  */
7525cabbc6bSPrashanth Sreenivasa void
spa_condense_indirect_start_sync(vdev_t * vd,dmu_tx_t * tx)7535cabbc6bSPrashanth Sreenivasa spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
7545cabbc6bSPrashanth Sreenivasa {
7555cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
7565cabbc6bSPrashanth Sreenivasa 	spa_condensing_indirect_phys_t *scip =
7575cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys;
7585cabbc6bSPrashanth Sreenivasa 
7595cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_next_mapping_object);
7605cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_prev_obsolete_sm_object);
7615cabbc6bSPrashanth Sreenivasa 	ASSERT0(scip->scip_vdev);
7625cabbc6bSPrashanth Sreenivasa 	ASSERT(dmu_tx_is_syncing(tx));
7635cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
7645cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
7655cabbc6bSPrashanth Sreenivasa 	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
7665cabbc6bSPrashanth Sreenivasa 
7675cabbc6bSPrashanth Sreenivasa 	uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
7685cabbc6bSPrashanth Sreenivasa 	ASSERT(obsolete_sm_obj != 0);
7695cabbc6bSPrashanth Sreenivasa 
7705cabbc6bSPrashanth Sreenivasa 	scip->scip_vdev = vd->vdev_id;
7715cabbc6bSPrashanth Sreenivasa 	scip->scip_next_mapping_object =
7725cabbc6bSPrashanth Sreenivasa 	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
7735cabbc6bSPrashanth Sreenivasa 
7745cabbc6bSPrashanth Sreenivasa 	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
7755cabbc6bSPrashanth Sreenivasa 
7765cabbc6bSPrashanth Sreenivasa 	/*
7775cabbc6bSPrashanth Sreenivasa 	 * We don't need to allocate a new space map object, since
7785cabbc6bSPrashanth Sreenivasa 	 * vdev_indirect_sync_obsolete will allocate one when needed.
7795cabbc6bSPrashanth Sreenivasa 	 */
7805cabbc6bSPrashanth Sreenivasa 	space_map_close(vd->vdev_obsolete_sm);
7815cabbc6bSPrashanth Sreenivasa 	vd->vdev_obsolete_sm = NULL;
7825cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
7835cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
7845cabbc6bSPrashanth Sreenivasa 
7855cabbc6bSPrashanth Sreenivasa 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
7865cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_DIRECTORY_OBJECT,
7875cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
7885cabbc6bSPrashanth Sreenivasa 	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
7895cabbc6bSPrashanth Sreenivasa 
7905cabbc6bSPrashanth Sreenivasa 	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
7915cabbc6bSPrashanth Sreenivasa 	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
7925cabbc6bSPrashanth Sreenivasa 
7935cabbc6bSPrashanth Sreenivasa 	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
7945cabbc6bSPrashanth Sreenivasa 	    "posm=%llu nm=%llu",
7955cabbc6bSPrashanth Sreenivasa 	    vd->vdev_id, dmu_tx_get_txg(tx),
7965cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
7975cabbc6bSPrashanth Sreenivasa 	    (u_longlong_t)scip->scip_next_mapping_object);
7985cabbc6bSPrashanth Sreenivasa 
799667ec66fSSerapheim Dimitropoulos 	zthr_wakeup(spa->spa_condense_zthr);
8005cabbc6bSPrashanth Sreenivasa }
8015cabbc6bSPrashanth Sreenivasa 
8025cabbc6bSPrashanth Sreenivasa /*
8035cabbc6bSPrashanth Sreenivasa  * Sync to the given vdev's obsolete space map any segments that are no longer
8045cabbc6bSPrashanth Sreenivasa  * referenced as of the given txg.
8055cabbc6bSPrashanth Sreenivasa  *
8065cabbc6bSPrashanth Sreenivasa  * If the obsolete space map doesn't exist yet, create and open it.
8075cabbc6bSPrashanth Sreenivasa  */
8085cabbc6bSPrashanth Sreenivasa void
vdev_indirect_sync_obsolete(vdev_t * vd,dmu_tx_t * tx)8095cabbc6bSPrashanth Sreenivasa vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
8105cabbc6bSPrashanth Sreenivasa {
8115cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
8125cabbc6bSPrashanth Sreenivasa 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
8135cabbc6bSPrashanth Sreenivasa 
8145cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vic->vic_mapping_object, !=, 0);
8155cabbc6bSPrashanth Sreenivasa 	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
8165cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
8175cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
8185cabbc6bSPrashanth Sreenivasa 
8195cabbc6bSPrashanth Sreenivasa 	if (vdev_obsolete_sm_object(vd) == 0) {
8205cabbc6bSPrashanth Sreenivasa 		uint64_t obsolete_sm_object =
82186714001SSerapheim Dimitropoulos 		    space_map_alloc(spa->spa_meta_objset,
822814dcd43SSerapheim Dimitropoulos 		    zfs_vdev_standard_sm_blksz, tx);
8235cabbc6bSPrashanth Sreenivasa 
8245cabbc6bSPrashanth Sreenivasa 		ASSERT(vd->vdev_top_zap != 0);
8255cabbc6bSPrashanth Sreenivasa 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
8265cabbc6bSPrashanth Sreenivasa 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
8275cabbc6bSPrashanth Sreenivasa 		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
8285cabbc6bSPrashanth Sreenivasa 		ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
8295cabbc6bSPrashanth Sreenivasa 
8305cabbc6bSPrashanth Sreenivasa 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
8315cabbc6bSPrashanth Sreenivasa 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
8325cabbc6bSPrashanth Sreenivasa 		    spa->spa_meta_objset, obsolete_sm_object,
8335cabbc6bSPrashanth Sreenivasa 		    0, vd->vdev_asize, 0));
8345cabbc6bSPrashanth Sreenivasa 	}
8355cabbc6bSPrashanth Sreenivasa 
8365cabbc6bSPrashanth Sreenivasa 	ASSERT(vd->vdev_obsolete_sm != NULL);
8375cabbc6bSPrashanth Sreenivasa 	ASSERT3U(vdev_obsolete_sm_object(vd), ==,
8385cabbc6bSPrashanth Sreenivasa 	    space_map_object(vd->vdev_obsolete_sm));
8395cabbc6bSPrashanth Sreenivasa 
8405cabbc6bSPrashanth Sreenivasa 	space_map_write(vd->vdev_obsolete_sm,
84117f11284SSerapheim Dimitropoulos 	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
8425cabbc6bSPrashanth Sreenivasa 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
8435cabbc6bSPrashanth Sreenivasa }
8445cabbc6bSPrashanth Sreenivasa 
8455cabbc6bSPrashanth Sreenivasa int
spa_condense_init(spa_t * spa)8465cabbc6bSPrashanth Sreenivasa spa_condense_init(spa_t *spa)
8475cabbc6bSPrashanth Sreenivasa {
8485cabbc6bSPrashanth Sreenivasa 	int error = zap_lookup(spa->spa_meta_objset,
8495cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_DIRECTORY_OBJECT,
8505cabbc6bSPrashanth Sreenivasa 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
8515cabbc6bSPrashanth Sreenivasa 	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
8525cabbc6bSPrashanth Sreenivasa 	    &spa->spa_condensing_indirect_phys);
8535cabbc6bSPrashanth Sreenivasa 	if (error == 0) {
8545cabbc6bSPrashanth Sreenivasa 		if (spa_writeable(spa)) {
8555cabbc6bSPrashanth Sreenivasa 			spa->spa_condensing_indirect =
8565cabbc6bSPrashanth Sreenivasa 			    spa_condensing_indirect_create(spa);
8575cabbc6bSPrashanth Sreenivasa 		}
8585cabbc6bSPrashanth Sreenivasa 		return (0);
8595cabbc6bSPrashanth Sreenivasa 	} else if (error == ENOENT) {
8605cabbc6bSPrashanth Sreenivasa 		return (0);
8615cabbc6bSPrashanth Sreenivasa 	} else {
8625cabbc6bSPrashanth Sreenivasa 		return (error);
8635cabbc6bSPrashanth Sreenivasa 	}
8645cabbc6bSPrashanth Sreenivasa }
8655cabbc6bSPrashanth Sreenivasa 
8665cabbc6bSPrashanth Sreenivasa void
spa_condense_fini(spa_t * spa)8675cabbc6bSPrashanth Sreenivasa spa_condense_fini(spa_t *spa)
8685cabbc6bSPrashanth Sreenivasa {
8695cabbc6bSPrashanth Sreenivasa 	if (spa->spa_condensing_indirect != NULL) {
8705cabbc6bSPrashanth Sreenivasa 		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
8715cabbc6bSPrashanth Sreenivasa 		spa->spa_condensing_indirect = NULL;
8725cabbc6bSPrashanth Sreenivasa 	}
8735cabbc6bSPrashanth Sreenivasa }
8745cabbc6bSPrashanth Sreenivasa 
8755cabbc6bSPrashanth Sreenivasa void
spa_start_indirect_condensing_thread(spa_t * spa)876667ec66fSSerapheim Dimitropoulos spa_start_indirect_condensing_thread(spa_t *spa)
8775cabbc6bSPrashanth Sreenivasa {
878667ec66fSSerapheim Dimitropoulos 	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
879667ec66fSSerapheim Dimitropoulos 	spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
880667ec66fSSerapheim Dimitropoulos 	    spa_condense_indirect_thread, spa);
8815cabbc6bSPrashanth Sreenivasa }
8825cabbc6bSPrashanth Sreenivasa 
8835cabbc6bSPrashanth Sreenivasa /*
8845cabbc6bSPrashanth Sreenivasa  * Gets the obsolete spacemap object from the vdev's ZAP.
8855cabbc6bSPrashanth Sreenivasa  * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
8865cabbc6bSPrashanth Sreenivasa  * exist yet.
8875cabbc6bSPrashanth Sreenivasa  */
8885cabbc6bSPrashanth Sreenivasa int
vdev_obsolete_sm_object(vdev_t * vd)8895cabbc6bSPrashanth Sreenivasa vdev_obsolete_sm_object(vdev_t *vd)
8905cabbc6bSPrashanth Sreenivasa {
8915cabbc6bSPrashanth Sreenivasa 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
8925cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_top_zap == 0) {
8935cabbc6bSPrashanth Sreenivasa 		return (0);
8945cabbc6bSPrashanth Sreenivasa 	}
8955cabbc6bSPrashanth Sreenivasa 
8965cabbc6bSPrashanth Sreenivasa 	uint64_t sm_obj = 0;
8975cabbc6bSPrashanth Sreenivasa 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
8985cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
8995cabbc6bSPrashanth Sreenivasa 
9005cabbc6bSPrashanth Sreenivasa 	ASSERT(err == 0 || err == ENOENT);
9015cabbc6bSPrashanth Sreenivasa 
9025cabbc6bSPrashanth Sreenivasa 	return (sm_obj);
9035cabbc6bSPrashanth Sreenivasa }
9045cabbc6bSPrashanth Sreenivasa 
9055cabbc6bSPrashanth Sreenivasa boolean_t
vdev_obsolete_counts_are_precise(vdev_t * vd)9065cabbc6bSPrashanth Sreenivasa vdev_obsolete_counts_are_precise(vdev_t *vd)
9075cabbc6bSPrashanth Sreenivasa {
9085cabbc6bSPrashanth Sreenivasa 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
9095cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_top_zap == 0) {
9105cabbc6bSPrashanth Sreenivasa 		return (B_FALSE);
9115cabbc6bSPrashanth Sreenivasa 	}
9125cabbc6bSPrashanth Sreenivasa 
9135cabbc6bSPrashanth Sreenivasa 	uint64_t val = 0;
9145cabbc6bSPrashanth Sreenivasa 	int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
9155cabbc6bSPrashanth Sreenivasa 	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
9165cabbc6bSPrashanth Sreenivasa 
9175cabbc6bSPrashanth Sreenivasa 	ASSERT(err == 0 || err == ENOENT);
9185cabbc6bSPrashanth Sreenivasa 
9195cabbc6bSPrashanth Sreenivasa 	return (val != 0);
9205cabbc6bSPrashanth Sreenivasa }
9215cabbc6bSPrashanth Sreenivasa 
9225cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
9235cabbc6bSPrashanth Sreenivasa static void
vdev_indirect_close(vdev_t * vd)9245cabbc6bSPrashanth Sreenivasa vdev_indirect_close(vdev_t *vd)
9255cabbc6bSPrashanth Sreenivasa {
9265cabbc6bSPrashanth Sreenivasa }
9275cabbc6bSPrashanth Sreenivasa 
9285cabbc6bSPrashanth Sreenivasa /* ARGSUSED */
9295cabbc6bSPrashanth Sreenivasa static int
vdev_indirect_open(vdev_t * vd,uint64_t * psize,uint64_t * max_psize,uint64_t * ashift)9305cabbc6bSPrashanth Sreenivasa vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
9315cabbc6bSPrashanth Sreenivasa     uint64_t *ashift)
9325cabbc6bSPrashanth Sreenivasa {
9335cabbc6bSPrashanth Sreenivasa 	*psize = *max_psize = vd->vdev_asize +
9345cabbc6bSPrashanth Sreenivasa 	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
9355cabbc6bSPrashanth Sreenivasa 	*ashift = vd->vdev_ashift;
9365cabbc6bSPrashanth Sreenivasa 	return (0);
9375cabbc6bSPrashanth Sreenivasa }
9385cabbc6bSPrashanth Sreenivasa 
9395cabbc6bSPrashanth Sreenivasa typedef struct remap_segment {
9405cabbc6bSPrashanth Sreenivasa 	vdev_t *rs_vd;
9415cabbc6bSPrashanth Sreenivasa 	uint64_t rs_offset;
9425cabbc6bSPrashanth Sreenivasa 	uint64_t rs_asize;
9435cabbc6bSPrashanth Sreenivasa 	uint64_t rs_split_offset;
9445cabbc6bSPrashanth Sreenivasa 	list_node_t rs_node;
9455cabbc6bSPrashanth Sreenivasa } remap_segment_t;
9465cabbc6bSPrashanth Sreenivasa 
9475cabbc6bSPrashanth Sreenivasa remap_segment_t *
rs_alloc(vdev_t * vd,uint64_t offset,uint64_t asize,uint64_t split_offset)9485cabbc6bSPrashanth Sreenivasa rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
9495cabbc6bSPrashanth Sreenivasa {
9505cabbc6bSPrashanth Sreenivasa 	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
9515cabbc6bSPrashanth Sreenivasa 	rs->rs_vd = vd;
9525cabbc6bSPrashanth Sreenivasa 	rs->rs_offset = offset;
9535cabbc6bSPrashanth Sreenivasa 	rs->rs_asize = asize;
9545cabbc6bSPrashanth Sreenivasa 	rs->rs_split_offset = split_offset;
9555cabbc6bSPrashanth Sreenivasa 	return (rs);
9565cabbc6bSPrashanth Sreenivasa }
9575cabbc6bSPrashanth Sreenivasa 
958bdfded42SSerapheim Dimitropoulos /*
959bdfded42SSerapheim Dimitropoulos  * Given an indirect vdev and an extent on that vdev, it duplicates the
960bdfded42SSerapheim Dimitropoulos  * physical entries of the indirect mapping that correspond to the extent
961bdfded42SSerapheim Dimitropoulos  * to a new array and returns a pointer to it. In addition, copied_entries
962bdfded42SSerapheim Dimitropoulos  * is populated with the number of mapping entries that were duplicated.
963bdfded42SSerapheim Dimitropoulos  *
964bdfded42SSerapheim Dimitropoulos  * Note that the function assumes that the caller holds vdev_indirect_rwlock.
965bdfded42SSerapheim Dimitropoulos  * This ensures that the mapping won't change due to condensing as we
966bdfded42SSerapheim Dimitropoulos  * copy over its contents.
967bdfded42SSerapheim Dimitropoulos  *
968bdfded42SSerapheim Dimitropoulos  * Finally, since we are doing an allocation, it is up to the caller to
969bdfded42SSerapheim Dimitropoulos  * free the array allocated in this function.
970bdfded42SSerapheim Dimitropoulos  */
971bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *
vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t * vd,uint64_t offset,uint64_t asize,uint64_t * copied_entries)972bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
973bdfded42SSerapheim Dimitropoulos     uint64_t asize, uint64_t *copied_entries)
974bdfded42SSerapheim Dimitropoulos {
975bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
976bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
977bdfded42SSerapheim Dimitropoulos 	uint64_t entries = 0;
978bdfded42SSerapheim Dimitropoulos 
979bdfded42SSerapheim Dimitropoulos 	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
980bdfded42SSerapheim Dimitropoulos 
981bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *first_mapping =
982bdfded42SSerapheim Dimitropoulos 	    vdev_indirect_mapping_entry_for_offset(vim, offset);
983bdfded42SSerapheim Dimitropoulos 	ASSERT3P(first_mapping, !=, NULL);
984bdfded42SSerapheim Dimitropoulos 
985bdfded42SSerapheim Dimitropoulos 	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
986bdfded42SSerapheim Dimitropoulos 	while (asize > 0) {
987bdfded42SSerapheim Dimitropoulos 		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
988bdfded42SSerapheim Dimitropoulos 
989bdfded42SSerapheim Dimitropoulos 		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
990bdfded42SSerapheim Dimitropoulos 		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
991bdfded42SSerapheim Dimitropoulos 
992bdfded42SSerapheim Dimitropoulos 		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
993bdfded42SSerapheim Dimitropoulos 		uint64_t inner_size = MIN(asize, size - inner_offset);
994bdfded42SSerapheim Dimitropoulos 
995bdfded42SSerapheim Dimitropoulos 		offset += inner_size;
996bdfded42SSerapheim Dimitropoulos 		asize -= inner_size;
997bdfded42SSerapheim Dimitropoulos 		entries++;
998bdfded42SSerapheim Dimitropoulos 		m++;
999bdfded42SSerapheim Dimitropoulos 	}
1000bdfded42SSerapheim Dimitropoulos 
1001bdfded42SSerapheim Dimitropoulos 	size_t copy_length = entries * sizeof (*first_mapping);
1002bdfded42SSerapheim Dimitropoulos 	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
1003bdfded42SSerapheim Dimitropoulos 	bcopy(first_mapping, duplicate_mappings, copy_length);
1004bdfded42SSerapheim Dimitropoulos 	*copied_entries = entries;
1005bdfded42SSerapheim Dimitropoulos 
1006bdfded42SSerapheim Dimitropoulos 	return (duplicate_mappings);
1007bdfded42SSerapheim Dimitropoulos }
1008bdfded42SSerapheim Dimitropoulos 
10095cabbc6bSPrashanth Sreenivasa /*
10105cabbc6bSPrashanth Sreenivasa  * Goes through the relevant indirect mappings until it hits a concrete vdev
10115cabbc6bSPrashanth Sreenivasa  * and issues the callback. On the way to the concrete vdev, if any other
10125cabbc6bSPrashanth Sreenivasa  * indirect vdevs are encountered, then the callback will also be called on
10135cabbc6bSPrashanth Sreenivasa  * each of those indirect vdevs. For example, if the segment is mapped to
10145cabbc6bSPrashanth Sreenivasa  * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
10155cabbc6bSPrashanth Sreenivasa  * mapped to segment B on concrete vdev 2, then the callback will be called on
10165cabbc6bSPrashanth Sreenivasa  * both vdev 1 and vdev 2.
10175cabbc6bSPrashanth Sreenivasa  *
10185cabbc6bSPrashanth Sreenivasa  * While the callback passed to vdev_indirect_remap() is called on every vdev
10195cabbc6bSPrashanth Sreenivasa  * the function encounters, certain callbacks only care about concrete vdevs.
10205cabbc6bSPrashanth Sreenivasa  * These types of callbacks should return immediately and explicitly when they
10215cabbc6bSPrashanth Sreenivasa  * are called on an indirect vdev.
10225cabbc6bSPrashanth Sreenivasa  *
10235cabbc6bSPrashanth Sreenivasa  * Because there is a possibility that a DVA section in the indirect device
10245cabbc6bSPrashanth Sreenivasa  * has been split into multiple sections in our mapping, we keep track
10255cabbc6bSPrashanth Sreenivasa  * of the relevant contiguous segments of the new location (remap_segment_t)
10265cabbc6bSPrashanth Sreenivasa  * in a stack. This way we can call the callback for each of the new sections
10275cabbc6bSPrashanth Sreenivasa  * created by a single section of the indirect device. Note though, that in
10285cabbc6bSPrashanth Sreenivasa  * this scenario the callbacks in each split block won't occur in-order in
10295cabbc6bSPrashanth Sreenivasa  * terms of offset, so callers should not make any assumptions about that.
10305cabbc6bSPrashanth Sreenivasa  *
10315cabbc6bSPrashanth Sreenivasa  * For callbacks that don't handle split blocks and immediately return when
10325cabbc6bSPrashanth Sreenivasa  * they encounter them (as is the case for remap_blkptr_cb), the caller can
10335cabbc6bSPrashanth Sreenivasa  * assume that its callback will be applied from the first indirect vdev
10345cabbc6bSPrashanth Sreenivasa  * encountered to the last one and then the concrete vdev, in that order.
10355cabbc6bSPrashanth Sreenivasa  */
10365cabbc6bSPrashanth Sreenivasa static void
vdev_indirect_remap(vdev_t * vd,uint64_t offset,uint64_t asize,void (* func)(uint64_t,vdev_t *,uint64_t,uint64_t,void *),void * arg)10375cabbc6bSPrashanth Sreenivasa vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
10385cabbc6bSPrashanth Sreenivasa     void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
10395cabbc6bSPrashanth Sreenivasa {
10405cabbc6bSPrashanth Sreenivasa 	list_t stack;
10415cabbc6bSPrashanth Sreenivasa 	spa_t *spa = vd->vdev_spa;
10425cabbc6bSPrashanth Sreenivasa 
10435cabbc6bSPrashanth Sreenivasa 	list_create(&stack, sizeof (remap_segment_t),
10445cabbc6bSPrashanth Sreenivasa 	    offsetof(remap_segment_t, rs_node));
10455cabbc6bSPrashanth Sreenivasa 
10465cabbc6bSPrashanth Sreenivasa 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
10475cabbc6bSPrashanth Sreenivasa 	    rs != NULL; rs = list_remove_head(&stack)) {
10485cabbc6bSPrashanth Sreenivasa 		vdev_t *v = rs->rs_vd;
1049bdfded42SSerapheim Dimitropoulos 		uint64_t num_entries = 0;
1050bdfded42SSerapheim Dimitropoulos 
1051bdfded42SSerapheim Dimitropoulos 		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1052bdfded42SSerapheim Dimitropoulos 		ASSERT(rs->rs_asize > 0);
10535cabbc6bSPrashanth Sreenivasa 
10545cabbc6bSPrashanth Sreenivasa 		/*
1055bdfded42SSerapheim Dimitropoulos 		 * Note: As this function can be called from open context
1056bdfded42SSerapheim Dimitropoulos 		 * (e.g. zio_read()), we need the following rwlock to
1057bdfded42SSerapheim Dimitropoulos 		 * prevent the mapping from being changed by condensing.
1058bdfded42SSerapheim Dimitropoulos 		 *
1059bdfded42SSerapheim Dimitropoulos 		 * So we grab the lock and we make a copy of the entries
1060bdfded42SSerapheim Dimitropoulos 		 * that are relevant to the extent that we are working on.
1061bdfded42SSerapheim Dimitropoulos 		 * Once that is done, we drop the lock and iterate over
1062bdfded42SSerapheim Dimitropoulos 		 * our copy of the mapping. Once we are done with the with
1063bdfded42SSerapheim Dimitropoulos 		 * the remap segment and we free it, we also free our copy
1064bdfded42SSerapheim Dimitropoulos 		 * of the indirect mapping entries that are relevant to it.
1065bdfded42SSerapheim Dimitropoulos 		 *
1066bdfded42SSerapheim Dimitropoulos 		 * This way we don't need to wait until the function is
1067bdfded42SSerapheim Dimitropoulos 		 * finished with a segment, to condense it. In addition, we
1068bdfded42SSerapheim Dimitropoulos 		 * don't need a recursive rwlock for the case that a call to
1069bdfded42SSerapheim Dimitropoulos 		 * vdev_indirect_remap() needs to call itself (through the
1070bdfded42SSerapheim Dimitropoulos 		 * codepath of its callback) for the same vdev in the middle
1071bdfded42SSerapheim Dimitropoulos 		 * of its execution.
10725cabbc6bSPrashanth Sreenivasa 		 */
10735cabbc6bSPrashanth Sreenivasa 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
10745cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
10755cabbc6bSPrashanth Sreenivasa 		ASSERT3P(vim, !=, NULL);
10765cabbc6bSPrashanth Sreenivasa 
10775cabbc6bSPrashanth Sreenivasa 		vdev_indirect_mapping_entry_phys_t *mapping =
1078bdfded42SSerapheim Dimitropoulos 		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
1079bdfded42SSerapheim Dimitropoulos 		    rs->rs_offset, rs->rs_asize, &num_entries);
10805cabbc6bSPrashanth Sreenivasa 		ASSERT3P(mapping, !=, NULL);
1081bdfded42SSerapheim Dimitropoulos 		ASSERT3U(num_entries, >, 0);
1082bdfded42SSerapheim Dimitropoulos 		rw_exit(&v->vdev_indirect_rwlock);
10835cabbc6bSPrashanth Sreenivasa 
1084bdfded42SSerapheim Dimitropoulos 		for (uint64_t i = 0; i < num_entries; i++) {
10855cabbc6bSPrashanth Sreenivasa 			/*
10865cabbc6bSPrashanth Sreenivasa 			 * Note: the vdev_indirect_mapping can not change
10875cabbc6bSPrashanth Sreenivasa 			 * while we are running.  It only changes while the
10885cabbc6bSPrashanth Sreenivasa 			 * removal is in progress, and then only from syncing
10895cabbc6bSPrashanth Sreenivasa 			 * context. While a removal is in progress, this
10905cabbc6bSPrashanth Sreenivasa 			 * function is only called for frees, which also only
10915cabbc6bSPrashanth Sreenivasa 			 * happen from syncing context.
10925cabbc6bSPrashanth Sreenivasa 			 */
1093bdfded42SSerapheim Dimitropoulos 			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
1094bdfded42SSerapheim Dimitropoulos 
1095bdfded42SSerapheim Dimitropoulos 			ASSERT3P(m, !=, NULL);
1096bdfded42SSerapheim Dimitropoulos 			ASSERT3U(rs->rs_asize, >, 0);
10975cabbc6bSPrashanth Sreenivasa 
1098bdfded42SSerapheim Dimitropoulos 			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
1099bdfded42SSerapheim Dimitropoulos 			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
1100bdfded42SSerapheim Dimitropoulos 			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
11015cabbc6bSPrashanth Sreenivasa 
11025cabbc6bSPrashanth Sreenivasa 			ASSERT3U(rs->rs_offset, >=,
1103bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m));
11045cabbc6bSPrashanth Sreenivasa 			ASSERT3U(rs->rs_offset, <,
1105bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
11065cabbc6bSPrashanth Sreenivasa 			ASSERT3U(dst_vdev, !=, v->vdev_id);
11075cabbc6bSPrashanth Sreenivasa 
11085cabbc6bSPrashanth Sreenivasa 			uint64_t inner_offset = rs->rs_offset -
1109bdfded42SSerapheim Dimitropoulos 			    DVA_MAPPING_GET_SRC_OFFSET(m);
11105cabbc6bSPrashanth Sreenivasa 			uint64_t inner_size =
11115cabbc6bSPrashanth Sreenivasa 			    MIN(rs->rs_asize, size - inner_offset);
11125cabbc6bSPrashanth Sreenivasa 
11135cabbc6bSPrashanth Sreenivasa 			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
11145cabbc6bSPrashanth Sreenivasa 			ASSERT3P(dst_v, !=, NULL);
11155cabbc6bSPrashanth Sreenivasa 
11165cabbc6bSPrashanth Sreenivasa 			if (dst_v->vdev_ops == &vdev_indirect_ops) {
11175cabbc6bSPrashanth Sreenivasa 				list_insert_head(&stack,
11185cabbc6bSPrashanth Sreenivasa 				    rs_alloc(dst_v, dst_offset + inner_offset,
11195cabbc6bSPrashanth Sreenivasa 				    inner_size, rs->rs_split_offset));
11205cabbc6bSPrashanth Sreenivasa 
11215cabbc6bSPrashanth Sreenivasa 			}
11225cabbc6bSPrashanth Sreenivasa 
11235cabbc6bSPrashanth Sreenivasa 			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
11245cabbc6bSPrashanth Sreenivasa 			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
11255cabbc6bSPrashanth Sreenivasa 				/*
11265cabbc6bSPrashanth Sreenivasa 				 * Note: This clause exists only solely for
11275cabbc6bSPrashanth Sreenivasa 				 * testing purposes. We use it to ensure that
11285cabbc6bSPrashanth Sreenivasa 				 * split blocks work and that the callbacks
11295cabbc6bSPrashanth Sreenivasa 				 * using them yield the same result if issued
11305cabbc6bSPrashanth Sreenivasa 				 * in reverse order.
11315cabbc6bSPrashanth Sreenivasa 				 */
11325cabbc6bSPrashanth Sreenivasa 				uint64_t inner_half = inner_size / 2;
11335cabbc6bSPrashanth Sreenivasa 
11345cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset + inner_half, dst_v,
11355cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset + inner_half,
11365cabbc6bSPrashanth Sreenivasa 				    inner_half, arg);
11375cabbc6bSPrashanth Sreenivasa 
11385cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset, dst_v,
11395cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset,
11405cabbc6bSPrashanth Sreenivasa 				    inner_half, arg);
11415cabbc6bSPrashanth Sreenivasa 			} else {
11425cabbc6bSPrashanth Sreenivasa 				func(rs->rs_split_offset, dst_v,
11435cabbc6bSPrashanth Sreenivasa 				    dst_offset + inner_offset,
11445cabbc6bSPrashanth Sreenivasa 				    inner_size, arg);
11455cabbc6bSPrashanth Sreenivasa 			}
11465cabbc6bSPrashanth Sreenivasa 
11475cabbc6bSPrashanth Sreenivasa 			rs->rs_offset += inner_size;
11485cabbc6bSPrashanth Sreenivasa 			rs->rs_asize -= inner_size;
11495cabbc6bSPrashanth Sreenivasa 			rs->rs_split_offset += inner_size;
11505cabbc6bSPrashanth Sreenivasa 		}
1151bdfded42SSerapheim Dimitropoulos 		VERIFY0(rs->rs_asize);
11525cabbc6bSPrashanth Sreenivasa 
1153bdfded42SSerapheim Dimitropoulos 		kmem_free(mapping, num_entries * sizeof (*mapping));
11545cabbc6bSPrashanth Sreenivasa 		kmem_free(rs, sizeof (remap_segment_t));
11555cabbc6bSPrashanth Sreenivasa 	}
11565cabbc6bSPrashanth Sreenivasa 	list_destroy(&stack);
11575cabbc6bSPrashanth Sreenivasa }
11585cabbc6bSPrashanth Sreenivasa 
11595cabbc6bSPrashanth Sreenivasa static void
vdev_indirect_child_io_done(zio_t * zio)11605cabbc6bSPrashanth Sreenivasa vdev_indirect_child_io_done(zio_t *zio)
11615cabbc6bSPrashanth Sreenivasa {
11625cabbc6bSPrashanth Sreenivasa 	zio_t *pio = zio->io_private;
11635cabbc6bSPrashanth Sreenivasa 
11645cabbc6bSPrashanth Sreenivasa 	mutex_enter(&pio->io_lock);
11655cabbc6bSPrashanth Sreenivasa 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
11665cabbc6bSPrashanth Sreenivasa 	mutex_exit(&pio->io_lock);
11675cabbc6bSPrashanth Sreenivasa 
11685cabbc6bSPrashanth Sreenivasa 	abd_put(zio->io_abd);
11695cabbc6bSPrashanth Sreenivasa }
11705cabbc6bSPrashanth Sreenivasa 
11713a4b1be9SMatthew Ahrens /*
11723a4b1be9SMatthew Ahrens  * This is a callback for vdev_indirect_remap() which allocates an
11733a4b1be9SMatthew Ahrens  * indirect_split_t for each split segment and adds it to iv_splits.
11743a4b1be9SMatthew Ahrens  */
11755cabbc6bSPrashanth Sreenivasa static void
vdev_indirect_gather_splits(uint64_t split_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)11763a4b1be9SMatthew Ahrens vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
11775cabbc6bSPrashanth Sreenivasa     uint64_t size, void *arg)
11785cabbc6bSPrashanth Sreenivasa {
11795cabbc6bSPrashanth Sreenivasa 	zio_t *zio = arg;
11803a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
11815cabbc6bSPrashanth Sreenivasa 
11825cabbc6bSPrashanth Sreenivasa 	ASSERT3P(vd, !=, NULL);
11835cabbc6bSPrashanth Sreenivasa 
11845cabbc6bSPrashanth Sreenivasa 	if (vd->vdev_ops == &vdev_indirect_ops)
11855cabbc6bSPrashanth Sreenivasa 		return;
11865cabbc6bSPrashanth Sreenivasa 
11873a4b1be9SMatthew Ahrens 	int n = 1;
11883a4b1be9SMatthew Ahrens 	if (vd->vdev_ops == &vdev_mirror_ops)
11893a4b1be9SMatthew Ahrens 		n = vd->vdev_children;
11903a4b1be9SMatthew Ahrens 
11913a4b1be9SMatthew Ahrens 	indirect_split_t *is =
11923a4b1be9SMatthew Ahrens 	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
11933a4b1be9SMatthew Ahrens 
11943a4b1be9SMatthew Ahrens 	is->is_children = n;
11953a4b1be9SMatthew Ahrens 	is->is_size = size;
11963a4b1be9SMatthew Ahrens 	is->is_split_offset = split_offset;
11973a4b1be9SMatthew Ahrens 	is->is_target_offset = offset;
11983a4b1be9SMatthew Ahrens 	is->is_vdev = vd;
1199a21fe349SBrian Behlendorf 	list_create(&is->is_unique_child, sizeof (indirect_child_t),
1200a21fe349SBrian Behlendorf 	    offsetof(indirect_child_t, ic_node));
12013a4b1be9SMatthew Ahrens 
12023a4b1be9SMatthew Ahrens 	/*
12033a4b1be9SMatthew Ahrens 	 * Note that we only consider multiple copies of the data for
12043a4b1be9SMatthew Ahrens 	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
12053a4b1be9SMatthew Ahrens 	 * though they use the same ops as mirror, because there's only one
12063a4b1be9SMatthew Ahrens 	 * "good" copy under the replacing/spare.
12073a4b1be9SMatthew Ahrens 	 */
12083a4b1be9SMatthew Ahrens 	if (vd->vdev_ops == &vdev_mirror_ops) {
12093a4b1be9SMatthew Ahrens 		for (int i = 0; i < n; i++) {
12103a4b1be9SMatthew Ahrens 			is->is_child[i].ic_vdev = vd->vdev_child[i];
1211a21fe349SBrian Behlendorf 			list_link_init(&is->is_child[i].ic_node);
12123a4b1be9SMatthew Ahrens 		}
12133a4b1be9SMatthew Ahrens 	} else {
12143a4b1be9SMatthew Ahrens 		is->is_child[0].ic_vdev = vd;
12153a4b1be9SMatthew Ahrens 	}
12163a4b1be9SMatthew Ahrens 
12173a4b1be9SMatthew Ahrens 	list_insert_tail(&iv->iv_splits, is);
12183a4b1be9SMatthew Ahrens }
12193a4b1be9SMatthew Ahrens 
12203a4b1be9SMatthew Ahrens static void
vdev_indirect_read_split_done(zio_t * zio)12213a4b1be9SMatthew Ahrens vdev_indirect_read_split_done(zio_t *zio)
12223a4b1be9SMatthew Ahrens {
12233a4b1be9SMatthew Ahrens 	indirect_child_t *ic = zio->io_private;
12243a4b1be9SMatthew Ahrens 
12253a4b1be9SMatthew Ahrens 	if (zio->io_error != 0) {
12263a4b1be9SMatthew Ahrens 		/*
12273a4b1be9SMatthew Ahrens 		 * Clear ic_data to indicate that we do not have data for this
12283a4b1be9SMatthew Ahrens 		 * child.
12293a4b1be9SMatthew Ahrens 		 */
12303a4b1be9SMatthew Ahrens 		abd_free(ic->ic_data);
12313a4b1be9SMatthew Ahrens 		ic->ic_data = NULL;
12323a4b1be9SMatthew Ahrens 	}
12333a4b1be9SMatthew Ahrens }
12343a4b1be9SMatthew Ahrens 
12353a4b1be9SMatthew Ahrens /*
12363a4b1be9SMatthew Ahrens  * Issue reads for all copies (mirror children) of all splits.
12373a4b1be9SMatthew Ahrens  */
12383a4b1be9SMatthew Ahrens static void
vdev_indirect_read_all(zio_t * zio)12393a4b1be9SMatthew Ahrens vdev_indirect_read_all(zio_t *zio)
12403a4b1be9SMatthew Ahrens {
12413a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
12423a4b1be9SMatthew Ahrens 
1243e4c795beSTom Caputi 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1244e4c795beSTom Caputi 
12453a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
12463a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
12473a4b1be9SMatthew Ahrens 		for (int i = 0; i < is->is_children; i++) {
12483a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[i];
12493a4b1be9SMatthew Ahrens 
12503a4b1be9SMatthew Ahrens 			if (!vdev_readable(ic->ic_vdev))
12513a4b1be9SMatthew Ahrens 				continue;
12523a4b1be9SMatthew Ahrens 
12533a4b1be9SMatthew Ahrens 			/*
12543a4b1be9SMatthew Ahrens 			 * Note, we may read from a child whose DTL
12553a4b1be9SMatthew Ahrens 			 * indicates that the data may not be present here.
12563a4b1be9SMatthew Ahrens 			 * While this might result in a few i/os that will
12573a4b1be9SMatthew Ahrens 			 * likely return incorrect data, it simplifies the
12583a4b1be9SMatthew Ahrens 			 * code since we can treat scrub and resilver
12593a4b1be9SMatthew Ahrens 			 * identically.  (The incorrect data will be
12603a4b1be9SMatthew Ahrens 			 * detected and ignored when we verify the
12613a4b1be9SMatthew Ahrens 			 * checksum.)
12623a4b1be9SMatthew Ahrens 			 */
12633a4b1be9SMatthew Ahrens 
12643a4b1be9SMatthew Ahrens 			ic->ic_data = abd_alloc_sametype(zio->io_abd,
12653a4b1be9SMatthew Ahrens 			    is->is_size);
1266a21fe349SBrian Behlendorf 			ic->ic_duplicate = NULL;
12673a4b1be9SMatthew Ahrens 
12683a4b1be9SMatthew Ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
12693a4b1be9SMatthew Ahrens 			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
12703a4b1be9SMatthew Ahrens 			    is->is_size, zio->io_type, zio->io_priority, 0,
12713a4b1be9SMatthew Ahrens 			    vdev_indirect_read_split_done, ic));
12723a4b1be9SMatthew Ahrens 		}
12733a4b1be9SMatthew Ahrens 	}
12743a4b1be9SMatthew Ahrens 	iv->iv_reconstruct = B_TRUE;
12755cabbc6bSPrashanth Sreenivasa }
12765cabbc6bSPrashanth Sreenivasa 
12775cabbc6bSPrashanth Sreenivasa static void
vdev_indirect_io_start(zio_t * zio)12785cabbc6bSPrashanth Sreenivasa vdev_indirect_io_start(zio_t *zio)
12795cabbc6bSPrashanth Sreenivasa {
12805cabbc6bSPrashanth Sreenivasa 	spa_t *spa = zio->io_spa;
12813a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
12823a4b1be9SMatthew Ahrens 	list_create(&iv->iv_splits,
12833a4b1be9SMatthew Ahrens 	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
12843a4b1be9SMatthew Ahrens 
12853a4b1be9SMatthew Ahrens 	zio->io_vsd = iv;
12863a4b1be9SMatthew Ahrens 	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
12875cabbc6bSPrashanth Sreenivasa 
12885cabbc6bSPrashanth Sreenivasa 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
12895cabbc6bSPrashanth Sreenivasa 	if (zio->io_type != ZIO_TYPE_READ) {
12905cabbc6bSPrashanth Sreenivasa 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
12913a4b1be9SMatthew Ahrens 		/*
12923a4b1be9SMatthew Ahrens 		 * Note: this code can handle other kinds of writes,
12933a4b1be9SMatthew Ahrens 		 * but we don't expect them.
12943a4b1be9SMatthew Ahrens 		 */
12953a4b1be9SMatthew Ahrens 		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
12963a4b1be9SMatthew Ahrens 		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
12975cabbc6bSPrashanth Sreenivasa 	}
12985cabbc6bSPrashanth Sreenivasa 
12995cabbc6bSPrashanth Sreenivasa 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
13003a4b1be9SMatthew Ahrens 	    vdev_indirect_gather_splits, zio);
13013a4b1be9SMatthew Ahrens 
13023a4b1be9SMatthew Ahrens 	indirect_split_t *first = list_head(&iv->iv_splits);
13033a4b1be9SMatthew Ahrens 	if (first->is_size == zio->io_size) {
13043a4b1be9SMatthew Ahrens 		/*
13053a4b1be9SMatthew Ahrens 		 * This is not a split block; we are pointing to the entire
13063a4b1be9SMatthew Ahrens 		 * data, which will checksum the same as the original data.
13073a4b1be9SMatthew Ahrens 		 * Pass the BP down so that the child i/o can verify the
13083a4b1be9SMatthew Ahrens 		 * checksum, and try a different location if available
13093a4b1be9SMatthew Ahrens 		 * (e.g. on a mirror).
13103a4b1be9SMatthew Ahrens 		 *
13113a4b1be9SMatthew Ahrens 		 * While this special case could be handled the same as the
13123a4b1be9SMatthew Ahrens 		 * general (split block) case, doing it this way ensures
13133a4b1be9SMatthew Ahrens 		 * that the vast majority of blocks on indirect vdevs
13143a4b1be9SMatthew Ahrens 		 * (which are not split) are handled identically to blocks
13153a4b1be9SMatthew Ahrens 		 * on non-indirect vdevs.  This allows us to be less strict
13163a4b1be9SMatthew Ahrens 		 * about performance in the general (but rare) case.
13173a4b1be9SMatthew Ahrens 		 */
13183a4b1be9SMatthew Ahrens 		ASSERT0(first->is_split_offset);
13193a4b1be9SMatthew Ahrens 		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
13203a4b1be9SMatthew Ahrens 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
13213a4b1be9SMatthew Ahrens 		    first->is_vdev, first->is_target_offset,
13223a4b1be9SMatthew Ahrens 		    abd_get_offset(zio->io_abd, 0),
13233a4b1be9SMatthew Ahrens 		    zio->io_size, zio->io_type, zio->io_priority, 0,
13243a4b1be9SMatthew Ahrens 		    vdev_indirect_child_io_done, zio));
13253a4b1be9SMatthew Ahrens 	} else {
13263a4b1be9SMatthew Ahrens 		iv->iv_split_block = B_TRUE;
1327e4c795beSTom Caputi 		if (zio->io_type == ZIO_TYPE_READ &&
1328e4c795beSTom Caputi 		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
13293a4b1be9SMatthew Ahrens 			/*
13303a4b1be9SMatthew Ahrens 			 * Read all copies.  Note that for simplicity,
13313a4b1be9SMatthew Ahrens 			 * we don't bother consulting the DTL in the
13323a4b1be9SMatthew Ahrens 			 * resilver case.
13333a4b1be9SMatthew Ahrens 			 */
13343a4b1be9SMatthew Ahrens 			vdev_indirect_read_all(zio);
13353a4b1be9SMatthew Ahrens 		} else {
13363a4b1be9SMatthew Ahrens 			/*
1337e4c795beSTom Caputi 			 * If this is a read zio, we read one copy of each
1338e4c795beSTom Caputi 			 * split segment, from the top-level vdev.  Since
1339e4c795beSTom Caputi 			 * we don't know the checksum of each split
1340e4c795beSTom Caputi 			 * individually, the child zio can't ensure that
1341e4c795beSTom Caputi 			 * we get the right data. E.g. if it's a mirror,
1342e4c795beSTom Caputi 			 * it will just read from a random (healthy) leaf
1343e4c795beSTom Caputi 			 * vdev. We have to verify the checksum in
1344e4c795beSTom Caputi 			 * vdev_indirect_io_done().
1345e4c795beSTom Caputi 			 *
1346e4c795beSTom Caputi 			 * For write zios, the vdev code will ensure we write
1347e4c795beSTom Caputi 			 * to all children.
13483a4b1be9SMatthew Ahrens 			 */
13493a4b1be9SMatthew Ahrens 			for (indirect_split_t *is = list_head(&iv->iv_splits);
13503a4b1be9SMatthew Ahrens 			    is != NULL; is = list_next(&iv->iv_splits, is)) {
13513a4b1be9SMatthew Ahrens 				zio_nowait(zio_vdev_child_io(zio, NULL,
13523a4b1be9SMatthew Ahrens 				    is->is_vdev, is->is_target_offset,
13533a4b1be9SMatthew Ahrens 				    abd_get_offset(zio->io_abd,
13543a4b1be9SMatthew Ahrens 				    is->is_split_offset),
13553a4b1be9SMatthew Ahrens 				    is->is_size, zio->io_type,
13563a4b1be9SMatthew Ahrens 				    zio->io_priority, 0,
13573a4b1be9SMatthew Ahrens 				    vdev_indirect_child_io_done, zio));
13583a4b1be9SMatthew Ahrens 			}
13593a4b1be9SMatthew Ahrens 		}
13603a4b1be9SMatthew Ahrens 	}
13615cabbc6bSPrashanth Sreenivasa 
13625cabbc6bSPrashanth Sreenivasa 	zio_execute(zio);
13635cabbc6bSPrashanth Sreenivasa }
13645cabbc6bSPrashanth Sreenivasa 
13653a4b1be9SMatthew Ahrens /*
13663a4b1be9SMatthew Ahrens  * Report a checksum error for a child.
13673a4b1be9SMatthew Ahrens  */
13683a4b1be9SMatthew Ahrens static void
vdev_indirect_checksum_error(zio_t * zio,indirect_split_t * is,indirect_child_t * ic)13693a4b1be9SMatthew Ahrens vdev_indirect_checksum_error(zio_t *zio,
13703a4b1be9SMatthew Ahrens     indirect_split_t *is, indirect_child_t *ic)
13713a4b1be9SMatthew Ahrens {
13723a4b1be9SMatthew Ahrens 	vdev_t *vd = ic->ic_vdev;
13733a4b1be9SMatthew Ahrens 
13743a4b1be9SMatthew Ahrens 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
13753a4b1be9SMatthew Ahrens 		return;
13763a4b1be9SMatthew Ahrens 
13773a4b1be9SMatthew Ahrens 	mutex_enter(&vd->vdev_stat_lock);
13783a4b1be9SMatthew Ahrens 	vd->vdev_stat.vs_checksum_errors++;
13793a4b1be9SMatthew Ahrens 	mutex_exit(&vd->vdev_stat_lock);
13803a4b1be9SMatthew Ahrens 
13813a4b1be9SMatthew Ahrens 	zio_bad_cksum_t zbc = { 0 };
13823a4b1be9SMatthew Ahrens 	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
1383a21fe349SBrian Behlendorf 	abd_t *good_abd = is->is_good_child->ic_data;
13843a4b1be9SMatthew Ahrens 	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
1385*9b088140SToomas Soome 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark,
1386*9b088140SToomas Soome 	    zio, is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
13873a4b1be9SMatthew Ahrens 	abd_return_buf(ic->ic_data, bad_buf, is->is_size);
13883a4b1be9SMatthew Ahrens 	abd_return_buf(good_abd, good_buf, is->is_size);
13893a4b1be9SMatthew Ahrens }
13903a4b1be9SMatthew Ahrens 
13913a4b1be9SMatthew Ahrens /*
13923a4b1be9SMatthew Ahrens  * Issue repair i/os for any incorrect copies.  We do this by comparing
13933a4b1be9SMatthew Ahrens  * each split segment's correct data (is_good_child's ic_data) with each
13943a4b1be9SMatthew Ahrens  * other copy of the data.  If they differ, then we overwrite the bad data
13953a4b1be9SMatthew Ahrens  * with the good copy.  Note that we do this without regard for the DTL's,
13963a4b1be9SMatthew Ahrens  * which simplifies this code and also issues the optimal number of writes
13973a4b1be9SMatthew Ahrens  * (based on which copies actually read bad data, as opposed to which we
13983a4b1be9SMatthew Ahrens  * think might be wrong).  For the same reason, we always use
13993a4b1be9SMatthew Ahrens  * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
14003a4b1be9SMatthew Ahrens  */
14013a4b1be9SMatthew Ahrens static void
vdev_indirect_repair(zio_t * zio)14023a4b1be9SMatthew Ahrens vdev_indirect_repair(zio_t *zio)
14033a4b1be9SMatthew Ahrens {
14043a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
14053a4b1be9SMatthew Ahrens 
14063a4b1be9SMatthew Ahrens 	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
14073a4b1be9SMatthew Ahrens 
14083a4b1be9SMatthew Ahrens 	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
14093a4b1be9SMatthew Ahrens 		flags |= ZIO_FLAG_SELF_HEAL;
14103a4b1be9SMatthew Ahrens 
14113a4b1be9SMatthew Ahrens 	if (!spa_writeable(zio->io_spa))
14123a4b1be9SMatthew Ahrens 		return;
14133a4b1be9SMatthew Ahrens 
14143a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
14153a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
14163a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
14173a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
1418a21fe349SBrian Behlendorf 			if (ic == is->is_good_child)
14193a4b1be9SMatthew Ahrens 				continue;
14203a4b1be9SMatthew Ahrens 			if (ic->ic_data == NULL)
14213a4b1be9SMatthew Ahrens 				continue;
1422a21fe349SBrian Behlendorf 			if (ic->ic_duplicate == is->is_good_child)
14233a4b1be9SMatthew Ahrens 				continue;
14243a4b1be9SMatthew Ahrens 
14253a4b1be9SMatthew Ahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
14263a4b1be9SMatthew Ahrens 			    ic->ic_vdev, is->is_target_offset,
1427a21fe349SBrian Behlendorf 			    is->is_good_child->ic_data, is->is_size,
14283a4b1be9SMatthew Ahrens 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
14293a4b1be9SMatthew Ahrens 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
14303a4b1be9SMatthew Ahrens 			    NULL, NULL));
14313a4b1be9SMatthew Ahrens 
14323a4b1be9SMatthew Ahrens 			vdev_indirect_checksum_error(zio, is, ic);
14333a4b1be9SMatthew Ahrens 		}
14343a4b1be9SMatthew Ahrens 	}
14353a4b1be9SMatthew Ahrens }
14363a4b1be9SMatthew Ahrens 
14373a4b1be9SMatthew Ahrens /*
14383a4b1be9SMatthew Ahrens  * Report checksum errors on all children that we read from.
14393a4b1be9SMatthew Ahrens  */
14403a4b1be9SMatthew Ahrens static void
vdev_indirect_all_checksum_errors(zio_t * zio)14413a4b1be9SMatthew Ahrens vdev_indirect_all_checksum_errors(zio_t *zio)
14423a4b1be9SMatthew Ahrens {
14433a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
14443a4b1be9SMatthew Ahrens 
14453a4b1be9SMatthew Ahrens 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
14463a4b1be9SMatthew Ahrens 		return;
14473a4b1be9SMatthew Ahrens 
14483a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
14493a4b1be9SMatthew Ahrens 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
14503a4b1be9SMatthew Ahrens 		for (int c = 0; c < is->is_children; c++) {
14513a4b1be9SMatthew Ahrens 			indirect_child_t *ic = &is->is_child[c];
14523a4b1be9SMatthew Ahrens 
14533a4b1be9SMatthew Ahrens 			if (ic->ic_data == NULL)
14543a4b1be9SMatthew Ahrens 				continue;
14553a4b1be9SMatthew Ahrens 
14563a4b1be9SMatthew Ahrens 			vdev_t *vd = ic->ic_vdev;
14573a4b1be9SMatthew Ahrens 
14583a4b1be9SMatthew Ahrens 			mutex_enter(&vd->vdev_stat_lock);
14593a4b1be9SMatthew Ahrens 			vd->vdev_stat.vs_checksum_errors++;
14603a4b1be9SMatthew Ahrens 			mutex_exit(&vd->vdev_stat_lock);
14613a4b1be9SMatthew Ahrens 
1462*9b088140SToomas Soome 			(void) zfs_ereport_post_checksum(zio->io_spa, vd,
1463eb633035STom Caputi 			    &zio->io_bookmark, zio, is->is_target_offset,
1464eb633035STom Caputi 			    is->is_size, NULL, NULL, NULL);
14653a4b1be9SMatthew Ahrens 		}
14663a4b1be9SMatthew Ahrens 	}
14673a4b1be9SMatthew Ahrens }
14683a4b1be9SMatthew Ahrens 
1469a21fe349SBrian Behlendorf /*
1470a21fe349SBrian Behlendorf  * Copy data from all the splits to a main zio then validate the checksum.
1471a21fe349SBrian Behlendorf  * If then checksum is successfully validated return success.
1472a21fe349SBrian Behlendorf  */
1473a21fe349SBrian Behlendorf static int
vdev_indirect_splits_checksum_validate(indirect_vsd_t * iv,zio_t * zio)1474a21fe349SBrian Behlendorf vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
1475a21fe349SBrian Behlendorf {
1476a21fe349SBrian Behlendorf 	zio_bad_cksum_t zbc;
1477a21fe349SBrian Behlendorf 
1478a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1479a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1480a21fe349SBrian Behlendorf 
1481a21fe349SBrian Behlendorf 		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
1482a21fe349SBrian Behlendorf 		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
1483a21fe349SBrian Behlendorf 
1484a21fe349SBrian Behlendorf 		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
1485a21fe349SBrian Behlendorf 		    is->is_split_offset, 0, is->is_size);
1486a21fe349SBrian Behlendorf 	}
1487a21fe349SBrian Behlendorf 
1488a21fe349SBrian Behlendorf 	return (zio_checksum_error(zio, &zbc));
1489a21fe349SBrian Behlendorf }
1490a21fe349SBrian Behlendorf 
1491a21fe349SBrian Behlendorf /*
1492a21fe349SBrian Behlendorf  * There are relatively few possible combinations making it feasible to
1493a21fe349SBrian Behlendorf  * deterministically check them all.  We do this by setting the good_child
1494a21fe349SBrian Behlendorf  * to the next unique split version.  If we reach the end of the list then
1495a21fe349SBrian Behlendorf  * "carry over" to the next unique split version (like counting in base
1496a21fe349SBrian Behlendorf  * is_unique_children, but each digit can have a different base).
1497a21fe349SBrian Behlendorf  */
1498a21fe349SBrian Behlendorf static int
vdev_indirect_splits_enumerate_all(indirect_vsd_t * iv,zio_t * zio)1499a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
1500a21fe349SBrian Behlendorf {
1501a21fe349SBrian Behlendorf 	boolean_t more = B_TRUE;
1502a21fe349SBrian Behlendorf 
1503a21fe349SBrian Behlendorf 	iv->iv_attempts = 0;
1504a21fe349SBrian Behlendorf 
1505a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1506a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is))
1507a21fe349SBrian Behlendorf 		is->is_good_child = list_head(&is->is_unique_child);
1508a21fe349SBrian Behlendorf 
1509a21fe349SBrian Behlendorf 	while (more == B_TRUE) {
1510a21fe349SBrian Behlendorf 		iv->iv_attempts++;
1511a21fe349SBrian Behlendorf 		more = B_FALSE;
1512a21fe349SBrian Behlendorf 
1513a21fe349SBrian Behlendorf 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
1514a21fe349SBrian Behlendorf 			return (0);
1515a21fe349SBrian Behlendorf 
1516a21fe349SBrian Behlendorf 		for (indirect_split_t *is = list_head(&iv->iv_splits);
1517a21fe349SBrian Behlendorf 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1518a21fe349SBrian Behlendorf 			is->is_good_child = list_next(&is->is_unique_child,
1519a21fe349SBrian Behlendorf 			    is->is_good_child);
1520a21fe349SBrian Behlendorf 			if (is->is_good_child != NULL) {
1521a21fe349SBrian Behlendorf 				more = B_TRUE;
1522a21fe349SBrian Behlendorf 				break;
1523a21fe349SBrian Behlendorf 			}
1524a21fe349SBrian Behlendorf 
1525a21fe349SBrian Behlendorf 			is->is_good_child = list_head(&is->is_unique_child);
1526a21fe349SBrian Behlendorf 		}
1527a21fe349SBrian Behlendorf 	}
1528a21fe349SBrian Behlendorf 
1529a21fe349SBrian Behlendorf 	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
1530a21fe349SBrian Behlendorf 
1531a21fe349SBrian Behlendorf 	return (SET_ERROR(ECKSUM));
1532a21fe349SBrian Behlendorf }
1533a21fe349SBrian Behlendorf 
1534a21fe349SBrian Behlendorf /*
1535a21fe349SBrian Behlendorf  * There are too many combinations to try all of them in a reasonable amount
1536a21fe349SBrian Behlendorf  * of time.  So try a fixed number of random combinations from the unique
1537a21fe349SBrian Behlendorf  * split versions, after which we'll consider the block unrecoverable.
1538a21fe349SBrian Behlendorf  */
1539a21fe349SBrian Behlendorf static int
vdev_indirect_splits_enumerate_randomly(indirect_vsd_t * iv,zio_t * zio)1540a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
1541a21fe349SBrian Behlendorf {
1542a21fe349SBrian Behlendorf 	iv->iv_attempts = 0;
1543a21fe349SBrian Behlendorf 
1544a21fe349SBrian Behlendorf 	while (iv->iv_attempts < iv->iv_attempts_max) {
1545a21fe349SBrian Behlendorf 		iv->iv_attempts++;
1546a21fe349SBrian Behlendorf 
1547a21fe349SBrian Behlendorf 		for (indirect_split_t *is = list_head(&iv->iv_splits);
1548a21fe349SBrian Behlendorf 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
1549a21fe349SBrian Behlendorf 			indirect_child_t *ic = list_head(&is->is_unique_child);
1550a21fe349SBrian Behlendorf 			int children = is->is_unique_children;
1551a21fe349SBrian Behlendorf 
1552a21fe349SBrian Behlendorf 			for (int i = spa_get_random(children); i > 0; i--)
1553a21fe349SBrian Behlendorf 				ic = list_next(&is->is_unique_child, ic);
1554a21fe349SBrian Behlendorf 
1555a21fe349SBrian Behlendorf 			ASSERT3P(ic, !=, NULL);
1556a21fe349SBrian Behlendorf 			is->is_good_child = ic;
1557a21fe349SBrian Behlendorf 		}
1558a21fe349SBrian Behlendorf 
1559a21fe349SBrian Behlendorf 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
1560a21fe349SBrian Behlendorf 			return (0);
1561a21fe349SBrian Behlendorf 	}
1562a21fe349SBrian Behlendorf 
1563a21fe349SBrian Behlendorf 	return (SET_ERROR(ECKSUM));
1564a21fe349SBrian Behlendorf }
1565a21fe349SBrian Behlendorf 
1566a21fe349SBrian Behlendorf /*
1567a21fe349SBrian Behlendorf  * This is a validation function for reconstruction.  It randomly selects
1568a21fe349SBrian Behlendorf  * a good combination, if one can be found, and then it intentionally
1569a21fe349SBrian Behlendorf  * damages all other segment copes by zeroing them.  This forces the
1570a21fe349SBrian Behlendorf  * reconstruction algorithm to locate the one remaining known good copy.
1571a21fe349SBrian Behlendorf  */
1572a21fe349SBrian Behlendorf static int
vdev_indirect_splits_damage(indirect_vsd_t * iv,zio_t * zio)1573a21fe349SBrian Behlendorf vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
1574a21fe349SBrian Behlendorf {
1575a21fe349SBrian Behlendorf 	/* Presume all the copies are unique for initial selection. */
1576a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1577a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1578a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
1579a21fe349SBrian Behlendorf 
1580a21fe349SBrian Behlendorf 		for (int i = 0; i < is->is_children; i++) {
1581a21fe349SBrian Behlendorf 			indirect_child_t *ic = &is->is_child[i];
1582a21fe349SBrian Behlendorf 			if (ic->ic_data != NULL) {
1583a21fe349SBrian Behlendorf 				is->is_unique_children++;
1584a21fe349SBrian Behlendorf 				list_insert_tail(&is->is_unique_child, ic);
1585a21fe349SBrian Behlendorf 			}
1586a21fe349SBrian Behlendorf 		}
1587a21fe349SBrian Behlendorf 	}
1588a21fe349SBrian Behlendorf 
1589a21fe349SBrian Behlendorf 	/*
1590a21fe349SBrian Behlendorf 	 * Set each is_good_child to a randomly-selected child which
1591a21fe349SBrian Behlendorf 	 * is known to contain validated data.
1592a21fe349SBrian Behlendorf 	 */
1593a21fe349SBrian Behlendorf 	int error = vdev_indirect_splits_enumerate_randomly(iv, zio);
1594a21fe349SBrian Behlendorf 	if (error)
1595a21fe349SBrian Behlendorf 		goto out;
1596a21fe349SBrian Behlendorf 
1597a21fe349SBrian Behlendorf 	/*
1598a21fe349SBrian Behlendorf 	 * Damage all but the known good copy by zeroing it.  This will
1599a21fe349SBrian Behlendorf 	 * result in two or less unique copies per indirect_child_t.
1600a21fe349SBrian Behlendorf 	 * Both may need to be checked in order to reconstruct the block.
1601a21fe349SBrian Behlendorf 	 * Set iv->iv_attempts_max such that all unique combinations will
1602a21fe349SBrian Behlendorf 	 * enumerated, but limit the damage to at most 16 indirect splits.
1603a21fe349SBrian Behlendorf 	 */
1604a21fe349SBrian Behlendorf 	iv->iv_attempts_max = 1;
1605a21fe349SBrian Behlendorf 
1606a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1607a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1608a21fe349SBrian Behlendorf 		for (int c = 0; c < is->is_children; c++) {
1609a21fe349SBrian Behlendorf 			indirect_child_t *ic = &is->is_child[c];
1610a21fe349SBrian Behlendorf 
1611a21fe349SBrian Behlendorf 			if (ic == is->is_good_child)
1612a21fe349SBrian Behlendorf 				continue;
1613a21fe349SBrian Behlendorf 			if (ic->ic_data == NULL)
1614a21fe349SBrian Behlendorf 				continue;
1615a21fe349SBrian Behlendorf 
1616a21fe349SBrian Behlendorf 			abd_zero(ic->ic_data, ic->ic_data->abd_size);
1617a21fe349SBrian Behlendorf 		}
1618a21fe349SBrian Behlendorf 
1619a21fe349SBrian Behlendorf 		iv->iv_attempts_max *= 2;
1620a21fe349SBrian Behlendorf 		if (iv->iv_attempts_max > (1ULL << 16)) {
1621a21fe349SBrian Behlendorf 			iv->iv_attempts_max = UINT64_MAX;
1622a21fe349SBrian Behlendorf 			break;
1623a21fe349SBrian Behlendorf 		}
1624a21fe349SBrian Behlendorf 	}
1625a21fe349SBrian Behlendorf 
1626a21fe349SBrian Behlendorf out:
1627a21fe349SBrian Behlendorf 	/* Empty the unique children lists so they can be reconstructed. */
1628a21fe349SBrian Behlendorf 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1629a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1630a21fe349SBrian Behlendorf 		indirect_child_t *ic;
1631a21fe349SBrian Behlendorf 		while ((ic = list_head(&is->is_unique_child)) != NULL)
1632a21fe349SBrian Behlendorf 			list_remove(&is->is_unique_child, ic);
1633a21fe349SBrian Behlendorf 
1634a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
1635a21fe349SBrian Behlendorf 	}
1636a21fe349SBrian Behlendorf 
1637a21fe349SBrian Behlendorf 	return (error);
1638a21fe349SBrian Behlendorf }
1639a21fe349SBrian Behlendorf 
16403a4b1be9SMatthew Ahrens /*
16413a4b1be9SMatthew Ahrens  * This function is called when we have read all copies of the data and need
16423a4b1be9SMatthew Ahrens  * to try to find a combination of copies that gives us the right checksum.
16433a4b1be9SMatthew Ahrens  *
16443a4b1be9SMatthew Ahrens  * If we pointed to any mirror vdevs, this effectively does the job of the
16453a4b1be9SMatthew Ahrens  * mirror.  The mirror vdev code can't do its own job because we don't know
1646a21fe349SBrian Behlendorf  * the checksum of each split segment individually.
16473a4b1be9SMatthew Ahrens  *
1648a21fe349SBrian Behlendorf  * We have to try every unique combination of copies of split segments, until
1649a21fe349SBrian Behlendorf  * we find one that checksums correctly.  Duplicate segment copies are first
1650a21fe349SBrian Behlendorf  * identified and latter skipped during reconstruction.  This optimization
1651a21fe349SBrian Behlendorf  * reduces the search space and ensures that of the remaining combinations
1652a21fe349SBrian Behlendorf  * at most one is correct.
1653a21fe349SBrian Behlendorf  *
1654a21fe349SBrian Behlendorf  * When the total number of combinations is small they can all be checked.
1655a21fe349SBrian Behlendorf  * For example, if we have 3 segments in the split, and each points to a
1656a21fe349SBrian Behlendorf  * 2-way mirror with unique copies, we will have the following pieces of data:
16573a4b1be9SMatthew Ahrens  *
16583a4b1be9SMatthew Ahrens  *       |     mirror child
16593a4b1be9SMatthew Ahrens  * split |     [0]        [1]
16603a4b1be9SMatthew Ahrens  * ======|=====================
16613a4b1be9SMatthew Ahrens  *   A   |  data_A_0   data_A_1
16623a4b1be9SMatthew Ahrens  *   B   |  data_B_0   data_B_1
16633a4b1be9SMatthew Ahrens  *   C   |  data_C_0   data_C_1
16643a4b1be9SMatthew Ahrens  *
16653a4b1be9SMatthew Ahrens  * We will try the following (mirror children)^(number of splits) (2^3=8)
16663a4b1be9SMatthew Ahrens  * combinations, which is similar to bitwise-little-endian counting in
16673a4b1be9SMatthew Ahrens  * binary.  In general each "digit" corresponds to a split segment, and the
16683a4b1be9SMatthew Ahrens  * base of each digit is is_children, which can be different for each
16693a4b1be9SMatthew Ahrens  * digit.
16703a4b1be9SMatthew Ahrens  *
16713a4b1be9SMatthew Ahrens  * "low bit"        "high bit"
16723a4b1be9SMatthew Ahrens  *        v                 v
16733a4b1be9SMatthew Ahrens  * data_A_0 data_B_0 data_C_0
16743a4b1be9SMatthew Ahrens  * data_A_1 data_B_0 data_C_0
16753a4b1be9SMatthew Ahrens  * data_A_0 data_B_1 data_C_0
16763a4b1be9SMatthew Ahrens  * data_A_1 data_B_1 data_C_0
16773a4b1be9SMatthew Ahrens  * data_A_0 data_B_0 data_C_1
16783a4b1be9SMatthew Ahrens  * data_A_1 data_B_0 data_C_1
16793a4b1be9SMatthew Ahrens  * data_A_0 data_B_1 data_C_1
16803a4b1be9SMatthew Ahrens  * data_A_1 data_B_1 data_C_1
16813a4b1be9SMatthew Ahrens  *
16823a4b1be9SMatthew Ahrens  * Note that the split segments may be on the same or different top-level
1683a21fe349SBrian Behlendorf  * vdevs. In either case, we may need to try lots of combinations (see
1684a21fe349SBrian Behlendorf  * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
1685a21fe349SBrian Behlendorf  * has small silent errors on all of its children, we can still reconstruct
1686a21fe349SBrian Behlendorf  * the correct data, as long as those errors are at sufficiently-separated
16873a4b1be9SMatthew Ahrens  * offsets (specifically, separated by the largest block size - default of
16883a4b1be9SMatthew Ahrens  * 128KB, but up to 16MB).
16893a4b1be9SMatthew Ahrens  */
16903a4b1be9SMatthew Ahrens static void
vdev_indirect_reconstruct_io_done(zio_t * zio)16913a4b1be9SMatthew Ahrens vdev_indirect_reconstruct_io_done(zio_t *zio)
16923a4b1be9SMatthew Ahrens {
16933a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
1694a21fe349SBrian Behlendorf 	boolean_t known_good = B_FALSE;
1695a21fe349SBrian Behlendorf 	int error;
1696a21fe349SBrian Behlendorf 
1697a21fe349SBrian Behlendorf 	iv->iv_unique_combinations = 1;
1698a21fe349SBrian Behlendorf 	iv->iv_attempts_max = UINT64_MAX;
1699a21fe349SBrian Behlendorf 
1700a21fe349SBrian Behlendorf 	if (zfs_reconstruct_indirect_combinations_max > 0)
1701a21fe349SBrian Behlendorf 		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
1702a21fe349SBrian Behlendorf 
1703a21fe349SBrian Behlendorf 	/*
1704a21fe349SBrian Behlendorf 	 * If nonzero, every 1/x blocks will be damaged, in order to validate
1705a21fe349SBrian Behlendorf 	 * reconstruction when there are split segments with damaged copies.
1706a21fe349SBrian Behlendorf 	 * Known_good will TRUE when reconstruction is known to be possible.
1707a21fe349SBrian Behlendorf 	 */
1708a21fe349SBrian Behlendorf 	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
1709a21fe349SBrian Behlendorf 	    spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
1710a21fe349SBrian Behlendorf 		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
17113a4b1be9SMatthew Ahrens 
1712a21fe349SBrian Behlendorf 	/*
1713a21fe349SBrian Behlendorf 	 * Determine the unique children for a split segment and add them
1714a21fe349SBrian Behlendorf 	 * to the is_unique_child list.  By restricting reconstruction
1715a21fe349SBrian Behlendorf 	 * to these children, only unique combinations will be considered.
1716a21fe349SBrian Behlendorf 	 * This can vastly reduce the search space when there are a large
1717a21fe349SBrian Behlendorf 	 * number of indirect splits.
1718a21fe349SBrian Behlendorf 	 */
17193a4b1be9SMatthew Ahrens 	for (indirect_split_t *is = list_head(&iv->iv_splits);
1720a21fe349SBrian Behlendorf 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
1721a21fe349SBrian Behlendorf 		is->is_unique_children = 0;
17223a4b1be9SMatthew Ahrens 
1723a21fe349SBrian Behlendorf 		for (int i = 0; i < is->is_children; i++) {
1724a21fe349SBrian Behlendorf 			indirect_child_t *ic_i = &is->is_child[i];
17253a4b1be9SMatthew Ahrens 
1726a21fe349SBrian Behlendorf 			if (ic_i->ic_data == NULL ||
1727a21fe349SBrian Behlendorf 			    ic_i->ic_duplicate != NULL)
1728a21fe349SBrian Behlendorf 				continue;
17293a4b1be9SMatthew Ahrens 
1730a21fe349SBrian Behlendorf 			for (int j = i + 1; j < is->is_children; j++) {
1731a21fe349SBrian Behlendorf 				indirect_child_t *ic_j = &is->is_child[j];
17323a4b1be9SMatthew Ahrens 
1733a21fe349SBrian Behlendorf 				if (ic_j->ic_data == NULL ||
1734a21fe349SBrian Behlendorf 				    ic_j->ic_duplicate != NULL)
1735a21fe349SBrian Behlendorf 					continue;
17363a4b1be9SMatthew Ahrens 
1737a21fe349SBrian Behlendorf 				if (abd_cmp(ic_i->ic_data, ic_j->ic_data,
1738a21fe349SBrian Behlendorf 				    is->is_size) == 0) {
1739a21fe349SBrian Behlendorf 					ic_j->ic_duplicate = ic_i;
17403a4b1be9SMatthew Ahrens 				}
17413a4b1be9SMatthew Ahrens 			}
1742a21fe349SBrian Behlendorf 
1743a21fe349SBrian Behlendorf 			is->is_unique_children++;
1744a21fe349SBrian Behlendorf 			list_insert_tail(&is->is_unique_child, ic_i);
17453a4b1be9SMatthew Ahrens 		}
1746a21fe349SBrian Behlendorf 
1747a21fe349SBrian Behlendorf 		/* Reconstruction is impossible, no valid children */
1748a21fe349SBrian Behlendorf 		EQUIV(list_is_empty(&is->is_unique_child),
1749a21fe349SBrian Behlendorf 		    is->is_unique_children == 0);
1750a21fe349SBrian Behlendorf 		if (list_is_empty(&is->is_unique_child)) {
1751a21fe349SBrian Behlendorf 			zio->io_error = EIO;
17523a4b1be9SMatthew Ahrens 			vdev_indirect_all_checksum_errors(zio);
17533a4b1be9SMatthew Ahrens 			zio_checksum_verified(zio);
17543a4b1be9SMatthew Ahrens 			return;
17553a4b1be9SMatthew Ahrens 		}
1756a21fe349SBrian Behlendorf 
1757a21fe349SBrian Behlendorf 		iv->iv_unique_combinations *= is->is_unique_children;
1758a21fe349SBrian Behlendorf 	}
1759a21fe349SBrian Behlendorf 
1760a21fe349SBrian Behlendorf 	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
1761a21fe349SBrian Behlendorf 		error = vdev_indirect_splits_enumerate_all(iv, zio);
1762a21fe349SBrian Behlendorf 	else
1763a21fe349SBrian Behlendorf 		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
1764a21fe349SBrian Behlendorf 
1765a21fe349SBrian Behlendorf 	if (error != 0) {
1766a21fe349SBrian Behlendorf 		/* All attempted combinations failed. */
1767a21fe349SBrian Behlendorf 		ASSERT3B(known_good, ==, B_FALSE);
1768a21fe349SBrian Behlendorf 		zio->io_error = error;
1769a21fe349SBrian Behlendorf 		vdev_indirect_all_checksum_errors(zio);
1770a21fe349SBrian Behlendorf 	} else {
1771a21fe349SBrian Behlendorf 		/*
1772a21fe349SBrian Behlendorf 		 * The checksum has been successfully validated.  Issue
1773a21fe349SBrian Behlendorf 		 * repair I/Os to any copies of splits which don't match
1774a21fe349SBrian Behlendorf 		 * the validated version.
1775a21fe349SBrian Behlendorf 		 */
1776a21fe349SBrian Behlendorf 		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
1777a21fe349SBrian Behlendorf 		vdev_indirect_repair(zio);
1778a21fe349SBrian Behlendorf 		zio_checksum_verified(zio);
17793a4b1be9SMatthew Ahrens 	}
17803a4b1be9SMatthew Ahrens }
17813a4b1be9SMatthew Ahrens 
17823a4b1be9SMatthew Ahrens static void
vdev_indirect_io_done(zio_t * zio)17833a4b1be9SMatthew Ahrens vdev_indirect_io_done(zio_t *zio)
17843a4b1be9SMatthew Ahrens {
17853a4b1be9SMatthew Ahrens 	indirect_vsd_t *iv = zio->io_vsd;
17863a4b1be9SMatthew Ahrens 
17873a4b1be9SMatthew Ahrens 	if (iv->iv_reconstruct) {
17883a4b1be9SMatthew Ahrens 		/*
17893a4b1be9SMatthew Ahrens 		 * We have read all copies of the data (e.g. from mirrors),
17903a4b1be9SMatthew Ahrens 		 * either because this was a scrub/resilver, or because the
17913a4b1be9SMatthew Ahrens 		 * one-copy read didn't checksum correctly.
17923a4b1be9SMatthew Ahrens 		 */
17933a4b1be9SMatthew Ahrens 		vdev_indirect_reconstruct_io_done(zio);
17943a4b1be9SMatthew Ahrens 		return;
17953a4b1be9SMatthew Ahrens 	}
17963a4b1be9SMatthew Ahrens 
17973a4b1be9SMatthew Ahrens 	if (!iv->iv_split_block) {
17983a4b1be9SMatthew Ahrens 		/*
17993a4b1be9SMatthew Ahrens 		 * This was not a split block, so we passed the BP down,
18003a4b1be9SMatthew Ahrens 		 * and the checksum was handled by the (one) child zio.
18013a4b1be9SMatthew Ahrens 		 */
18023a4b1be9SMatthew Ahrens 		return;
18033a4b1be9SMatthew Ahrens 	}
18043a4b1be9SMatthew Ahrens 
18053a4b1be9SMatthew Ahrens 	zio_bad_cksum_t zbc;
18063a4b1be9SMatthew Ahrens 	int ret = zio_checksum_error(zio, &zbc);
18073a4b1be9SMatthew Ahrens 	if (ret == 0) {
18083a4b1be9SMatthew Ahrens 		zio_checksum_verified(zio);
18093a4b1be9SMatthew Ahrens 		return;
18103a4b1be9SMatthew Ahrens 	}
18113a4b1be9SMatthew Ahrens 
18123a4b1be9SMatthew Ahrens 	/*
18133a4b1be9SMatthew Ahrens 	 * The checksum didn't match.  Read all copies of all splits, and
18143a4b1be9SMatthew Ahrens 	 * then we will try to reconstruct.  The next time
18153a4b1be9SMatthew Ahrens 	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
18163a4b1be9SMatthew Ahrens 	 */
18173a4b1be9SMatthew Ahrens 	vdev_indirect_read_all(zio);
18183a4b1be9SMatthew Ahrens 
18193a4b1be9SMatthew Ahrens 	zio_vdev_io_redone(zio);
18203a4b1be9SMatthew Ahrens }
18213a4b1be9SMatthew Ahrens 
18225cabbc6bSPrashanth Sreenivasa vdev_ops_t vdev_indirect_ops = {
1823a3874b8bSToomas Soome 	.vdev_op_open = vdev_indirect_open,
1824a3874b8bSToomas Soome 	.vdev_op_close = vdev_indirect_close,
1825a3874b8bSToomas Soome 	.vdev_op_asize = vdev_default_asize,
1826a3874b8bSToomas Soome 	.vdev_op_io_start = vdev_indirect_io_start,
1827a3874b8bSToomas Soome 	.vdev_op_io_done = vdev_indirect_io_done,
1828a3874b8bSToomas Soome 	.vdev_op_state_change = NULL,
1829a3874b8bSToomas Soome 	.vdev_op_need_resilver = NULL,
1830a3874b8bSToomas Soome 	.vdev_op_hold = NULL,
1831a3874b8bSToomas Soome 	.vdev_op_rele = NULL,
1832a3874b8bSToomas Soome 	.vdev_op_remap = vdev_indirect_remap,
1833a3874b8bSToomas Soome 	.vdev_op_xlate = NULL,
1834ac04831dSMike Gerdts 	.vdev_op_dumpio = NULL,
1835a3874b8bSToomas Soome 	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
1836a3874b8bSToomas Soome 	.vdev_op_leaf = B_FALSE			/* leaf vdev */
18375cabbc6bSPrashanth Sreenivasa };
1838