15cabbc6bSPrashanth Sreenivasa /* 25cabbc6bSPrashanth Sreenivasa * CDDL HEADER START 35cabbc6bSPrashanth Sreenivasa * 45cabbc6bSPrashanth Sreenivasa * This file and its contents are supplied under the terms of the 55cabbc6bSPrashanth Sreenivasa * Common Development and Distribution License ("CDDL"), version 1.0. 65cabbc6bSPrashanth Sreenivasa * You may only use this file in accordance with the terms of version 75cabbc6bSPrashanth Sreenivasa * 1.0 of the CDDL. 85cabbc6bSPrashanth Sreenivasa * 95cabbc6bSPrashanth Sreenivasa * A full copy of the text of the CDDL should have accompanied this 105cabbc6bSPrashanth Sreenivasa * source. A copy of the CDDL is also available via the Internet at 115cabbc6bSPrashanth Sreenivasa * http://www.illumos.org/license/CDDL. 125cabbc6bSPrashanth Sreenivasa * 135cabbc6bSPrashanth Sreenivasa * CDDL HEADER END 145cabbc6bSPrashanth Sreenivasa */ 155cabbc6bSPrashanth Sreenivasa 165cabbc6bSPrashanth Sreenivasa /* 17814dcd43SSerapheim Dimitropoulos * Copyright (c) 2014, 2019 by Delphix. All rights reserved. 18*ac04831dSMike Gerdts * Copyright 2019 Joyent, Inc. 195cabbc6bSPrashanth Sreenivasa */ 205cabbc6bSPrashanth Sreenivasa 215cabbc6bSPrashanth Sreenivasa #include <sys/zfs_context.h> 225cabbc6bSPrashanth Sreenivasa #include <sys/spa.h> 235cabbc6bSPrashanth Sreenivasa #include <sys/spa_impl.h> 245cabbc6bSPrashanth Sreenivasa #include <sys/vdev_impl.h> 255cabbc6bSPrashanth Sreenivasa #include <sys/fs/zfs.h> 265cabbc6bSPrashanth Sreenivasa #include <sys/zio.h> 273a4b1be9SMatthew Ahrens #include <sys/zio_checksum.h> 285cabbc6bSPrashanth Sreenivasa #include <sys/metaslab.h> 295cabbc6bSPrashanth Sreenivasa #include <sys/refcount.h> 305cabbc6bSPrashanth Sreenivasa #include <sys/dmu.h> 315cabbc6bSPrashanth Sreenivasa #include <sys/vdev_indirect_mapping.h> 325cabbc6bSPrashanth Sreenivasa #include <sys/dmu_tx.h> 335cabbc6bSPrashanth Sreenivasa #include <sys/dsl_synctask.h> 345cabbc6bSPrashanth Sreenivasa #include <sys/zap.h> 35667ec66fSSerapheim Dimitropoulos #include <sys/abd.h> 36667ec66fSSerapheim Dimitropoulos #include <sys/zthr.h> 375cabbc6bSPrashanth Sreenivasa 385cabbc6bSPrashanth Sreenivasa /* 395cabbc6bSPrashanth Sreenivasa * An indirect vdev corresponds to a vdev that has been removed. Since 405cabbc6bSPrashanth Sreenivasa * we cannot rewrite block pointers of snapshots, etc., we keep a 415cabbc6bSPrashanth Sreenivasa * mapping from old location on the removed device to the new location 425cabbc6bSPrashanth Sreenivasa * on another device in the pool and use this mapping whenever we need 435cabbc6bSPrashanth Sreenivasa * to access the DVA. Unfortunately, this mapping did not respect 445cabbc6bSPrashanth Sreenivasa * logical block boundaries when it was first created, and so a DVA on 455cabbc6bSPrashanth Sreenivasa * this indirect vdev may be "split" into multiple sections that each 465cabbc6bSPrashanth Sreenivasa * map to a different location. As a consequence, not all DVAs can be 475cabbc6bSPrashanth Sreenivasa * translated to an equivalent new DVA. Instead we must provide a 485cabbc6bSPrashanth Sreenivasa * "vdev_remap" operation that executes a callback on each contiguous 495cabbc6bSPrashanth Sreenivasa * segment of the new location. This function is used in multiple ways: 505cabbc6bSPrashanth Sreenivasa * 513a4b1be9SMatthew Ahrens * - i/os to this vdev use the callback to determine where the 523a4b1be9SMatthew Ahrens * data is now located, and issue child i/os for each segment's new 533a4b1be9SMatthew Ahrens * location. 545cabbc6bSPrashanth Sreenivasa * 553a4b1be9SMatthew Ahrens * - frees and claims to this vdev use the callback to free or claim 565cabbc6bSPrashanth Sreenivasa * each mapped segment. (Note that we don't actually need to claim 575cabbc6bSPrashanth Sreenivasa * log blocks on indirect vdevs, because we don't allocate to 585cabbc6bSPrashanth Sreenivasa * removing vdevs. However, zdb uses zio_claim() for its leak 595cabbc6bSPrashanth Sreenivasa * detection.) 605cabbc6bSPrashanth Sreenivasa */ 615cabbc6bSPrashanth Sreenivasa 625cabbc6bSPrashanth Sreenivasa /* 635cabbc6bSPrashanth Sreenivasa * "Big theory statement" for how we mark blocks obsolete. 645cabbc6bSPrashanth Sreenivasa * 655cabbc6bSPrashanth Sreenivasa * When a block on an indirect vdev is freed or remapped, a section of 665cabbc6bSPrashanth Sreenivasa * that vdev's mapping may no longer be referenced (aka "obsolete"). We 675cabbc6bSPrashanth Sreenivasa * keep track of how much of each mapping entry is obsolete. When 685cabbc6bSPrashanth Sreenivasa * an entry becomes completely obsolete, we can remove it, thus reducing 695cabbc6bSPrashanth Sreenivasa * the memory used by the mapping. The complete picture of obsolescence 705cabbc6bSPrashanth Sreenivasa * is given by the following data structures, described below: 715cabbc6bSPrashanth Sreenivasa * - the entry-specific obsolete count 725cabbc6bSPrashanth Sreenivasa * - the vdev-specific obsolete spacemap 735cabbc6bSPrashanth Sreenivasa * - the pool-specific obsolete bpobj 745cabbc6bSPrashanth Sreenivasa * 755cabbc6bSPrashanth Sreenivasa * == On disk data structures used == 765cabbc6bSPrashanth Sreenivasa * 775cabbc6bSPrashanth Sreenivasa * We track the obsolete space for the pool using several objects. Each 785cabbc6bSPrashanth Sreenivasa * of these objects is created on demand and freed when no longer 795cabbc6bSPrashanth Sreenivasa * needed, and is assumed to be empty if it does not exist. 805cabbc6bSPrashanth Sreenivasa * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 815cabbc6bSPrashanth Sreenivasa * 825cabbc6bSPrashanth Sreenivasa * - Each vic_mapping_object (associated with an indirect vdev) can 835cabbc6bSPrashanth Sreenivasa * have a vimp_counts_object. This is an array of uint32_t's 845cabbc6bSPrashanth Sreenivasa * with the same number of entries as the vic_mapping_object. When 855cabbc6bSPrashanth Sreenivasa * the mapping is condensed, entries from the vic_obsolete_sm_object 865cabbc6bSPrashanth Sreenivasa * (see below) are folded into the counts. Therefore, each 875cabbc6bSPrashanth Sreenivasa * obsolete_counts entry tells us the number of bytes in the 885cabbc6bSPrashanth Sreenivasa * corresponding mapping entry that were not referenced when the 895cabbc6bSPrashanth Sreenivasa * mapping was last condensed. 905cabbc6bSPrashanth Sreenivasa * 915cabbc6bSPrashanth Sreenivasa * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 925cabbc6bSPrashanth Sreenivasa * This is a space map containing an alloc entry for every DVA that 935cabbc6bSPrashanth Sreenivasa * has been obsoleted since the last time this indirect vdev was 945cabbc6bSPrashanth Sreenivasa * condensed. We use this object in order to improve performance 955cabbc6bSPrashanth Sreenivasa * when marking a DVA as obsolete. Instead of modifying an arbitrary 965cabbc6bSPrashanth Sreenivasa * offset of the vimp_counts_object, we only need to append an entry 975cabbc6bSPrashanth Sreenivasa * to the end of this object. When a DVA becomes obsolete, it is 985cabbc6bSPrashanth Sreenivasa * added to the obsolete space map. This happens when the DVA is 995cabbc6bSPrashanth Sreenivasa * freed, remapped and not referenced by a snapshot, or the last 1005cabbc6bSPrashanth Sreenivasa * snapshot referencing it is destroyed. 1015cabbc6bSPrashanth Sreenivasa * 1025cabbc6bSPrashanth Sreenivasa * - Each dataset can have a ds_remap_deadlist object. This is a 1035cabbc6bSPrashanth Sreenivasa * deadlist object containing all blocks that were remapped in this 1045cabbc6bSPrashanth Sreenivasa * dataset but referenced in a previous snapshot. Blocks can *only* 1055cabbc6bSPrashanth Sreenivasa * appear on this list if they were remapped (dsl_dataset_block_remapped); 1065cabbc6bSPrashanth Sreenivasa * blocks that were killed in a head dataset are put on the normal 1075cabbc6bSPrashanth Sreenivasa * ds_deadlist and marked obsolete when they are freed. 1085cabbc6bSPrashanth Sreenivasa * 1095cabbc6bSPrashanth Sreenivasa * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 1105cabbc6bSPrashanth Sreenivasa * in the pool that need to be marked obsolete. When a snapshot is 1115cabbc6bSPrashanth Sreenivasa * destroyed, we move some of the ds_remap_deadlist to the obsolete 1125cabbc6bSPrashanth Sreenivasa * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 1135cabbc6bSPrashanth Sreenivasa * asynchronously process the obsolete bpobj, moving its entries to 1145cabbc6bSPrashanth Sreenivasa * the specific vdevs' obsolete space maps. 1155cabbc6bSPrashanth Sreenivasa * 1165cabbc6bSPrashanth Sreenivasa * == Summary of how we mark blocks as obsolete == 1175cabbc6bSPrashanth Sreenivasa * 1185cabbc6bSPrashanth Sreenivasa * - When freeing a block: if any DVA is on an indirect vdev, append to 1195cabbc6bSPrashanth Sreenivasa * vic_obsolete_sm_object. 1205cabbc6bSPrashanth Sreenivasa * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 1215cabbc6bSPrashanth Sreenivasa * references; otherwise append to vic_obsolete_sm_object). 1225cabbc6bSPrashanth Sreenivasa * - When freeing a snapshot: move parts of ds_remap_deadlist to 1235cabbc6bSPrashanth Sreenivasa * dp_obsolete_bpobj (same algorithm as ds_deadlist). 1245cabbc6bSPrashanth Sreenivasa * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 1255cabbc6bSPrashanth Sreenivasa * individual vdev's vic_obsolete_sm_object. 1265cabbc6bSPrashanth Sreenivasa */ 1275cabbc6bSPrashanth Sreenivasa 1285cabbc6bSPrashanth Sreenivasa /* 1295cabbc6bSPrashanth Sreenivasa * "Big theory statement" for how we condense indirect vdevs. 1305cabbc6bSPrashanth Sreenivasa * 1315cabbc6bSPrashanth Sreenivasa * Condensing an indirect vdev's mapping is the process of determining 1325cabbc6bSPrashanth Sreenivasa * the precise counts of obsolete space for each mapping entry (by 1335cabbc6bSPrashanth Sreenivasa * integrating the obsolete spacemap into the obsolete counts) and 1345cabbc6bSPrashanth Sreenivasa * writing out a new mapping that contains only referenced entries. 1355cabbc6bSPrashanth Sreenivasa * 1365cabbc6bSPrashanth Sreenivasa * We condense a vdev when we expect the mapping to shrink (see 1375cabbc6bSPrashanth Sreenivasa * vdev_indirect_should_condense()), but only perform one condense at a 1385cabbc6bSPrashanth Sreenivasa * time to limit the memory usage. In addition, we use a separate 1395cabbc6bSPrashanth Sreenivasa * open-context thread (spa_condense_indirect_thread) to incrementally 1405cabbc6bSPrashanth Sreenivasa * create the new mapping object in a way that minimizes the impact on 1415cabbc6bSPrashanth Sreenivasa * the rest of the system. 1425cabbc6bSPrashanth Sreenivasa * 1435cabbc6bSPrashanth Sreenivasa * == Generating a new mapping == 1445cabbc6bSPrashanth Sreenivasa * 1455cabbc6bSPrashanth Sreenivasa * To generate a new mapping, we follow these steps: 1465cabbc6bSPrashanth Sreenivasa * 1475cabbc6bSPrashanth Sreenivasa * 1. Save the old obsolete space map and create a new mapping object 1485cabbc6bSPrashanth Sreenivasa * (see spa_condense_indirect_start_sync()). This initializes the 1495cabbc6bSPrashanth Sreenivasa * spa_condensing_indirect_phys with the "previous obsolete space map", 1505cabbc6bSPrashanth Sreenivasa * which is now read only. Newly obsolete DVAs will be added to a 1515cabbc6bSPrashanth Sreenivasa * new (initially empty) obsolete space map, and will not be 1525cabbc6bSPrashanth Sreenivasa * considered as part of this condense operation. 1535cabbc6bSPrashanth Sreenivasa * 1545cabbc6bSPrashanth Sreenivasa * 2. Construct in memory the precise counts of obsolete space for each 1555cabbc6bSPrashanth Sreenivasa * mapping entry, by incorporating the obsolete space map into the 1565cabbc6bSPrashanth Sreenivasa * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 1575cabbc6bSPrashanth Sreenivasa * 1585cabbc6bSPrashanth Sreenivasa * 3. Iterate through each mapping entry, writing to the new mapping any 1595cabbc6bSPrashanth Sreenivasa * entries that are not completely obsolete (i.e. which don't have 1605cabbc6bSPrashanth Sreenivasa * obsolete count == mapping length). (See 1615cabbc6bSPrashanth Sreenivasa * spa_condense_indirect_generate_new_mapping().) 1625cabbc6bSPrashanth Sreenivasa * 1635cabbc6bSPrashanth Sreenivasa * 4. Destroy the old mapping object and switch over to the new one 1645cabbc6bSPrashanth Sreenivasa * (spa_condense_indirect_complete_sync). 1655cabbc6bSPrashanth Sreenivasa * 1665cabbc6bSPrashanth Sreenivasa * == Restarting from failure == 1675cabbc6bSPrashanth Sreenivasa * 1685cabbc6bSPrashanth Sreenivasa * To restart the condense when we import/open the pool, we must start 1695cabbc6bSPrashanth Sreenivasa * at the 2nd step above: reconstruct the precise counts in memory, 1705cabbc6bSPrashanth Sreenivasa * based on the space map + counts. Then in the 3rd step, we start 1715cabbc6bSPrashanth Sreenivasa * iterating where we left off: at vimp_max_offset of the new mapping 1725cabbc6bSPrashanth Sreenivasa * object. 1735cabbc6bSPrashanth Sreenivasa */ 1745cabbc6bSPrashanth Sreenivasa 1755cabbc6bSPrashanth Sreenivasa boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 1765cabbc6bSPrashanth Sreenivasa 1775cabbc6bSPrashanth Sreenivasa /* 1785cabbc6bSPrashanth Sreenivasa * Condense if at least this percent of the bytes in the mapping is 1795cabbc6bSPrashanth Sreenivasa * obsolete. With the default of 25%, the amount of space mapped 1805cabbc6bSPrashanth Sreenivasa * will be reduced to 1% of its original size after at most 16 1815cabbc6bSPrashanth Sreenivasa * condenses. Higher values will condense less often (causing less 1825cabbc6bSPrashanth Sreenivasa * i/o); lower values will reduce the mapping size more quickly. 1835cabbc6bSPrashanth Sreenivasa */ 1845cabbc6bSPrashanth Sreenivasa int zfs_indirect_condense_obsolete_pct = 25; 1855cabbc6bSPrashanth Sreenivasa 1865cabbc6bSPrashanth Sreenivasa /* 1875cabbc6bSPrashanth Sreenivasa * Condense if the obsolete space map takes up more than this amount of 1885cabbc6bSPrashanth Sreenivasa * space on disk (logically). This limits the amount of disk space 1895cabbc6bSPrashanth Sreenivasa * consumed by the obsolete space map; the default of 1GB is small enough 1905cabbc6bSPrashanth Sreenivasa * that we typically don't mind "wasting" it. 1915cabbc6bSPrashanth Sreenivasa */ 1925cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 1935cabbc6bSPrashanth Sreenivasa 1945cabbc6bSPrashanth Sreenivasa /* 1955cabbc6bSPrashanth Sreenivasa * Don't bother condensing if the mapping uses less than this amount of 1965cabbc6bSPrashanth Sreenivasa * memory. The default of 128KB is considered a "trivial" amount of 1975cabbc6bSPrashanth Sreenivasa * memory and not worth reducing. 1985cabbc6bSPrashanth Sreenivasa */ 1995cabbc6bSPrashanth Sreenivasa uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 2005cabbc6bSPrashanth Sreenivasa 2015cabbc6bSPrashanth Sreenivasa /* 2025cabbc6bSPrashanth Sreenivasa * This is used by the test suite so that it can ensure that certain 2035cabbc6bSPrashanth Sreenivasa * actions happen while in the middle of a condense (which might otherwise 2045cabbc6bSPrashanth Sreenivasa * complete too quickly). If used to reduce the performance impact of 2055cabbc6bSPrashanth Sreenivasa * condensing in production, a maximum value of 1 should be sufficient. 2065cabbc6bSPrashanth Sreenivasa */ 2075cabbc6bSPrashanth Sreenivasa int zfs_condense_indirect_commit_entry_delay_ticks = 0; 2085cabbc6bSPrashanth Sreenivasa 2093a4b1be9SMatthew Ahrens /* 210a21fe349SBrian Behlendorf * If an indirect split block contains more than this many possible unique 211a21fe349SBrian Behlendorf * combinations when being reconstructed, consider it too computationally 212a21fe349SBrian Behlendorf * expensive to check them all. Instead, try at most 100 randomly-selected 213a21fe349SBrian Behlendorf * combinations each time the block is accessed. This allows all segment 214a21fe349SBrian Behlendorf * copies to participate fairly in the reconstruction when all combinations 215a21fe349SBrian Behlendorf * cannot be checked and prevents repeated use of one bad copy. 216a21fe349SBrian Behlendorf */ 217a21fe349SBrian Behlendorf int zfs_reconstruct_indirect_combinations_max = 256; 218a21fe349SBrian Behlendorf 219a21fe349SBrian Behlendorf 220a21fe349SBrian Behlendorf /* 221a21fe349SBrian Behlendorf * Enable to simulate damaged segments and validate reconstruction. 222a21fe349SBrian Behlendorf * Used by ztest 2233a4b1be9SMatthew Ahrens */ 224a21fe349SBrian Behlendorf unsigned long zfs_reconstruct_indirect_damage_fraction = 0; 2253a4b1be9SMatthew Ahrens 2263a4b1be9SMatthew Ahrens /* 2273a4b1be9SMatthew Ahrens * The indirect_child_t represents the vdev that we will read from, when we 2283a4b1be9SMatthew Ahrens * need to read all copies of the data (e.g. for scrub or reconstruction). 2293a4b1be9SMatthew Ahrens * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 2303a4b1be9SMatthew Ahrens * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 2313a4b1be9SMatthew Ahrens * ic_vdev is a child of the mirror. 2323a4b1be9SMatthew Ahrens */ 2333a4b1be9SMatthew Ahrens typedef struct indirect_child { 2343a4b1be9SMatthew Ahrens abd_t *ic_data; 2353a4b1be9SMatthew Ahrens vdev_t *ic_vdev; 236a21fe349SBrian Behlendorf 237a21fe349SBrian Behlendorf /* 238a21fe349SBrian Behlendorf * ic_duplicate is NULL when the ic_data contents are unique, when it 239a21fe349SBrian Behlendorf * is determined to be a duplicate it references the primary child. 240a21fe349SBrian Behlendorf */ 241a21fe349SBrian Behlendorf struct indirect_child *ic_duplicate; 242a21fe349SBrian Behlendorf list_node_t ic_node; /* node on is_unique_child */ 2433a4b1be9SMatthew Ahrens } indirect_child_t; 2443a4b1be9SMatthew Ahrens 2453a4b1be9SMatthew Ahrens /* 2463a4b1be9SMatthew Ahrens * The indirect_split_t represents one mapped segment of an i/o to the 2473a4b1be9SMatthew Ahrens * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 2483a4b1be9SMatthew Ahrens * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 2493a4b1be9SMatthew Ahrens * For split blocks, there will be several of these. 2503a4b1be9SMatthew Ahrens */ 2513a4b1be9SMatthew Ahrens typedef struct indirect_split { 2523a4b1be9SMatthew Ahrens list_node_t is_node; /* link on iv_splits */ 2533a4b1be9SMatthew Ahrens 2543a4b1be9SMatthew Ahrens /* 2553a4b1be9SMatthew Ahrens * is_split_offset is the offset into the i/o. 2563a4b1be9SMatthew Ahrens * This is the sum of the previous splits' is_size's. 2573a4b1be9SMatthew Ahrens */ 2583a4b1be9SMatthew Ahrens uint64_t is_split_offset; 2593a4b1be9SMatthew Ahrens 2603a4b1be9SMatthew Ahrens vdev_t *is_vdev; /* top-level vdev */ 2613a4b1be9SMatthew Ahrens uint64_t is_target_offset; /* offset on is_vdev */ 2623a4b1be9SMatthew Ahrens uint64_t is_size; 2633a4b1be9SMatthew Ahrens int is_children; /* number of entries in is_child[] */ 264a21fe349SBrian Behlendorf int is_unique_children; /* number of entries in is_unique_child */ 265a21fe349SBrian Behlendorf list_t is_unique_child; 2663a4b1be9SMatthew Ahrens 2673a4b1be9SMatthew Ahrens /* 2683a4b1be9SMatthew Ahrens * is_good_child is the child that we are currently using to 2693a4b1be9SMatthew Ahrens * attempt reconstruction. 2703a4b1be9SMatthew Ahrens */ 271a21fe349SBrian Behlendorf indirect_child_t *is_good_child; 2723a4b1be9SMatthew Ahrens 2733a4b1be9SMatthew Ahrens indirect_child_t is_child[1]; /* variable-length */ 2743a4b1be9SMatthew Ahrens } indirect_split_t; 2753a4b1be9SMatthew Ahrens 2763a4b1be9SMatthew Ahrens /* 2773a4b1be9SMatthew Ahrens * The indirect_vsd_t is associated with each i/o to the indirect vdev. 2783a4b1be9SMatthew Ahrens * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 2793a4b1be9SMatthew Ahrens */ 2803a4b1be9SMatthew Ahrens typedef struct indirect_vsd { 2813a4b1be9SMatthew Ahrens boolean_t iv_split_block; 2823a4b1be9SMatthew Ahrens boolean_t iv_reconstruct; 283a21fe349SBrian Behlendorf uint64_t iv_unique_combinations; 284a21fe349SBrian Behlendorf uint64_t iv_attempts; 285a21fe349SBrian Behlendorf uint64_t iv_attempts_max; 2863a4b1be9SMatthew Ahrens 2873a4b1be9SMatthew Ahrens list_t iv_splits; /* list of indirect_split_t's */ 2883a4b1be9SMatthew Ahrens } indirect_vsd_t; 2893a4b1be9SMatthew Ahrens 2903a4b1be9SMatthew Ahrens static void 2913a4b1be9SMatthew Ahrens vdev_indirect_map_free(zio_t *zio) 2923a4b1be9SMatthew Ahrens { 2933a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 2943a4b1be9SMatthew Ahrens 2953a4b1be9SMatthew Ahrens indirect_split_t *is; 2963a4b1be9SMatthew Ahrens while ((is = list_head(&iv->iv_splits)) != NULL) { 2973a4b1be9SMatthew Ahrens for (int c = 0; c < is->is_children; c++) { 2983a4b1be9SMatthew Ahrens indirect_child_t *ic = &is->is_child[c]; 2993a4b1be9SMatthew Ahrens if (ic->ic_data != NULL) 3003a4b1be9SMatthew Ahrens abd_free(ic->ic_data); 3013a4b1be9SMatthew Ahrens } 3023a4b1be9SMatthew Ahrens list_remove(&iv->iv_splits, is); 303a21fe349SBrian Behlendorf 304a21fe349SBrian Behlendorf indirect_child_t *ic; 305a21fe349SBrian Behlendorf while ((ic = list_head(&is->is_unique_child)) != NULL) 306a21fe349SBrian Behlendorf list_remove(&is->is_unique_child, ic); 307a21fe349SBrian Behlendorf 308a21fe349SBrian Behlendorf list_destroy(&is->is_unique_child); 309a21fe349SBrian Behlendorf 3103a4b1be9SMatthew Ahrens kmem_free(is, 3113a4b1be9SMatthew Ahrens offsetof(indirect_split_t, is_child[is->is_children])); 3123a4b1be9SMatthew Ahrens } 3133a4b1be9SMatthew Ahrens kmem_free(iv, sizeof (*iv)); 3143a4b1be9SMatthew Ahrens } 3153a4b1be9SMatthew Ahrens 3163a4b1be9SMatthew Ahrens static const zio_vsd_ops_t vdev_indirect_vsd_ops = { 3173a4b1be9SMatthew Ahrens vdev_indirect_map_free, 3183a4b1be9SMatthew Ahrens zio_vsd_default_cksum_report 3193a4b1be9SMatthew Ahrens }; 3205cabbc6bSPrashanth Sreenivasa /* 32186714001SSerapheim Dimitropoulos * Mark the given offset and size as being obsolete. 3225cabbc6bSPrashanth Sreenivasa */ 3235cabbc6bSPrashanth Sreenivasa void 32486714001SSerapheim Dimitropoulos vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 3255cabbc6bSPrashanth Sreenivasa { 3265cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 32786714001SSerapheim Dimitropoulos 3285cabbc6bSPrashanth Sreenivasa ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 3295cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 3305cabbc6bSPrashanth Sreenivasa ASSERT(size > 0); 3315cabbc6bSPrashanth Sreenivasa VERIFY(vdev_indirect_mapping_entry_for_offset( 3325cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping, offset) != NULL); 3335cabbc6bSPrashanth Sreenivasa 3345cabbc6bSPrashanth Sreenivasa if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 3355cabbc6bSPrashanth Sreenivasa mutex_enter(&vd->vdev_obsolete_lock); 3365cabbc6bSPrashanth Sreenivasa range_tree_add(vd->vdev_obsolete_segments, offset, size); 3375cabbc6bSPrashanth Sreenivasa mutex_exit(&vd->vdev_obsolete_lock); 33886714001SSerapheim Dimitropoulos vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); 3395cabbc6bSPrashanth Sreenivasa } 3405cabbc6bSPrashanth Sreenivasa } 3415cabbc6bSPrashanth Sreenivasa 3425cabbc6bSPrashanth Sreenivasa /* 3435cabbc6bSPrashanth Sreenivasa * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 3445cabbc6bSPrashanth Sreenivasa * wrapper is provided because the DMU does not know about vdev_t's and 3455cabbc6bSPrashanth Sreenivasa * cannot directly call vdev_indirect_mark_obsolete. 3465cabbc6bSPrashanth Sreenivasa */ 3475cabbc6bSPrashanth Sreenivasa void 3485cabbc6bSPrashanth Sreenivasa spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 3495cabbc6bSPrashanth Sreenivasa uint64_t size, dmu_tx_t *tx) 3505cabbc6bSPrashanth Sreenivasa { 3515cabbc6bSPrashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, vdev_id); 3525cabbc6bSPrashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 3535cabbc6bSPrashanth Sreenivasa 3545cabbc6bSPrashanth Sreenivasa /* The DMU can only remap indirect vdevs. */ 3555cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 35686714001SSerapheim Dimitropoulos vdev_indirect_mark_obsolete(vd, offset, size); 3575cabbc6bSPrashanth Sreenivasa } 3585cabbc6bSPrashanth Sreenivasa 3595cabbc6bSPrashanth Sreenivasa static spa_condensing_indirect_t * 3605cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_create(spa_t *spa) 3615cabbc6bSPrashanth Sreenivasa { 3625cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 3635cabbc6bSPrashanth Sreenivasa &spa->spa_condensing_indirect_phys; 3645cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 3655cabbc6bSPrashanth Sreenivasa objset_t *mos = spa->spa_meta_objset; 3665cabbc6bSPrashanth Sreenivasa 3675cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 3685cabbc6bSPrashanth Sreenivasa list_create(&sci->sci_new_mapping_entries[i], 3695cabbc6bSPrashanth Sreenivasa sizeof (vdev_indirect_mapping_entry_t), 3705cabbc6bSPrashanth Sreenivasa offsetof(vdev_indirect_mapping_entry_t, vime_node)); 3715cabbc6bSPrashanth Sreenivasa } 3725cabbc6bSPrashanth Sreenivasa 3735cabbc6bSPrashanth Sreenivasa sci->sci_new_mapping = 3745cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 3755cabbc6bSPrashanth Sreenivasa 3765cabbc6bSPrashanth Sreenivasa return (sci); 3775cabbc6bSPrashanth Sreenivasa } 3785cabbc6bSPrashanth Sreenivasa 3795cabbc6bSPrashanth Sreenivasa static void 3805cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 3815cabbc6bSPrashanth Sreenivasa { 3825cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) 3835cabbc6bSPrashanth Sreenivasa list_destroy(&sci->sci_new_mapping_entries[i]); 3845cabbc6bSPrashanth Sreenivasa 3855cabbc6bSPrashanth Sreenivasa if (sci->sci_new_mapping != NULL) 3865cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_close(sci->sci_new_mapping); 3875cabbc6bSPrashanth Sreenivasa 3885cabbc6bSPrashanth Sreenivasa kmem_free(sci, sizeof (*sci)); 3895cabbc6bSPrashanth Sreenivasa } 3905cabbc6bSPrashanth Sreenivasa 3915cabbc6bSPrashanth Sreenivasa boolean_t 3925cabbc6bSPrashanth Sreenivasa vdev_indirect_should_condense(vdev_t *vd) 3935cabbc6bSPrashanth Sreenivasa { 3945cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3955cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 3965cabbc6bSPrashanth Sreenivasa 3975cabbc6bSPrashanth Sreenivasa ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 3985cabbc6bSPrashanth Sreenivasa 3995cabbc6bSPrashanth Sreenivasa if (!zfs_condense_indirect_vdevs_enable) 4005cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4015cabbc6bSPrashanth Sreenivasa 4025cabbc6bSPrashanth Sreenivasa /* 4035cabbc6bSPrashanth Sreenivasa * We can only condense one indirect vdev at a time. 4045cabbc6bSPrashanth Sreenivasa */ 4055cabbc6bSPrashanth Sreenivasa if (spa->spa_condensing_indirect != NULL) 4065cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4075cabbc6bSPrashanth Sreenivasa 4085cabbc6bSPrashanth Sreenivasa if (spa_shutting_down(spa)) 4095cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4105cabbc6bSPrashanth Sreenivasa 4115cabbc6bSPrashanth Sreenivasa /* 4125cabbc6bSPrashanth Sreenivasa * The mapping object size must not change while we are 4135cabbc6bSPrashanth Sreenivasa * condensing, so we can only condense indirect vdevs 4145cabbc6bSPrashanth Sreenivasa * (not vdevs that are still in the middle of being removed). 4155cabbc6bSPrashanth Sreenivasa */ 4165cabbc6bSPrashanth Sreenivasa if (vd->vdev_ops != &vdev_indirect_ops) 4175cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4185cabbc6bSPrashanth Sreenivasa 4195cabbc6bSPrashanth Sreenivasa /* 4205cabbc6bSPrashanth Sreenivasa * If nothing new has been marked obsolete, there is no 4215cabbc6bSPrashanth Sreenivasa * point in condensing. 4225cabbc6bSPrashanth Sreenivasa */ 4235cabbc6bSPrashanth Sreenivasa if (vd->vdev_obsolete_sm == NULL) { 4245cabbc6bSPrashanth Sreenivasa ASSERT0(vdev_obsolete_sm_object(vd)); 4255cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4265cabbc6bSPrashanth Sreenivasa } 4275cabbc6bSPrashanth Sreenivasa 4285cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_obsolete_sm != NULL); 4295cabbc6bSPrashanth Sreenivasa 4305cabbc6bSPrashanth Sreenivasa ASSERT3U(vdev_obsolete_sm_object(vd), ==, 4315cabbc6bSPrashanth Sreenivasa space_map_object(vd->vdev_obsolete_sm)); 4325cabbc6bSPrashanth Sreenivasa 4335cabbc6bSPrashanth Sreenivasa uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 4345cabbc6bSPrashanth Sreenivasa uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 4355cabbc6bSPrashanth Sreenivasa uint64_t mapping_size = vdev_indirect_mapping_size(vim); 4365cabbc6bSPrashanth Sreenivasa uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 4375cabbc6bSPrashanth Sreenivasa 4385cabbc6bSPrashanth Sreenivasa ASSERT3U(bytes_obsolete, <=, bytes_mapped); 4395cabbc6bSPrashanth Sreenivasa 4405cabbc6bSPrashanth Sreenivasa /* 4415cabbc6bSPrashanth Sreenivasa * If a high percentage of the bytes that are mapped have become 4425cabbc6bSPrashanth Sreenivasa * obsolete, condense (unless the mapping is already small enough). 4435cabbc6bSPrashanth Sreenivasa * This has a good chance of reducing the amount of memory used 4445cabbc6bSPrashanth Sreenivasa * by the mapping. 4455cabbc6bSPrashanth Sreenivasa */ 4465cabbc6bSPrashanth Sreenivasa if (bytes_obsolete * 100 / bytes_mapped >= 4475cabbc6bSPrashanth Sreenivasa zfs_indirect_condense_obsolete_pct && 4485cabbc6bSPrashanth Sreenivasa mapping_size > zfs_condense_min_mapping_bytes) { 4495cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("should condense vdev %llu because obsolete " 4505cabbc6bSPrashanth Sreenivasa "spacemap covers %d%% of %lluMB mapping", 4515cabbc6bSPrashanth Sreenivasa (u_longlong_t)vd->vdev_id, 4525cabbc6bSPrashanth Sreenivasa (int)(bytes_obsolete * 100 / bytes_mapped), 4535cabbc6bSPrashanth Sreenivasa (u_longlong_t)bytes_mapped / 1024 / 1024); 4545cabbc6bSPrashanth Sreenivasa return (B_TRUE); 4555cabbc6bSPrashanth Sreenivasa } 4565cabbc6bSPrashanth Sreenivasa 4575cabbc6bSPrashanth Sreenivasa /* 4585cabbc6bSPrashanth Sreenivasa * If the obsolete space map takes up too much space on disk, 4595cabbc6bSPrashanth Sreenivasa * condense in order to free up this disk space. 4605cabbc6bSPrashanth Sreenivasa */ 4615cabbc6bSPrashanth Sreenivasa if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 4625cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("should condense vdev %llu because obsolete sm " 4635cabbc6bSPrashanth Sreenivasa "length %lluMB >= max size %lluMB", 4645cabbc6bSPrashanth Sreenivasa (u_longlong_t)vd->vdev_id, 4655cabbc6bSPrashanth Sreenivasa (u_longlong_t)obsolete_sm_size / 1024 / 1024, 4665cabbc6bSPrashanth Sreenivasa (u_longlong_t)zfs_condense_max_obsolete_bytes / 4675cabbc6bSPrashanth Sreenivasa 1024 / 1024); 4685cabbc6bSPrashanth Sreenivasa return (B_TRUE); 4695cabbc6bSPrashanth Sreenivasa } 4705cabbc6bSPrashanth Sreenivasa 4715cabbc6bSPrashanth Sreenivasa return (B_FALSE); 4725cabbc6bSPrashanth Sreenivasa } 4735cabbc6bSPrashanth Sreenivasa 4745cabbc6bSPrashanth Sreenivasa /* 4755cabbc6bSPrashanth Sreenivasa * This sync task completes (finishes) a condense, deleting the old 4765cabbc6bSPrashanth Sreenivasa * mapping and replacing it with the new one. 4775cabbc6bSPrashanth Sreenivasa */ 4785cabbc6bSPrashanth Sreenivasa static void 4795cabbc6bSPrashanth Sreenivasa spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 4805cabbc6bSPrashanth Sreenivasa { 4815cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_t *sci = arg; 4825cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4835cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 4845cabbc6bSPrashanth Sreenivasa &spa->spa_condensing_indirect_phys; 4855cabbc6bSPrashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 4865cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 4875cabbc6bSPrashanth Sreenivasa objset_t *mos = spa->spa_meta_objset; 4885cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 4895cabbc6bSPrashanth Sreenivasa uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 4905cabbc6bSPrashanth Sreenivasa uint64_t new_count = 4915cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 4925cabbc6bSPrashanth Sreenivasa 4935cabbc6bSPrashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 4945cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 4955cabbc6bSPrashanth Sreenivasa ASSERT3P(sci, ==, spa->spa_condensing_indirect); 4965cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 4975cabbc6bSPrashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 4985cabbc6bSPrashanth Sreenivasa } 4995cabbc6bSPrashanth Sreenivasa ASSERT(vic->vic_mapping_object != 0); 5005cabbc6bSPrashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 5015cabbc6bSPrashanth Sreenivasa ASSERT(scip->scip_next_mapping_object != 0); 5025cabbc6bSPrashanth Sreenivasa ASSERT(scip->scip_prev_obsolete_sm_object != 0); 5035cabbc6bSPrashanth Sreenivasa 5045cabbc6bSPrashanth Sreenivasa /* 5055cabbc6bSPrashanth Sreenivasa * Reset vdev_indirect_mapping to refer to the new object. 5065cabbc6bSPrashanth Sreenivasa */ 5075cabbc6bSPrashanth Sreenivasa rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 5085cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 5095cabbc6bSPrashanth Sreenivasa vd->vdev_indirect_mapping = sci->sci_new_mapping; 5105cabbc6bSPrashanth Sreenivasa rw_exit(&vd->vdev_indirect_rwlock); 5115cabbc6bSPrashanth Sreenivasa 5125cabbc6bSPrashanth Sreenivasa sci->sci_new_mapping = NULL; 5135cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 5145cabbc6bSPrashanth Sreenivasa vic->vic_mapping_object = scip->scip_next_mapping_object; 5155cabbc6bSPrashanth Sreenivasa scip->scip_next_mapping_object = 0; 5165cabbc6bSPrashanth Sreenivasa 5175cabbc6bSPrashanth Sreenivasa space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 5185cabbc6bSPrashanth Sreenivasa spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 5195cabbc6bSPrashanth Sreenivasa scip->scip_prev_obsolete_sm_object = 0; 5205cabbc6bSPrashanth Sreenivasa 5215cabbc6bSPrashanth Sreenivasa scip->scip_vdev = 0; 5225cabbc6bSPrashanth Sreenivasa 5235cabbc6bSPrashanth Sreenivasa VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 5245cabbc6bSPrashanth Sreenivasa DMU_POOL_CONDENSING_INDIRECT, tx)); 5255cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 5265cabbc6bSPrashanth Sreenivasa spa->spa_condensing_indirect = NULL; 5275cabbc6bSPrashanth Sreenivasa 5285cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 5295cabbc6bSPrashanth Sreenivasa "new mapping object %llu has %llu entries " 5305cabbc6bSPrashanth Sreenivasa "(was %llu entries)", 5315cabbc6bSPrashanth Sreenivasa vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 5325cabbc6bSPrashanth Sreenivasa new_count, old_count); 5335cabbc6bSPrashanth Sreenivasa 5345cabbc6bSPrashanth Sreenivasa vdev_config_dirty(spa->spa_root_vdev); 5355cabbc6bSPrashanth Sreenivasa } 5365cabbc6bSPrashanth Sreenivasa 5375cabbc6bSPrashanth Sreenivasa /* 5385cabbc6bSPrashanth Sreenivasa * This sync task appends entries to the new mapping object. 5395cabbc6bSPrashanth Sreenivasa */ 5405cabbc6bSPrashanth Sreenivasa static void 5415cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 5425cabbc6bSPrashanth Sreenivasa { 5435cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_t *sci = arg; 5445cabbc6bSPrashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 5455cabbc6bSPrashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 5465cabbc6bSPrashanth Sreenivasa 5475cabbc6bSPrashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 5485cabbc6bSPrashanth Sreenivasa ASSERT3P(sci, ==, spa->spa_condensing_indirect); 5495cabbc6bSPrashanth Sreenivasa 5505cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 5515cabbc6bSPrashanth Sreenivasa &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 5525cabbc6bSPrashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 5535cabbc6bSPrashanth Sreenivasa } 5545cabbc6bSPrashanth Sreenivasa 5555cabbc6bSPrashanth Sreenivasa /* 5565cabbc6bSPrashanth Sreenivasa * Open-context function to add one entry to the new mapping. The new 5575cabbc6bSPrashanth Sreenivasa * entry will be remembered and written from syncing context. 5585cabbc6bSPrashanth Sreenivasa */ 5595cabbc6bSPrashanth Sreenivasa static void 5605cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_entry(spa_t *spa, 5615cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 5625cabbc6bSPrashanth Sreenivasa { 5635cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 5645cabbc6bSPrashanth Sreenivasa 5655cabbc6bSPrashanth Sreenivasa ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 5665cabbc6bSPrashanth Sreenivasa 5675cabbc6bSPrashanth Sreenivasa dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5685cabbc6bSPrashanth Sreenivasa dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 5695cabbc6bSPrashanth Sreenivasa VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 5705cabbc6bSPrashanth Sreenivasa int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 5715cabbc6bSPrashanth Sreenivasa 5725cabbc6bSPrashanth Sreenivasa /* 5735cabbc6bSPrashanth Sreenivasa * If we are the first entry committed this txg, kick off the sync 5745cabbc6bSPrashanth Sreenivasa * task to write to the MOS on our behalf. 5755cabbc6bSPrashanth Sreenivasa */ 5765cabbc6bSPrashanth Sreenivasa if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 5775cabbc6bSPrashanth Sreenivasa dsl_sync_task_nowait(dmu_tx_pool(tx), 5785cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_sync, sci, 5795cabbc6bSPrashanth Sreenivasa 0, ZFS_SPACE_CHECK_NONE, tx); 5805cabbc6bSPrashanth Sreenivasa } 5815cabbc6bSPrashanth Sreenivasa 5825cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_t *vime = 5835cabbc6bSPrashanth Sreenivasa kmem_alloc(sizeof (*vime), KM_SLEEP); 5845cabbc6bSPrashanth Sreenivasa vime->vime_mapping = *vimep; 5855cabbc6bSPrashanth Sreenivasa vime->vime_obsolete_count = count; 5865cabbc6bSPrashanth Sreenivasa list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 5875cabbc6bSPrashanth Sreenivasa 5885cabbc6bSPrashanth Sreenivasa dmu_tx_commit(tx); 5895cabbc6bSPrashanth Sreenivasa } 5905cabbc6bSPrashanth Sreenivasa 5915cabbc6bSPrashanth Sreenivasa static void 5925cabbc6bSPrashanth Sreenivasa spa_condense_indirect_generate_new_mapping(vdev_t *vd, 593667ec66fSSerapheim Dimitropoulos uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 5945cabbc6bSPrashanth Sreenivasa { 5955cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 5965cabbc6bSPrashanth Sreenivasa uint64_t mapi = start_index; 5975cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 5985cabbc6bSPrashanth Sreenivasa uint64_t old_num_entries = 5995cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_num_entries(old_mapping); 6005cabbc6bSPrashanth Sreenivasa 6015cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 6025cabbc6bSPrashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 6035cabbc6bSPrashanth Sreenivasa 6045cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("starting condense of vdev %llu from index %llu", 6055cabbc6bSPrashanth Sreenivasa (u_longlong_t)vd->vdev_id, 6065cabbc6bSPrashanth Sreenivasa (u_longlong_t)mapi); 6075cabbc6bSPrashanth Sreenivasa 608667ec66fSSerapheim Dimitropoulos while (mapi < old_num_entries) { 609667ec66fSSerapheim Dimitropoulos 610667ec66fSSerapheim Dimitropoulos if (zthr_iscancelled(zthr)) { 611667ec66fSSerapheim Dimitropoulos zfs_dbgmsg("pausing condense of vdev %llu " 612667ec66fSSerapheim Dimitropoulos "at index %llu", (u_longlong_t)vd->vdev_id, 613667ec66fSSerapheim Dimitropoulos (u_longlong_t)mapi); 614667ec66fSSerapheim Dimitropoulos break; 615667ec66fSSerapheim Dimitropoulos } 616667ec66fSSerapheim Dimitropoulos 6175cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *entry = 6185cabbc6bSPrashanth Sreenivasa &old_mapping->vim_entries[mapi]; 6195cabbc6bSPrashanth Sreenivasa uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 6205cabbc6bSPrashanth Sreenivasa ASSERT3U(obsolete_counts[mapi], <=, entry_size); 6215cabbc6bSPrashanth Sreenivasa if (obsolete_counts[mapi] < entry_size) { 6225cabbc6bSPrashanth Sreenivasa spa_condense_indirect_commit_entry(spa, entry, 6235cabbc6bSPrashanth Sreenivasa obsolete_counts[mapi]); 6245cabbc6bSPrashanth Sreenivasa 6255cabbc6bSPrashanth Sreenivasa /* 6265cabbc6bSPrashanth Sreenivasa * This delay may be requested for testing, debugging, 6275cabbc6bSPrashanth Sreenivasa * or performance reasons. 6285cabbc6bSPrashanth Sreenivasa */ 6295cabbc6bSPrashanth Sreenivasa delay(zfs_condense_indirect_commit_entry_delay_ticks); 6305cabbc6bSPrashanth Sreenivasa } 6315cabbc6bSPrashanth Sreenivasa 6325cabbc6bSPrashanth Sreenivasa mapi++; 6335cabbc6bSPrashanth Sreenivasa } 6345cabbc6bSPrashanth Sreenivasa } 6355cabbc6bSPrashanth Sreenivasa 636667ec66fSSerapheim Dimitropoulos /* ARGSUSED */ 637667ec66fSSerapheim Dimitropoulos static boolean_t 638667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 6395cabbc6bSPrashanth Sreenivasa { 640667ec66fSSerapheim Dimitropoulos spa_t *spa = arg; 641667ec66fSSerapheim Dimitropoulos 642667ec66fSSerapheim Dimitropoulos return (spa->spa_condensing_indirect != NULL); 643667ec66fSSerapheim Dimitropoulos } 644667ec66fSSerapheim Dimitropoulos 645667ec66fSSerapheim Dimitropoulos /* ARGSUSED */ 6466a316e1fSSerapheim Dimitropoulos static void 647667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread(void *arg, zthr_t *zthr) 648667ec66fSSerapheim Dimitropoulos { 649667ec66fSSerapheim Dimitropoulos spa_t *spa = arg; 650667ec66fSSerapheim Dimitropoulos vdev_t *vd; 651667ec66fSSerapheim Dimitropoulos 652667ec66fSSerapheim Dimitropoulos ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 653667ec66fSSerapheim Dimitropoulos spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 654667ec66fSSerapheim Dimitropoulos vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 655667ec66fSSerapheim Dimitropoulos ASSERT3P(vd, !=, NULL); 656667ec66fSSerapheim Dimitropoulos spa_config_exit(spa, SCL_VDEV, FTAG); 657667ec66fSSerapheim Dimitropoulos 6585cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 6595cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 6605cabbc6bSPrashanth Sreenivasa &spa->spa_condensing_indirect_phys; 6615cabbc6bSPrashanth Sreenivasa uint32_t *counts; 6625cabbc6bSPrashanth Sreenivasa uint64_t start_index; 6635cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 6645cabbc6bSPrashanth Sreenivasa space_map_t *prev_obsolete_sm = NULL; 6655cabbc6bSPrashanth Sreenivasa 6665cabbc6bSPrashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 6675cabbc6bSPrashanth Sreenivasa ASSERT(scip->scip_next_mapping_object != 0); 6685cabbc6bSPrashanth Sreenivasa ASSERT(scip->scip_prev_obsolete_sm_object != 0); 6695cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 6705cabbc6bSPrashanth Sreenivasa 6715cabbc6bSPrashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 6725cabbc6bSPrashanth Sreenivasa /* 6735cabbc6bSPrashanth Sreenivasa * The list must start out empty in order for the 6745cabbc6bSPrashanth Sreenivasa * _commit_sync() sync task to be properly registered 6755cabbc6bSPrashanth Sreenivasa * on the first call to _commit_entry(); so it's wise 6765cabbc6bSPrashanth Sreenivasa * to double check and ensure we actually are starting 6775cabbc6bSPrashanth Sreenivasa * with empty lists. 6785cabbc6bSPrashanth Sreenivasa */ 6795cabbc6bSPrashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 6805cabbc6bSPrashanth Sreenivasa } 6815cabbc6bSPrashanth Sreenivasa 6825cabbc6bSPrashanth Sreenivasa VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 6835cabbc6bSPrashanth Sreenivasa scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 6845cabbc6bSPrashanth Sreenivasa counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 6855cabbc6bSPrashanth Sreenivasa if (prev_obsolete_sm != NULL) { 6865cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 6875cabbc6bSPrashanth Sreenivasa counts, prev_obsolete_sm); 6885cabbc6bSPrashanth Sreenivasa } 6895cabbc6bSPrashanth Sreenivasa space_map_close(prev_obsolete_sm); 6905cabbc6bSPrashanth Sreenivasa 6915cabbc6bSPrashanth Sreenivasa /* 6925cabbc6bSPrashanth Sreenivasa * Generate new mapping. Determine what index to continue from 6935cabbc6bSPrashanth Sreenivasa * based on the max offset that we've already written in the 6945cabbc6bSPrashanth Sreenivasa * new mapping. 6955cabbc6bSPrashanth Sreenivasa */ 6965cabbc6bSPrashanth Sreenivasa uint64_t max_offset = 6975cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 6985cabbc6bSPrashanth Sreenivasa if (max_offset == 0) { 6995cabbc6bSPrashanth Sreenivasa /* We haven't written anything to the new mapping yet. */ 7005cabbc6bSPrashanth Sreenivasa start_index = 0; 7015cabbc6bSPrashanth Sreenivasa } else { 7025cabbc6bSPrashanth Sreenivasa /* 7035cabbc6bSPrashanth Sreenivasa * Pick up from where we left off. _entry_for_offset() 7045cabbc6bSPrashanth Sreenivasa * returns a pointer into the vim_entries array. If 7055cabbc6bSPrashanth Sreenivasa * max_offset is greater than any of the mappings 7065cabbc6bSPrashanth Sreenivasa * contained in the table NULL will be returned and 7075cabbc6bSPrashanth Sreenivasa * that indicates we've exhausted our iteration of the 7085cabbc6bSPrashanth Sreenivasa * old_mapping. 7095cabbc6bSPrashanth Sreenivasa */ 7105cabbc6bSPrashanth Sreenivasa 7115cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *entry = 7125cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 7135cabbc6bSPrashanth Sreenivasa max_offset); 7145cabbc6bSPrashanth Sreenivasa 7155cabbc6bSPrashanth Sreenivasa if (entry == NULL) { 7165cabbc6bSPrashanth Sreenivasa /* 7175cabbc6bSPrashanth Sreenivasa * We've already written the whole new mapping. 7185cabbc6bSPrashanth Sreenivasa * This special value will cause us to skip the 7195cabbc6bSPrashanth Sreenivasa * generate_new_mapping step and just do the sync 7205cabbc6bSPrashanth Sreenivasa * task to complete the condense. 7215cabbc6bSPrashanth Sreenivasa */ 7225cabbc6bSPrashanth Sreenivasa start_index = UINT64_MAX; 7235cabbc6bSPrashanth Sreenivasa } else { 7245cabbc6bSPrashanth Sreenivasa start_index = entry - old_mapping->vim_entries; 7255cabbc6bSPrashanth Sreenivasa ASSERT3U(start_index, <, 7265cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_num_entries(old_mapping)); 7275cabbc6bSPrashanth Sreenivasa } 7285cabbc6bSPrashanth Sreenivasa } 7295cabbc6bSPrashanth Sreenivasa 730667ec66fSSerapheim Dimitropoulos spa_condense_indirect_generate_new_mapping(vd, counts, 731667ec66fSSerapheim Dimitropoulos start_index, zthr); 7325cabbc6bSPrashanth Sreenivasa 7335cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 7345cabbc6bSPrashanth Sreenivasa 7355cabbc6bSPrashanth Sreenivasa /* 736667ec66fSSerapheim Dimitropoulos * If the zthr has received a cancellation signal while running 737667ec66fSSerapheim Dimitropoulos * in generate_new_mapping() or at any point after that, then bail 738667ec66fSSerapheim Dimitropoulos * early. We don't want to complete the condense if the spa is 739667ec66fSSerapheim Dimitropoulos * shutting down. 7405cabbc6bSPrashanth Sreenivasa */ 741667ec66fSSerapheim Dimitropoulos if (zthr_iscancelled(zthr)) 7426a316e1fSSerapheim Dimitropoulos return; 743667ec66fSSerapheim Dimitropoulos 744667ec66fSSerapheim Dimitropoulos VERIFY0(dsl_sync_task(spa_name(spa), NULL, 74586714001SSerapheim Dimitropoulos spa_condense_indirect_complete_sync, sci, 0, 74686714001SSerapheim Dimitropoulos ZFS_SPACE_CHECK_EXTRA_RESERVED)); 7475cabbc6bSPrashanth Sreenivasa } 7485cabbc6bSPrashanth Sreenivasa 7495cabbc6bSPrashanth Sreenivasa /* 7505cabbc6bSPrashanth Sreenivasa * Sync task to begin the condensing process. 7515cabbc6bSPrashanth Sreenivasa */ 7525cabbc6bSPrashanth Sreenivasa void 7535cabbc6bSPrashanth Sreenivasa spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 7545cabbc6bSPrashanth Sreenivasa { 7555cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 7565cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 7575cabbc6bSPrashanth Sreenivasa &spa->spa_condensing_indirect_phys; 7585cabbc6bSPrashanth Sreenivasa 7595cabbc6bSPrashanth Sreenivasa ASSERT0(scip->scip_next_mapping_object); 7605cabbc6bSPrashanth Sreenivasa ASSERT0(scip->scip_prev_obsolete_sm_object); 7615cabbc6bSPrashanth Sreenivasa ASSERT0(scip->scip_vdev); 7625cabbc6bSPrashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 7635cabbc6bSPrashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 7645cabbc6bSPrashanth Sreenivasa ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 7655cabbc6bSPrashanth Sreenivasa ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 7665cabbc6bSPrashanth Sreenivasa 7675cabbc6bSPrashanth Sreenivasa uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 7685cabbc6bSPrashanth Sreenivasa ASSERT(obsolete_sm_obj != 0); 7695cabbc6bSPrashanth Sreenivasa 7705cabbc6bSPrashanth Sreenivasa scip->scip_vdev = vd->vdev_id; 7715cabbc6bSPrashanth Sreenivasa scip->scip_next_mapping_object = 7725cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 7735cabbc6bSPrashanth Sreenivasa 7745cabbc6bSPrashanth Sreenivasa scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 7755cabbc6bSPrashanth Sreenivasa 7765cabbc6bSPrashanth Sreenivasa /* 7775cabbc6bSPrashanth Sreenivasa * We don't need to allocate a new space map object, since 7785cabbc6bSPrashanth Sreenivasa * vdev_indirect_sync_obsolete will allocate one when needed. 7795cabbc6bSPrashanth Sreenivasa */ 7805cabbc6bSPrashanth Sreenivasa space_map_close(vd->vdev_obsolete_sm); 7815cabbc6bSPrashanth Sreenivasa vd->vdev_obsolete_sm = NULL; 7825cabbc6bSPrashanth Sreenivasa VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 7835cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 7845cabbc6bSPrashanth Sreenivasa 7855cabbc6bSPrashanth Sreenivasa VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 7865cabbc6bSPrashanth Sreenivasa DMU_POOL_DIRECTORY_OBJECT, 7875cabbc6bSPrashanth Sreenivasa DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 7885cabbc6bSPrashanth Sreenivasa sizeof (*scip) / sizeof (uint64_t), scip, tx)); 7895cabbc6bSPrashanth Sreenivasa 7905cabbc6bSPrashanth Sreenivasa ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 7915cabbc6bSPrashanth Sreenivasa spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 7925cabbc6bSPrashanth Sreenivasa 7935cabbc6bSPrashanth Sreenivasa zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 7945cabbc6bSPrashanth Sreenivasa "posm=%llu nm=%llu", 7955cabbc6bSPrashanth Sreenivasa vd->vdev_id, dmu_tx_get_txg(tx), 7965cabbc6bSPrashanth Sreenivasa (u_longlong_t)scip->scip_prev_obsolete_sm_object, 7975cabbc6bSPrashanth Sreenivasa (u_longlong_t)scip->scip_next_mapping_object); 7985cabbc6bSPrashanth Sreenivasa 799667ec66fSSerapheim Dimitropoulos zthr_wakeup(spa->spa_condense_zthr); 8005cabbc6bSPrashanth Sreenivasa } 8015cabbc6bSPrashanth Sreenivasa 8025cabbc6bSPrashanth Sreenivasa /* 8035cabbc6bSPrashanth Sreenivasa * Sync to the given vdev's obsolete space map any segments that are no longer 8045cabbc6bSPrashanth Sreenivasa * referenced as of the given txg. 8055cabbc6bSPrashanth Sreenivasa * 8065cabbc6bSPrashanth Sreenivasa * If the obsolete space map doesn't exist yet, create and open it. 8075cabbc6bSPrashanth Sreenivasa */ 8085cabbc6bSPrashanth Sreenivasa void 8095cabbc6bSPrashanth Sreenivasa vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 8105cabbc6bSPrashanth Sreenivasa { 8115cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 8125cabbc6bSPrashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 8135cabbc6bSPrashanth Sreenivasa 8145cabbc6bSPrashanth Sreenivasa ASSERT3U(vic->vic_mapping_object, !=, 0); 8155cabbc6bSPrashanth Sreenivasa ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 8165cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 8175cabbc6bSPrashanth Sreenivasa ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 8185cabbc6bSPrashanth Sreenivasa 8195cabbc6bSPrashanth Sreenivasa if (vdev_obsolete_sm_object(vd) == 0) { 8205cabbc6bSPrashanth Sreenivasa uint64_t obsolete_sm_object = 82186714001SSerapheim Dimitropoulos space_map_alloc(spa->spa_meta_objset, 822814dcd43SSerapheim Dimitropoulos zfs_vdev_standard_sm_blksz, tx); 8235cabbc6bSPrashanth Sreenivasa 8245cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_top_zap != 0); 8255cabbc6bSPrashanth Sreenivasa VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 8265cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 8275cabbc6bSPrashanth Sreenivasa sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 8285cabbc6bSPrashanth Sreenivasa ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); 8295cabbc6bSPrashanth Sreenivasa 8305cabbc6bSPrashanth Sreenivasa spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 8315cabbc6bSPrashanth Sreenivasa VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 8325cabbc6bSPrashanth Sreenivasa spa->spa_meta_objset, obsolete_sm_object, 8335cabbc6bSPrashanth Sreenivasa 0, vd->vdev_asize, 0)); 8345cabbc6bSPrashanth Sreenivasa } 8355cabbc6bSPrashanth Sreenivasa 8365cabbc6bSPrashanth Sreenivasa ASSERT(vd->vdev_obsolete_sm != NULL); 8375cabbc6bSPrashanth Sreenivasa ASSERT3U(vdev_obsolete_sm_object(vd), ==, 8385cabbc6bSPrashanth Sreenivasa space_map_object(vd->vdev_obsolete_sm)); 8395cabbc6bSPrashanth Sreenivasa 8405cabbc6bSPrashanth Sreenivasa space_map_write(vd->vdev_obsolete_sm, 84117f11284SSerapheim Dimitropoulos vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); 8425cabbc6bSPrashanth Sreenivasa range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 8435cabbc6bSPrashanth Sreenivasa } 8445cabbc6bSPrashanth Sreenivasa 8455cabbc6bSPrashanth Sreenivasa int 8465cabbc6bSPrashanth Sreenivasa spa_condense_init(spa_t *spa) 8475cabbc6bSPrashanth Sreenivasa { 8485cabbc6bSPrashanth Sreenivasa int error = zap_lookup(spa->spa_meta_objset, 8495cabbc6bSPrashanth Sreenivasa DMU_POOL_DIRECTORY_OBJECT, 8505cabbc6bSPrashanth Sreenivasa DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 8515cabbc6bSPrashanth Sreenivasa sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 8525cabbc6bSPrashanth Sreenivasa &spa->spa_condensing_indirect_phys); 8535cabbc6bSPrashanth Sreenivasa if (error == 0) { 8545cabbc6bSPrashanth Sreenivasa if (spa_writeable(spa)) { 8555cabbc6bSPrashanth Sreenivasa spa->spa_condensing_indirect = 8565cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_create(spa); 8575cabbc6bSPrashanth Sreenivasa } 8585cabbc6bSPrashanth Sreenivasa return (0); 8595cabbc6bSPrashanth Sreenivasa } else if (error == ENOENT) { 8605cabbc6bSPrashanth Sreenivasa return (0); 8615cabbc6bSPrashanth Sreenivasa } else { 8625cabbc6bSPrashanth Sreenivasa return (error); 8635cabbc6bSPrashanth Sreenivasa } 8645cabbc6bSPrashanth Sreenivasa } 8655cabbc6bSPrashanth Sreenivasa 8665cabbc6bSPrashanth Sreenivasa void 8675cabbc6bSPrashanth Sreenivasa spa_condense_fini(spa_t *spa) 8685cabbc6bSPrashanth Sreenivasa { 8695cabbc6bSPrashanth Sreenivasa if (spa->spa_condensing_indirect != NULL) { 8705cabbc6bSPrashanth Sreenivasa spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 8715cabbc6bSPrashanth Sreenivasa spa->spa_condensing_indirect = NULL; 8725cabbc6bSPrashanth Sreenivasa } 8735cabbc6bSPrashanth Sreenivasa } 8745cabbc6bSPrashanth Sreenivasa 8755cabbc6bSPrashanth Sreenivasa void 876667ec66fSSerapheim Dimitropoulos spa_start_indirect_condensing_thread(spa_t *spa) 8775cabbc6bSPrashanth Sreenivasa { 878667ec66fSSerapheim Dimitropoulos ASSERT3P(spa->spa_condense_zthr, ==, NULL); 879667ec66fSSerapheim Dimitropoulos spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, 880667ec66fSSerapheim Dimitropoulos spa_condense_indirect_thread, spa); 8815cabbc6bSPrashanth Sreenivasa } 8825cabbc6bSPrashanth Sreenivasa 8835cabbc6bSPrashanth Sreenivasa /* 8845cabbc6bSPrashanth Sreenivasa * Gets the obsolete spacemap object from the vdev's ZAP. 8855cabbc6bSPrashanth Sreenivasa * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't 8865cabbc6bSPrashanth Sreenivasa * exist yet. 8875cabbc6bSPrashanth Sreenivasa */ 8885cabbc6bSPrashanth Sreenivasa int 8895cabbc6bSPrashanth Sreenivasa vdev_obsolete_sm_object(vdev_t *vd) 8905cabbc6bSPrashanth Sreenivasa { 8915cabbc6bSPrashanth Sreenivasa ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 8925cabbc6bSPrashanth Sreenivasa if (vd->vdev_top_zap == 0) { 8935cabbc6bSPrashanth Sreenivasa return (0); 8945cabbc6bSPrashanth Sreenivasa } 8955cabbc6bSPrashanth Sreenivasa 8965cabbc6bSPrashanth Sreenivasa uint64_t sm_obj = 0; 8975cabbc6bSPrashanth Sreenivasa int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 8985cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); 8995cabbc6bSPrashanth Sreenivasa 9005cabbc6bSPrashanth Sreenivasa ASSERT(err == 0 || err == ENOENT); 9015cabbc6bSPrashanth Sreenivasa 9025cabbc6bSPrashanth Sreenivasa return (sm_obj); 9035cabbc6bSPrashanth Sreenivasa } 9045cabbc6bSPrashanth Sreenivasa 9055cabbc6bSPrashanth Sreenivasa boolean_t 9065cabbc6bSPrashanth Sreenivasa vdev_obsolete_counts_are_precise(vdev_t *vd) 9075cabbc6bSPrashanth Sreenivasa { 9085cabbc6bSPrashanth Sreenivasa ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 9095cabbc6bSPrashanth Sreenivasa if (vd->vdev_top_zap == 0) { 9105cabbc6bSPrashanth Sreenivasa return (B_FALSE); 9115cabbc6bSPrashanth Sreenivasa } 9125cabbc6bSPrashanth Sreenivasa 9135cabbc6bSPrashanth Sreenivasa uint64_t val = 0; 9145cabbc6bSPrashanth Sreenivasa int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 9155cabbc6bSPrashanth Sreenivasa VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 9165cabbc6bSPrashanth Sreenivasa 9175cabbc6bSPrashanth Sreenivasa ASSERT(err == 0 || err == ENOENT); 9185cabbc6bSPrashanth Sreenivasa 9195cabbc6bSPrashanth Sreenivasa return (val != 0); 9205cabbc6bSPrashanth Sreenivasa } 9215cabbc6bSPrashanth Sreenivasa 9225cabbc6bSPrashanth Sreenivasa /* ARGSUSED */ 9235cabbc6bSPrashanth Sreenivasa static void 9245cabbc6bSPrashanth Sreenivasa vdev_indirect_close(vdev_t *vd) 9255cabbc6bSPrashanth Sreenivasa { 9265cabbc6bSPrashanth Sreenivasa } 9275cabbc6bSPrashanth Sreenivasa 9285cabbc6bSPrashanth Sreenivasa /* ARGSUSED */ 9295cabbc6bSPrashanth Sreenivasa static int 9305cabbc6bSPrashanth Sreenivasa vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 9315cabbc6bSPrashanth Sreenivasa uint64_t *ashift) 9325cabbc6bSPrashanth Sreenivasa { 9335cabbc6bSPrashanth Sreenivasa *psize = *max_psize = vd->vdev_asize + 9345cabbc6bSPrashanth Sreenivasa VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 9355cabbc6bSPrashanth Sreenivasa *ashift = vd->vdev_ashift; 9365cabbc6bSPrashanth Sreenivasa return (0); 9375cabbc6bSPrashanth Sreenivasa } 9385cabbc6bSPrashanth Sreenivasa 9395cabbc6bSPrashanth Sreenivasa typedef struct remap_segment { 9405cabbc6bSPrashanth Sreenivasa vdev_t *rs_vd; 9415cabbc6bSPrashanth Sreenivasa uint64_t rs_offset; 9425cabbc6bSPrashanth Sreenivasa uint64_t rs_asize; 9435cabbc6bSPrashanth Sreenivasa uint64_t rs_split_offset; 9445cabbc6bSPrashanth Sreenivasa list_node_t rs_node; 9455cabbc6bSPrashanth Sreenivasa } remap_segment_t; 9465cabbc6bSPrashanth Sreenivasa 9475cabbc6bSPrashanth Sreenivasa remap_segment_t * 9485cabbc6bSPrashanth Sreenivasa rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 9495cabbc6bSPrashanth Sreenivasa { 9505cabbc6bSPrashanth Sreenivasa remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 9515cabbc6bSPrashanth Sreenivasa rs->rs_vd = vd; 9525cabbc6bSPrashanth Sreenivasa rs->rs_offset = offset; 9535cabbc6bSPrashanth Sreenivasa rs->rs_asize = asize; 9545cabbc6bSPrashanth Sreenivasa rs->rs_split_offset = split_offset; 9555cabbc6bSPrashanth Sreenivasa return (rs); 9565cabbc6bSPrashanth Sreenivasa } 9575cabbc6bSPrashanth Sreenivasa 958bdfded42SSerapheim Dimitropoulos /* 959bdfded42SSerapheim Dimitropoulos * Given an indirect vdev and an extent on that vdev, it duplicates the 960bdfded42SSerapheim Dimitropoulos * physical entries of the indirect mapping that correspond to the extent 961bdfded42SSerapheim Dimitropoulos * to a new array and returns a pointer to it. In addition, copied_entries 962bdfded42SSerapheim Dimitropoulos * is populated with the number of mapping entries that were duplicated. 963bdfded42SSerapheim Dimitropoulos * 964bdfded42SSerapheim Dimitropoulos * Note that the function assumes that the caller holds vdev_indirect_rwlock. 965bdfded42SSerapheim Dimitropoulos * This ensures that the mapping won't change due to condensing as we 966bdfded42SSerapheim Dimitropoulos * copy over its contents. 967bdfded42SSerapheim Dimitropoulos * 968bdfded42SSerapheim Dimitropoulos * Finally, since we are doing an allocation, it is up to the caller to 969bdfded42SSerapheim Dimitropoulos * free the array allocated in this function. 970bdfded42SSerapheim Dimitropoulos */ 971bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t * 972bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 973bdfded42SSerapheim Dimitropoulos uint64_t asize, uint64_t *copied_entries) 974bdfded42SSerapheim Dimitropoulos { 975bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 976bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 977bdfded42SSerapheim Dimitropoulos uint64_t entries = 0; 978bdfded42SSerapheim Dimitropoulos 979bdfded42SSerapheim Dimitropoulos ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); 980bdfded42SSerapheim Dimitropoulos 981bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *first_mapping = 982bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_for_offset(vim, offset); 983bdfded42SSerapheim Dimitropoulos ASSERT3P(first_mapping, !=, NULL); 984bdfded42SSerapheim Dimitropoulos 985bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *m = first_mapping; 986bdfded42SSerapheim Dimitropoulos while (asize > 0) { 987bdfded42SSerapheim Dimitropoulos uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 988bdfded42SSerapheim Dimitropoulos 989bdfded42SSerapheim Dimitropoulos ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); 990bdfded42SSerapheim Dimitropoulos ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); 991bdfded42SSerapheim Dimitropoulos 992bdfded42SSerapheim Dimitropoulos uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 993bdfded42SSerapheim Dimitropoulos uint64_t inner_size = MIN(asize, size - inner_offset); 994bdfded42SSerapheim Dimitropoulos 995bdfded42SSerapheim Dimitropoulos offset += inner_size; 996bdfded42SSerapheim Dimitropoulos asize -= inner_size; 997bdfded42SSerapheim Dimitropoulos entries++; 998bdfded42SSerapheim Dimitropoulos m++; 999bdfded42SSerapheim Dimitropoulos } 1000bdfded42SSerapheim Dimitropoulos 1001bdfded42SSerapheim Dimitropoulos size_t copy_length = entries * sizeof (*first_mapping); 1002bdfded42SSerapheim Dimitropoulos duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); 1003bdfded42SSerapheim Dimitropoulos bcopy(first_mapping, duplicate_mappings, copy_length); 1004bdfded42SSerapheim Dimitropoulos *copied_entries = entries; 1005bdfded42SSerapheim Dimitropoulos 1006bdfded42SSerapheim Dimitropoulos return (duplicate_mappings); 1007bdfded42SSerapheim Dimitropoulos } 1008bdfded42SSerapheim Dimitropoulos 10095cabbc6bSPrashanth Sreenivasa /* 10105cabbc6bSPrashanth Sreenivasa * Goes through the relevant indirect mappings until it hits a concrete vdev 10115cabbc6bSPrashanth Sreenivasa * and issues the callback. On the way to the concrete vdev, if any other 10125cabbc6bSPrashanth Sreenivasa * indirect vdevs are encountered, then the callback will also be called on 10135cabbc6bSPrashanth Sreenivasa * each of those indirect vdevs. For example, if the segment is mapped to 10145cabbc6bSPrashanth Sreenivasa * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 10155cabbc6bSPrashanth Sreenivasa * mapped to segment B on concrete vdev 2, then the callback will be called on 10165cabbc6bSPrashanth Sreenivasa * both vdev 1 and vdev 2. 10175cabbc6bSPrashanth Sreenivasa * 10185cabbc6bSPrashanth Sreenivasa * While the callback passed to vdev_indirect_remap() is called on every vdev 10195cabbc6bSPrashanth Sreenivasa * the function encounters, certain callbacks only care about concrete vdevs. 10205cabbc6bSPrashanth Sreenivasa * These types of callbacks should return immediately and explicitly when they 10215cabbc6bSPrashanth Sreenivasa * are called on an indirect vdev. 10225cabbc6bSPrashanth Sreenivasa * 10235cabbc6bSPrashanth Sreenivasa * Because there is a possibility that a DVA section in the indirect device 10245cabbc6bSPrashanth Sreenivasa * has been split into multiple sections in our mapping, we keep track 10255cabbc6bSPrashanth Sreenivasa * of the relevant contiguous segments of the new location (remap_segment_t) 10265cabbc6bSPrashanth Sreenivasa * in a stack. This way we can call the callback for each of the new sections 10275cabbc6bSPrashanth Sreenivasa * created by a single section of the indirect device. Note though, that in 10285cabbc6bSPrashanth Sreenivasa * this scenario the callbacks in each split block won't occur in-order in 10295cabbc6bSPrashanth Sreenivasa * terms of offset, so callers should not make any assumptions about that. 10305cabbc6bSPrashanth Sreenivasa * 10315cabbc6bSPrashanth Sreenivasa * For callbacks that don't handle split blocks and immediately return when 10325cabbc6bSPrashanth Sreenivasa * they encounter them (as is the case for remap_blkptr_cb), the caller can 10335cabbc6bSPrashanth Sreenivasa * assume that its callback will be applied from the first indirect vdev 10345cabbc6bSPrashanth Sreenivasa * encountered to the last one and then the concrete vdev, in that order. 10355cabbc6bSPrashanth Sreenivasa */ 10365cabbc6bSPrashanth Sreenivasa static void 10375cabbc6bSPrashanth Sreenivasa vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 10385cabbc6bSPrashanth Sreenivasa void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 10395cabbc6bSPrashanth Sreenivasa { 10405cabbc6bSPrashanth Sreenivasa list_t stack; 10415cabbc6bSPrashanth Sreenivasa spa_t *spa = vd->vdev_spa; 10425cabbc6bSPrashanth Sreenivasa 10435cabbc6bSPrashanth Sreenivasa list_create(&stack, sizeof (remap_segment_t), 10445cabbc6bSPrashanth Sreenivasa offsetof(remap_segment_t, rs_node)); 10455cabbc6bSPrashanth Sreenivasa 10465cabbc6bSPrashanth Sreenivasa for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 10475cabbc6bSPrashanth Sreenivasa rs != NULL; rs = list_remove_head(&stack)) { 10485cabbc6bSPrashanth Sreenivasa vdev_t *v = rs->rs_vd; 1049bdfded42SSerapheim Dimitropoulos uint64_t num_entries = 0; 1050bdfded42SSerapheim Dimitropoulos 1051bdfded42SSerapheim Dimitropoulos ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1052bdfded42SSerapheim Dimitropoulos ASSERT(rs->rs_asize > 0); 10535cabbc6bSPrashanth Sreenivasa 10545cabbc6bSPrashanth Sreenivasa /* 1055bdfded42SSerapheim Dimitropoulos * Note: As this function can be called from open context 1056bdfded42SSerapheim Dimitropoulos * (e.g. zio_read()), we need the following rwlock to 1057bdfded42SSerapheim Dimitropoulos * prevent the mapping from being changed by condensing. 1058bdfded42SSerapheim Dimitropoulos * 1059bdfded42SSerapheim Dimitropoulos * So we grab the lock and we make a copy of the entries 1060bdfded42SSerapheim Dimitropoulos * that are relevant to the extent that we are working on. 1061bdfded42SSerapheim Dimitropoulos * Once that is done, we drop the lock and iterate over 1062bdfded42SSerapheim Dimitropoulos * our copy of the mapping. Once we are done with the with 1063bdfded42SSerapheim Dimitropoulos * the remap segment and we free it, we also free our copy 1064bdfded42SSerapheim Dimitropoulos * of the indirect mapping entries that are relevant to it. 1065bdfded42SSerapheim Dimitropoulos * 1066bdfded42SSerapheim Dimitropoulos * This way we don't need to wait until the function is 1067bdfded42SSerapheim Dimitropoulos * finished with a segment, to condense it. In addition, we 1068bdfded42SSerapheim Dimitropoulos * don't need a recursive rwlock for the case that a call to 1069bdfded42SSerapheim Dimitropoulos * vdev_indirect_remap() needs to call itself (through the 1070bdfded42SSerapheim Dimitropoulos * codepath of its callback) for the same vdev in the middle 1071bdfded42SSerapheim Dimitropoulos * of its execution. 10725cabbc6bSPrashanth Sreenivasa */ 10735cabbc6bSPrashanth Sreenivasa rw_enter(&v->vdev_indirect_rwlock, RW_READER); 10745cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; 10755cabbc6bSPrashanth Sreenivasa ASSERT3P(vim, !=, NULL); 10765cabbc6bSPrashanth Sreenivasa 10775cabbc6bSPrashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *mapping = 1078bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_duplicate_adjacent_entries(v, 1079bdfded42SSerapheim Dimitropoulos rs->rs_offset, rs->rs_asize, &num_entries); 10805cabbc6bSPrashanth Sreenivasa ASSERT3P(mapping, !=, NULL); 1081bdfded42SSerapheim Dimitropoulos ASSERT3U(num_entries, >, 0); 1082bdfded42SSerapheim Dimitropoulos rw_exit(&v->vdev_indirect_rwlock); 10835cabbc6bSPrashanth Sreenivasa 1084bdfded42SSerapheim Dimitropoulos for (uint64_t i = 0; i < num_entries; i++) { 10855cabbc6bSPrashanth Sreenivasa /* 10865cabbc6bSPrashanth Sreenivasa * Note: the vdev_indirect_mapping can not change 10875cabbc6bSPrashanth Sreenivasa * while we are running. It only changes while the 10885cabbc6bSPrashanth Sreenivasa * removal is in progress, and then only from syncing 10895cabbc6bSPrashanth Sreenivasa * context. While a removal is in progress, this 10905cabbc6bSPrashanth Sreenivasa * function is only called for frees, which also only 10915cabbc6bSPrashanth Sreenivasa * happen from syncing context. 10925cabbc6bSPrashanth Sreenivasa */ 1093bdfded42SSerapheim Dimitropoulos vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 1094bdfded42SSerapheim Dimitropoulos 1095bdfded42SSerapheim Dimitropoulos ASSERT3P(m, !=, NULL); 1096bdfded42SSerapheim Dimitropoulos ASSERT3U(rs->rs_asize, >, 0); 10975cabbc6bSPrashanth Sreenivasa 1098bdfded42SSerapheim Dimitropoulos uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 1099bdfded42SSerapheim Dimitropoulos uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 1100bdfded42SSerapheim Dimitropoulos uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 11015cabbc6bSPrashanth Sreenivasa 11025cabbc6bSPrashanth Sreenivasa ASSERT3U(rs->rs_offset, >=, 1103bdfded42SSerapheim Dimitropoulos DVA_MAPPING_GET_SRC_OFFSET(m)); 11045cabbc6bSPrashanth Sreenivasa ASSERT3U(rs->rs_offset, <, 1105bdfded42SSerapheim Dimitropoulos DVA_MAPPING_GET_SRC_OFFSET(m) + size); 11065cabbc6bSPrashanth Sreenivasa ASSERT3U(dst_vdev, !=, v->vdev_id); 11075cabbc6bSPrashanth Sreenivasa 11085cabbc6bSPrashanth Sreenivasa uint64_t inner_offset = rs->rs_offset - 1109bdfded42SSerapheim Dimitropoulos DVA_MAPPING_GET_SRC_OFFSET(m); 11105cabbc6bSPrashanth Sreenivasa uint64_t inner_size = 11115cabbc6bSPrashanth Sreenivasa MIN(rs->rs_asize, size - inner_offset); 11125cabbc6bSPrashanth Sreenivasa 11135cabbc6bSPrashanth Sreenivasa vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 11145cabbc6bSPrashanth Sreenivasa ASSERT3P(dst_v, !=, NULL); 11155cabbc6bSPrashanth Sreenivasa 11165cabbc6bSPrashanth Sreenivasa if (dst_v->vdev_ops == &vdev_indirect_ops) { 11175cabbc6bSPrashanth Sreenivasa list_insert_head(&stack, 11185cabbc6bSPrashanth Sreenivasa rs_alloc(dst_v, dst_offset + inner_offset, 11195cabbc6bSPrashanth Sreenivasa inner_size, rs->rs_split_offset)); 11205cabbc6bSPrashanth Sreenivasa 11215cabbc6bSPrashanth Sreenivasa } 11225cabbc6bSPrashanth Sreenivasa 11235cabbc6bSPrashanth Sreenivasa if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 11245cabbc6bSPrashanth Sreenivasa IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 11255cabbc6bSPrashanth Sreenivasa /* 11265cabbc6bSPrashanth Sreenivasa * Note: This clause exists only solely for 11275cabbc6bSPrashanth Sreenivasa * testing purposes. We use it to ensure that 11285cabbc6bSPrashanth Sreenivasa * split blocks work and that the callbacks 11295cabbc6bSPrashanth Sreenivasa * using them yield the same result if issued 11305cabbc6bSPrashanth Sreenivasa * in reverse order. 11315cabbc6bSPrashanth Sreenivasa */ 11325cabbc6bSPrashanth Sreenivasa uint64_t inner_half = inner_size / 2; 11335cabbc6bSPrashanth Sreenivasa 11345cabbc6bSPrashanth Sreenivasa func(rs->rs_split_offset + inner_half, dst_v, 11355cabbc6bSPrashanth Sreenivasa dst_offset + inner_offset + inner_half, 11365cabbc6bSPrashanth Sreenivasa inner_half, arg); 11375cabbc6bSPrashanth Sreenivasa 11385cabbc6bSPrashanth Sreenivasa func(rs->rs_split_offset, dst_v, 11395cabbc6bSPrashanth Sreenivasa dst_offset + inner_offset, 11405cabbc6bSPrashanth Sreenivasa inner_half, arg); 11415cabbc6bSPrashanth Sreenivasa } else { 11425cabbc6bSPrashanth Sreenivasa func(rs->rs_split_offset, dst_v, 11435cabbc6bSPrashanth Sreenivasa dst_offset + inner_offset, 11445cabbc6bSPrashanth Sreenivasa inner_size, arg); 11455cabbc6bSPrashanth Sreenivasa } 11465cabbc6bSPrashanth Sreenivasa 11475cabbc6bSPrashanth Sreenivasa rs->rs_offset += inner_size; 11485cabbc6bSPrashanth Sreenivasa rs->rs_asize -= inner_size; 11495cabbc6bSPrashanth Sreenivasa rs->rs_split_offset += inner_size; 11505cabbc6bSPrashanth Sreenivasa } 1151bdfded42SSerapheim Dimitropoulos VERIFY0(rs->rs_asize); 11525cabbc6bSPrashanth Sreenivasa 1153bdfded42SSerapheim Dimitropoulos kmem_free(mapping, num_entries * sizeof (*mapping)); 11545cabbc6bSPrashanth Sreenivasa kmem_free(rs, sizeof (remap_segment_t)); 11555cabbc6bSPrashanth Sreenivasa } 11565cabbc6bSPrashanth Sreenivasa list_destroy(&stack); 11575cabbc6bSPrashanth Sreenivasa } 11585cabbc6bSPrashanth Sreenivasa 11595cabbc6bSPrashanth Sreenivasa static void 11605cabbc6bSPrashanth Sreenivasa vdev_indirect_child_io_done(zio_t *zio) 11615cabbc6bSPrashanth Sreenivasa { 11625cabbc6bSPrashanth Sreenivasa zio_t *pio = zio->io_private; 11635cabbc6bSPrashanth Sreenivasa 11645cabbc6bSPrashanth Sreenivasa mutex_enter(&pio->io_lock); 11655cabbc6bSPrashanth Sreenivasa pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 11665cabbc6bSPrashanth Sreenivasa mutex_exit(&pio->io_lock); 11675cabbc6bSPrashanth Sreenivasa 11685cabbc6bSPrashanth Sreenivasa abd_put(zio->io_abd); 11695cabbc6bSPrashanth Sreenivasa } 11705cabbc6bSPrashanth Sreenivasa 11713a4b1be9SMatthew Ahrens /* 11723a4b1be9SMatthew Ahrens * This is a callback for vdev_indirect_remap() which allocates an 11733a4b1be9SMatthew Ahrens * indirect_split_t for each split segment and adds it to iv_splits. 11743a4b1be9SMatthew Ahrens */ 11755cabbc6bSPrashanth Sreenivasa static void 11763a4b1be9SMatthew Ahrens vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 11775cabbc6bSPrashanth Sreenivasa uint64_t size, void *arg) 11785cabbc6bSPrashanth Sreenivasa { 11795cabbc6bSPrashanth Sreenivasa zio_t *zio = arg; 11803a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 11815cabbc6bSPrashanth Sreenivasa 11825cabbc6bSPrashanth Sreenivasa ASSERT3P(vd, !=, NULL); 11835cabbc6bSPrashanth Sreenivasa 11845cabbc6bSPrashanth Sreenivasa if (vd->vdev_ops == &vdev_indirect_ops) 11855cabbc6bSPrashanth Sreenivasa return; 11865cabbc6bSPrashanth Sreenivasa 11873a4b1be9SMatthew Ahrens int n = 1; 11883a4b1be9SMatthew Ahrens if (vd->vdev_ops == &vdev_mirror_ops) 11893a4b1be9SMatthew Ahrens n = vd->vdev_children; 11903a4b1be9SMatthew Ahrens 11913a4b1be9SMatthew Ahrens indirect_split_t *is = 11923a4b1be9SMatthew Ahrens kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); 11933a4b1be9SMatthew Ahrens 11943a4b1be9SMatthew Ahrens is->is_children = n; 11953a4b1be9SMatthew Ahrens is->is_size = size; 11963a4b1be9SMatthew Ahrens is->is_split_offset = split_offset; 11973a4b1be9SMatthew Ahrens is->is_target_offset = offset; 11983a4b1be9SMatthew Ahrens is->is_vdev = vd; 1199a21fe349SBrian Behlendorf list_create(&is->is_unique_child, sizeof (indirect_child_t), 1200a21fe349SBrian Behlendorf offsetof(indirect_child_t, ic_node)); 12013a4b1be9SMatthew Ahrens 12023a4b1be9SMatthew Ahrens /* 12033a4b1be9SMatthew Ahrens * Note that we only consider multiple copies of the data for 12043a4b1be9SMatthew Ahrens * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 12053a4b1be9SMatthew Ahrens * though they use the same ops as mirror, because there's only one 12063a4b1be9SMatthew Ahrens * "good" copy under the replacing/spare. 12073a4b1be9SMatthew Ahrens */ 12083a4b1be9SMatthew Ahrens if (vd->vdev_ops == &vdev_mirror_ops) { 12093a4b1be9SMatthew Ahrens for (int i = 0; i < n; i++) { 12103a4b1be9SMatthew Ahrens is->is_child[i].ic_vdev = vd->vdev_child[i]; 1211a21fe349SBrian Behlendorf list_link_init(&is->is_child[i].ic_node); 12123a4b1be9SMatthew Ahrens } 12133a4b1be9SMatthew Ahrens } else { 12143a4b1be9SMatthew Ahrens is->is_child[0].ic_vdev = vd; 12153a4b1be9SMatthew Ahrens } 12163a4b1be9SMatthew Ahrens 12173a4b1be9SMatthew Ahrens list_insert_tail(&iv->iv_splits, is); 12183a4b1be9SMatthew Ahrens } 12193a4b1be9SMatthew Ahrens 12203a4b1be9SMatthew Ahrens static void 12213a4b1be9SMatthew Ahrens vdev_indirect_read_split_done(zio_t *zio) 12223a4b1be9SMatthew Ahrens { 12233a4b1be9SMatthew Ahrens indirect_child_t *ic = zio->io_private; 12243a4b1be9SMatthew Ahrens 12253a4b1be9SMatthew Ahrens if (zio->io_error != 0) { 12263a4b1be9SMatthew Ahrens /* 12273a4b1be9SMatthew Ahrens * Clear ic_data to indicate that we do not have data for this 12283a4b1be9SMatthew Ahrens * child. 12293a4b1be9SMatthew Ahrens */ 12303a4b1be9SMatthew Ahrens abd_free(ic->ic_data); 12313a4b1be9SMatthew Ahrens ic->ic_data = NULL; 12323a4b1be9SMatthew Ahrens } 12333a4b1be9SMatthew Ahrens } 12343a4b1be9SMatthew Ahrens 12353a4b1be9SMatthew Ahrens /* 12363a4b1be9SMatthew Ahrens * Issue reads for all copies (mirror children) of all splits. 12373a4b1be9SMatthew Ahrens */ 12383a4b1be9SMatthew Ahrens static void 12393a4b1be9SMatthew Ahrens vdev_indirect_read_all(zio_t *zio) 12403a4b1be9SMatthew Ahrens { 12413a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 12423a4b1be9SMatthew Ahrens 1243e4c795beSTom Caputi ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 1244e4c795beSTom Caputi 12453a4b1be9SMatthew Ahrens for (indirect_split_t *is = list_head(&iv->iv_splits); 12463a4b1be9SMatthew Ahrens is != NULL; is = list_next(&iv->iv_splits, is)) { 12473a4b1be9SMatthew Ahrens for (int i = 0; i < is->is_children; i++) { 12483a4b1be9SMatthew Ahrens indirect_child_t *ic = &is->is_child[i]; 12493a4b1be9SMatthew Ahrens 12503a4b1be9SMatthew Ahrens if (!vdev_readable(ic->ic_vdev)) 12513a4b1be9SMatthew Ahrens continue; 12523a4b1be9SMatthew Ahrens 12533a4b1be9SMatthew Ahrens /* 12543a4b1be9SMatthew Ahrens * Note, we may read from a child whose DTL 12553a4b1be9SMatthew Ahrens * indicates that the data may not be present here. 12563a4b1be9SMatthew Ahrens * While this might result in a few i/os that will 12573a4b1be9SMatthew Ahrens * likely return incorrect data, it simplifies the 12583a4b1be9SMatthew Ahrens * code since we can treat scrub and resilver 12593a4b1be9SMatthew Ahrens * identically. (The incorrect data will be 12603a4b1be9SMatthew Ahrens * detected and ignored when we verify the 12613a4b1be9SMatthew Ahrens * checksum.) 12623a4b1be9SMatthew Ahrens */ 12633a4b1be9SMatthew Ahrens 12643a4b1be9SMatthew Ahrens ic->ic_data = abd_alloc_sametype(zio->io_abd, 12653a4b1be9SMatthew Ahrens is->is_size); 1266a21fe349SBrian Behlendorf ic->ic_duplicate = NULL; 12673a4b1be9SMatthew Ahrens 12683a4b1be9SMatthew Ahrens zio_nowait(zio_vdev_child_io(zio, NULL, 12693a4b1be9SMatthew Ahrens ic->ic_vdev, is->is_target_offset, ic->ic_data, 12703a4b1be9SMatthew Ahrens is->is_size, zio->io_type, zio->io_priority, 0, 12713a4b1be9SMatthew Ahrens vdev_indirect_read_split_done, ic)); 12723a4b1be9SMatthew Ahrens } 12733a4b1be9SMatthew Ahrens } 12743a4b1be9SMatthew Ahrens iv->iv_reconstruct = B_TRUE; 12755cabbc6bSPrashanth Sreenivasa } 12765cabbc6bSPrashanth Sreenivasa 12775cabbc6bSPrashanth Sreenivasa static void 12785cabbc6bSPrashanth Sreenivasa vdev_indirect_io_start(zio_t *zio) 12795cabbc6bSPrashanth Sreenivasa { 12805cabbc6bSPrashanth Sreenivasa spa_t *spa = zio->io_spa; 12813a4b1be9SMatthew Ahrens indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); 12823a4b1be9SMatthew Ahrens list_create(&iv->iv_splits, 12833a4b1be9SMatthew Ahrens sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 12843a4b1be9SMatthew Ahrens 12853a4b1be9SMatthew Ahrens zio->io_vsd = iv; 12863a4b1be9SMatthew Ahrens zio->io_vsd_ops = &vdev_indirect_vsd_ops; 12875cabbc6bSPrashanth Sreenivasa 12885cabbc6bSPrashanth Sreenivasa ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 12895cabbc6bSPrashanth Sreenivasa if (zio->io_type != ZIO_TYPE_READ) { 12905cabbc6bSPrashanth Sreenivasa ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 12913a4b1be9SMatthew Ahrens /* 12923a4b1be9SMatthew Ahrens * Note: this code can handle other kinds of writes, 12933a4b1be9SMatthew Ahrens * but we don't expect them. 12943a4b1be9SMatthew Ahrens */ 12953a4b1be9SMatthew Ahrens ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | 12963a4b1be9SMatthew Ahrens ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 12975cabbc6bSPrashanth Sreenivasa } 12985cabbc6bSPrashanth Sreenivasa 12995cabbc6bSPrashanth Sreenivasa vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 13003a4b1be9SMatthew Ahrens vdev_indirect_gather_splits, zio); 13013a4b1be9SMatthew Ahrens 13023a4b1be9SMatthew Ahrens indirect_split_t *first = list_head(&iv->iv_splits); 13033a4b1be9SMatthew Ahrens if (first->is_size == zio->io_size) { 13043a4b1be9SMatthew Ahrens /* 13053a4b1be9SMatthew Ahrens * This is not a split block; we are pointing to the entire 13063a4b1be9SMatthew Ahrens * data, which will checksum the same as the original data. 13073a4b1be9SMatthew Ahrens * Pass the BP down so that the child i/o can verify the 13083a4b1be9SMatthew Ahrens * checksum, and try a different location if available 13093a4b1be9SMatthew Ahrens * (e.g. on a mirror). 13103a4b1be9SMatthew Ahrens * 13113a4b1be9SMatthew Ahrens * While this special case could be handled the same as the 13123a4b1be9SMatthew Ahrens * general (split block) case, doing it this way ensures 13133a4b1be9SMatthew Ahrens * that the vast majority of blocks on indirect vdevs 13143a4b1be9SMatthew Ahrens * (which are not split) are handled identically to blocks 13153a4b1be9SMatthew Ahrens * on non-indirect vdevs. This allows us to be less strict 13163a4b1be9SMatthew Ahrens * about performance in the general (but rare) case. 13173a4b1be9SMatthew Ahrens */ 13183a4b1be9SMatthew Ahrens ASSERT0(first->is_split_offset); 13193a4b1be9SMatthew Ahrens ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); 13203a4b1be9SMatthew Ahrens zio_nowait(zio_vdev_child_io(zio, zio->io_bp, 13213a4b1be9SMatthew Ahrens first->is_vdev, first->is_target_offset, 13223a4b1be9SMatthew Ahrens abd_get_offset(zio->io_abd, 0), 13233a4b1be9SMatthew Ahrens zio->io_size, zio->io_type, zio->io_priority, 0, 13243a4b1be9SMatthew Ahrens vdev_indirect_child_io_done, zio)); 13253a4b1be9SMatthew Ahrens } else { 13263a4b1be9SMatthew Ahrens iv->iv_split_block = B_TRUE; 1327e4c795beSTom Caputi if (zio->io_type == ZIO_TYPE_READ && 1328e4c795beSTom Caputi zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { 13293a4b1be9SMatthew Ahrens /* 13303a4b1be9SMatthew Ahrens * Read all copies. Note that for simplicity, 13313a4b1be9SMatthew Ahrens * we don't bother consulting the DTL in the 13323a4b1be9SMatthew Ahrens * resilver case. 13333a4b1be9SMatthew Ahrens */ 13343a4b1be9SMatthew Ahrens vdev_indirect_read_all(zio); 13353a4b1be9SMatthew Ahrens } else { 13363a4b1be9SMatthew Ahrens /* 1337e4c795beSTom Caputi * If this is a read zio, we read one copy of each 1338e4c795beSTom Caputi * split segment, from the top-level vdev. Since 1339e4c795beSTom Caputi * we don't know the checksum of each split 1340e4c795beSTom Caputi * individually, the child zio can't ensure that 1341e4c795beSTom Caputi * we get the right data. E.g. if it's a mirror, 1342e4c795beSTom Caputi * it will just read from a random (healthy) leaf 1343e4c795beSTom Caputi * vdev. We have to verify the checksum in 1344e4c795beSTom Caputi * vdev_indirect_io_done(). 1345e4c795beSTom Caputi * 1346e4c795beSTom Caputi * For write zios, the vdev code will ensure we write 1347e4c795beSTom Caputi * to all children. 13483a4b1be9SMatthew Ahrens */ 13493a4b1be9SMatthew Ahrens for (indirect_split_t *is = list_head(&iv->iv_splits); 13503a4b1be9SMatthew Ahrens is != NULL; is = list_next(&iv->iv_splits, is)) { 13513a4b1be9SMatthew Ahrens zio_nowait(zio_vdev_child_io(zio, NULL, 13523a4b1be9SMatthew Ahrens is->is_vdev, is->is_target_offset, 13533a4b1be9SMatthew Ahrens abd_get_offset(zio->io_abd, 13543a4b1be9SMatthew Ahrens is->is_split_offset), 13553a4b1be9SMatthew Ahrens is->is_size, zio->io_type, 13563a4b1be9SMatthew Ahrens zio->io_priority, 0, 13573a4b1be9SMatthew Ahrens vdev_indirect_child_io_done, zio)); 13583a4b1be9SMatthew Ahrens } 13593a4b1be9SMatthew Ahrens } 13603a4b1be9SMatthew Ahrens } 13615cabbc6bSPrashanth Sreenivasa 13625cabbc6bSPrashanth Sreenivasa zio_execute(zio); 13635cabbc6bSPrashanth Sreenivasa } 13645cabbc6bSPrashanth Sreenivasa 13653a4b1be9SMatthew Ahrens /* 13663a4b1be9SMatthew Ahrens * Report a checksum error for a child. 13673a4b1be9SMatthew Ahrens */ 13683a4b1be9SMatthew Ahrens static void 13693a4b1be9SMatthew Ahrens vdev_indirect_checksum_error(zio_t *zio, 13703a4b1be9SMatthew Ahrens indirect_split_t *is, indirect_child_t *ic) 13713a4b1be9SMatthew Ahrens { 13723a4b1be9SMatthew Ahrens vdev_t *vd = ic->ic_vdev; 13733a4b1be9SMatthew Ahrens 13743a4b1be9SMatthew Ahrens if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 13753a4b1be9SMatthew Ahrens return; 13763a4b1be9SMatthew Ahrens 13773a4b1be9SMatthew Ahrens mutex_enter(&vd->vdev_stat_lock); 13783a4b1be9SMatthew Ahrens vd->vdev_stat.vs_checksum_errors++; 13793a4b1be9SMatthew Ahrens mutex_exit(&vd->vdev_stat_lock); 13803a4b1be9SMatthew Ahrens 13813a4b1be9SMatthew Ahrens zio_bad_cksum_t zbc = { 0 }; 13823a4b1be9SMatthew Ahrens void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); 1383a21fe349SBrian Behlendorf abd_t *good_abd = is->is_good_child->ic_data; 13843a4b1be9SMatthew Ahrens void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); 1385eb633035STom Caputi zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, 13863a4b1be9SMatthew Ahrens is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); 13873a4b1be9SMatthew Ahrens abd_return_buf(ic->ic_data, bad_buf, is->is_size); 13883a4b1be9SMatthew Ahrens abd_return_buf(good_abd, good_buf, is->is_size); 13893a4b1be9SMatthew Ahrens } 13903a4b1be9SMatthew Ahrens 13913a4b1be9SMatthew Ahrens /* 13923a4b1be9SMatthew Ahrens * Issue repair i/os for any incorrect copies. We do this by comparing 13933a4b1be9SMatthew Ahrens * each split segment's correct data (is_good_child's ic_data) with each 13943a4b1be9SMatthew Ahrens * other copy of the data. If they differ, then we overwrite the bad data 13953a4b1be9SMatthew Ahrens * with the good copy. Note that we do this without regard for the DTL's, 13963a4b1be9SMatthew Ahrens * which simplifies this code and also issues the optimal number of writes 13973a4b1be9SMatthew Ahrens * (based on which copies actually read bad data, as opposed to which we 13983a4b1be9SMatthew Ahrens * think might be wrong). For the same reason, we always use 13993a4b1be9SMatthew Ahrens * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). 14003a4b1be9SMatthew Ahrens */ 14013a4b1be9SMatthew Ahrens static void 14023a4b1be9SMatthew Ahrens vdev_indirect_repair(zio_t *zio) 14033a4b1be9SMatthew Ahrens { 14043a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 14053a4b1be9SMatthew Ahrens 14063a4b1be9SMatthew Ahrens enum zio_flag flags = ZIO_FLAG_IO_REPAIR; 14073a4b1be9SMatthew Ahrens 14083a4b1be9SMatthew Ahrens if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) 14093a4b1be9SMatthew Ahrens flags |= ZIO_FLAG_SELF_HEAL; 14103a4b1be9SMatthew Ahrens 14113a4b1be9SMatthew Ahrens if (!spa_writeable(zio->io_spa)) 14123a4b1be9SMatthew Ahrens return; 14133a4b1be9SMatthew Ahrens 14143a4b1be9SMatthew Ahrens for (indirect_split_t *is = list_head(&iv->iv_splits); 14153a4b1be9SMatthew Ahrens is != NULL; is = list_next(&iv->iv_splits, is)) { 14163a4b1be9SMatthew Ahrens for (int c = 0; c < is->is_children; c++) { 14173a4b1be9SMatthew Ahrens indirect_child_t *ic = &is->is_child[c]; 1418a21fe349SBrian Behlendorf if (ic == is->is_good_child) 14193a4b1be9SMatthew Ahrens continue; 14203a4b1be9SMatthew Ahrens if (ic->ic_data == NULL) 14213a4b1be9SMatthew Ahrens continue; 1422a21fe349SBrian Behlendorf if (ic->ic_duplicate == is->is_good_child) 14233a4b1be9SMatthew Ahrens continue; 14243a4b1be9SMatthew Ahrens 14253a4b1be9SMatthew Ahrens zio_nowait(zio_vdev_child_io(zio, NULL, 14263a4b1be9SMatthew Ahrens ic->ic_vdev, is->is_target_offset, 1427a21fe349SBrian Behlendorf is->is_good_child->ic_data, is->is_size, 14283a4b1be9SMatthew Ahrens ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 14293a4b1be9SMatthew Ahrens ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, 14303a4b1be9SMatthew Ahrens NULL, NULL)); 14313a4b1be9SMatthew Ahrens 14323a4b1be9SMatthew Ahrens vdev_indirect_checksum_error(zio, is, ic); 14333a4b1be9SMatthew Ahrens } 14343a4b1be9SMatthew Ahrens } 14353a4b1be9SMatthew Ahrens } 14363a4b1be9SMatthew Ahrens 14373a4b1be9SMatthew Ahrens /* 14383a4b1be9SMatthew Ahrens * Report checksum errors on all children that we read from. 14393a4b1be9SMatthew Ahrens */ 14403a4b1be9SMatthew Ahrens static void 14413a4b1be9SMatthew Ahrens vdev_indirect_all_checksum_errors(zio_t *zio) 14423a4b1be9SMatthew Ahrens { 14433a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 14443a4b1be9SMatthew Ahrens 14453a4b1be9SMatthew Ahrens if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 14463a4b1be9SMatthew Ahrens return; 14473a4b1be9SMatthew Ahrens 14483a4b1be9SMatthew Ahrens for (indirect_split_t *is = list_head(&iv->iv_splits); 14493a4b1be9SMatthew Ahrens is != NULL; is = list_next(&iv->iv_splits, is)) { 14503a4b1be9SMatthew Ahrens for (int c = 0; c < is->is_children; c++) { 14513a4b1be9SMatthew Ahrens indirect_child_t *ic = &is->is_child[c]; 14523a4b1be9SMatthew Ahrens 14533a4b1be9SMatthew Ahrens if (ic->ic_data == NULL) 14543a4b1be9SMatthew Ahrens continue; 14553a4b1be9SMatthew Ahrens 14563a4b1be9SMatthew Ahrens vdev_t *vd = ic->ic_vdev; 14573a4b1be9SMatthew Ahrens 14583a4b1be9SMatthew Ahrens mutex_enter(&vd->vdev_stat_lock); 14593a4b1be9SMatthew Ahrens vd->vdev_stat.vs_checksum_errors++; 14603a4b1be9SMatthew Ahrens mutex_exit(&vd->vdev_stat_lock); 14613a4b1be9SMatthew Ahrens 1462eb633035STom Caputi zfs_ereport_post_checksum(zio->io_spa, vd, 1463eb633035STom Caputi &zio->io_bookmark, zio, is->is_target_offset, 1464eb633035STom Caputi is->is_size, NULL, NULL, NULL); 14653a4b1be9SMatthew Ahrens } 14663a4b1be9SMatthew Ahrens } 14673a4b1be9SMatthew Ahrens } 14683a4b1be9SMatthew Ahrens 1469a21fe349SBrian Behlendorf /* 1470a21fe349SBrian Behlendorf * Copy data from all the splits to a main zio then validate the checksum. 1471a21fe349SBrian Behlendorf * If then checksum is successfully validated return success. 1472a21fe349SBrian Behlendorf */ 1473a21fe349SBrian Behlendorf static int 1474a21fe349SBrian Behlendorf vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) 1475a21fe349SBrian Behlendorf { 1476a21fe349SBrian Behlendorf zio_bad_cksum_t zbc; 1477a21fe349SBrian Behlendorf 1478a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1479a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1480a21fe349SBrian Behlendorf 1481a21fe349SBrian Behlendorf ASSERT3P(is->is_good_child->ic_data, !=, NULL); 1482a21fe349SBrian Behlendorf ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); 1483a21fe349SBrian Behlendorf 1484a21fe349SBrian Behlendorf abd_copy_off(zio->io_abd, is->is_good_child->ic_data, 1485a21fe349SBrian Behlendorf is->is_split_offset, 0, is->is_size); 1486a21fe349SBrian Behlendorf } 1487a21fe349SBrian Behlendorf 1488a21fe349SBrian Behlendorf return (zio_checksum_error(zio, &zbc)); 1489a21fe349SBrian Behlendorf } 1490a21fe349SBrian Behlendorf 1491a21fe349SBrian Behlendorf /* 1492a21fe349SBrian Behlendorf * There are relatively few possible combinations making it feasible to 1493a21fe349SBrian Behlendorf * deterministically check them all. We do this by setting the good_child 1494a21fe349SBrian Behlendorf * to the next unique split version. If we reach the end of the list then 1495a21fe349SBrian Behlendorf * "carry over" to the next unique split version (like counting in base 1496a21fe349SBrian Behlendorf * is_unique_children, but each digit can have a different base). 1497a21fe349SBrian Behlendorf */ 1498a21fe349SBrian Behlendorf static int 1499a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) 1500a21fe349SBrian Behlendorf { 1501a21fe349SBrian Behlendorf boolean_t more = B_TRUE; 1502a21fe349SBrian Behlendorf 1503a21fe349SBrian Behlendorf iv->iv_attempts = 0; 1504a21fe349SBrian Behlendorf 1505a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1506a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) 1507a21fe349SBrian Behlendorf is->is_good_child = list_head(&is->is_unique_child); 1508a21fe349SBrian Behlendorf 1509a21fe349SBrian Behlendorf while (more == B_TRUE) { 1510a21fe349SBrian Behlendorf iv->iv_attempts++; 1511a21fe349SBrian Behlendorf more = B_FALSE; 1512a21fe349SBrian Behlendorf 1513a21fe349SBrian Behlendorf if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) 1514a21fe349SBrian Behlendorf return (0); 1515a21fe349SBrian Behlendorf 1516a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1517a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1518a21fe349SBrian Behlendorf is->is_good_child = list_next(&is->is_unique_child, 1519a21fe349SBrian Behlendorf is->is_good_child); 1520a21fe349SBrian Behlendorf if (is->is_good_child != NULL) { 1521a21fe349SBrian Behlendorf more = B_TRUE; 1522a21fe349SBrian Behlendorf break; 1523a21fe349SBrian Behlendorf } 1524a21fe349SBrian Behlendorf 1525a21fe349SBrian Behlendorf is->is_good_child = list_head(&is->is_unique_child); 1526a21fe349SBrian Behlendorf } 1527a21fe349SBrian Behlendorf } 1528a21fe349SBrian Behlendorf 1529a21fe349SBrian Behlendorf ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); 1530a21fe349SBrian Behlendorf 1531a21fe349SBrian Behlendorf return (SET_ERROR(ECKSUM)); 1532a21fe349SBrian Behlendorf } 1533a21fe349SBrian Behlendorf 1534a21fe349SBrian Behlendorf /* 1535a21fe349SBrian Behlendorf * There are too many combinations to try all of them in a reasonable amount 1536a21fe349SBrian Behlendorf * of time. So try a fixed number of random combinations from the unique 1537a21fe349SBrian Behlendorf * split versions, after which we'll consider the block unrecoverable. 1538a21fe349SBrian Behlendorf */ 1539a21fe349SBrian Behlendorf static int 1540a21fe349SBrian Behlendorf vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) 1541a21fe349SBrian Behlendorf { 1542a21fe349SBrian Behlendorf iv->iv_attempts = 0; 1543a21fe349SBrian Behlendorf 1544a21fe349SBrian Behlendorf while (iv->iv_attempts < iv->iv_attempts_max) { 1545a21fe349SBrian Behlendorf iv->iv_attempts++; 1546a21fe349SBrian Behlendorf 1547a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1548a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1549a21fe349SBrian Behlendorf indirect_child_t *ic = list_head(&is->is_unique_child); 1550a21fe349SBrian Behlendorf int children = is->is_unique_children; 1551a21fe349SBrian Behlendorf 1552a21fe349SBrian Behlendorf for (int i = spa_get_random(children); i > 0; i--) 1553a21fe349SBrian Behlendorf ic = list_next(&is->is_unique_child, ic); 1554a21fe349SBrian Behlendorf 1555a21fe349SBrian Behlendorf ASSERT3P(ic, !=, NULL); 1556a21fe349SBrian Behlendorf is->is_good_child = ic; 1557a21fe349SBrian Behlendorf } 1558a21fe349SBrian Behlendorf 1559a21fe349SBrian Behlendorf if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) 1560a21fe349SBrian Behlendorf return (0); 1561a21fe349SBrian Behlendorf } 1562a21fe349SBrian Behlendorf 1563a21fe349SBrian Behlendorf return (SET_ERROR(ECKSUM)); 1564a21fe349SBrian Behlendorf } 1565a21fe349SBrian Behlendorf 1566a21fe349SBrian Behlendorf /* 1567a21fe349SBrian Behlendorf * This is a validation function for reconstruction. It randomly selects 1568a21fe349SBrian Behlendorf * a good combination, if one can be found, and then it intentionally 1569a21fe349SBrian Behlendorf * damages all other segment copes by zeroing them. This forces the 1570a21fe349SBrian Behlendorf * reconstruction algorithm to locate the one remaining known good copy. 1571a21fe349SBrian Behlendorf */ 1572a21fe349SBrian Behlendorf static int 1573a21fe349SBrian Behlendorf vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) 1574a21fe349SBrian Behlendorf { 1575a21fe349SBrian Behlendorf /* Presume all the copies are unique for initial selection. */ 1576a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1577a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1578a21fe349SBrian Behlendorf is->is_unique_children = 0; 1579a21fe349SBrian Behlendorf 1580a21fe349SBrian Behlendorf for (int i = 0; i < is->is_children; i++) { 1581a21fe349SBrian Behlendorf indirect_child_t *ic = &is->is_child[i]; 1582a21fe349SBrian Behlendorf if (ic->ic_data != NULL) { 1583a21fe349SBrian Behlendorf is->is_unique_children++; 1584a21fe349SBrian Behlendorf list_insert_tail(&is->is_unique_child, ic); 1585a21fe349SBrian Behlendorf } 1586a21fe349SBrian Behlendorf } 1587a21fe349SBrian Behlendorf } 1588a21fe349SBrian Behlendorf 1589a21fe349SBrian Behlendorf /* 1590a21fe349SBrian Behlendorf * Set each is_good_child to a randomly-selected child which 1591a21fe349SBrian Behlendorf * is known to contain validated data. 1592a21fe349SBrian Behlendorf */ 1593a21fe349SBrian Behlendorf int error = vdev_indirect_splits_enumerate_randomly(iv, zio); 1594a21fe349SBrian Behlendorf if (error) 1595a21fe349SBrian Behlendorf goto out; 1596a21fe349SBrian Behlendorf 1597a21fe349SBrian Behlendorf /* 1598a21fe349SBrian Behlendorf * Damage all but the known good copy by zeroing it. This will 1599a21fe349SBrian Behlendorf * result in two or less unique copies per indirect_child_t. 1600a21fe349SBrian Behlendorf * Both may need to be checked in order to reconstruct the block. 1601a21fe349SBrian Behlendorf * Set iv->iv_attempts_max such that all unique combinations will 1602a21fe349SBrian Behlendorf * enumerated, but limit the damage to at most 16 indirect splits. 1603a21fe349SBrian Behlendorf */ 1604a21fe349SBrian Behlendorf iv->iv_attempts_max = 1; 1605a21fe349SBrian Behlendorf 1606a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1607a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1608a21fe349SBrian Behlendorf for (int c = 0; c < is->is_children; c++) { 1609a21fe349SBrian Behlendorf indirect_child_t *ic = &is->is_child[c]; 1610a21fe349SBrian Behlendorf 1611a21fe349SBrian Behlendorf if (ic == is->is_good_child) 1612a21fe349SBrian Behlendorf continue; 1613a21fe349SBrian Behlendorf if (ic->ic_data == NULL) 1614a21fe349SBrian Behlendorf continue; 1615a21fe349SBrian Behlendorf 1616a21fe349SBrian Behlendorf abd_zero(ic->ic_data, ic->ic_data->abd_size); 1617a21fe349SBrian Behlendorf } 1618a21fe349SBrian Behlendorf 1619a21fe349SBrian Behlendorf iv->iv_attempts_max *= 2; 1620a21fe349SBrian Behlendorf if (iv->iv_attempts_max > (1ULL << 16)) { 1621a21fe349SBrian Behlendorf iv->iv_attempts_max = UINT64_MAX; 1622a21fe349SBrian Behlendorf break; 1623a21fe349SBrian Behlendorf } 1624a21fe349SBrian Behlendorf } 1625a21fe349SBrian Behlendorf 1626a21fe349SBrian Behlendorf out: 1627a21fe349SBrian Behlendorf /* Empty the unique children lists so they can be reconstructed. */ 1628a21fe349SBrian Behlendorf for (indirect_split_t *is = list_head(&iv->iv_splits); 1629a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1630a21fe349SBrian Behlendorf indirect_child_t *ic; 1631a21fe349SBrian Behlendorf while ((ic = list_head(&is->is_unique_child)) != NULL) 1632a21fe349SBrian Behlendorf list_remove(&is->is_unique_child, ic); 1633a21fe349SBrian Behlendorf 1634a21fe349SBrian Behlendorf is->is_unique_children = 0; 1635a21fe349SBrian Behlendorf } 1636a21fe349SBrian Behlendorf 1637a21fe349SBrian Behlendorf return (error); 1638a21fe349SBrian Behlendorf } 1639a21fe349SBrian Behlendorf 16403a4b1be9SMatthew Ahrens /* 16413a4b1be9SMatthew Ahrens * This function is called when we have read all copies of the data and need 16423a4b1be9SMatthew Ahrens * to try to find a combination of copies that gives us the right checksum. 16433a4b1be9SMatthew Ahrens * 16443a4b1be9SMatthew Ahrens * If we pointed to any mirror vdevs, this effectively does the job of the 16453a4b1be9SMatthew Ahrens * mirror. The mirror vdev code can't do its own job because we don't know 1646a21fe349SBrian Behlendorf * the checksum of each split segment individually. 16473a4b1be9SMatthew Ahrens * 1648a21fe349SBrian Behlendorf * We have to try every unique combination of copies of split segments, until 1649a21fe349SBrian Behlendorf * we find one that checksums correctly. Duplicate segment copies are first 1650a21fe349SBrian Behlendorf * identified and latter skipped during reconstruction. This optimization 1651a21fe349SBrian Behlendorf * reduces the search space and ensures that of the remaining combinations 1652a21fe349SBrian Behlendorf * at most one is correct. 1653a21fe349SBrian Behlendorf * 1654a21fe349SBrian Behlendorf * When the total number of combinations is small they can all be checked. 1655a21fe349SBrian Behlendorf * For example, if we have 3 segments in the split, and each points to a 1656a21fe349SBrian Behlendorf * 2-way mirror with unique copies, we will have the following pieces of data: 16573a4b1be9SMatthew Ahrens * 16583a4b1be9SMatthew Ahrens * | mirror child 16593a4b1be9SMatthew Ahrens * split | [0] [1] 16603a4b1be9SMatthew Ahrens * ======|===================== 16613a4b1be9SMatthew Ahrens * A | data_A_0 data_A_1 16623a4b1be9SMatthew Ahrens * B | data_B_0 data_B_1 16633a4b1be9SMatthew Ahrens * C | data_C_0 data_C_1 16643a4b1be9SMatthew Ahrens * 16653a4b1be9SMatthew Ahrens * We will try the following (mirror children)^(number of splits) (2^3=8) 16663a4b1be9SMatthew Ahrens * combinations, which is similar to bitwise-little-endian counting in 16673a4b1be9SMatthew Ahrens * binary. In general each "digit" corresponds to a split segment, and the 16683a4b1be9SMatthew Ahrens * base of each digit is is_children, which can be different for each 16693a4b1be9SMatthew Ahrens * digit. 16703a4b1be9SMatthew Ahrens * 16713a4b1be9SMatthew Ahrens * "low bit" "high bit" 16723a4b1be9SMatthew Ahrens * v v 16733a4b1be9SMatthew Ahrens * data_A_0 data_B_0 data_C_0 16743a4b1be9SMatthew Ahrens * data_A_1 data_B_0 data_C_0 16753a4b1be9SMatthew Ahrens * data_A_0 data_B_1 data_C_0 16763a4b1be9SMatthew Ahrens * data_A_1 data_B_1 data_C_0 16773a4b1be9SMatthew Ahrens * data_A_0 data_B_0 data_C_1 16783a4b1be9SMatthew Ahrens * data_A_1 data_B_0 data_C_1 16793a4b1be9SMatthew Ahrens * data_A_0 data_B_1 data_C_1 16803a4b1be9SMatthew Ahrens * data_A_1 data_B_1 data_C_1 16813a4b1be9SMatthew Ahrens * 16823a4b1be9SMatthew Ahrens * Note that the split segments may be on the same or different top-level 1683a21fe349SBrian Behlendorf * vdevs. In either case, we may need to try lots of combinations (see 1684a21fe349SBrian Behlendorf * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror 1685a21fe349SBrian Behlendorf * has small silent errors on all of its children, we can still reconstruct 1686a21fe349SBrian Behlendorf * the correct data, as long as those errors are at sufficiently-separated 16873a4b1be9SMatthew Ahrens * offsets (specifically, separated by the largest block size - default of 16883a4b1be9SMatthew Ahrens * 128KB, but up to 16MB). 16893a4b1be9SMatthew Ahrens */ 16903a4b1be9SMatthew Ahrens static void 16913a4b1be9SMatthew Ahrens vdev_indirect_reconstruct_io_done(zio_t *zio) 16923a4b1be9SMatthew Ahrens { 16933a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 1694a21fe349SBrian Behlendorf boolean_t known_good = B_FALSE; 1695a21fe349SBrian Behlendorf int error; 1696a21fe349SBrian Behlendorf 1697a21fe349SBrian Behlendorf iv->iv_unique_combinations = 1; 1698a21fe349SBrian Behlendorf iv->iv_attempts_max = UINT64_MAX; 1699a21fe349SBrian Behlendorf 1700a21fe349SBrian Behlendorf if (zfs_reconstruct_indirect_combinations_max > 0) 1701a21fe349SBrian Behlendorf iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; 1702a21fe349SBrian Behlendorf 1703a21fe349SBrian Behlendorf /* 1704a21fe349SBrian Behlendorf * If nonzero, every 1/x blocks will be damaged, in order to validate 1705a21fe349SBrian Behlendorf * reconstruction when there are split segments with damaged copies. 1706a21fe349SBrian Behlendorf * Known_good will TRUE when reconstruction is known to be possible. 1707a21fe349SBrian Behlendorf */ 1708a21fe349SBrian Behlendorf if (zfs_reconstruct_indirect_damage_fraction != 0 && 1709a21fe349SBrian Behlendorf spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) 1710a21fe349SBrian Behlendorf known_good = (vdev_indirect_splits_damage(iv, zio) == 0); 17113a4b1be9SMatthew Ahrens 1712a21fe349SBrian Behlendorf /* 1713a21fe349SBrian Behlendorf * Determine the unique children for a split segment and add them 1714a21fe349SBrian Behlendorf * to the is_unique_child list. By restricting reconstruction 1715a21fe349SBrian Behlendorf * to these children, only unique combinations will be considered. 1716a21fe349SBrian Behlendorf * This can vastly reduce the search space when there are a large 1717a21fe349SBrian Behlendorf * number of indirect splits. 1718a21fe349SBrian Behlendorf */ 17193a4b1be9SMatthew Ahrens for (indirect_split_t *is = list_head(&iv->iv_splits); 1720a21fe349SBrian Behlendorf is != NULL; is = list_next(&iv->iv_splits, is)) { 1721a21fe349SBrian Behlendorf is->is_unique_children = 0; 17223a4b1be9SMatthew Ahrens 1723a21fe349SBrian Behlendorf for (int i = 0; i < is->is_children; i++) { 1724a21fe349SBrian Behlendorf indirect_child_t *ic_i = &is->is_child[i]; 17253a4b1be9SMatthew Ahrens 1726a21fe349SBrian Behlendorf if (ic_i->ic_data == NULL || 1727a21fe349SBrian Behlendorf ic_i->ic_duplicate != NULL) 1728a21fe349SBrian Behlendorf continue; 17293a4b1be9SMatthew Ahrens 1730a21fe349SBrian Behlendorf for (int j = i + 1; j < is->is_children; j++) { 1731a21fe349SBrian Behlendorf indirect_child_t *ic_j = &is->is_child[j]; 17323a4b1be9SMatthew Ahrens 1733a21fe349SBrian Behlendorf if (ic_j->ic_data == NULL || 1734a21fe349SBrian Behlendorf ic_j->ic_duplicate != NULL) 1735a21fe349SBrian Behlendorf continue; 17363a4b1be9SMatthew Ahrens 1737a21fe349SBrian Behlendorf if (abd_cmp(ic_i->ic_data, ic_j->ic_data, 1738a21fe349SBrian Behlendorf is->is_size) == 0) { 1739a21fe349SBrian Behlendorf ic_j->ic_duplicate = ic_i; 17403a4b1be9SMatthew Ahrens } 17413a4b1be9SMatthew Ahrens } 1742a21fe349SBrian Behlendorf 1743a21fe349SBrian Behlendorf is->is_unique_children++; 1744a21fe349SBrian Behlendorf list_insert_tail(&is->is_unique_child, ic_i); 17453a4b1be9SMatthew Ahrens } 1746a21fe349SBrian Behlendorf 1747a21fe349SBrian Behlendorf /* Reconstruction is impossible, no valid children */ 1748a21fe349SBrian Behlendorf EQUIV(list_is_empty(&is->is_unique_child), 1749a21fe349SBrian Behlendorf is->is_unique_children == 0); 1750a21fe349SBrian Behlendorf if (list_is_empty(&is->is_unique_child)) { 1751a21fe349SBrian Behlendorf zio->io_error = EIO; 17523a4b1be9SMatthew Ahrens vdev_indirect_all_checksum_errors(zio); 17533a4b1be9SMatthew Ahrens zio_checksum_verified(zio); 17543a4b1be9SMatthew Ahrens return; 17553a4b1be9SMatthew Ahrens } 1756a21fe349SBrian Behlendorf 1757a21fe349SBrian Behlendorf iv->iv_unique_combinations *= is->is_unique_children; 1758a21fe349SBrian Behlendorf } 1759a21fe349SBrian Behlendorf 1760a21fe349SBrian Behlendorf if (iv->iv_unique_combinations <= iv->iv_attempts_max) 1761a21fe349SBrian Behlendorf error = vdev_indirect_splits_enumerate_all(iv, zio); 1762a21fe349SBrian Behlendorf else 1763a21fe349SBrian Behlendorf error = vdev_indirect_splits_enumerate_randomly(iv, zio); 1764a21fe349SBrian Behlendorf 1765a21fe349SBrian Behlendorf if (error != 0) { 1766a21fe349SBrian Behlendorf /* All attempted combinations failed. */ 1767a21fe349SBrian Behlendorf ASSERT3B(known_good, ==, B_FALSE); 1768a21fe349SBrian Behlendorf zio->io_error = error; 1769a21fe349SBrian Behlendorf vdev_indirect_all_checksum_errors(zio); 1770a21fe349SBrian Behlendorf } else { 1771a21fe349SBrian Behlendorf /* 1772a21fe349SBrian Behlendorf * The checksum has been successfully validated. Issue 1773a21fe349SBrian Behlendorf * repair I/Os to any copies of splits which don't match 1774a21fe349SBrian Behlendorf * the validated version. 1775a21fe349SBrian Behlendorf */ 1776a21fe349SBrian Behlendorf ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); 1777a21fe349SBrian Behlendorf vdev_indirect_repair(zio); 1778a21fe349SBrian Behlendorf zio_checksum_verified(zio); 17793a4b1be9SMatthew Ahrens } 17803a4b1be9SMatthew Ahrens } 17813a4b1be9SMatthew Ahrens 17823a4b1be9SMatthew Ahrens static void 17833a4b1be9SMatthew Ahrens vdev_indirect_io_done(zio_t *zio) 17843a4b1be9SMatthew Ahrens { 17853a4b1be9SMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 17863a4b1be9SMatthew Ahrens 17873a4b1be9SMatthew Ahrens if (iv->iv_reconstruct) { 17883a4b1be9SMatthew Ahrens /* 17893a4b1be9SMatthew Ahrens * We have read all copies of the data (e.g. from mirrors), 17903a4b1be9SMatthew Ahrens * either because this was a scrub/resilver, or because the 17913a4b1be9SMatthew Ahrens * one-copy read didn't checksum correctly. 17923a4b1be9SMatthew Ahrens */ 17933a4b1be9SMatthew Ahrens vdev_indirect_reconstruct_io_done(zio); 17943a4b1be9SMatthew Ahrens return; 17953a4b1be9SMatthew Ahrens } 17963a4b1be9SMatthew Ahrens 17973a4b1be9SMatthew Ahrens if (!iv->iv_split_block) { 17983a4b1be9SMatthew Ahrens /* 17993a4b1be9SMatthew Ahrens * This was not a split block, so we passed the BP down, 18003a4b1be9SMatthew Ahrens * and the checksum was handled by the (one) child zio. 18013a4b1be9SMatthew Ahrens */ 18023a4b1be9SMatthew Ahrens return; 18033a4b1be9SMatthew Ahrens } 18043a4b1be9SMatthew Ahrens 18053a4b1be9SMatthew Ahrens zio_bad_cksum_t zbc; 18063a4b1be9SMatthew Ahrens int ret = zio_checksum_error(zio, &zbc); 18073a4b1be9SMatthew Ahrens if (ret == 0) { 18083a4b1be9SMatthew Ahrens zio_checksum_verified(zio); 18093a4b1be9SMatthew Ahrens return; 18103a4b1be9SMatthew Ahrens } 18113a4b1be9SMatthew Ahrens 18123a4b1be9SMatthew Ahrens /* 18133a4b1be9SMatthew Ahrens * The checksum didn't match. Read all copies of all splits, and 18143a4b1be9SMatthew Ahrens * then we will try to reconstruct. The next time 18153a4b1be9SMatthew Ahrens * vdev_indirect_io_done() is called, iv_reconstruct will be set. 18163a4b1be9SMatthew Ahrens */ 18173a4b1be9SMatthew Ahrens vdev_indirect_read_all(zio); 18183a4b1be9SMatthew Ahrens 18193a4b1be9SMatthew Ahrens zio_vdev_io_redone(zio); 18203a4b1be9SMatthew Ahrens } 18213a4b1be9SMatthew Ahrens 18225cabbc6bSPrashanth Sreenivasa vdev_ops_t vdev_indirect_ops = { 1823a3874b8bSToomas Soome .vdev_op_open = vdev_indirect_open, 1824a3874b8bSToomas Soome .vdev_op_close = vdev_indirect_close, 1825a3874b8bSToomas Soome .vdev_op_asize = vdev_default_asize, 1826a3874b8bSToomas Soome .vdev_op_io_start = vdev_indirect_io_start, 1827a3874b8bSToomas Soome .vdev_op_io_done = vdev_indirect_io_done, 1828a3874b8bSToomas Soome .vdev_op_state_change = NULL, 1829a3874b8bSToomas Soome .vdev_op_need_resilver = NULL, 1830a3874b8bSToomas Soome .vdev_op_hold = NULL, 1831a3874b8bSToomas Soome .vdev_op_rele = NULL, 1832a3874b8bSToomas Soome .vdev_op_remap = vdev_indirect_remap, 1833a3874b8bSToomas Soome .vdev_op_xlate = NULL, 1834*ac04831dSMike Gerdts .vdev_op_dumpio = NULL, 1835a3874b8bSToomas Soome .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1836a3874b8bSToomas Soome .vdev_op_leaf = B_FALSE /* leaf vdev */ 18375cabbc6bSPrashanth Sreenivasa }; 1838