15cabbc6Prashanth Sreenivasa/* 25cabbc6Prashanth Sreenivasa * CDDL HEADER START 35cabbc6Prashanth Sreenivasa * 45cabbc6Prashanth Sreenivasa * This file and its contents are supplied under the terms of the 55cabbc6Prashanth Sreenivasa * Common Development and Distribution License ("CDDL"), version 1.0. 65cabbc6Prashanth Sreenivasa * You may only use this file in accordance with the terms of version 75cabbc6Prashanth Sreenivasa * 1.0 of the CDDL. 85cabbc6Prashanth Sreenivasa * 95cabbc6Prashanth Sreenivasa * A full copy of the text of the CDDL should have accompanied this 105cabbc6Prashanth Sreenivasa * source. A copy of the CDDL is also available via the Internet at 115cabbc6Prashanth Sreenivasa * http://www.illumos.org/license/CDDL. 125cabbc6Prashanth Sreenivasa * 135cabbc6Prashanth Sreenivasa * CDDL HEADER END 145cabbc6Prashanth Sreenivasa */ 155cabbc6Prashanth Sreenivasa 165cabbc6Prashanth Sreenivasa/* 17814dcd4Serapheim Dimitropoulos * Copyright (c) 2014, 2019 by Delphix. All rights reserved. 185cabbc6Prashanth Sreenivasa */ 195cabbc6Prashanth Sreenivasa 205cabbc6Prashanth Sreenivasa#include <sys/zfs_context.h> 215cabbc6Prashanth Sreenivasa#include <sys/spa.h> 225cabbc6Prashanth Sreenivasa#include <sys/spa_impl.h> 235cabbc6Prashanth Sreenivasa#include <sys/vdev_impl.h> 245cabbc6Prashanth Sreenivasa#include <sys/fs/zfs.h> 255cabbc6Prashanth Sreenivasa#include <sys/zio.h> 263a4b1beMatthew Ahrens#include <sys/zio_checksum.h> 275cabbc6Prashanth Sreenivasa#include <sys/metaslab.h> 285cabbc6Prashanth Sreenivasa#include <sys/refcount.h> 295cabbc6Prashanth Sreenivasa#include <sys/dmu.h> 305cabbc6Prashanth Sreenivasa#include <sys/vdev_indirect_mapping.h> 315cabbc6Prashanth Sreenivasa#include <sys/dmu_tx.h> 325cabbc6Prashanth Sreenivasa#include <sys/dsl_synctask.h> 335cabbc6Prashanth Sreenivasa#include <sys/zap.h> 34667ec66Serapheim Dimitropoulos#include <sys/abd.h> 35667ec66Serapheim Dimitropoulos#include <sys/zthr.h> 365cabbc6Prashanth Sreenivasa 375cabbc6Prashanth Sreenivasa/* 385cabbc6Prashanth Sreenivasa * An indirect vdev corresponds to a vdev that has been removed. Since 395cabbc6Prashanth Sreenivasa * we cannot rewrite block pointers of snapshots, etc., we keep a 405cabbc6Prashanth Sreenivasa * mapping from old location on the removed device to the new location 415cabbc6Prashanth Sreenivasa * on another device in the pool and use this mapping whenever we need 425cabbc6Prashanth Sreenivasa * to access the DVA. Unfortunately, this mapping did not respect 435cabbc6Prashanth Sreenivasa * logical block boundaries when it was first created, and so a DVA on 445cabbc6Prashanth Sreenivasa * this indirect vdev may be "split" into multiple sections that each 455cabbc6Prashanth Sreenivasa * map to a different location. As a consequence, not all DVAs can be 465cabbc6Prashanth Sreenivasa * translated to an equivalent new DVA. Instead we must provide a 475cabbc6Prashanth Sreenivasa * "vdev_remap" operation that executes a callback on each contiguous 485cabbc6Prashanth Sreenivasa * segment of the new location. This function is used in multiple ways: 495cabbc6Prashanth Sreenivasa * 503a4b1beMatthew Ahrens * - i/os to this vdev use the callback to determine where the 513a4b1beMatthew Ahrens * data is now located, and issue child i/os for each segment's new 523a4b1beMatthew Ahrens * location. 535cabbc6Prashanth Sreenivasa * 543a4b1beMatthew Ahrens * - frees and claims to this vdev use the callback to free or claim 555cabbc6Prashanth Sreenivasa * each mapped segment. (Note that we don't actually need to claim 565cabbc6Prashanth Sreenivasa * log blocks on indirect vdevs, because we don't allocate to 575cabbc6Prashanth Sreenivasa * removing vdevs. However, zdb uses zio_claim() for its leak 585cabbc6Prashanth Sreenivasa * detection.) 595cabbc6Prashanth Sreenivasa */ 605cabbc6Prashanth Sreenivasa 615cabbc6Prashanth Sreenivasa/* 625cabbc6Prashanth Sreenivasa * "Big theory statement" for how we mark blocks obsolete. 635cabbc6Prashanth Sreenivasa * 645cabbc6Prashanth Sreenivasa * When a block on an indirect vdev is freed or remapped, a section of 655cabbc6Prashanth Sreenivasa * that vdev's mapping may no longer be referenced (aka "obsolete"). We 665cabbc6Prashanth Sreenivasa * keep track of how much of each mapping entry is obsolete. When 675cabbc6Prashanth Sreenivasa * an entry becomes completely obsolete, we can remove it, thus reducing 685cabbc6Prashanth Sreenivasa * the memory used by the mapping. The complete picture of obsolescence 695cabbc6Prashanth Sreenivasa * is given by the following data structures, described below: 705cabbc6Prashanth Sreenivasa * - the entry-specific obsolete count 715cabbc6Prashanth Sreenivasa * - the vdev-specific obsolete spacemap 725cabbc6Prashanth Sreenivasa * - the pool-specific obsolete bpobj 735cabbc6Prashanth Sreenivasa * 745cabbc6Prashanth Sreenivasa * == On disk data structures used == 755cabbc6Prashanth Sreenivasa * 765cabbc6Prashanth Sreenivasa * We track the obsolete space for the pool using several objects. Each 775cabbc6Prashanth Sreenivasa * of these objects is created on demand and freed when no longer 785cabbc6Prashanth Sreenivasa * needed, and is assumed to be empty if it does not exist. 795cabbc6Prashanth Sreenivasa * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 805cabbc6Prashanth Sreenivasa * 815cabbc6Prashanth Sreenivasa * - Each vic_mapping_object (associated with an indirect vdev) can 825cabbc6Prashanth Sreenivasa * have a vimp_counts_object. This is an array of uint32_t's 835cabbc6Prashanth Sreenivasa * with the same number of entries as the vic_mapping_object. When 845cabbc6Prashanth Sreenivasa * the mapping is condensed, entries from the vic_obsolete_sm_object 855cabbc6Prashanth Sreenivasa * (see below) are folded into the counts. Therefore, each 865cabbc6Prashanth Sreenivasa * obsolete_counts entry tells us the number of bytes in the 875cabbc6Prashanth Sreenivasa * corresponding mapping entry that were not referenced when the 885cabbc6Prashanth Sreenivasa * mapping was last condensed. 895cabbc6Prashanth Sreenivasa * 905cabbc6Prashanth Sreenivasa * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 915cabbc6Prashanth Sreenivasa * This is a space map containing an alloc entry for every DVA that 925cabbc6Prashanth Sreenivasa * has been obsoleted since the last time this indirect vdev was 935cabbc6Prashanth Sreenivasa * condensed. We use this object in order to improve performance 945cabbc6Prashanth Sreenivasa * when marking a DVA as obsolete. Instead of modifying an arbitrary 955cabbc6Prashanth Sreenivasa * offset of the vimp_counts_object, we only need to append an entry 965cabbc6Prashanth Sreenivasa * to the end of this object. When a DVA becomes obsolete, it is 975cabbc6Prashanth Sreenivasa * added to the obsolete space map. This happens when the DVA is 985cabbc6Prashanth Sreenivasa * freed, remapped and not referenced by a snapshot, or the last 995cabbc6Prashanth Sreenivasa * snapshot referencing it is destroyed. 1005cabbc6Prashanth Sreenivasa * 1015cabbc6Prashanth Sreenivasa * - Each dataset can have a ds_remap_deadlist object. This is a 1025cabbc6Prashanth Sreenivasa * deadlist object containing all blocks that were remapped in this 1035cabbc6Prashanth Sreenivasa * dataset but referenced in a previous snapshot. Blocks can *only* 1045cabbc6Prashanth Sreenivasa * appear on this list if they were remapped (dsl_dataset_block_remapped); 1055cabbc6Prashanth Sreenivasa * blocks that were killed in a head dataset are put on the normal 1065cabbc6Prashanth Sreenivasa * ds_deadlist and marked obsolete when they are freed. 1075cabbc6Prashanth Sreenivasa * 1085cabbc6Prashanth Sreenivasa * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 1095cabbc6Prashanth Sreenivasa * in the pool that need to be marked obsolete. When a snapshot is 1105cabbc6Prashanth Sreenivasa * destroyed, we move some of the ds_remap_deadlist to the obsolete 1115cabbc6Prashanth Sreenivasa * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 1125cabbc6Prashanth Sreenivasa * asynchronously process the obsolete bpobj, moving its entries to 1135cabbc6Prashanth Sreenivasa * the specific vdevs' obsolete space maps. 1145cabbc6Prashanth Sreenivasa * 1155cabbc6Prashanth Sreenivasa * == Summary of how we mark blocks as obsolete == 1165cabbc6Prashanth Sreenivasa * 1175cabbc6Prashanth Sreenivasa * - When freeing a block: if any DVA is on an indirect vdev, append to 1185cabbc6Prashanth Sreenivasa * vic_obsolete_sm_object. 1195cabbc6Prashanth Sreenivasa * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 1205cabbc6Prashanth Sreenivasa * references; otherwise append to vic_obsolete_sm_object). 1215cabbc6Prashanth Sreenivasa * - When freeing a snapshot: move parts of ds_remap_deadlist to 1225cabbc6Prashanth Sreenivasa * dp_obsolete_bpobj (same algorithm as ds_deadlist). 1235cabbc6Prashanth Sreenivasa * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 1245cabbc6Prashanth Sreenivasa * individual vdev's vic_obsolete_sm_object. 1255cabbc6Prashanth Sreenivasa */ 1265cabbc6Prashanth Sreenivasa 1275cabbc6Prashanth Sreenivasa/* 1285cabbc6Prashanth Sreenivasa * "Big theory statement" for how we condense indirect vdevs. 1295cabbc6Prashanth Sreenivasa * 1305cabbc6Prashanth Sreenivasa * Condensing an indirect vdev's mapping is the process of determining 1315cabbc6Prashanth Sreenivasa * the precise counts of obsolete space for each mapping entry (by 1325cabbc6Prashanth Sreenivasa * integrating the obsolete spacemap into the obsolete counts) and 1335cabbc6Prashanth Sreenivasa * writing out a new mapping that contains only referenced entries. 1345cabbc6Prashanth Sreenivasa * 1355cabbc6Prashanth Sreenivasa * We condense a vdev when we expect the mapping to shrink (see 1365cabbc6Prashanth Sreenivasa * vdev_indirect_should_condense()), but only perform one condense at a 1375cabbc6Prashanth Sreenivasa * time to limit the memory usage. In addition, we use a separate 1385cabbc6Prashanth Sreenivasa * open-context thread (spa_condense_indirect_thread) to incrementally 1395cabbc6Prashanth Sreenivasa * create the new mapping object in a way that minimizes the impact on 1405cabbc6Prashanth Sreenivasa * the rest of the system. 1415cabbc6Prashanth Sreenivasa * 1425cabbc6Prashanth Sreenivasa * == Generating a new mapping == 1435cabbc6Prashanth Sreenivasa * 1445cabbc6Prashanth Sreenivasa * To generate a new mapping, we follow these steps: 1455cabbc6Prashanth Sreenivasa * 1465cabbc6Prashanth Sreenivasa * 1. Save the old obsolete space map and create a new mapping object 1475cabbc6Prashanth Sreenivasa * (see spa_condense_indirect_start_sync()). This initializes the 1485cabbc6Prashanth Sreenivasa * spa_condensing_indirect_phys with the "previous obsolete space map", 1495cabbc6Prashanth Sreenivasa * which is now read only. Newly obsolete DVAs will be added to a 1505cabbc6Prashanth Sreenivasa * new (initially empty) obsolete space map, and will not be 1515cabbc6Prashanth Sreenivasa * considered as part of this condense operation. 1525cabbc6Prashanth Sreenivasa * 1535cabbc6Prashanth Sreenivasa * 2. Construct in memory the precise counts of obsolete space for each 1545cabbc6Prashanth Sreenivasa * mapping entry, by incorporating the obsolete space map into the 1555cabbc6Prashanth Sreenivasa * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 1565cabbc6Prashanth Sreenivasa * 1575cabbc6Prashanth Sreenivasa * 3. Iterate through each mapping entry, writing to the new mapping any 1585cabbc6Prashanth Sreenivasa * entries that are not completely obsolete (i.e. which don't have 1595cabbc6Prashanth Sreenivasa * obsolete count == mapping length). (See 1605cabbc6Prashanth Sreenivasa * spa_condense_indirect_generate_new_mapping().) 1615cabbc6Prashanth Sreenivasa * 1625cabbc6Prashanth Sreenivasa * 4. Destroy the old mapping object and switch over to the new one 1635cabbc6Prashanth Sreenivasa * (spa_condense_indirect_complete_sync). 1645cabbc6Prashanth Sreenivasa * 1655cabbc6Prashanth Sreenivasa * == Restarting from failure == 1665cabbc6Prashanth Sreenivasa * 1675cabbc6Prashanth Sreenivasa * To restart the condense when we import/open the pool, we must start 1685cabbc6Prashanth Sreenivasa * at the 2nd step above: reconstruct the precise counts in memory, 1695cabbc6Prashanth Sreenivasa * based on the space map + counts. Then in the 3rd step, we start 1705cabbc6Prashanth Sreenivasa * iterating where we left off: at vimp_max_offset of the new mapping 1715cabbc6Prashanth Sreenivasa * object. 1725cabbc6Prashanth Sreenivasa */ 1735cabbc6Prashanth Sreenivasa 1745cabbc6Prashanth Sreenivasaboolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 1755cabbc6Prashanth Sreenivasa 1765cabbc6Prashanth Sreenivasa/* 1775cabbc6Prashanth Sreenivasa * Condense if at least this percent of the bytes in the mapping is 1785cabbc6Prashanth Sreenivasa * obsolete. With the default of 25%, the amount of space mapped 1795cabbc6Prashanth Sreenivasa * will be reduced to 1% of its original size after at most 16 1805cabbc6Prashanth Sreenivasa * condenses. Higher values will condense less often (causing less 1815cabbc6Prashanth Sreenivasa * i/o); lower values will reduce the mapping size more quickly. 1825cabbc6Prashanth Sreenivasa */ 1835cabbc6Prashanth Sreenivasaint zfs_indirect_condense_obsolete_pct = 25; 1845cabbc6Prashanth Sreenivasa 1855cabbc6Prashanth Sreenivasa/* 1865cabbc6Prashanth Sreenivasa * Condense if the obsolete space map takes up more than this amount of 1875cabbc6Prashanth Sreenivasa * space on disk (logically). This limits the amount of disk space 1885cabbc6Prashanth Sreenivasa * consumed by the obsolete space map; the default of 1GB is small enough 1895cabbc6Prashanth Sreenivasa * that we typically don't mind "wasting" it. 1905cabbc6Prashanth Sreenivasa */ 1915cabbc6Prashanth Sreenivasauint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 1925cabbc6Prashanth Sreenivasa 1935cabbc6Prashanth Sreenivasa/* 1945cabbc6Prashanth Sreenivasa * Don't bother condensing if the mapping uses less than this amount of 1955cabbc6Prashanth Sreenivasa * memory. The default of 128KB is considered a "trivial" amount of 1965cabbc6Prashanth Sreenivasa * memory and not worth reducing. 1975cabbc6Prashanth Sreenivasa */ 1985cabbc6Prashanth Sreenivasauint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 1995cabbc6Prashanth Sreenivasa 2005cabbc6Prashanth Sreenivasa/* 2015cabbc6Prashanth Sreenivasa * This is used by the test suite so that it can ensure that certain 2025cabbc6Prashanth Sreenivasa * actions happen while in the middle of a condense (which might otherwise 2035cabbc6Prashanth Sreenivasa * complete too quickly). If used to reduce the performance impact of 2045cabbc6Prashanth Sreenivasa * condensing in production, a maximum value of 1 should be sufficient. 2055cabbc6Prashanth Sreenivasa */ 2065cabbc6Prashanth Sreenivasaint zfs_condense_indirect_commit_entry_delay_ticks = 0; 2075cabbc6Prashanth Sreenivasa 2085cabbc6Prashanth Sreenivasa/* 209a21fe34Brian Behlendorf * If an indirect split block contains more than this many possible unique 210a21fe34Brian Behlendorf * combinations when being reconstructed, consider it too computationally 211a21fe34Brian Behlendorf * expensive to check them all. Instead, try at most 100 randomly-selected 212a21fe34Brian Behlendorf * combinations each time the block is accessed. This allows all segment 213a21fe34Brian Behlendorf * copies to participate fairly in the reconstruction when all combinations 214a21fe34Brian Behlendorf * cannot be checked and prevents repeated use of one bad copy. 215a21fe34Brian Behlendorf */ 216a21fe34Brian Behlendorfint zfs_reconstruct_indirect_combinations_max = 256; 217a21fe34Brian Behlendorf 218a21fe34Brian Behlendorf 219a21fe34Brian Behlendorf/* 220a21fe34Brian Behlendorf * Enable to simulate damaged segments and validate reconstruction. 221a21fe34Brian Behlendorf * Used by ztest 2223a4b1beMatthew Ahrens */ 223a21fe34Brian Behlendorfunsigned long zfs_reconstruct_indirect_damage_fraction = 0; 2243a4b1beMatthew Ahrens 2253a4b1beMatthew Ahrens/* 2263a4b1beMatthew Ahrens * The indirect_child_t represents the vdev that we will read from, when we 2273a4b1beMatthew Ahrens * need to read all copies of the data (e.g. for scrub or reconstruction). 2283a4b1beMatthew Ahrens * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 2293a4b1beMatthew Ahrens * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 2303a4b1beMatthew Ahrens * ic_vdev is a child of the mirror. 2313a4b1beMatthew Ahrens */ 2323a4b1beMatthew Ahrenstypedef struct indirect_child { 2333a4b1beMatthew Ahrens abd_t *ic_data; 2343a4b1beMatthew Ahrens vdev_t *ic_vdev; 235a21fe34Brian Behlendorf 236a21fe34Brian Behlendorf /* 237a21fe34Brian Behlendorf * ic_duplicate is NULL when the ic_data contents are unique, when it 238a21fe34Brian Behlendorf * is determined to be a duplicate it references the primary child. 239a21fe34Brian Behlendorf */ 240a21fe34Brian Behlendorf struct indirect_child *ic_duplicate; 241a21fe34Brian Behlendorf list_node_t ic_node; /* node on is_unique_child */ 2423a4b1beMatthew Ahrens} indirect_child_t; 2433a4b1beMatthew Ahrens 2443a4b1beMatthew Ahrens/* 2453a4b1beMatthew Ahrens * The indirect_split_t represents one mapped segment of an i/o to the 2463a4b1beMatthew Ahrens * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 2473a4b1beMatthew Ahrens * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 2483a4b1beMatthew Ahrens * For split blocks, there will be several of these. 2493a4b1beMatthew Ahrens */ 2503a4b1beMatthew Ahrenstypedef struct indirect_split { 2513a4b1beMatthew Ahrens list_node_t is_node; /* link on iv_splits */ 2523a4b1beMatthew Ahrens 2533a4b1beMatthew Ahrens /* 2543a4b1beMatthew Ahrens * is_split_offset is the offset into the i/o. 2553a4b1beMatthew Ahrens * This is the sum of the previous splits' is_size's. 2563a4b1beMatthew Ahrens */ 2573a4b1beMatthew Ahrens uint64_t is_split_offset; 2583a4b1beMatthew Ahrens 2593a4b1beMatthew Ahrens vdev_t *is_vdev; /* top-level vdev */ 2603a4b1beMatthew Ahrens uint64_t is_target_offset; /* offset on is_vdev */ 2613a4b1beMatthew Ahrens uint64_t is_size; 2623a4b1beMatthew Ahrens int is_children; /* number of entries in is_child[] */ 263a21fe34Brian Behlendorf int is_unique_children; /* number of entries in is_unique_child */ 264a21fe34Brian Behlendorf list_t is_unique_child; 2653a4b1beMatthew Ahrens 2663a4b1beMatthew Ahrens /* 2673a4b1beMatthew Ahrens * is_good_child is the child that we are currently using to 2683a4b1beMatthew Ahrens * attempt reconstruction. 2693a4b1beMatthew Ahrens */ 270a21fe34Brian Behlendorf indirect_child_t *is_good_child; 2713a4b1beMatthew Ahrens 2723a4b1beMatthew Ahrens indirect_child_t is_child[1]; /* variable-length */ 2733a4b1beMatthew Ahrens} indirect_split_t; 2743a4b1beMatthew Ahrens 2753a4b1beMatthew Ahrens/* 2763a4b1beMatthew Ahrens * The indirect_vsd_t is associated with each i/o to the indirect vdev. 2773a4b1beMatthew Ahrens * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 2783a4b1beMatthew Ahrens */ 2793a4b1beMatthew Ahrenstypedef struct indirect_vsd { 2803a4b1beMatthew Ahrens boolean_t iv_split_block; 2813a4b1beMatthew Ahrens boolean_t iv_reconstruct; 282a21fe34Brian Behlendorf uint64_t iv_unique_combinations; 283a21fe34Brian Behlendorf uint64_t iv_attempts; 284a21fe34Brian Behlendorf uint64_t iv_attempts_max; 2853a4b1beMatthew Ahrens 2863a4b1beMatthew Ahrens list_t iv_splits; /* list of indirect_split_t's */ 2873a4b1beMatthew Ahrens} indirect_vsd_t; 2883a4b1beMatthew Ahrens 2893a4b1beMatthew Ahrensstatic void 2903a4b1beMatthew Ahrensvdev_indirect_map_free(zio_t *zio) 2913a4b1beMatthew Ahrens{ 2923a4b1beMatthew Ahrens indirect_vsd_t *iv = zio->io_vsd; 2933a4b1beMatthew Ahrens 2943a4b1beMatthew Ahrens indirect_split_t *is; 2953a4b1beMatthew Ahrens while ((is = list_head(&iv->iv_splits)) != NULL) { 2963a4b1beMatthew Ahrens for (int c = 0; c < is->is_children; c++) { 2973a4b1beMatthew Ahrens indirect_child_t *ic = &is->is_child[c]; 2983a4b1beMatthew Ahrens if (ic->ic_data != NULL) 2993a4b1beMatthew Ahrens abd_free(ic->ic_data); 3003a4b1beMatthew Ahrens } 3013a4b1beMatthew Ahrens list_remove(&iv->iv_splits, is); 302a21fe34Brian Behlendorf 303a21fe34Brian Behlendorf indirect_child_t *ic; 304a21fe34Brian Behlendorf while ((ic = list_head(&is->is_unique_child)) != NULL) 305a21fe34Brian Behlendorf list_remove(&is->is_unique_child, ic); 306a21fe34Brian Behlendorf 307a21fe34Brian Behlendorf list_destroy(&is->is_unique_child); 308a21fe34Brian Behlendorf 3093a4b1beMatthew Ahrens kmem_free(is, 3103a4b1beMatthew Ahrens offsetof(indirect_split_t, is_child[is->is_children])); 3113a4b1beMatthew Ahrens } 3123a4b1beMatthew Ahrens kmem_free(iv, sizeof (*iv)); 3133a4b1beMatthew Ahrens} 3143a4b1beMatthew Ahrens 3153a4b1beMatthew Ahrensstatic const zio_vsd_ops_t vdev_indirect_vsd_ops = { 3163a4b1beMatthew Ahrens vdev_indirect_map_free, 3173a4b1beMatthew Ahrens zio_vsd_default_cksum_report 3183a4b1beMatthew Ahrens}; 3193a4b1beMatthew Ahrens/* 3208671400Serapheim Dimitropoulos * Mark the given offset and size as being obsolete. 3215cabbc6Prashanth Sreenivasa */ 3225cabbc6Prashanth Sreenivasavoid 3238671400Serapheim Dimitropoulosvdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 3245cabbc6Prashanth Sreenivasa{ 3255cabbc6Prashanth Sreenivasa spa_t *spa = vd->vdev_spa; 3268671400Serapheim Dimitropoulos 3275cabbc6Prashanth Sreenivasa ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 3285cabbc6Prashanth Sreenivasa ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 3295cabbc6Prashanth Sreenivasa ASSERT(size > 0); 3305cabbc6Prashanth Sreenivasa VERIFY(vdev_indirect_mapping_entry_for_offset( 3315cabbc6Prashanth Sreenivasa vd->vdev_indirect_mapping, offset) != NULL); 3325cabbc6Prashanth Sreenivasa 3335cabbc6Prashanth Sreenivasa if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 3345cabbc6Prashanth Sreenivasa mutex_enter(&vd->vdev_obsolete_lock); 3355cabbc6Prashanth Sreenivasa range_tree_add(vd->vdev_obsolete_segments, offset, size); 3365cabbc6Prashanth Sreenivasa mutex_exit(&vd->vdev_obsolete_lock); 3378671400Serapheim Dimitropoulos vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); 3385cabbc6Prashanth Sreenivasa } 3395cabbc6Prashanth Sreenivasa} 3405cabbc6Prashanth Sreenivasa 3415cabbc6Prashanth Sreenivasa/* 3425cabbc6Prashanth Sreenivasa * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 3435cabbc6Prashanth Sreenivasa * wrapper is provided because the DMU does not know about vdev_t's and 3445cabbc6Prashanth Sreenivasa * cannot directly call vdev_indirect_mark_obsolete. 3455cabbc6Prashanth Sreenivasa */ 3465cabbc6Prashanth Sreenivasavoid 3475cabbc6Prashanth Sreenivasaspa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 3485cabbc6Prashanth Sreenivasa uint64_t size, dmu_tx_t *tx) 3495cabbc6Prashanth Sreenivasa{ 3505cabbc6Prashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, vdev_id); 3515cabbc6Prashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 3525cabbc6Prashanth Sreenivasa 3535cabbc6Prashanth Sreenivasa /* The DMU can only remap indirect vdevs. */ 3545cabbc6Prashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 3558671400Serapheim Dimitropoulos vdev_indirect_mark_obsolete(vd, offset, size); 3565cabbc6Prashanth Sreenivasa} 3575cabbc6Prashanth Sreenivasa 3585cabbc6Prashanth Sreenivasastatic spa_condensing_indirect_t * 3595cabbc6Prashanth Sreenivasaspa_condensing_indirect_create(spa_t *spa) 3605cabbc6Prashanth Sreenivasa{ 3615cabbc6Prashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 3625cabbc6Prashanth Sreenivasa &spa->spa_condensing_indirect_phys; 3635cabbc6Prashanth Sreenivasa spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 3645cabbc6Prashanth Sreenivasa objset_t *mos = spa->spa_meta_objset; 3655cabbc6Prashanth Sreenivasa 3665cabbc6Prashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 3675cabbc6Prashanth Sreenivasa list_create(&sci->sci_new_mapping_entries[i], 3685cabbc6Prashanth Sreenivasa sizeof (vdev_indirect_mapping_entry_t), 3695cabbc6Prashanth Sreenivasa offsetof(vdev_indirect_mapping_entry_t, vime_node)); 3705cabbc6Prashanth Sreenivasa } 3715cabbc6Prashanth Sreenivasa 3725cabbc6Prashanth Sreenivasa sci->sci_new_mapping = 3735cabbc6Prashanth Sreenivasa vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 3745cabbc6Prashanth Sreenivasa 3755cabbc6Prashanth Sreenivasa return (sci); 3765cabbc6Prashanth Sreenivasa} 3775cabbc6Prashanth Sreenivasa 3785cabbc6Prashanth Sreenivasastatic void 3795cabbc6Prashanth Sreenivasaspa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 3805cabbc6Prashanth Sreenivasa{ 3815cabbc6Prashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) 3825cabbc6Prashanth Sreenivasa list_destroy(&sci->sci_new_mapping_entries[i]); 3835cabbc6Prashanth Sreenivasa 3845cabbc6Prashanth Sreenivasa if (sci->sci_new_mapping != NULL) 3855cabbc6Prashanth Sreenivasa vdev_indirect_mapping_close(sci->sci_new_mapping); 3865cabbc6Prashanth Sreenivasa 3875cabbc6Prashanth Sreenivasa kmem_free(sci, sizeof (*sci)); 3885cabbc6Prashanth Sreenivasa} 3895cabbc6Prashanth Sreenivasa 3905cabbc6Prashanth Sreenivasaboolean_t 3915cabbc6Prashanth Sreenivasavdev_indirect_should_condense(vdev_t *vd) 3925cabbc6Prashanth Sreenivasa{ 3935cabbc6Prashanth Sreenivasa vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3945cabbc6Prashanth Sreenivasa spa_t *spa = vd->vdev_spa; 3955cabbc6Prashanth Sreenivasa 3965cabbc6Prashanth Sreenivasa ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 3975cabbc6Prashanth Sreenivasa 3985cabbc6Prashanth Sreenivasa if (!zfs_condense_indirect_vdevs_enable) 3995cabbc6Prashanth Sreenivasa return (B_FALSE); 4005cabbc6Prashanth Sreenivasa 4015cabbc6Prashanth Sreenivasa /* 4025cabbc6Prashanth Sreenivasa * We can only condense one indirect vdev at a time. 4035cabbc6Prashanth Sreenivasa */ 4045cabbc6Prashanth Sreenivasa if (spa->spa_condensing_indirect != NULL) 4055cabbc6Prashanth Sreenivasa return (B_FALSE); 4065cabbc6Prashanth Sreenivasa 4075cabbc6Prashanth Sreenivasa if (spa_shutting_down(spa)) 4085cabbc6Prashanth Sreenivasa return (B_FALSE); 4095cabbc6Prashanth Sreenivasa 4105cabbc6Prashanth Sreenivasa /* 4115cabbc6Prashanth Sreenivasa * The mapping object size must not change while we are 4125cabbc6Prashanth Sreenivasa * condensing, so we can only condense indirect vdevs 4135cabbc6Prashanth Sreenivasa * (not vdevs that are still in the middle of being removed). 4145cabbc6Prashanth Sreenivasa */ 4155cabbc6Prashanth Sreenivasa if (vd->vdev_ops != &vdev_indirect_ops) 4165cabbc6Prashanth Sreenivasa return (B_FALSE); 4175cabbc6Prashanth Sreenivasa 4185cabbc6Prashanth Sreenivasa /* 4195cabbc6Prashanth Sreenivasa * If nothing new has been marked obsolete, there is no 4205cabbc6Prashanth Sreenivasa * point in condensing. 4215cabbc6Prashanth Sreenivasa */ 4225cabbc6Prashanth Sreenivasa if (vd->vdev_obsolete_sm == NULL) { 4235cabbc6Prashanth Sreenivasa ASSERT0(vdev_obsolete_sm_object(vd)); 4245cabbc6Prashanth Sreenivasa return (B_FALSE); 4255cabbc6Prashanth Sreenivasa } 4265cabbc6Prashanth Sreenivasa 4275cabbc6Prashanth Sreenivasa ASSERT(vd->vdev_obsolete_sm != NULL); 4285cabbc6Prashanth Sreenivasa 4295cabbc6Prashanth Sreenivasa ASSERT3U(vdev_obsolete_sm_object(vd), ==, 4305cabbc6Prashanth Sreenivasa space_map_object(vd->vdev_obsolete_sm)); 4315cabbc6Prashanth Sreenivasa 4325cabbc6Prashanth Sreenivasa uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 4335cabbc6Prashanth Sreenivasa uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 4345cabbc6Prashanth Sreenivasa uint64_t mapping_size = vdev_indirect_mapping_size(vim); 4355cabbc6Prashanth Sreenivasa uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 4365cabbc6Prashanth Sreenivasa 4375cabbc6Prashanth Sreenivasa ASSERT3U(bytes_obsolete, <=, bytes_mapped); 4385cabbc6Prashanth Sreenivasa 4395cabbc6Prashanth Sreenivasa /* 4405cabbc6Prashanth Sreenivasa * If a high percentage of the bytes that are mapped have become 4415cabbc6Prashanth Sreenivasa * obsolete, condense (unless the mapping is already small enough). 4425cabbc6Prashanth Sreenivasa * This has a good chance of reducing the amount of memory used 4435cabbc6Prashanth Sreenivasa * by the mapping. 4445cabbc6Prashanth Sreenivasa */ 4455cabbc6Prashanth Sreenivasa if (bytes_obsolete * 100 / bytes_mapped >= 4465cabbc6Prashanth Sreenivasa zfs_indirect_condense_obsolete_pct && 4475cabbc6Prashanth Sreenivasa mapping_size > zfs_condense_min_mapping_bytes) { 4485cabbc6Prashanth Sreenivasa zfs_dbgmsg("should condense vdev %llu because obsolete " 4495cabbc6Prashanth Sreenivasa "spacemap covers %d%% of %lluMB mapping", 4505cabbc6Prashanth Sreenivasa (u_longlong_t)vd->vdev_id, 4515cabbc6Prashanth Sreenivasa (int)(bytes_obsolete * 100 / bytes_mapped), 4525cabbc6Prashanth Sreenivasa (u_longlong_t)bytes_mapped / 1024 / 1024); 4535cabbc6Prashanth Sreenivasa return (B_TRUE); 4545cabbc6Prashanth Sreenivasa } 4555cabbc6Prashanth Sreenivasa 4565cabbc6Prashanth Sreenivasa /* 4575cabbc6Prashanth Sreenivasa * If the obsolete space map takes up too much space on disk, 4585cabbc6Prashanth Sreenivasa * condense in order to free up this disk space. 4595cabbc6Prashanth Sreenivasa */ 4605cabbc6Prashanth Sreenivasa if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 4615cabbc6Prashanth Sreenivasa zfs_dbgmsg("should condense vdev %llu because obsolete sm " 4625cabbc6Prashanth Sreenivasa "length %lluMB >= max size %lluMB", 4635cabbc6Prashanth Sreenivasa (u_longlong_t)vd->vdev_id, 4645cabbc6Prashanth Sreenivasa (u_longlong_t)obsolete_sm_size / 1024 / 1024, 4655cabbc6Prashanth Sreenivasa (u_longlong_t)zfs_condense_max_obsolete_bytes / 4665cabbc6Prashanth Sreenivasa 1024 / 1024); 4675cabbc6Prashanth Sreenivasa return (B_TRUE); 4685cabbc6Prashanth Sreenivasa } 4695cabbc6Prashanth Sreenivasa 4705cabbc6Prashanth Sreenivasa return (B_FALSE); 4715cabbc6Prashanth Sreenivasa} 4725cabbc6Prashanth Sreenivasa 4735cabbc6Prashanth Sreenivasa/* 4745cabbc6Prashanth Sreenivasa * This sync task completes (finishes) a condense, deleting the old 4755cabbc6Prashanth Sreenivasa * mapping and replacing it with the new one. 4765cabbc6Prashanth Sreenivasa */ 4775cabbc6Prashanth Sreenivasastatic void 4785cabbc6Prashanth Sreenivasaspa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 4795cabbc6Prashanth Sreenivasa{ 4805cabbc6Prashanth Sreenivasa spa_condensing_indirect_t *sci = arg; 4815cabbc6Prashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 4825cabbc6Prashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 4835cabbc6Prashanth Sreenivasa &spa->spa_condensing_indirect_phys; 4845cabbc6Prashanth Sreenivasa vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 4855cabbc6Prashanth Sreenivasa vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 4865cabbc6Prashanth Sreenivasa objset_t *mos = spa->spa_meta_objset; 4875cabbc6Prashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 4885cabbc6Prashanth Sreenivasa uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 4895cabbc6Prashanth Sreenivasa uint64_t new_count = 4905cabbc6Prashanth Sreenivasa vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 4915cabbc6Prashanth Sreenivasa 4925cabbc6Prashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 4935cabbc6Prashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 4945cabbc6Prashanth Sreenivasa ASSERT3P(sci, ==, spa->spa_condensing_indirect); 4955cabbc6Prashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 4965cabbc6Prashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 4975cabbc6Prashanth Sreenivasa } 4985cabbc6Prashanth Sreenivasa ASSERT(vic->vic_mapping_object != 0); 4995cabbc6Prashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 5005cabbc6Prashanth Sreenivasa ASSERT(scip->scip_next_mapping_object != 0); 5015cabbc6Prashanth Sreenivasa ASSERT(scip->scip_prev_obsolete_sm_object != 0); 5025cabbc6Prashanth Sreenivasa 5035cabbc6Prashanth Sreenivasa /* 5045cabbc6Prashanth Sreenivasa * Reset vdev_indirect_mapping to refer to the new object. 5055cabbc6Prashanth Sreenivasa */ 5065cabbc6Prashanth Sreenivasa rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 5075cabbc6Prashanth Sreenivasa vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 5085cabbc6Prashanth Sreenivasa vd->vdev_indirect_mapping = sci->sci_new_mapping; 5095cabbc6Prashanth Sreenivasa rw_exit(&vd->vdev_indirect_rwlock); 5105cabbc6Prashanth Sreenivasa 5115cabbc6Prashanth Sreenivasa sci->sci_new_mapping = NULL; 5125cabbc6Prashanth Sreenivasa vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 5135cabbc6Prashanth Sreenivasa vic->vic_mapping_object = scip->scip_next_mapping_object; 5145cabbc6Prashanth Sreenivasa scip->scip_next_mapping_object = 0; 5155cabbc6Prashanth Sreenivasa 5165cabbc6Prashanth Sreenivasa space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 5175cabbc6Prashanth Sreenivasa spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 5185cabbc6Prashanth Sreenivasa scip->scip_prev_obsolete_sm_object = 0; 5195cabbc6Prashanth Sreenivasa 5205cabbc6Prashanth Sreenivasa scip->scip_vdev = 0; 5215cabbc6Prashanth Sreenivasa 5225cabbc6Prashanth Sreenivasa VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 5235cabbc6Prashanth Sreenivasa DMU_POOL_CONDENSING_INDIRECT, tx)); 5245cabbc6Prashanth Sreenivasa spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 5255cabbc6Prashanth Sreenivasa spa->spa_condensing_indirect = NULL; 5265cabbc6Prashanth Sreenivasa 5275cabbc6Prashanth Sreenivasa zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 5285cabbc6Prashanth Sreenivasa "new mapping object %llu has %llu entries " 5295cabbc6Prashanth Sreenivasa "(was %llu entries)", 5305cabbc6Prashanth Sreenivasa vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 5315cabbc6Prashanth Sreenivasa new_count, old_count); 5325cabbc6Prashanth Sreenivasa 5335cabbc6Prashanth Sreenivasa vdev_config_dirty(spa->spa_root_vdev); 5345cabbc6Prashanth Sreenivasa} 5355cabbc6Prashanth Sreenivasa 5365cabbc6Prashanth Sreenivasa/* 5375cabbc6Prashanth Sreenivasa * This sync task appends entries to the new mapping object. 5385cabbc6Prashanth Sreenivasa */ 5395cabbc6Prashanth Sreenivasastatic void 5405cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 5415cabbc6Prashanth Sreenivasa{ 5425cabbc6Prashanth Sreenivasa spa_condensing_indirect_t *sci = arg; 5435cabbc6Prashanth Sreenivasa uint64_t txg = dmu_tx_get_txg(tx); 5445cabbc6Prashanth Sreenivasa spa_t *spa = dmu_tx_pool(tx)->dp_spa; 5455cabbc6Prashanth Sreenivasa 5465cabbc6Prashanth Sreenivasa ASSERT(dmu_tx_is_syncing(tx)); 5475cabbc6Prashanth Sreenivasa ASSERT3P(sci, ==, spa->spa_condensing_indirect); 5485cabbc6Prashanth Sreenivasa 5495cabbc6Prashanth Sreenivasa vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 5505cabbc6Prashanth Sreenivasa &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 5515cabbc6Prashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 5525cabbc6Prashanth Sreenivasa} 5535cabbc6Prashanth Sreenivasa 5545cabbc6Prashanth Sreenivasa/* 5555cabbc6Prashanth Sreenivasa * Open-context function to add one entry to the new mapping. The new 5565cabbc6Prashanth Sreenivasa * entry will be remembered and written from syncing context. 5575cabbc6Prashanth Sreenivasa */ 5585cabbc6Prashanth Sreenivasastatic void 5595cabbc6Prashanth Sreenivasaspa_condense_indirect_commit_entry(spa_t *spa, 5605cabbc6Prashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 5615cabbc6Prashanth Sreenivasa{ 5625cabbc6Prashanth Sreenivasa spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 5635cabbc6Prashanth Sreenivasa 5645cabbc6Prashanth Sreenivasa ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 5655cabbc6Prashanth Sreenivasa 5665cabbc6Prashanth Sreenivasa dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5675cabbc6Prashanth Sreenivasa dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 5685cabbc6Prashanth Sreenivasa VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 5695cabbc6Prashanth Sreenivasa int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 5705cabbc6Prashanth Sreenivasa 5715cabbc6Prashanth Sreenivasa /* 5725cabbc6Prashanth Sreenivasa * If we are the first entry committed this txg, kick off the sync 5735cabbc6Prashanth Sreenivasa * task to write to the MOS on our behalf. 5745cabbc6Prashanth Sreenivasa */ 5755cabbc6Prashanth Sreenivasa if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 5765cabbc6Prashanth Sreenivasa dsl_sync_task_nowait(dmu_tx_pool(tx), 5775cabbc6Prashanth Sreenivasa spa_condense_indirect_commit_sync, sci, 5785cabbc6Prashanth Sreenivasa 0, ZFS_SPACE_CHECK_NONE, tx); 5795cabbc6Prashanth Sreenivasa } 5805cabbc6Prashanth Sreenivasa 5815cabbc6Prashanth Sreenivasa vdev_indirect_mapping_entry_t *vime = 5825cabbc6Prashanth Sreenivasa kmem_alloc(sizeof (*vime), KM_SLEEP); 5835cabbc6Prashanth Sreenivasa vime->vime_mapping = *vimep; 5845cabbc6Prashanth Sreenivasa vime->vime_obsolete_count = count; 5855cabbc6Prashanth Sreenivasa list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 5865cabbc6Prashanth Sreenivasa 5875cabbc6Prashanth Sreenivasa dmu_tx_commit(tx); 5885cabbc6Prashanth Sreenivasa} 5895cabbc6Prashanth Sreenivasa 5905cabbc6Prashanth Sreenivasastatic void 5915cabbc6Prashanth Sreenivasaspa_condense_indirect_generate_new_mapping(vdev_t *vd, 592667ec66Serapheim Dimitropoulos uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 5935cabbc6Prashanth Sreenivasa{ 5945cabbc6Prashanth Sreenivasa spa_t *spa = vd->vdev_spa; 5955cabbc6Prashanth Sreenivasa uint64_t mapi = start_index; 5965cabbc6Prashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 5975cabbc6Prashanth Sreenivasa uint64_t old_num_entries = 5985cabbc6Prashanth Sreenivasa vdev_indirect_mapping_num_entries(old_mapping); 5995cabbc6Prashanth Sreenivasa 6005cabbc6Prashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 6015cabbc6Prashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 6025cabbc6Prashanth Sreenivasa 6035cabbc6Prashanth Sreenivasa zfs_dbgmsg("starting condense of vdev %llu from index %llu", 6045cabbc6Prashanth Sreenivasa (u_longlong_t)vd->vdev_id, 6055cabbc6Prashanth Sreenivasa (u_longlong_t)mapi); 6065cabbc6Prashanth Sreenivasa 607667ec66Serapheim Dimitropoulos while (mapi < old_num_entries) { 608667ec66Serapheim Dimitropoulos 609667ec66Serapheim Dimitropoulos if (zthr_iscancelled(zthr)) { 610667ec66Serapheim Dimitropoulos zfs_dbgmsg("pausing condense of vdev %llu " 611667ec66Serapheim Dimitropoulos "at index %llu", (u_longlong_t)vd->vdev_id, 612667ec66Serapheim Dimitropoulos (u_longlong_t)mapi); 613667ec66Serapheim Dimitropoulos break; 614667ec66Serapheim Dimitropoulos } 615667ec66Serapheim Dimitropoulos 6165cabbc6Prashanth Sreenivasa vdev_indirect_mapping_entry_phys_t *entry = 6175cabbc6Prashanth Sreenivasa &old_mapping->vim_entries[mapi]; 6185cabbc6Prashanth Sreenivasa uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 6195cabbc6Prashanth Sreenivasa ASSERT3U(obsolete_counts[mapi], <=, entry_size); 6205cabbc6Prashanth Sreenivasa if (obsolete_counts[mapi] < entry_size) { 6215cabbc6Prashanth Sreenivasa spa_condense_indirect_commit_entry(spa, entry, 6225cabbc6Prashanth Sreenivasa obsolete_counts[mapi]); 6235cabbc6Prashanth Sreenivasa 6245cabbc6Prashanth Sreenivasa /* 6255cabbc6Prashanth Sreenivasa * This delay may be requested for testing, debugging, 6265cabbc6Prashanth Sreenivasa * or performance reasons. 6275cabbc6Prashanth Sreenivasa */ 6285cabbc6Prashanth Sreenivasa delay(zfs_condense_indirect_commit_entry_delay_ticks); 6295cabbc6Prashanth Sreenivasa } 6305cabbc6Prashanth Sreenivasa 6315cabbc6Prashanth Sreenivasa mapi++; 6325cabbc6Prashanth Sreenivasa } 6335cabbc6Prashanth Sreenivasa} 6345cabbc6Prashanth Sreenivasa 635667ec66Serapheim Dimitropoulos/* ARGSUSED */ 636667ec66Serapheim Dimitropoulosstatic boolean_t 637667ec66Serapheim Dimitropoulosspa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 6385cabbc6Prashanth Sreenivasa{ 639667ec66Serapheim Dimitropoulos spa_t *spa = arg; 640667ec66Serapheim Dimitropoulos 641667ec66Serapheim Dimitropoulos return (spa->spa_condensing_indirect != NULL); 642667ec66Serapheim Dimitropoulos} 643667ec66Serapheim Dimitropoulos 644667ec66Serapheim Dimitropoulos/* ARGSUSED */ 6456a316e1Serapheim Dimitropoulosstatic void 646667ec66Serapheim Dimitropoulosspa_condense_indirect_thread(void *arg, zthr_t *zthr) 647667ec66Serapheim Dimitropoulos{ 648667ec66Serapheim Dimitropoulos spa_t *spa = arg; 649667ec66Serapheim Dimitropoulos vdev_t *vd; 650667ec66Serapheim Dimitropoulos 651667ec66Serapheim Dimitropoulos ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 652667ec66Serapheim Dimitropoulos spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 653667ec66Serapheim Dimitropoulos vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 654667ec66Serapheim Dimitropoulos ASSERT3P(vd, !=, NULL); 655667ec66Serapheim Dimitropoulos spa_config_exit(spa, SCL_VDEV, FTAG); 656667ec66Serapheim Dimitropoulos 6575cabbc6Prashanth Sreenivasa spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 6585cabbc6Prashanth Sreenivasa spa_condensing_indirect_phys_t *scip = 6595cabbc6Prashanth Sreenivasa &spa->spa_condensing_indirect_phys; 6605cabbc6Prashanth Sreenivasa uint32_t *counts; 6615cabbc6Prashanth Sreenivasa uint64_t start_index; 6625cabbc6Prashanth Sreenivasa vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 6635cabbc6Prashanth Sreenivasa space_map_t *prev_obsolete_sm = NULL; 6645cabbc6Prashanth Sreenivasa 6655cabbc6Prashanth Sreenivasa ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 6665cabbc6Prashanth Sreenivasa ASSERT(scip->scip_next_mapping_object != 0); 6675cabbc6Prashanth Sreenivasa ASSERT(scip->scip_prev_obsolete_sm_object != 0); 6685cabbc6Prashanth Sreenivasa ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 6695cabbc6Prashanth Sreenivasa 6705cabbc6Prashanth Sreenivasa for (int i = 0; i < TXG_SIZE; i++) { 6715cabbc6Prashanth Sreenivasa /* 6725cabbc6Prashanth Sreenivasa * The list must start out empty in order for the 6735cabbc6Prashanth Sreenivasa * _commit_sync() sync task to be properly registered 6745cabbc6Prashanth Sreenivasa * on the first call to _commit_entry(); so it's wise 6755cabbc6Prashanth Sreenivasa * to double check and ensure we actually are starting 6765cabbc6Prashanth Sreenivasa * with empty lists. 6775cabbc6Prashanth Sreenivasa */ 6785cabbc6Prashanth Sreenivasa ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 6795cabbc6Prashanth Sreenivasa } 6805cabbc6Prashanth Sreenivasa 6815cabbc6Prashanth Sreenivasa VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 6825cabbc6Prashanth Sreenivasa scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 6835cabbc6Prashanth Sreenivasa counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 6845cabbc6Prashanth Sreenivasa if (prev_obsolete_sm != NULL) { 6855cabbc6Prashanth Sreenivasa vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 6865cabbc6Prashanth Sreenivasa counts, prev_obsolete_sm); 687