1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ecc2d604Sbonwick * Common Development and Distribution License (the "License"). 6ecc2d604Sbonwick * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22d6e555bdSGeorge Wilson * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 2401f55e48SGeorge Wilson */ 2501f55e48SGeorge Wilson 2601f55e48SGeorge Wilson /* 27814dcd43SSerapheim Dimitropoulos * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 28fa9e4066Sahrens */ 29fa9e4066Sahrens 30fa9e4066Sahrens #ifndef _SYS_METASLAB_IMPL_H 31fa9e4066Sahrens #define _SYS_METASLAB_IMPL_H 32fa9e4066Sahrens 33fa9e4066Sahrens #include <sys/metaslab.h> 34fa9e4066Sahrens #include <sys/space_map.h> 350713e232SGeorge Wilson #include <sys/range_tree.h> 36fa9e4066Sahrens #include <sys/vdev.h> 37fa9e4066Sahrens #include <sys/txg.h> 38fa9e4066Sahrens #include <sys/avl.h> 39af1d63abSPaul Dagnelie #include <sys/multilist.h> 40fa9e4066Sahrens 41fa9e4066Sahrens #ifdef __cplusplus 42fa9e4066Sahrens extern "C" { 43fa9e4066Sahrens #endif 44fa9e4066Sahrens 458363e80aSGeorge Wilson /* 468363e80aSGeorge Wilson * Metaslab allocation tracing record. 478363e80aSGeorge Wilson */ 488363e80aSGeorge Wilson typedef struct metaslab_alloc_trace { 498363e80aSGeorge Wilson list_node_t mat_list_node; 508363e80aSGeorge Wilson metaslab_group_t *mat_mg; 518363e80aSGeorge Wilson metaslab_t *mat_msp; 528363e80aSGeorge Wilson uint64_t mat_size; 538363e80aSGeorge Wilson uint64_t mat_weight; 548363e80aSGeorge Wilson uint32_t mat_dva_id; 558363e80aSGeorge Wilson uint64_t mat_offset; 56f78cdc34SPaul Dagnelie int mat_allocator; 578363e80aSGeorge Wilson } metaslab_alloc_trace_t; 588363e80aSGeorge Wilson 598363e80aSGeorge Wilson /* 608363e80aSGeorge Wilson * Used by the metaslab allocation tracing facility to indicate 618363e80aSGeorge Wilson * error conditions. These errors are stored to the offset member 628363e80aSGeorge Wilson * of the metaslab_alloc_trace_t record and displayed by mdb. 638363e80aSGeorge Wilson */ 648363e80aSGeorge Wilson typedef enum trace_alloc_type { 658363e80aSGeorge Wilson TRACE_ALLOC_FAILURE = -1ULL, 668363e80aSGeorge Wilson TRACE_TOO_SMALL = -2ULL, 678363e80aSGeorge Wilson TRACE_FORCE_GANG = -3ULL, 688363e80aSGeorge Wilson TRACE_NOT_ALLOCATABLE = -4ULL, 698363e80aSGeorge Wilson TRACE_GROUP_FAILURE = -5ULL, 708363e80aSGeorge Wilson TRACE_ENOSPC = -6ULL, 718363e80aSGeorge Wilson TRACE_CONDENSING = -7ULL, 72094e47e9SGeorge Wilson TRACE_VDEV_ERROR = -8ULL, 73084fd14fSBrian Behlendorf TRACE_DISABLED = -9ULL, 748363e80aSGeorge Wilson } trace_alloc_type_t; 758363e80aSGeorge Wilson 768363e80aSGeorge Wilson #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 778363e80aSGeorge Wilson #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 78f78cdc34SPaul Dagnelie #define METASLAB_WEIGHT_CLAIM (1ULL << 61) 79f78cdc34SPaul Dagnelie #define METASLAB_WEIGHT_TYPE (1ULL << 60) 808363e80aSGeorge Wilson #define METASLAB_ACTIVE_MASK \ 81f78cdc34SPaul Dagnelie (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ 82f78cdc34SPaul Dagnelie METASLAB_WEIGHT_CLAIM) 838363e80aSGeorge Wilson 848363e80aSGeorge Wilson /* 858363e80aSGeorge Wilson * The metaslab weight is used to encode the amount of free space in a 868363e80aSGeorge Wilson * metaslab, such that the "best" metaslab appears first when sorting the 878363e80aSGeorge Wilson * metaslabs by weight. The weight (and therefore the "best" metaslab) can 888363e80aSGeorge Wilson * be determined in two different ways: by computing a weighted sum of all 898363e80aSGeorge Wilson * the free space in the metaslab (a space based weight) or by counting only 908363e80aSGeorge Wilson * the free segments of the largest size (a segment based weight). We prefer 918363e80aSGeorge Wilson * the segment based weight because it reflects how the free space is 928363e80aSGeorge Wilson * comprised, but we cannot always use it -- legacy pools do not have the 938363e80aSGeorge Wilson * space map histogram information necessary to determine the largest 948363e80aSGeorge Wilson * contiguous regions. Pools that have the space map histogram determine 958363e80aSGeorge Wilson * the segment weight by looking at each bucket in the histogram and 968363e80aSGeorge Wilson * determining the free space whose size in bytes is in the range: 978363e80aSGeorge Wilson * [2^i, 2^(i+1)) 988363e80aSGeorge Wilson * We then encode the largest index, i, that contains regions into the 998363e80aSGeorge Wilson * segment-weighted value. 1008363e80aSGeorge Wilson * 1018363e80aSGeorge Wilson * Space-based weight: 1028363e80aSGeorge Wilson * 1038363e80aSGeorge Wilson * 64 56 48 40 32 24 16 8 0 1048363e80aSGeorge Wilson * +-------+-------+-------+-------+-------+-------+-------+-------+ 105f78cdc34SPaul Dagnelie * |PSC1| weighted-free space | 1068363e80aSGeorge Wilson * +-------+-------+-------+-------+-------+-------+-------+-------+ 1078363e80aSGeorge Wilson * 1088363e80aSGeorge Wilson * PS - indicates primary and secondary activation 109f78cdc34SPaul Dagnelie * C - indicates activation for claimed block zio 1108363e80aSGeorge Wilson * space - the fragmentation-weighted space 1118363e80aSGeorge Wilson * 1128363e80aSGeorge Wilson * Segment-based weight: 1138363e80aSGeorge Wilson * 1148363e80aSGeorge Wilson * 64 56 48 40 32 24 16 8 0 1158363e80aSGeorge Wilson * +-------+-------+-------+-------+-------+-------+-------+-------+ 116f78cdc34SPaul Dagnelie * |PSC0| idx| count of segments in region | 1178363e80aSGeorge Wilson * +-------+-------+-------+-------+-------+-------+-------+-------+ 1188363e80aSGeorge Wilson * 1198363e80aSGeorge Wilson * PS - indicates primary and secondary activation 120f78cdc34SPaul Dagnelie * C - indicates activation for claimed block zio 1218363e80aSGeorge Wilson * idx - index for the highest bucket in the histogram 1228363e80aSGeorge Wilson * count - number of segments in the specified bucket 1238363e80aSGeorge Wilson */ 124f78cdc34SPaul Dagnelie #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) 125f78cdc34SPaul Dagnelie #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) 1268363e80aSGeorge Wilson 1278363e80aSGeorge Wilson #define WEIGHT_IS_SPACEBASED(weight) \ 128f78cdc34SPaul Dagnelie ((weight) == 0 || BF64_GET((weight), 60, 1)) 129f78cdc34SPaul Dagnelie #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) 1308363e80aSGeorge Wilson 1318363e80aSGeorge Wilson /* 1328363e80aSGeorge Wilson * These macros are only applicable to segment-based weighting. 1338363e80aSGeorge Wilson */ 134f78cdc34SPaul Dagnelie #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) 135f78cdc34SPaul Dagnelie #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) 136f78cdc34SPaul Dagnelie #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) 137f78cdc34SPaul Dagnelie #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) 1388363e80aSGeorge Wilson 1392e4c9986SGeorge Wilson /* 1402e4c9986SGeorge Wilson * A metaslab class encompasses a category of allocatable top-level vdevs. 1412e4c9986SGeorge Wilson * Each top-level vdev is associated with a metaslab group which defines 1422e4c9986SGeorge Wilson * the allocatable region for that vdev. Examples of these categories include 1432e4c9986SGeorge Wilson * "normal" for data block allocations (i.e. main pool allocations) or "log" 1442e4c9986SGeorge Wilson * for allocations designated for intent log devices (i.e. slog devices). 1452e4c9986SGeorge Wilson * When a block allocation is requested from the SPA it is associated with a 1462e4c9986SGeorge Wilson * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging 1472e4c9986SGeorge Wilson * to the class can be used to satisfy that request. Allocations are done 1482e4c9986SGeorge Wilson * by traversing the metaslab groups that are linked off of the mc_rotor field. 1492e4c9986SGeorge Wilson * This rotor points to the next metaslab group where allocations will be 1502e4c9986SGeorge Wilson * attempted. Allocating a block is a 3 step process -- select the metaslab 1512e4c9986SGeorge Wilson * group, select the metaslab, and then allocate the block. The metaslab 1522e4c9986SGeorge Wilson * class defines the low-level block allocator that will be used as the 1532e4c9986SGeorge Wilson * final step in allocation. These allocators are pluggable allowing each class 1542e4c9986SGeorge Wilson * to use a block allocator that best suits that class. 1552e4c9986SGeorge Wilson */ 156fa9e4066Sahrens struct metaslab_class { 1570f7643c7SGeorge Wilson kmutex_t mc_lock; 15888ecc943SGeorge Wilson spa_t *mc_spa; 159fa9e4066Sahrens metaslab_group_t *mc_rotor; 1600713e232SGeorge Wilson metaslab_ops_t *mc_ops; 161b24ab676SJeff Bonwick uint64_t mc_aliquot; 1620f7643c7SGeorge Wilson 1630f7643c7SGeorge Wilson /* 1640f7643c7SGeorge Wilson * Track the number of metaslab groups that have been initialized 1650f7643c7SGeorge Wilson * and can accept allocations. An initialized metaslab group is 1660f7643c7SGeorge Wilson * one has been completely added to the config (i.e. we have 1670f7643c7SGeorge Wilson * updated the MOS config and the space has been added to the pool). 1680f7643c7SGeorge Wilson */ 1690f7643c7SGeorge Wilson uint64_t mc_groups; 1700f7643c7SGeorge Wilson 1710f7643c7SGeorge Wilson /* 1720f7643c7SGeorge Wilson * Toggle to enable/disable the allocation throttle. 1730f7643c7SGeorge Wilson */ 1740f7643c7SGeorge Wilson boolean_t mc_alloc_throttle_enabled; 1750f7643c7SGeorge Wilson 1760f7643c7SGeorge Wilson /* 1770f7643c7SGeorge Wilson * The allocation throttle works on a reservation system. Whenever 1780f7643c7SGeorge Wilson * an asynchronous zio wants to perform an allocation it must 1790f7643c7SGeorge Wilson * first reserve the number of blocks that it wants to allocate. 1800f7643c7SGeorge Wilson * If there aren't sufficient slots available for the pending zio 1810f7643c7SGeorge Wilson * then that I/O is throttled until more slots free up. The current 1820f7643c7SGeorge Wilson * number of reserved allocations is maintained by the mc_alloc_slots 1830f7643c7SGeorge Wilson * refcount. The mc_alloc_max_slots value determines the maximum 1840f7643c7SGeorge Wilson * number of allocations that the system allows. Gang blocks are 1850f7643c7SGeorge Wilson * allowed to reserve slots even if we've reached the maximum 1860f7643c7SGeorge Wilson * number of allocations allowed. 1870f7643c7SGeorge Wilson */ 188f78cdc34SPaul Dagnelie uint64_t *mc_alloc_max_slots; 189e914ace2STim Schumacher zfs_refcount_t *mc_alloc_slots; 1900f7643c7SGeorge Wilson 19122e30981SGeorge Wilson uint64_t mc_alloc_groups; /* # of allocatable groups */ 1920f7643c7SGeorge Wilson 193b24ab676SJeff Bonwick uint64_t mc_alloc; /* total allocated space */ 194b24ab676SJeff Bonwick uint64_t mc_deferred; /* total deferred frees */ 195b24ab676SJeff Bonwick uint64_t mc_space; /* total space (alloc + free) */ 196b24ab676SJeff Bonwick uint64_t mc_dspace; /* total deflated space */ 1972e4c9986SGeorge Wilson uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; 198af1d63abSPaul Dagnelie 199af1d63abSPaul Dagnelie /* 200af1d63abSPaul Dagnelie * List of all loaded metaslabs in the class, sorted in order of most 201af1d63abSPaul Dagnelie * recent use. 202af1d63abSPaul Dagnelie */ 203af1d63abSPaul Dagnelie multilist_t *mc_metaslab_txg_list; 204fa9e4066Sahrens }; 205fa9e4066Sahrens 2062e4c9986SGeorge Wilson /* 2072e4c9986SGeorge Wilson * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) 2082e4c9986SGeorge Wilson * of a top-level vdev. They are linked togther to form a circular linked 2092e4c9986SGeorge Wilson * list and can belong to only one metaslab class. Metaslab groups may become 2102e4c9986SGeorge Wilson * ineligible for allocations for a number of reasons such as limited free 2112e4c9986SGeorge Wilson * space, fragmentation, or going offline. When this happens the allocator will 2122e4c9986SGeorge Wilson * simply find the next metaslab group in the linked list and attempt 2132e4c9986SGeorge Wilson * to allocate from that group instead. 2142e4c9986SGeorge Wilson */ 215fa9e4066Sahrens struct metaslab_group { 216fa9e4066Sahrens kmutex_t mg_lock; 217f78cdc34SPaul Dagnelie metaslab_t **mg_primaries; 218f78cdc34SPaul Dagnelie metaslab_t **mg_secondaries; 219fa9e4066Sahrens avl_tree_t mg_metaslab_tree; 220fa9e4066Sahrens uint64_t mg_aliquot; 22122e30981SGeorge Wilson boolean_t mg_allocatable; /* can we allocate? */ 222f78cdc34SPaul Dagnelie uint64_t mg_ms_ready; 2230f7643c7SGeorge Wilson 2240f7643c7SGeorge Wilson /* 2250f7643c7SGeorge Wilson * A metaslab group is considered to be initialized only after 2260f7643c7SGeorge Wilson * we have updated the MOS config and added the space to the pool. 2270f7643c7SGeorge Wilson * We only allow allocation attempts to a metaslab group if it 2280f7643c7SGeorge Wilson * has been initialized. 2290f7643c7SGeorge Wilson */ 2300f7643c7SGeorge Wilson boolean_t mg_initialized; 2310f7643c7SGeorge Wilson 23222e30981SGeorge Wilson uint64_t mg_free_capacity; /* percentage free */ 233fa9e4066Sahrens int64_t mg_bias; 234a1521560SJeff Bonwick int64_t mg_activation_count; 235fa9e4066Sahrens metaslab_class_t *mg_class; 236fa9e4066Sahrens vdev_t *mg_vd; 2370713e232SGeorge Wilson taskq_t *mg_taskq; 238fa9e4066Sahrens metaslab_group_t *mg_prev; 239fa9e4066Sahrens metaslab_group_t *mg_next; 2400f7643c7SGeorge Wilson 2410f7643c7SGeorge Wilson /* 242f78cdc34SPaul Dagnelie * In order for the allocation throttle to function properly, we cannot 243f78cdc34SPaul Dagnelie * have too many IOs going to each disk by default; the throttle 244f78cdc34SPaul Dagnelie * operates by allocating more work to disks that finish quickly, so 245f78cdc34SPaul Dagnelie * allocating larger chunks to each disk reduces its effectiveness. 246f78cdc34SPaul Dagnelie * However, if the number of IOs going to each allocator is too small, 247f78cdc34SPaul Dagnelie * we will not perform proper aggregation at the vdev_queue layer, 248f78cdc34SPaul Dagnelie * also resulting in decreased performance. Therefore, we will use a 249f78cdc34SPaul Dagnelie * ramp-up strategy. 250f78cdc34SPaul Dagnelie * 251f78cdc34SPaul Dagnelie * Each allocator in each metaslab group has a current queue depth 252f78cdc34SPaul Dagnelie * (mg_alloc_queue_depth[allocator]) and a current max queue depth 253f78cdc34SPaul Dagnelie * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group 254f78cdc34SPaul Dagnelie * has an absolute max queue depth (mg_max_alloc_queue_depth). We 255f78cdc34SPaul Dagnelie * add IOs to an allocator until the mg_alloc_queue_depth for that 256f78cdc34SPaul Dagnelie * allocator hits the cur_max. Every time an IO completes for a given 257f78cdc34SPaul Dagnelie * allocator on a given metaslab group, we increment its cur_max until 258f78cdc34SPaul Dagnelie * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to 259f78cdc34SPaul Dagnelie * help protect against disks that decrease in performance over time. 260f78cdc34SPaul Dagnelie * 261f78cdc34SPaul Dagnelie * It's possible for an allocator to handle more allocations than 262f78cdc34SPaul Dagnelie * its max. This can occur when gang blocks are required or when other 263f78cdc34SPaul Dagnelie * groups are unable to handle their share of allocations. 2640f7643c7SGeorge Wilson */ 2650f7643c7SGeorge Wilson uint64_t mg_max_alloc_queue_depth; 266f78cdc34SPaul Dagnelie uint64_t *mg_cur_max_alloc_queue_depth; 267e914ace2STim Schumacher zfs_refcount_t *mg_alloc_queue_depth; 268f78cdc34SPaul Dagnelie int mg_allocators; 2690f7643c7SGeorge Wilson /* 2700f7643c7SGeorge Wilson * A metalab group that can no longer allocate the minimum block 2710f7643c7SGeorge Wilson * size will set mg_no_free_space. Once a metaslab group is out 2720f7643c7SGeorge Wilson * of space then its share of work must be distributed to other 2730f7643c7SGeorge Wilson * groups. 2740f7643c7SGeorge Wilson */ 2750f7643c7SGeorge Wilson boolean_t mg_no_free_space; 2760f7643c7SGeorge Wilson 2770f7643c7SGeorge Wilson uint64_t mg_allocations; 2780f7643c7SGeorge Wilson uint64_t mg_failed_allocations; 2792e4c9986SGeorge Wilson uint64_t mg_fragmentation; 2802e4c9986SGeorge Wilson uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; 281094e47e9SGeorge Wilson 282084fd14fSBrian Behlendorf int mg_ms_disabled; 283084fd14fSBrian Behlendorf boolean_t mg_disabled_updating; 284084fd14fSBrian Behlendorf kmutex_t mg_ms_disabled_lock; 285084fd14fSBrian Behlendorf kcondvar_t mg_ms_disabled_cv; 286fa9e4066Sahrens }; 287fa9e4066Sahrens 288fa9e4066Sahrens /* 2890713e232SGeorge Wilson * This value defines the number of elements in the ms_lbas array. The value 2902e4c9986SGeorge Wilson * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. 2912e4c9986SGeorge Wilson * This is the equivalent of highbit(UINT64_MAX). 2920713e232SGeorge Wilson */ 2930713e232SGeorge Wilson #define MAX_LBAS 64 2940713e232SGeorge Wilson 2950713e232SGeorge Wilson /* 2965f145778SMatthew Ahrens * Each metaslab maintains a set of in-core trees to track metaslab 29786714001SSerapheim Dimitropoulos * operations. The in-core free tree (ms_allocatable) contains the list of 2985f145778SMatthew Ahrens * free segments which are eligible for allocation. As blocks are 29986714001SSerapheim Dimitropoulos * allocated, the allocated segment are removed from the ms_allocatable and 30086714001SSerapheim Dimitropoulos * added to a per txg allocation tree (ms_allocating). As blocks are 30186714001SSerapheim Dimitropoulos * freed, they are added to the free tree (ms_freeing). These trees 3025cabbc6bSPrashanth Sreenivasa * allow us to process all allocations and frees in syncing context 3035cabbc6bSPrashanth Sreenivasa * where it is safe to update the on-disk space maps. An additional set 3045cabbc6bSPrashanth Sreenivasa * of in-core trees is maintained to track deferred frees 30586714001SSerapheim Dimitropoulos * (ms_defer). Once a block is freed it will move from the 30686714001SSerapheim Dimitropoulos * ms_freed to the ms_defer tree. A deferred free means that a block 3075f145778SMatthew Ahrens * has been freed but cannot be used by the pool until TXG_DEFER_SIZE 3085f145778SMatthew Ahrens * transactions groups later. For example, a block that is freed in txg 3095f145778SMatthew Ahrens * 50 will not be available for reallocation until txg 52 (50 + 3105f145778SMatthew Ahrens * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. 3115f145778SMatthew Ahrens * A pool could be safely rolled back TXG_DEFERS_SIZE transactions 3125f145778SMatthew Ahrens * groups and ensure that no block has been reallocated. 3130713e232SGeorge Wilson * 3140713e232SGeorge Wilson * The simplified transition diagram looks like this: 3150713e232SGeorge Wilson * 3160713e232SGeorge Wilson * 3170713e232SGeorge Wilson * ALLOCATE 3180713e232SGeorge Wilson * | 3190713e232SGeorge Wilson * V 32086714001SSerapheim Dimitropoulos * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) 3210713e232SGeorge Wilson * ^ 32286714001SSerapheim Dimitropoulos * | ms_freeing <--- FREE 32386714001SSerapheim Dimitropoulos * | | 32486714001SSerapheim Dimitropoulos * | v 32586714001SSerapheim Dimitropoulos * | ms_freed 32686714001SSerapheim Dimitropoulos * | | 32786714001SSerapheim Dimitropoulos * +-------- ms_defer[2] <-------+-------> (write to space map) 32816a4a807SGeorge Wilson * 3290713e232SGeorge Wilson * 3300713e232SGeorge Wilson * Each metaslab's space is tracked in a single space map in the MOS, 3315f145778SMatthew Ahrens * which is only updated in syncing context. Each time we sync a txg, 3325f145778SMatthew Ahrens * we append the allocs and frees from that txg to the space map. The 3335f145778SMatthew Ahrens * pool space is only updated once all metaslabs have finished syncing. 33416a4a807SGeorge Wilson * 3355f145778SMatthew Ahrens * To load the in-core free tree we read the space map from disk. This 3365f145778SMatthew Ahrens * object contains a series of alloc and free records that are combined 3375f145778SMatthew Ahrens * to make up the list of all free segments in this metaslab. These 33886714001SSerapheim Dimitropoulos * segments are represented in-core by the ms_allocatable and are stored 33986714001SSerapheim Dimitropoulos * in an AVL tree. 34016a4a807SGeorge Wilson * 3410713e232SGeorge Wilson * As the space map grows (as a result of the appends) it will 3425f145778SMatthew Ahrens * eventually become space-inefficient. When the metaslab's in-core 3435f145778SMatthew Ahrens * free tree is zfs_condense_pct/100 times the size of the minimal 3445f145778SMatthew Ahrens * on-disk representation, we rewrite it in its minimized form. If a 3455f145778SMatthew Ahrens * metaslab needs to condense then we must set the ms_condensing flag to 3465f145778SMatthew Ahrens * ensure that allocations are not performed on the metaslab that is 3475f145778SMatthew Ahrens * being written. 348fa9e4066Sahrens */ 349fa9e4066Sahrens struct metaslab { 350555d674dSSerapheim Dimitropoulos /* 351555d674dSSerapheim Dimitropoulos * This is the main lock of the metaslab and its purpose is to 352555d674dSSerapheim Dimitropoulos * coordinate our allocations and frees [e.g metaslab_block_alloc(), 353555d674dSSerapheim Dimitropoulos * metaslab_free_concrete(), ..etc] with our various syncing 354555d674dSSerapheim Dimitropoulos * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. 355555d674dSSerapheim Dimitropoulos * 356555d674dSSerapheim Dimitropoulos * The lock is also used during some miscellaneous operations like 357555d674dSSerapheim Dimitropoulos * using the metaslab's histogram for the metaslab group's histogram 358555d674dSSerapheim Dimitropoulos * aggregation, or marking the metaslab for initialization. 359555d674dSSerapheim Dimitropoulos */ 3600713e232SGeorge Wilson kmutex_t ms_lock; 361555d674dSSerapheim Dimitropoulos 362555d674dSSerapheim Dimitropoulos /* 363555d674dSSerapheim Dimitropoulos * Acquired together with the ms_lock whenever we expect to 364555d674dSSerapheim Dimitropoulos * write to metaslab data on-disk (i.e flushing entries to 365555d674dSSerapheim Dimitropoulos * the metaslab's space map). It helps coordinate readers of 366555d674dSSerapheim Dimitropoulos * the metaslab's space map [see spa_vdev_remove_thread()] 367814dcd43SSerapheim Dimitropoulos * with writers [see metaslab_sync() or metaslab_flush()]. 368555d674dSSerapheim Dimitropoulos * 369555d674dSSerapheim Dimitropoulos * Note that metaslab_load(), even though a reader, uses 370555d674dSSerapheim Dimitropoulos * a completely different mechanism to deal with the reading 371555d674dSSerapheim Dimitropoulos * of the metaslab's space map based on ms_synced_length. That 372555d674dSSerapheim Dimitropoulos * said, the function still uses the ms_sync_lock after it 373555d674dSSerapheim Dimitropoulos * has read the ms_sm [see relevant comment in metaslab_load() 374555d674dSSerapheim Dimitropoulos * as to why]. 375555d674dSSerapheim Dimitropoulos */ 3765cabbc6bSPrashanth Sreenivasa kmutex_t ms_sync_lock; 377555d674dSSerapheim Dimitropoulos 3780713e232SGeorge Wilson kcondvar_t ms_load_cv; 3790713e232SGeorge Wilson space_map_t *ms_sm; 3800713e232SGeorge Wilson uint64_t ms_id; 3810713e232SGeorge Wilson uint64_t ms_start; 3820713e232SGeorge Wilson uint64_t ms_size; 3832e4c9986SGeorge Wilson uint64_t ms_fragmentation; 3840713e232SGeorge Wilson 38586714001SSerapheim Dimitropoulos range_tree_t *ms_allocating[TXG_SIZE]; 38686714001SSerapheim Dimitropoulos range_tree_t *ms_allocatable; 387555d674dSSerapheim Dimitropoulos uint64_t ms_allocated_this_txg; 388af1d63abSPaul Dagnelie uint64_t ms_allocating_total; 3890713e232SGeorge Wilson 3905f145778SMatthew Ahrens /* 3915f145778SMatthew Ahrens * The following range trees are accessed only from syncing context. 3925f145778SMatthew Ahrens * ms_free*tree only have entries while syncing, and are empty 3935f145778SMatthew Ahrens * between syncs. 3945f145778SMatthew Ahrens */ 39586714001SSerapheim Dimitropoulos range_tree_t *ms_freeing; /* to free this syncing txg */ 39686714001SSerapheim Dimitropoulos range_tree_t *ms_freed; /* already freed this syncing txg */ 39786714001SSerapheim Dimitropoulos range_tree_t *ms_defer[TXG_DEFER_SIZE]; 39886714001SSerapheim Dimitropoulos range_tree_t *ms_checkpointing; /* to add to the checkpoint */ 3995f145778SMatthew Ahrens 400084fd14fSBrian Behlendorf /* 401084fd14fSBrian Behlendorf * The ms_trim tree is the set of allocatable segments which are 402084fd14fSBrian Behlendorf * eligible for trimming. (When the metaslab is loaded, it's a 403084fd14fSBrian Behlendorf * subset of ms_allocatable.) It's kept in-core as long as the 404084fd14fSBrian Behlendorf * autotrim property is set and is not vacated when the metaslab 405084fd14fSBrian Behlendorf * is unloaded. Its purpose is to aggregate freed ranges to 406084fd14fSBrian Behlendorf * facilitate efficient trimming. 407084fd14fSBrian Behlendorf */ 408084fd14fSBrian Behlendorf range_tree_t *ms_trim; 409084fd14fSBrian Behlendorf 4100713e232SGeorge Wilson boolean_t ms_condensing; /* condensing? */ 4112e4c9986SGeorge Wilson boolean_t ms_condense_wanted; 4128363e80aSGeorge Wilson 413084fd14fSBrian Behlendorf /* 414084fd14fSBrian Behlendorf * The number of consumers which have disabled the metaslab. 415084fd14fSBrian Behlendorf */ 416084fd14fSBrian Behlendorf uint64_t ms_disabled; 417094e47e9SGeorge Wilson 4188363e80aSGeorge Wilson /* 419a0b03b16SSerapheim Dimitropoulos * We must always hold the ms_lock when modifying ms_loaded 420a0b03b16SSerapheim Dimitropoulos * and ms_loading. 4218363e80aSGeorge Wilson */ 4220713e232SGeorge Wilson boolean_t ms_loaded; 4230713e232SGeorge Wilson boolean_t ms_loading; 424814dcd43SSerapheim Dimitropoulos kcondvar_t ms_flush_cv; 425814dcd43SSerapheim Dimitropoulos boolean_t ms_flushing; 4260713e232SGeorge Wilson 427555d674dSSerapheim Dimitropoulos /* 428555d674dSSerapheim Dimitropoulos * The following histograms count entries that are in the 429555d674dSSerapheim Dimitropoulos * metaslab's space map (and its histogram) but are not in 430555d674dSSerapheim Dimitropoulos * ms_allocatable yet, because they are in ms_freed, ms_freeing, 431555d674dSSerapheim Dimitropoulos * or ms_defer[]. 432555d674dSSerapheim Dimitropoulos * 433555d674dSSerapheim Dimitropoulos * When the metaslab is not loaded, its ms_weight needs to 434555d674dSSerapheim Dimitropoulos * reflect what is allocatable (i.e. what will be part of 435555d674dSSerapheim Dimitropoulos * ms_allocatable if it is loaded). The weight is computed from 436555d674dSSerapheim Dimitropoulos * the spacemap histogram, but that includes ranges that are 437555d674dSSerapheim Dimitropoulos * not yet allocatable (because they are in ms_freed, 438555d674dSSerapheim Dimitropoulos * ms_freeing, or ms_defer[]). Therefore, when calculating the 439555d674dSSerapheim Dimitropoulos * weight, we need to remove those ranges. 440555d674dSSerapheim Dimitropoulos * 441555d674dSSerapheim Dimitropoulos * The ranges in the ms_freed and ms_defer[] range trees are all 442555d674dSSerapheim Dimitropoulos * present in the spacemap. However, the spacemap may have 443555d674dSSerapheim Dimitropoulos * multiple entries to represent a contiguous range, because it 444555d674dSSerapheim Dimitropoulos * is written across multiple sync passes, but the changes of 445555d674dSSerapheim Dimitropoulos * all sync passes are consolidated into the range trees. 446555d674dSSerapheim Dimitropoulos * Adjacent ranges that are freed in different sync passes of 447555d674dSSerapheim Dimitropoulos * one txg will be represented separately (as 2 or more entries) 448555d674dSSerapheim Dimitropoulos * in the space map (and its histogram), but these adjacent 449555d674dSSerapheim Dimitropoulos * ranges will be consolidated (represented as one entry) in the 450555d674dSSerapheim Dimitropoulos * ms_freed/ms_defer[] range trees (and their histograms). 451555d674dSSerapheim Dimitropoulos * 452555d674dSSerapheim Dimitropoulos * When calculating the weight, we can not simply subtract the 453555d674dSSerapheim Dimitropoulos * range trees' histograms from the spacemap's histogram, 454555d674dSSerapheim Dimitropoulos * because the range trees' histograms may have entries in 455555d674dSSerapheim Dimitropoulos * higher buckets than the spacemap, due to consolidation. 456555d674dSSerapheim Dimitropoulos * Instead we must subtract the exact entries that were added to 457555d674dSSerapheim Dimitropoulos * the spacemap's histogram. ms_synchist and ms_deferhist[] 458555d674dSSerapheim Dimitropoulos * represent these exact entries, so we can subtract them from 459555d674dSSerapheim Dimitropoulos * the spacemap's histogram when calculating ms_weight. 460555d674dSSerapheim Dimitropoulos * 461555d674dSSerapheim Dimitropoulos * ms_synchist represents the same ranges as ms_freeing + 462555d674dSSerapheim Dimitropoulos * ms_freed, but without consolidation across sync passes. 463555d674dSSerapheim Dimitropoulos * 464555d674dSSerapheim Dimitropoulos * ms_deferhist[i] represents the same ranges as ms_defer[i], 465555d674dSSerapheim Dimitropoulos * but without consolidation across sync passes. 466555d674dSSerapheim Dimitropoulos */ 467555d674dSSerapheim Dimitropoulos uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; 468555d674dSSerapheim Dimitropoulos uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; 469555d674dSSerapheim Dimitropoulos 470555d674dSSerapheim Dimitropoulos /* 471555d674dSSerapheim Dimitropoulos * Tracks the exact amount of allocated space of this metaslab 472555d674dSSerapheim Dimitropoulos * (and specifically the metaslab's space map) up to the most 473555d674dSSerapheim Dimitropoulos * recently completed sync pass [see usage in metaslab_sync()]. 474555d674dSSerapheim Dimitropoulos */ 475555d674dSSerapheim Dimitropoulos uint64_t ms_allocated_space; 476468c413aSTim Haley int64_t ms_deferspace; /* sum of ms_defermap[] space */ 477ecc2d604Sbonwick uint64_t ms_weight; /* weight vs. others in group */ 4788363e80aSGeorge Wilson uint64_t ms_activation_weight; /* activation weight */ 4798363e80aSGeorge Wilson 4808363e80aSGeorge Wilson /* 4818363e80aSGeorge Wilson * Track of whenever a metaslab is selected for loading or allocation. 4828363e80aSGeorge Wilson * We use this value to determine how long the metaslab should 4838363e80aSGeorge Wilson * stay cached. 4848363e80aSGeorge Wilson */ 4858363e80aSGeorge Wilson uint64_t ms_selected_txg; 486af1d63abSPaul Dagnelie /* 487af1d63abSPaul Dagnelie * ms_load/unload_time can be used for performance monitoring 488af1d63abSPaul Dagnelie * (e.g. by dtrace or mdb). 489af1d63abSPaul Dagnelie */ 490af1d63abSPaul Dagnelie hrtime_t ms_load_time; /* time last loaded */ 491af1d63abSPaul Dagnelie hrtime_t ms_unload_time; /* time last unloaded */ 492af1d63abSPaul Dagnelie hrtime_t ms_selected_time; /* time last allocated from */ 4938363e80aSGeorge Wilson 4948363e80aSGeorge Wilson uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ 4958363e80aSGeorge Wilson uint64_t ms_max_size; /* maximum allocatable size */ 4960713e232SGeorge Wilson 497f78cdc34SPaul Dagnelie /* 498f78cdc34SPaul Dagnelie * -1 if it's not active in an allocator, otherwise set to the allocator 499f78cdc34SPaul Dagnelie * this metaslab is active for. 500f78cdc34SPaul Dagnelie */ 501f78cdc34SPaul Dagnelie int ms_allocator; 502f78cdc34SPaul Dagnelie boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ 503f78cdc34SPaul Dagnelie 5040713e232SGeorge Wilson /* 5050713e232SGeorge Wilson * The metaslab block allocators can optionally use a size-ordered 5060713e232SGeorge Wilson * range tree and/or an array of LBAs. Not all allocators use 50786714001SSerapheim Dimitropoulos * this functionality. The ms_allocatable_by_size should always 50886714001SSerapheim Dimitropoulos * contain the same number of segments as the ms_allocatable. The 50986714001SSerapheim Dimitropoulos * only difference is that the ms_allocatable_by_size is ordered by 51086714001SSerapheim Dimitropoulos * segment sizes. 5110713e232SGeorge Wilson */ 512*4d7988d6SPaul Dagnelie zfs_btree_t ms_allocatable_by_size; 513*4d7988d6SPaul Dagnelie zfs_btree_t ms_unflushed_frees_by_size; 5140713e232SGeorge Wilson uint64_t ms_lbas[MAX_LBAS]; 5150713e232SGeorge Wilson 516ecc2d604Sbonwick metaslab_group_t *ms_group; /* metaslab group */ 517ecc2d604Sbonwick avl_node_t ms_group_node; /* node in metaslab group tree */ 518ecc2d604Sbonwick txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ 519814dcd43SSerapheim Dimitropoulos avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ 520af1d63abSPaul Dagnelie /* 521af1d63abSPaul Dagnelie * Node in metaslab class's selected txg list 522af1d63abSPaul Dagnelie */ 523af1d63abSPaul Dagnelie multilist_node_t ms_class_txg_node; 524814dcd43SSerapheim Dimitropoulos 525814dcd43SSerapheim Dimitropoulos /* 526814dcd43SSerapheim Dimitropoulos * Allocs and frees that are committed to the vdev log spacemap but 527814dcd43SSerapheim Dimitropoulos * not yet to this metaslab's spacemap. 528814dcd43SSerapheim Dimitropoulos */ 529814dcd43SSerapheim Dimitropoulos range_tree_t *ms_unflushed_allocs; 530814dcd43SSerapheim Dimitropoulos range_tree_t *ms_unflushed_frees; 531814dcd43SSerapheim Dimitropoulos 532814dcd43SSerapheim Dimitropoulos /* 533814dcd43SSerapheim Dimitropoulos * We have flushed entries up to but not including this TXG. In 534814dcd43SSerapheim Dimitropoulos * other words, all changes from this TXG and onward should not 535814dcd43SSerapheim Dimitropoulos * be in this metaslab's space map and must be read from the 536814dcd43SSerapheim Dimitropoulos * log space maps. 537814dcd43SSerapheim Dimitropoulos */ 538814dcd43SSerapheim Dimitropoulos uint64_t ms_unflushed_txg; 539f78cdc34SPaul Dagnelie 540555d674dSSerapheim Dimitropoulos /* updated every time we are done syncing the metaslab's space map */ 541555d674dSSerapheim Dimitropoulos uint64_t ms_synced_length; 542555d674dSSerapheim Dimitropoulos 543f78cdc34SPaul Dagnelie boolean_t ms_new; 544fa9e4066Sahrens }; 545fa9e4066Sahrens 546814dcd43SSerapheim Dimitropoulos typedef struct metaslab_unflushed_phys { 547814dcd43SSerapheim Dimitropoulos /* on-disk counterpart of ms_unflushed_txg */ 548814dcd43SSerapheim Dimitropoulos uint64_t msp_unflushed_txg; 549814dcd43SSerapheim Dimitropoulos } metaslab_unflushed_phys_t; 550814dcd43SSerapheim Dimitropoulos 551fa9e4066Sahrens #ifdef __cplusplus 552fa9e4066Sahrens } 553fa9e4066Sahrens #endif 554fa9e4066Sahrens 555fa9e4066Sahrens #endif /* _SYS_METASLAB_IMPL_H */ 556