xref: /illumos-gate/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h (revision 2e4c998613148111f2fc5371085331ffb39122ff)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ecc2d604Sbonwick  * Common Development and Distribution License (the "License").
6ecc2d604Sbonwick  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22d6e555bdSGeorge Wilson  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
2401f55e48SGeorge Wilson  */
2501f55e48SGeorge Wilson 
2601f55e48SGeorge Wilson /*
27b6240e83SGeorge Wilson  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
28fa9e4066Sahrens  */
29fa9e4066Sahrens 
30fa9e4066Sahrens #ifndef _SYS_METASLAB_IMPL_H
31fa9e4066Sahrens #define	_SYS_METASLAB_IMPL_H
32fa9e4066Sahrens 
33fa9e4066Sahrens #include <sys/metaslab.h>
34fa9e4066Sahrens #include <sys/space_map.h>
350713e232SGeorge Wilson #include <sys/range_tree.h>
36fa9e4066Sahrens #include <sys/vdev.h>
37fa9e4066Sahrens #include <sys/txg.h>
38fa9e4066Sahrens #include <sys/avl.h>
39fa9e4066Sahrens 
40fa9e4066Sahrens #ifdef	__cplusplus
41fa9e4066Sahrens extern "C" {
42fa9e4066Sahrens #endif
43fa9e4066Sahrens 
44*2e4c9986SGeorge Wilson /*
45*2e4c9986SGeorge Wilson  * A metaslab class encompasses a category of allocatable top-level vdevs.
46*2e4c9986SGeorge Wilson  * Each top-level vdev is associated with a metaslab group which defines
47*2e4c9986SGeorge Wilson  * the allocatable region for that vdev. Examples of these categories include
48*2e4c9986SGeorge Wilson  * "normal" for data block allocations (i.e. main pool allocations) or "log"
49*2e4c9986SGeorge Wilson  * for allocations designated for intent log devices (i.e. slog devices).
50*2e4c9986SGeorge Wilson  * When a block allocation is requested from the SPA it is associated with a
51*2e4c9986SGeorge Wilson  * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
52*2e4c9986SGeorge Wilson  * to the class can be used to satisfy that request. Allocations are done
53*2e4c9986SGeorge Wilson  * by traversing the metaslab groups that are linked off of the mc_rotor field.
54*2e4c9986SGeorge Wilson  * This rotor points to the next metaslab group where allocations will be
55*2e4c9986SGeorge Wilson  * attempted. Allocating a block is a 3 step process -- select the metaslab
56*2e4c9986SGeorge Wilson  * group, select the metaslab, and then allocate the block. The metaslab
57*2e4c9986SGeorge Wilson  * class defines the low-level block allocator that will be used as the
58*2e4c9986SGeorge Wilson  * final step in allocation. These allocators are pluggable allowing each class
59*2e4c9986SGeorge Wilson  * to use a block allocator that best suits that class.
60*2e4c9986SGeorge Wilson  */
61fa9e4066Sahrens struct metaslab_class {
6288ecc943SGeorge Wilson 	spa_t			*mc_spa;
63fa9e4066Sahrens 	metaslab_group_t	*mc_rotor;
640713e232SGeorge Wilson 	metaslab_ops_t		*mc_ops;
65b24ab676SJeff Bonwick 	uint64_t		mc_aliquot;
6622e30981SGeorge Wilson 	uint64_t		mc_alloc_groups; /* # of allocatable groups */
67b24ab676SJeff Bonwick 	uint64_t		mc_alloc;	/* total allocated space */
68b24ab676SJeff Bonwick 	uint64_t		mc_deferred;	/* total deferred frees */
69b24ab676SJeff Bonwick 	uint64_t		mc_space;	/* total space (alloc + free) */
70b24ab676SJeff Bonwick 	uint64_t		mc_dspace;	/* total deflated space */
71*2e4c9986SGeorge Wilson 	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
72fa9e4066Sahrens };
73fa9e4066Sahrens 
74*2e4c9986SGeorge Wilson /*
75*2e4c9986SGeorge Wilson  * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
76*2e4c9986SGeorge Wilson  * of a top-level vdev. They are linked togther to form a circular linked
77*2e4c9986SGeorge Wilson  * list and can belong to only one metaslab class. Metaslab groups may become
78*2e4c9986SGeorge Wilson  * ineligible for allocations for a number of reasons such as limited free
79*2e4c9986SGeorge Wilson  * space, fragmentation, or going offline. When this happens the allocator will
80*2e4c9986SGeorge Wilson  * simply find the next metaslab group in the linked list and attempt
81*2e4c9986SGeorge Wilson  * to allocate from that group instead.
82*2e4c9986SGeorge Wilson  */
83fa9e4066Sahrens struct metaslab_group {
84fa9e4066Sahrens 	kmutex_t		mg_lock;
85fa9e4066Sahrens 	avl_tree_t		mg_metaslab_tree;
86fa9e4066Sahrens 	uint64_t		mg_aliquot;
8722e30981SGeorge Wilson 	boolean_t		mg_allocatable;		/* can we allocate? */
8822e30981SGeorge Wilson 	uint64_t		mg_free_capacity;	/* percentage free */
89fa9e4066Sahrens 	int64_t			mg_bias;
90a1521560SJeff Bonwick 	int64_t			mg_activation_count;
91fa9e4066Sahrens 	metaslab_class_t	*mg_class;
92fa9e4066Sahrens 	vdev_t			*mg_vd;
930713e232SGeorge Wilson 	taskq_t			*mg_taskq;
94fa9e4066Sahrens 	metaslab_group_t	*mg_prev;
95fa9e4066Sahrens 	metaslab_group_t	*mg_next;
96*2e4c9986SGeorge Wilson 	uint64_t		mg_fragmentation;
97*2e4c9986SGeorge Wilson 	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
98fa9e4066Sahrens };
99fa9e4066Sahrens 
100fa9e4066Sahrens /*
1010713e232SGeorge Wilson  * This value defines the number of elements in the ms_lbas array. The value
102*2e4c9986SGeorge Wilson  * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
103*2e4c9986SGeorge Wilson  * This is the equivalent of highbit(UINT64_MAX).
1040713e232SGeorge Wilson  */
1050713e232SGeorge Wilson #define	MAX_LBAS	64
1060713e232SGeorge Wilson 
1070713e232SGeorge Wilson /*
1080713e232SGeorge Wilson  * Each metaslab maintains a set of in-core trees to track metaslab operations.
1090713e232SGeorge Wilson  * The in-core free tree (ms_tree) contains the current list of free segments.
1100713e232SGeorge Wilson  * As blocks are allocated, the allocated segment are removed from the ms_tree
1110713e232SGeorge Wilson  * and added to a per txg allocation tree (ms_alloctree). As blocks are freed,
1120713e232SGeorge Wilson  * they are added to the per txg free tree (ms_freetree). These per txg
1130713e232SGeorge Wilson  * trees allow us to process all allocations and frees in syncing context
1140713e232SGeorge Wilson  * where it is safe to update the on-disk space maps. One additional in-core
1150713e232SGeorge Wilson  * tree is maintained to track deferred frees (ms_defertree). Once a block
1160713e232SGeorge Wilson  * is freed it will move from the ms_freetree to the ms_defertree. A deferred
1170713e232SGeorge Wilson  * free means that a block has been freed but cannot be used by the pool
1180713e232SGeorge Wilson  * until TXG_DEFER_SIZE transactions groups later. For example, a block
1190713e232SGeorge Wilson  * that is freed in txg 50 will not be available for reallocation until
1200713e232SGeorge Wilson  * txg 52 (50 + TXG_DEFER_SIZE).  This provides a safety net for uberblock
1210713e232SGeorge Wilson  * rollback. A pool could be safely rolled back TXG_DEFERS_SIZE
1220713e232SGeorge Wilson  * transactions groups and ensure that no block has been reallocated.
1230713e232SGeorge Wilson  *
1240713e232SGeorge Wilson  * The simplified transition diagram looks like this:
1250713e232SGeorge Wilson  *
1260713e232SGeorge Wilson  *
1270713e232SGeorge Wilson  *      ALLOCATE
1280713e232SGeorge Wilson  *         |
1290713e232SGeorge Wilson  *         V
1300713e232SGeorge Wilson  *    free segment (ms_tree) --------> ms_alloctree ----> (write to space map)
1310713e232SGeorge Wilson  *         ^
1320713e232SGeorge Wilson  *         |
1330713e232SGeorge Wilson  *         |                           ms_freetree <--- FREE
1340713e232SGeorge Wilson  *         |                                 |
1350713e232SGeorge Wilson  *         |                                 |
1360713e232SGeorge Wilson  *         |                                 |
1370713e232SGeorge Wilson  *         +----------- ms_defertree <-------+---------> (write to space map)
13816a4a807SGeorge Wilson  *
1390713e232SGeorge Wilson  *
1400713e232SGeorge Wilson  * Each metaslab's space is tracked in a single space map in the MOS,
14116a4a807SGeorge Wilson  * which is only updated in syncing context. Each time we sync a txg,
1420713e232SGeorge Wilson  * we append the allocs and frees from that txg to the space map.
1430713e232SGeorge Wilson  * The pool space is only updated once all metaslabs have finished syncing.
14416a4a807SGeorge Wilson  *
1450713e232SGeorge Wilson  * To load the in-core free tree we read the space map from disk.
14616a4a807SGeorge Wilson  * This object contains a series of alloc and free records that are
14716a4a807SGeorge Wilson  * combined to make up the list of all free segments in this metaslab. These
1480713e232SGeorge Wilson  * segments are represented in-core by the ms_tree and are stored in an
14916a4a807SGeorge Wilson  * AVL tree.
15016a4a807SGeorge Wilson  *
1510713e232SGeorge Wilson  * As the space map grows (as a result of the appends) it will
1520713e232SGeorge Wilson  * eventually become space-inefficient. When the metaslab's in-core free tree
1530713e232SGeorge Wilson  * is zfs_condense_pct/100 times the size of the minimal on-disk
1540713e232SGeorge Wilson  * representation, we rewrite it in its minimized form. If a metaslab
1550713e232SGeorge Wilson  * needs to condense then we must set the ms_condensing flag to ensure
1560713e232SGeorge Wilson  * that allocations are not performed on the metaslab that is being written.
157fa9e4066Sahrens  */
158fa9e4066Sahrens struct metaslab {
1590713e232SGeorge Wilson 	kmutex_t	ms_lock;
1600713e232SGeorge Wilson 	kcondvar_t	ms_load_cv;
1610713e232SGeorge Wilson 	space_map_t	*ms_sm;
1620713e232SGeorge Wilson 	metaslab_ops_t	*ms_ops;
1630713e232SGeorge Wilson 	uint64_t	ms_id;
1640713e232SGeorge Wilson 	uint64_t	ms_start;
1650713e232SGeorge Wilson 	uint64_t	ms_size;
166*2e4c9986SGeorge Wilson 	uint64_t	ms_fragmentation;
1670713e232SGeorge Wilson 
1680713e232SGeorge Wilson 	range_tree_t	*ms_alloctree[TXG_SIZE];
1690713e232SGeorge Wilson 	range_tree_t	*ms_freetree[TXG_SIZE];
1700713e232SGeorge Wilson 	range_tree_t	*ms_defertree[TXG_DEFER_SIZE];
1710713e232SGeorge Wilson 	range_tree_t	*ms_tree;
1720713e232SGeorge Wilson 
1730713e232SGeorge Wilson 	boolean_t	ms_condensing;	/* condensing? */
174*2e4c9986SGeorge Wilson 	boolean_t	ms_condense_wanted;
1750713e232SGeorge Wilson 	boolean_t	ms_loaded;
1760713e232SGeorge Wilson 	boolean_t	ms_loading;
1770713e232SGeorge Wilson 
178468c413aSTim Haley 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
179ecc2d604Sbonwick 	uint64_t	ms_weight;	/* weight vs. others in group	*/
1800713e232SGeorge Wilson 	uint64_t	ms_access_txg;
1810713e232SGeorge Wilson 
1820713e232SGeorge Wilson 	/*
1830713e232SGeorge Wilson 	 * The metaslab block allocators can optionally use a size-ordered
1840713e232SGeorge Wilson 	 * range tree and/or an array of LBAs. Not all allocators use
1850713e232SGeorge Wilson 	 * this functionality. The ms_size_tree should always contain the
1860713e232SGeorge Wilson 	 * same number of segments as the ms_tree. The only difference
1870713e232SGeorge Wilson 	 * is that the ms_size_tree is ordered by segment sizes.
1880713e232SGeorge Wilson 	 */
1890713e232SGeorge Wilson 	avl_tree_t	ms_size_tree;
1900713e232SGeorge Wilson 	uint64_t	ms_lbas[MAX_LBAS];
1910713e232SGeorge Wilson 
192ecc2d604Sbonwick 	metaslab_group_t *ms_group;	/* metaslab group		*/
193ecc2d604Sbonwick 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
194ecc2d604Sbonwick 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
195fa9e4066Sahrens };
196fa9e4066Sahrens 
197fa9e4066Sahrens #ifdef	__cplusplus
198fa9e4066Sahrens }
199fa9e4066Sahrens #endif
200fa9e4066Sahrens 
201fa9e4066Sahrens #endif	/* _SYS_METASLAB_IMPL_H */
202