xref: /illumos-gate/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1*fa9e4066Sahrens /*
2*fa9e4066Sahrens  * CDDL HEADER START
3*fa9e4066Sahrens  *
4*fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5*fa9e4066Sahrens  * Common Development and Distribution License, Version 1.0 only
6*fa9e4066Sahrens  * (the "License").  You may not use this file except in compliance
7*fa9e4066Sahrens  * with the License.
8*fa9e4066Sahrens  *
9*fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
11*fa9e4066Sahrens  * See the License for the specific language governing permissions
12*fa9e4066Sahrens  * and limitations under the License.
13*fa9e4066Sahrens  *
14*fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*fa9e4066Sahrens  *
20*fa9e4066Sahrens  * CDDL HEADER END
21*fa9e4066Sahrens  */
22*fa9e4066Sahrens /*
23*fa9e4066Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*fa9e4066Sahrens  * Use is subject to license terms.
25*fa9e4066Sahrens  */
26*fa9e4066Sahrens 
27*fa9e4066Sahrens #ifndef _SYS_METASLAB_IMPL_H
28*fa9e4066Sahrens #define	_SYS_METASLAB_IMPL_H
29*fa9e4066Sahrens 
30*fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
31*fa9e4066Sahrens 
32*fa9e4066Sahrens #include <sys/metaslab.h>
33*fa9e4066Sahrens #include <sys/space_map.h>
34*fa9e4066Sahrens #include <sys/vdev.h>
35*fa9e4066Sahrens #include <sys/txg.h>
36*fa9e4066Sahrens #include <sys/avl.h>
37*fa9e4066Sahrens 
38*fa9e4066Sahrens #ifdef	__cplusplus
39*fa9e4066Sahrens extern "C" {
40*fa9e4066Sahrens #endif
41*fa9e4066Sahrens 
42*fa9e4066Sahrens struct metaslab_class {
43*fa9e4066Sahrens 	metaslab_group_t	*mc_rotor;
44*fa9e4066Sahrens 	uint64_t		mc_allocated;
45*fa9e4066Sahrens };
46*fa9e4066Sahrens 
47*fa9e4066Sahrens struct metaslab_group {
48*fa9e4066Sahrens 	kmutex_t		mg_lock;
49*fa9e4066Sahrens 	avl_tree_t		mg_metaslab_tree;
50*fa9e4066Sahrens 	uint64_t		mg_aliquot;
51*fa9e4066Sahrens 	int64_t			mg_bias;
52*fa9e4066Sahrens 	metaslab_class_t	*mg_class;
53*fa9e4066Sahrens 	vdev_t			*mg_vd;
54*fa9e4066Sahrens 	metaslab_group_t	*mg_prev;
55*fa9e4066Sahrens 	metaslab_group_t	*mg_next;
56*fa9e4066Sahrens };
57*fa9e4066Sahrens 
58*fa9e4066Sahrens /*
59*fa9e4066Sahrens  * Each metaslab's free block list is kept in its own DMU object in the
60*fa9e4066Sahrens  * metaslab freelist dataset.  To minimize space consumption, the list
61*fa9e4066Sahrens  * is circular.
62*fa9e4066Sahrens  *
63*fa9e4066Sahrens  * Allocations and frees can happen in multiple transaction groups at
64*fa9e4066Sahrens  * the same time, which makes it a bit challening to keep the metaslab
65*fa9e4066Sahrens  * consistent.  For example, we cannot allow frees from different
66*fa9e4066Sahrens  * transaction groups to be interleaved in the metaslab's free block list.
67*fa9e4066Sahrens  *
68*fa9e4066Sahrens  * We address this in several ways:
69*fa9e4066Sahrens  *
70*fa9e4066Sahrens  *	We don't allow allocations from the same metaslab in concurrent
71*fa9e4066Sahrens  *	transaction groups.  metaslab_alloc() enforces this by checking
72*fa9e4066Sahrens  *	the ms_last_alloc field, which specifies the last txg in which
73*fa9e4066Sahrens  *	the metaslab was used for allocations.
74*fa9e4066Sahrens  *
75*fa9e4066Sahrens  *	We can't segregate frees this way because we can't choose which
76*fa9e4066Sahrens  *	DVAs someone wants to free.  So we keep separate in-core freelists
77*fa9e4066Sahrens  *	for each active transaction group.  This in-core data is only
78*fa9e4066Sahrens  *	written to the metaslab's on-disk freelist in metaslab_sync(),
79*fa9e4066Sahrens  *	which solves the interleave problem: we only append frees from
80*fa9e4066Sahrens  *	the syncing txg to the on-disk freelist, so the appends all occur
81*fa9e4066Sahrens  *	in txg order.
82*fa9e4066Sahrens  *
83*fa9e4066Sahrens  *	We cannot allow a block which was freed in a given txg to be
84*fa9e4066Sahrens  *	allocated again until that txg has closed; otherwise, if we
85*fa9e4066Sahrens  *	failed to sync that txg and had to roll back to txg - 1,
86*fa9e4066Sahrens  *	changes in txg + 1 could have overwritten the data.  Therefore,
87*fa9e4066Sahrens  *	we partition the free blocks into "available" and "limbo" states.
88*fa9e4066Sahrens  *	A block is available if the txg in which it was freed has closed;
89*fa9e4066Sahrens  *	until then, the block is in limbo.  Each time metaslab_sync() runs,
90*fa9e4066Sahrens  *	if first adds any limbo blocks to the avail list, clears the limbo
91*fa9e4066Sahrens  *	list, and starts writing the new limbo blocks (i.e. the ones that
92*fa9e4066Sahrens  *	were freed in the syncing txg).
93*fa9e4066Sahrens  */
94*fa9e4066Sahrens 
95*fa9e4066Sahrens struct metaslab {
96*fa9e4066Sahrens 	kmutex_t	ms_lock;	/* metaslab lock		*/
97*fa9e4066Sahrens 	space_map_obj_t	*ms_smo;	/* space map object		*/
98*fa9e4066Sahrens 	uint64_t	ms_last_alloc;	/* txg of last alloc		*/
99*fa9e4066Sahrens 	uint64_t	ms_usable_end;	/* end of free_obj at last sync	*/
100*fa9e4066Sahrens 	uint64_t	ms_usable_space; /* usable space at last sync	*/
101*fa9e4066Sahrens 	metaslab_group_t *ms_group;	/* metaslab group		*/
102*fa9e4066Sahrens 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
103*fa9e4066Sahrens 	uint64_t	ms_weight;	/* weight vs. others in group	*/
104*fa9e4066Sahrens 	uint8_t		ms_dirty[TXG_SIZE];	/* per-txg dirty flags	*/
105*fa9e4066Sahrens 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
106*fa9e4066Sahrens 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
107*fa9e4066Sahrens 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
108*fa9e4066Sahrens 	space_map_t	ms_map;		/* in-core free space map	*/
109*fa9e4066Sahrens 	uint8_t		ms_map_incore;  /* space map contents are valid */
110*fa9e4066Sahrens 	uint64_t	ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD	*/
111*fa9e4066Sahrens };
112*fa9e4066Sahrens 
113*fa9e4066Sahrens /*
114*fa9e4066Sahrens  * ms_dirty[] flags
115*fa9e4066Sahrens  */
116*fa9e4066Sahrens #define	MSD_ALLOC	0x01	/* allocated from in this txg		*/
117*fa9e4066Sahrens #define	MSD_FREE	0x02	/* freed to in this txg			*/
118*fa9e4066Sahrens #define	MSD_ADD		0x04	/* added to the pool in this txg	*/
119*fa9e4066Sahrens #define	MSD_CONDENSE	0x08	/* condensed in this txg		*/
120*fa9e4066Sahrens 
121*fa9e4066Sahrens #ifdef	__cplusplus
122*fa9e4066Sahrens }
123*fa9e4066Sahrens #endif
124*fa9e4066Sahrens 
125*fa9e4066Sahrens #endif	/* _SYS_METASLAB_IMPL_H */
126