spa.h revision fa9e4066f08beec538e775443c5be79dd423fcab
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#ifndef _SYS_SPA_H
28#define	_SYS_SPA_H
29
30#pragma ident	"%Z%%M%	%I%	%E% SMI"
31
32#include <sys/avl.h>
33#include <sys/zfs_context.h>
34#include <sys/nvpair.h>
35#include <sys/sysmacros.h>
36#include <sys/types.h>
37#include <sys/fs/zfs.h>
38
39#ifdef	__cplusplus
40extern "C" {
41#endif
42
43/*
44 * Forward references that lots of things need.
45 */
46typedef struct spa spa_t;
47typedef struct vdev vdev_t;
48typedef struct metaslab metaslab_t;
49typedef struct zilog zilog_t;
50typedef struct traverse_handle traverse_handle_t;
51struct dsl_pool;
52
53/*
54 * General-purpose 32-bit and 64-bit bitfield encodings.
55 */
56#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
57#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
58#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
59#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
60
61#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
62#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
63
64#define	BF32_SET(x, low, len, val)	\
65	((x) ^= BF32_ENCODE((x >> low) ^ val, low, len))
66#define	BF64_SET(x, low, len, val)	\
67	((x) ^= BF64_ENCODE((x >> low) ^ val, low, len))
68
69#define	BF32_GET_SB(x, low, len, shift, bias)	\
70	((BF32_GET(x, low, len) + (bias)) << (shift))
71#define	BF64_GET_SB(x, low, len, shift, bias)	\
72	((BF64_GET(x, low, len) + (bias)) << (shift))
73
74#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
75	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
76#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
77	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
78
79/*
80 * We currently support nine block sizes, from 512 bytes to 128K.
81 * We could go higher, but the benefits are near-zero and the cost
82 * of COWing a giant block to modify one byte would become excessive.
83 */
84#define	SPA_MINBLOCKSHIFT	9
85#define	SPA_MAXBLOCKSHIFT	17
86#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
87#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
88
89#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
90
91/*
92 * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
93 * The ASIZE encoding should be at least 64 times larger (6 more bits)
94 * to support up to 4-way RAID-Z mirror mode with worst-case gang block
95 * overhead, three DVAs per bp, plus one more bit in case we do anything
96 * else that expands the ASIZE.
97 */
98#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
99#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
100#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
101
102/*
103 * All SPA data is represented by 128-bit data virtual addresses (DVAs).
104 * The members of the dva_t should be considered opaque outside the SPA.
105 */
106typedef struct dva {
107	uint64_t	dva_word[2];
108} dva_t;
109
110/*
111 * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
112 */
113typedef struct zio_cksum {
114	uint64_t	zc_word[4];
115} zio_cksum_t;
116
117/*
118 * Each block is described by its DVAs, time of birth, checksum, etc.
119 * The word-by-word, bit-by-bit layout of the blkptr is as follows:
120 *
121 *	64	56	48	40	32	24	16	8	0
122 *	+-------+-------+-------+-------+-------+-------+-------+-------+
123 * 0	|		vdev1		| GRID  |	  ASIZE		|
124 *	+-------+-------+-------+-------+-------+-------+-------+-------+
125 * 1	|G|			 offset1				|
126 *	+-------+-------+-------+-------+-------+-------+-------+-------+
127 * 2	|		vdev2		| GRID  |	  ASIZE		|
128 *	+-------+-------+-------+-------+-------+-------+-------+-------+
129 * 3	|G|			 offset2				|
130 *	+-------+-------+-------+-------+-------+-------+-------+-------+
131 * 4	|		vdev3		| GRID  |	  ASIZE		|
132 *	+-------+-------+-------+-------+-------+-------+-------+-------+
133 * 5	|G|			 offset3				|
134 *	+-------+-------+-------+-------+-------+-------+-------+-------+
135 * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
136 *	+-------+-------+-------+-------+-------+-------+-------+-------+
137 * 7	|			padding					|
138 *	+-------+-------+-------+-------+-------+-------+-------+-------+
139 * 8	|			padding					|
140 *	+-------+-------+-------+-------+-------+-------+-------+-------+
141 * 9	|			padding					|
142 *	+-------+-------+-------+-------+-------+-------+-------+-------+
143 * a	|			birth txg				|
144 *	+-------+-------+-------+-------+-------+-------+-------+-------+
145 * b	|			fill count				|
146 *	+-------+-------+-------+-------+-------+-------+-------+-------+
147 * c	|			checksum[0]				|
148 *	+-------+-------+-------+-------+-------+-------+-------+-------+
149 * d	|			checksum[1]				|
150 *	+-------+-------+-------+-------+-------+-------+-------+-------+
151 * e	|			checksum[2]				|
152 *	+-------+-------+-------+-------+-------+-------+-------+-------+
153 * f	|			checksum[3]				|
154 *	+-------+-------+-------+-------+-------+-------+-------+-------+
155 *
156 * Legend:
157 *
158 * vdev		virtual device ID
159 * offset	offset into virtual device
160 * LSIZE	logical size
161 * PSIZE	physical size (after compression)
162 * ASIZE	allocated size (including RAID-Z parity and gang block headers)
163 * GRID		RAID-Z layout information (reserved for future use)
164 * cksum	checksum function
165 * comp		compression function
166 * G		gang block indicator
167 * E		endianness
168 * type		DMU object type
169 * lvl		level of indirection
170 * birth txg	transaction group in which the block was born
171 * fill count	number of non-zero blocks under this bp
172 * checksum[4]	256-bit checksum of the data this bp describes
173 */
174typedef struct blkptr {
175	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
176	uint64_t	blk_prop;	/* size, compression, type, etc	*/
177	uint64_t	blk_pad[3];	/* Extra space for the future	*/
178	uint64_t	blk_birth;	/* transaction group at birth	*/
179	uint64_t	blk_fill;	/* fill count			*/
180	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
181} blkptr_t;
182
183#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
184#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
185
186/*
187 * Macros to get and set fields in a bp or DVA.
188 */
189#define	DVA_GET_ASIZE(dva)	\
190	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
191#define	DVA_SET_ASIZE(dva, x)	\
192	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
193
194#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
195#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
196
197#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
198#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
199
200#define	DVA_GET_OFFSET(dva)	\
201	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
202#define	DVA_SET_OFFSET(dva, x)	\
203	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
204
205#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
206#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
207
208#define	BP_GET_LSIZE(bp)	\
209	(BP_IS_HOLE(bp) ? 0 : \
210	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
211#define	BP_SET_LSIZE(bp, x)	\
212	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
213
214#define	BP_GET_PSIZE(bp)	\
215	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
216#define	BP_SET_PSIZE(bp, x)	\
217	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
218
219#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
220#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
221
222#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
223#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
224
225#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
226#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
227
228#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
229#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
230
231#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
232#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
233
234#define	BP_GET_ASIZE(bp)	\
235	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
236	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
237
238#define	DVA_EQUAL(dva1, dva2)	\
239	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
240	(dva1)->dva_word[0] == (dva2)->dva_word[0])
241
242#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
243
244#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
245{						\
246	(zcp)->zc_word[0] = w0;			\
247	(zcp)->zc_word[1] = w1;			\
248	(zcp)->zc_word[2] = w2;			\
249	(zcp)->zc_word[3] = w3;			\
250}
251
252#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
253
254#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
255
256#define	BP_ZERO(bp)				\
257{						\
258	(bp)->blk_dva[0].dva_word[0] = 0;	\
259	(bp)->blk_dva[0].dva_word[1] = 0;	\
260	(bp)->blk_dva[1].dva_word[0] = 0;	\
261	(bp)->blk_dva[1].dva_word[1] = 0;	\
262	(bp)->blk_dva[2].dva_word[0] = 0;	\
263	(bp)->blk_dva[2].dva_word[1] = 0;	\
264	(bp)->blk_prop = 0;			\
265	(bp)->blk_pad[0] = 0;			\
266	(bp)->blk_pad[1] = 0;			\
267	(bp)->blk_pad[2] = 0;			\
268	(bp)->blk_birth = 0;			\
269	(bp)->blk_fill = 0;			\
270	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
271}
272
273/*
274 * Note: the byteorder is either 0 or -1, both of which are palindromes.
275 * This simplifies the endianness handling a bit.
276 */
277#ifdef _BIG_ENDIAN
278#define	ZFS_HOST_BYTEORDER	(0ULL)
279#else
280#define	ZFS_HOST_BYTEORDER	(-1ULL)
281#endif
282
283#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
284
285#include <sys/dmu.h>
286
287/*
288 * Routines found in spa.c
289 */
290
291/* state manipulation functions */
292extern int spa_open(const char *pool, spa_t **, void *tag);
293extern int spa_get_stats(const char *pool, nvlist_t **config);
294extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
295extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
296extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
297extern int spa_destroy(char *pool);
298extern int spa_export(char *pool);
299
300/* device manipulation */
301extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
302extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
303extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
304    int replacing);
305extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
306    int replace_done);
307extern void spa_vdev_replace_done(spa_t *spa);
308
309/* scrubbing */
310extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
311extern void spa_scrub_suspend(spa_t *spa);
312extern void spa_scrub_resume(spa_t *spa);
313extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
314
315/* spa syncing */
316extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
317extern void spa_sync_allpools(void);
318
319/*
320 * SPA configuration functions in spa_config.c
321 */
322extern void spa_config_sync(void);
323extern void spa_config_load(void);
324extern nvlist_t *spa_all_configs(uint64_t *);
325extern void spa_config_set(spa_t *spa, nvlist_t *config);
326extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
327    int getstats);
328
329/*
330 * Miscellaneous SPA routines in spa_misc.c
331 */
332
333/* Namespace manipulation */
334extern spa_t *spa_lookup(const char *name);
335extern spa_t *spa_add(const char *name);
336extern void spa_remove(spa_t *spa);
337extern spa_t *spa_next(spa_t *prev);
338
339/* Refcount functions */
340extern void spa_open_ref(spa_t *spa, void *tag);
341extern void spa_close(spa_t *spa, void *tag);
342extern boolean_t spa_refcount_zero(spa_t *spa);
343
344/* Pool configuration lock */
345extern void spa_config_enter(spa_t *spa, krw_t rw);
346extern void spa_config_exit(spa_t *spa);
347extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
348
349/* Pool vdev add/remove lock */
350extern uint64_t spa_vdev_enter(spa_t *spa);
351extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
352
353/* Accessor functions */
354extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
355extern int spa_traverse_wanted(spa_t *spa);
356extern struct dsl_pool *spa_get_dsl(spa_t *spa);
357extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
358extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
359extern void spa_altroot(spa_t *, char *, size_t);
360extern int spa_sync_pass(spa_t *spa);
361extern char *spa_name(spa_t *spa);
362extern uint64_t spa_guid(spa_t *spa);
363extern uint64_t spa_last_synced_txg(spa_t *spa);
364extern uint64_t spa_first_txg(spa_t *spa);
365extern int spa_state(spa_t *spa);
366extern uint64_t spa_freeze_txg(spa_t *spa);
367struct metaslab_class;
368extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
369extern uint64_t spa_get_alloc(spa_t *spa);
370extern uint64_t spa_get_space(spa_t *spa);
371extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
372extern int spa_busy(void);
373
374/* Miscellaneous support routines */
375extern int spa_rename(const char *oldname, const char *newname);
376extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
377extern char *spa_strdup(const char *);
378extern void spa_strfree(char *);
379extern uint64_t spa_get_random(uint64_t range);
380extern void sprintf_blkptr(char *buf, blkptr_t *bp);
381extern void spa_freeze(spa_t *spa);
382extern void spa_evict_all(void);
383
384/* Initialization and termination */
385extern void spa_init(int flags);
386extern void spa_fini(void);
387
388#ifdef ZFS_DEBUG
389#define	dprintf_bp(bp, fmt, ...) do {			\
390	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
391	char __blkbuf[200];				\
392	sprintf_blkptr(__blkbuf, (bp));			\
393	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);	\
394	} \
395_NOTE(CONSTCOND) } while (0)
396#else
397#define	dprintf_bp(bp, fmt, ...)
398#endif
399
400extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
401
402#ifdef	__cplusplus
403}
404#endif
405
406#endif	/* _SYS_SPA_H */
407