spa.h revision ad135b5d644628e791c3188a6ecbd9c257961ef8
17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
50a44ef6jacobs * Common Development and Distribution License (the "License").
60a44ef6jacobs * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
217c478bdstevel@tonic-gate/*
227c478bdstevel@tonic-gate * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bdstevel@tonic-gate * Copyright (c) 2012 by Delphix. All rights reserved.
247c478bdstevel@tonic-gate * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
257c478bdstevel@tonic-gate */
260a44ef6jacobs
277c478bdstevel@tonic-gate#ifndef _SYS_SPA_H
28c3377eeJohn Levon#define	_SYS_SPA_H
297c478bdstevel@tonic-gate
307c478bdstevel@tonic-gate#include <sys/avl.h>
317c478bdstevel@tonic-gate#include <sys/zfs_context.h>
327c478bdstevel@tonic-gate#include <sys/nvpair.h>
337c478bdstevel@tonic-gate#include <sys/sysmacros.h>
347c478bdstevel@tonic-gate#include <sys/types.h>
357c478bdstevel@tonic-gate#include <sys/fs/zfs.h>
367c478bdstevel@tonic-gate
377c478bdstevel@tonic-gate#ifdef	__cplusplus
387c478bdstevel@tonic-gateextern "C" {
397c478bdstevel@tonic-gate#endif
407c478bdstevel@tonic-gate
417c478bdstevel@tonic-gate/*
427c478bdstevel@tonic-gate * Forward references that lots of things need.
437c478bdstevel@tonic-gate */
447c478bdstevel@tonic-gatetypedef struct spa spa_t;
457c478bdstevel@tonic-gatetypedef struct vdev vdev_t;
467c478bdstevel@tonic-gatetypedef struct metaslab metaslab_t;
477c478bdstevel@tonic-gatetypedef struct metaslab_group metaslab_group_t;
487c478bdstevel@tonic-gatetypedef struct metaslab_class metaslab_class_t;
497c478bdstevel@tonic-gatetypedef struct zio zio_t;
507c478bdstevel@tonic-gatetypedef struct zilog zilog_t;
517c478bdstevel@tonic-gatetypedef struct spa_aux_vdev spa_aux_vdev_t;
527c478bdstevel@tonic-gatetypedef struct ddt ddt_t;
537c478bdstevel@tonic-gatetypedef struct ddt_entry ddt_entry_t;
547c478bdstevel@tonic-gatestruct dsl_pool;
557c478bdstevel@tonic-gate
567c478bdstevel@tonic-gate/*
577c478bdstevel@tonic-gate * General-purpose 32-bit and 64-bit bitfield encodings.
587c478bdstevel@tonic-gate */
597c478bdstevel@tonic-gate#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
607c478bdstevel@tonic-gate#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
617c478bdstevel@tonic-gate#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
627c478bdstevel@tonic-gate#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
637c478bdstevel@tonic-gate
647c478bdstevel@tonic-gate#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
657c478bdstevel@tonic-gate#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
667c478bdstevel@tonic-gate
677c478bdstevel@tonic-gate#define	BF32_SET(x, low, len, val)	\
687c478bdstevel@tonic-gate	((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
697c478bdstevel@tonic-gate#define	BF64_SET(x, low, len, val)	\
707c478bdstevel@tonic-gate	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
717c478bdstevel@tonic-gate
727c478bdstevel@tonic-gate#define	BF32_GET_SB(x, low, len, shift, bias)	\
737c478bdstevel@tonic-gate	((BF32_GET(x, low, len) + (bias)) << (shift))
747c478bdstevel@tonic-gate#define	BF64_GET_SB(x, low, len, shift, bias)	\
757c478bdstevel@tonic-gate	((BF64_GET(x, low, len) + (bias)) << (shift))
767c478bdstevel@tonic-gate
777c478bdstevel@tonic-gate#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
787c478bdstevel@tonic-gate	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
797c478bdstevel@tonic-gate#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
807c478bdstevel@tonic-gate	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
817c478bdstevel@tonic-gate
827c478bdstevel@tonic-gate/*
837c478bdstevel@tonic-gate * We currently support nine block sizes, from 512 bytes to 128K.
847c478bdstevel@tonic-gate * We could go higher, but the benefits are near-zero and the cost
857c478bdstevel@tonic-gate * of COWing a giant block to modify one byte would become excessive.
867c478bdstevel@tonic-gate */
877c478bdstevel@tonic-gate#define	SPA_MINBLOCKSHIFT	9
887c478bdstevel@tonic-gate#define	SPA_MAXBLOCKSHIFT	17
897c478bdstevel@tonic-gate#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
907c478bdstevel@tonic-gate#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
917c478bdstevel@tonic-gate
927c478bdstevel@tonic-gate#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
937c478bdstevel@tonic-gate
947c478bdstevel@tonic-gate/*
957c478bdstevel@tonic-gate * Size of block to hold the configuration data (a packed nvlist)
967c478bdstevel@tonic-gate */
977c478bdstevel@tonic-gate#define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
987c478bdstevel@tonic-gate
997c478bdstevel@tonic-gate/*
1007c478bdstevel@tonic-gate * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
1017c478bdstevel@tonic-gate * The ASIZE encoding should be at least 64 times larger (6 more bits)
1027c478bdstevel@tonic-gate * to support up to 4-way RAID-Z mirror mode with worst-case gang block
1037c478bdstevel@tonic-gate * overhead, three DVAs per bp, plus one more bit in case we do anything
1047c478bdstevel@tonic-gate * else that expands the ASIZE.
1057c478bdstevel@tonic-gate */
1067c478bdstevel@tonic-gate#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
1077c478bdstevel@tonic-gate#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
1087c478bdstevel@tonic-gate#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
1097c478bdstevel@tonic-gate
1107c478bdstevel@tonic-gate/*
1117c478bdstevel@tonic-gate * All SPA data is represented by 128-bit data virtual addresses (DVAs).
1127c478bdstevel@tonic-gate * The members of the dva_t should be considered opaque outside the SPA.
1137c478bdstevel@tonic-gate */
1147c478bdstevel@tonic-gatetypedef struct dva {
1157c478bdstevel@tonic-gate	uint64_t	dva_word[2];
1167c478bdstevel@tonic-gate} dva_t;
1177c478bdstevel@tonic-gate
1187c478bdstevel@tonic-gate/*
1197c478bdstevel@tonic-gate * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
1207c478bdstevel@tonic-gate */
1217c478bdstevel@tonic-gatetypedef struct zio_cksum {
1227c478bdstevel@tonic-gate	uint64_t	zc_word[4];
1237c478bdstevel@tonic-gate} zio_cksum_t;
1247c478bdstevel@tonic-gate
1257c478bdstevel@tonic-gate/*
1267c478bdstevel@tonic-gate * Each block is described by its DVAs, time of birth, checksum, etc.
1277c478bdstevel@tonic-gate * The word-by-word, bit-by-bit layout of the blkptr is as follows:
1287c478bdstevel@tonic-gate *
1297c478bdstevel@tonic-gate *	64	56	48	40	32	24	16	8	0
1307c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1317c478bdstevel@tonic-gate * 0	|		vdev1		| GRID  |	  ASIZE		|
1327c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1337c478bdstevel@tonic-gate * 1	|G|			 offset1				|
1347c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1357c478bdstevel@tonic-gate * 2	|		vdev2		| GRID  |	  ASIZE		|
1367c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1377c478bdstevel@tonic-gate * 3	|G|			 offset2				|
1387c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1397c478bdstevel@tonic-gate * 4	|		vdev3		| GRID  |	  ASIZE		|
1407c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1417c478bdstevel@tonic-gate * 5	|G|			 offset3				|
1427c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1437c478bdstevel@tonic-gate * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
1447c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1457c478bdstevel@tonic-gate * 7	|			padding					|
1467c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1477c478bdstevel@tonic-gate * 8	|			padding					|
1487c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1497c478bdstevel@tonic-gate * 9	|			physical birth txg			|
1507c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1517c478bdstevel@tonic-gate * a	|			logical birth txg			|
1527c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1537c478bdstevel@tonic-gate * b	|			fill count				|
1547c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1557c478bdstevel@tonic-gate * c	|			checksum[0]				|
1567c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1577c478bdstevel@tonic-gate * d	|			checksum[1]				|
1587c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1597c478bdstevel@tonic-gate * e	|			checksum[2]				|
1607c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1617c478bdstevel@tonic-gate * f	|			checksum[3]				|
1627c478bdstevel@tonic-gate *	+-------+-------+-------+-------+-------+-------+-------+-------+
1637c478bdstevel@tonic-gate *
1647c478bdstevel@tonic-gate * Legend:
1657c478bdstevel@tonic-gate *
1667c478bdstevel@tonic-gate * vdev		virtual device ID
1677c478bdstevel@tonic-gate * offset	offset into virtual device
1687c478bdstevel@tonic-gate * LSIZE	logical size
1697c478bdstevel@tonic-gate * PSIZE	physical size (after compression)
1707c478bdstevel@tonic-gate * ASIZE	allocated size (including RAID-Z parity and gang block headers)
1717c478bdstevel@tonic-gate * GRID		RAID-Z layout information (reserved for future use)
1727c478bdstevel@tonic-gate * cksum	checksum function
1737c478bdstevel@tonic-gate * comp		compression function
1747c478bdstevel@tonic-gate * G		gang block indicator
1757c478bdstevel@tonic-gate * B		byteorder (endianness)
1767c478bdstevel@tonic-gate * D		dedup
1777c478bdstevel@tonic-gate * X		unused
1787c478bdstevel@tonic-gate * lvl		level of indirection
1797c478bdstevel@tonic-gate * type		DMU object type
1807c478bdstevel@tonic-gate * phys birth	txg of block allocation; zero if same as logical birth txg
1817c478bdstevel@tonic-gate * log. birth	transaction group in which the block was logically born
1827c478bdstevel@tonic-gate * fill count	number of non-zero blocks under this bp
1837c478bdstevel@tonic-gate * checksum[4]	256-bit checksum of the data this bp describes
1847c478bdstevel@tonic-gate */
1857c478bdstevel@tonic-gate#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
1867c478bdstevel@tonic-gate#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
1877c478bdstevel@tonic-gate
1887c478bdstevel@tonic-gatetypedef struct blkptr {
1897c478bdstevel@tonic-gate	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
1907c478bdstevel@tonic-gate	uint64_t	blk_prop;	/* size, compression, type, etc	    */
1917c478bdstevel@tonic-gate	uint64_t	blk_pad[2];	/* Extra space for the future	    */
1927c478bdstevel@tonic-gate	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
1937c478bdstevel@tonic-gate	uint64_t	blk_birth;	/* transaction group at birth	    */
1947c478bdstevel@tonic-gate	uint64_t	blk_fill;	/* fill count			    */
1957c478bdstevel@tonic-gate	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
1967c478bdstevel@tonic-gate} blkptr_t;
1977c478bdstevel@tonic-gate
1987c478bdstevel@tonic-gate/*
1997c478bdstevel@tonic-gate * Macros to get and set fields in a bp or DVA.
2007c478bdstevel@tonic-gate */
2017c478bdstevel@tonic-gate#define	DVA_GET_ASIZE(dva)	\
2027c478bdstevel@tonic-gate	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
2037c478bdstevel@tonic-gate#define	DVA_SET_ASIZE(dva, x)	\
2047c478bdstevel@tonic-gate	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
2057c478bdstevel@tonic-gate
2067c478bdstevel@tonic-gate#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
2077c478bdstevel@tonic-gate#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
2087c478bdstevel@tonic-gate
2097c478bdstevel@tonic-gate#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
2107c478bdstevel@tonic-gate#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
2117c478bdstevel@tonic-gate
2127c478bdstevel@tonic-gate#define	DVA_GET_OFFSET(dva)	\
2137c478bdstevel@tonic-gate	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
2147c478bdstevel@tonic-gate#define	DVA_SET_OFFSET(dva, x)	\
215	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
216
217#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
218#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
219
220#define	BP_GET_LSIZE(bp)	\
221	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
222#define	BP_SET_LSIZE(bp, x)	\
223	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
224
225#define	BP_GET_PSIZE(bp)	\
226	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
227#define	BP_SET_PSIZE(bp, x)	\
228	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
229
230#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
231#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
232
233#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
234#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
235
236#define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
237#define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
238
239#define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
240#define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
241
242#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
243#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
244
245#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
246#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
247
248#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
249#define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
250
251#define	BP_PHYSICAL_BIRTH(bp)		\
252	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
253
254#define	BP_SET_BIRTH(bp, logical, physical)	\
255{						\
256	(bp)->blk_birth = (logical);		\
257	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
258}
259
260#define	BP_GET_ASIZE(bp)	\
261	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
262		DVA_GET_ASIZE(&(bp)->blk_dva[2]))
263
264#define	BP_GET_UCSIZE(bp) \
265	((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
266	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
267
268#define	BP_GET_NDVAS(bp)	\
269	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
270	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
271	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
272
273#define	BP_COUNT_GANG(bp)	\
274	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
275	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
276	DVA_GET_GANG(&(bp)->blk_dva[2]))
277
278#define	DVA_EQUAL(dva1, dva2)	\
279	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
280	(dva1)->dva_word[0] == (dva2)->dva_word[0])
281
282#define	BP_EQUAL(bp1, bp2)	\
283	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
284	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
285	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
286	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
287
288#define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
289	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
290	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
291	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
292	((zc1).zc_word[3] - (zc2).zc_word[3])))
293
294#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
295
296#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
297{						\
298	(zcp)->zc_word[0] = w0;			\
299	(zcp)->zc_word[1] = w1;			\
300	(zcp)->zc_word[2] = w2;			\
301	(zcp)->zc_word[3] = w3;			\
302}
303
304#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
305#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
306#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
307
308/* BP_IS_RAIDZ(bp) assumes no block compression */
309#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
310				BP_GET_PSIZE(bp))
311
312#define	BP_ZERO(bp)				\
313{						\
314	(bp)->blk_dva[0].dva_word[0] = 0;	\
315	(bp)->blk_dva[0].dva_word[1] = 0;	\
316	(bp)->blk_dva[1].dva_word[0] = 0;	\
317	(bp)->blk_dva[1].dva_word[1] = 0;	\
318	(bp)->blk_dva[2].dva_word[0] = 0;	\
319	(bp)->blk_dva[2].dva_word[1] = 0;	\
320	(bp)->blk_prop = 0;			\
321	(bp)->blk_pad[0] = 0;			\
322	(bp)->blk_pad[1] = 0;			\
323	(bp)->blk_phys_birth = 0;		\
324	(bp)->blk_birth = 0;			\
325	(bp)->blk_fill = 0;			\
326	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
327}
328
329/*
330 * Note: the byteorder is either 0 or -1, both of which are palindromes.
331 * This simplifies the endianness handling a bit.
332 */
333#ifdef _BIG_ENDIAN
334#define	ZFS_HOST_BYTEORDER	(0ULL)
335#else
336#define	ZFS_HOST_BYTEORDER	(-1ULL)
337#endif
338
339#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
340
341#define	BP_SPRINTF_LEN	320
342
343/*
344 * This macro allows code sharing between zfs, libzpool, and mdb.
345 * 'func' is either snprintf() or mdb_snprintf().
346 * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
347 */
348#define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
349{									\
350	static const char *copyname[] =					\
351	    { "zero", "single", "double", "triple" };			\
352	int size = BP_SPRINTF_LEN;					\
353	int len = 0;							\
354	int copies = 0;							\
355									\
356	if (bp == NULL) {						\
357		len = func(buf + len, size - len, "<NULL>");		\
358	} else if (BP_IS_HOLE(bp)) {					\
359		len = func(buf + len, size - len, "<hole>");		\
360	} else {							\
361		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
362			const dva_t *dva = &bp->blk_dva[d];		\
363			if (DVA_IS_VALID(dva))				\
364				copies++;				\
365			len += func(buf + len, size - len,		\
366			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
367			    (u_longlong_t)DVA_GET_VDEV(dva),		\
368			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
369			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
370			    ws);					\
371		}							\
372		if (BP_IS_GANG(bp) &&					\
373		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
374		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
375			copies--;					\
376		len += func(buf + len, size - len,			\
377		    "[L%llu %s] %s %s %s %s %s %s%c"			\
378		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
379		    "cksum=%llx:%llx:%llx:%llx",			\
380		    (u_longlong_t)BP_GET_LEVEL(bp),			\
381		    type,						\
382		    checksum,						\
383		    compress,						\
384		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
385		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
386		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
387		    copyname[copies],					\
388		    ws,							\
389		    (u_longlong_t)BP_GET_LSIZE(bp),			\
390		    (u_longlong_t)BP_GET_PSIZE(bp),			\
391		    (u_longlong_t)bp->blk_birth,			\
392		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
393		    (u_longlong_t)bp->blk_fill,				\
394		    ws,							\
395		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
396		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
397		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
398		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
399	}								\
400	ASSERT(len < size);						\
401}
402
403#include <sys/dmu.h>
404
405#define	BP_GET_BUFC_TYPE(bp)						\
406	(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
407	ARC_BUFC_METADATA : ARC_BUFC_DATA)
408
409typedef enum spa_import_type {
410	SPA_IMPORT_EXISTING,
411	SPA_IMPORT_ASSEMBLE
412} spa_import_type_t;
413
414/* state manipulation functions */
415extern int spa_open(const char *pool, spa_t **, void *tag);
416extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
417    nvlist_t *policy, nvlist_t **config);
418extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
419    size_t buflen);
420extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
421    const char *history_str, nvlist_t *zplprops);
422extern int spa_import_rootpool(char *devpath, char *devid);
423extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
424    uint64_t flags);
425extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
426extern int spa_destroy(char *pool);
427extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
428    boolean_t hardforce);
429extern int spa_reset(char *pool);
430extern void spa_async_request(spa_t *spa, int flag);
431extern void spa_async_unrequest(spa_t *spa, int flag);
432extern void spa_async_suspend(spa_t *spa);
433extern void spa_async_resume(spa_t *spa);
434extern spa_t *spa_inject_addref(char *pool);
435extern void spa_inject_delref(spa_t *spa);
436extern void spa_scan_stat_init(spa_t *spa);
437extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
438
439#define	SPA_ASYNC_CONFIG_UPDATE	0x01
440#define	SPA_ASYNC_REMOVE	0x02
441#define	SPA_ASYNC_PROBE		0x04
442#define	SPA_ASYNC_RESILVER_DONE	0x08
443#define	SPA_ASYNC_RESILVER	0x10
444#define	SPA_ASYNC_AUTOEXPAND	0x20
445#define	SPA_ASYNC_REMOVE_DONE	0x40
446#define	SPA_ASYNC_REMOVE_STOP	0x80
447
448/*
449 * Controls the behavior of spa_vdev_remove().
450 */
451#define	SPA_REMOVE_UNSPARE	0x01
452#define	SPA_REMOVE_DONE		0x02
453
454/* device manipulation */
455extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
456extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
457    int replacing);
458extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
459    int replace_done);
460extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
461extern boolean_t spa_vdev_remove_active(spa_t *spa);
462extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
463extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
464extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
465    nvlist_t *props, boolean_t exp);
466
467/* spare state (which is global across all pools) */
468extern void spa_spare_add(vdev_t *vd);
469extern void spa_spare_remove(vdev_t *vd);
470extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
471extern void spa_spare_activate(vdev_t *vd);
472
473/* L2ARC state (which is global across all pools) */
474extern void spa_l2cache_add(vdev_t *vd);
475extern void spa_l2cache_remove(vdev_t *vd);
476extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
477extern void spa_l2cache_activate(vdev_t *vd);
478extern void spa_l2cache_drop(spa_t *spa);
479
480/* scanning */
481extern int spa_scan(spa_t *spa, pool_scan_func_t func);
482extern int spa_scan_stop(spa_t *spa);
483
484/* spa syncing */
485extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
486extern void spa_sync_allpools(void);
487
488/*
489 * DEFERRED_FREE must be large enough that regular blocks are not
490 * deferred.  XXX so can't we change it back to 1?
491 */
492#define	SYNC_PASS_DEFERRED_FREE	2	/* defer frees after this pass */
493#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
494#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
495
496/* spa namespace global mutex */
497extern kmutex_t spa_namespace_lock;
498
499/*
500 * SPA configuration functions in spa_config.c
501 */
502
503#define	SPA_CONFIG_UPDATE_POOL	0
504#define	SPA_CONFIG_UPDATE_VDEVS	1
505
506extern void spa_config_sync(spa_t *, boolean_t, boolean_t);
507extern void spa_config_load(void);
508extern nvlist_t *spa_all_configs(uint64_t *);
509extern void spa_config_set(spa_t *spa, nvlist_t *config);
510extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
511    int getstats);
512extern void spa_config_update(spa_t *spa, int what);
513
514/*
515 * Miscellaneous SPA routines in spa_misc.c
516 */
517
518/* Namespace manipulation */
519extern spa_t *spa_lookup(const char *name);
520extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
521extern void spa_remove(spa_t *spa);
522extern spa_t *spa_next(spa_t *prev);
523
524/* Refcount functions */
525extern void spa_open_ref(spa_t *spa, void *tag);
526extern void spa_close(spa_t *spa, void *tag);
527extern boolean_t spa_refcount_zero(spa_t *spa);
528
529#define	SCL_NONE	0x00
530#define	SCL_CONFIG	0x01
531#define	SCL_STATE	0x02
532#define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
533#define	SCL_ALLOC	0x08
534#define	SCL_ZIO		0x10
535#define	SCL_FREE	0x20
536#define	SCL_VDEV	0x40
537#define	SCL_LOCKS	7
538#define	SCL_ALL		((1 << SCL_LOCKS) - 1)
539#define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
540
541/* Pool configuration locks */
542extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
543extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
544extern void spa_config_exit(spa_t *spa, int locks, void *tag);
545extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
546
547/* Pool vdev add/remove lock */
548extern uint64_t spa_vdev_enter(spa_t *spa);
549extern uint64_t spa_vdev_config_enter(spa_t *spa);
550extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
551    int error, char *tag);
552extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
553
554/* Pool vdev state change lock */
555extern void spa_vdev_state_enter(spa_t *spa, int oplock);
556extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
557
558/* Log state */
559typedef enum spa_log_state {
560	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
561	SPA_LOG_MISSING,	/* missing log(s) */
562	SPA_LOG_CLEAR,		/* clear the log(s) */
563	SPA_LOG_GOOD,		/* log(s) are good */
564} spa_log_state_t;
565
566extern spa_log_state_t spa_get_log_state(spa_t *spa);
567extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
568extern int spa_offline_log(spa_t *spa);
569
570/* Log claim callback */
571extern void spa_claim_notify(zio_t *zio);
572
573/* Accessor functions */
574extern boolean_t spa_shutting_down(spa_t *spa);
575extern struct dsl_pool *spa_get_dsl(spa_t *spa);
576extern boolean_t spa_is_initializing(spa_t *spa);
577extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
578extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
579extern void spa_altroot(spa_t *, char *, size_t);
580extern int spa_sync_pass(spa_t *spa);
581extern char *spa_name(spa_t *spa);
582extern uint64_t spa_guid(spa_t *spa);
583extern uint64_t spa_load_guid(spa_t *spa);
584extern uint64_t spa_last_synced_txg(spa_t *spa);
585extern uint64_t spa_first_txg(spa_t *spa);
586extern uint64_t spa_syncing_txg(spa_t *spa);
587extern uint64_t spa_version(spa_t *spa);
588extern pool_state_t spa_state(spa_t *spa);
589extern spa_load_state_t spa_load_state(spa_t *spa);
590extern uint64_t spa_freeze_txg(spa_t *spa);
591extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
592extern uint64_t spa_get_dspace(spa_t *spa);
593extern void spa_update_dspace(spa_t *spa);
594extern uint64_t spa_version(spa_t *spa);
595extern boolean_t spa_deflate(spa_t *spa);
596extern metaslab_class_t *spa_normal_class(spa_t *spa);
597extern metaslab_class_t *spa_log_class(spa_t *spa);
598extern int spa_max_replication(spa_t *spa);
599extern int spa_prev_software_version(spa_t *spa);
600extern int spa_busy(void);
601extern uint8_t spa_get_failmode(spa_t *spa);
602extern boolean_t spa_suspended(spa_t *spa);
603extern uint64_t spa_bootfs(spa_t *spa);
604extern uint64_t spa_delegation(spa_t *spa);
605extern objset_t *spa_meta_objset(spa_t *spa);
606
607/* Miscellaneous support routines */
608extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
609extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
610extern int spa_rename(const char *oldname, const char *newname);
611extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
612extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
613extern char *spa_strdup(const char *);
614extern void spa_strfree(char *);
615extern uint64_t spa_get_random(uint64_t range);
616extern uint64_t spa_generate_guid(spa_t *spa);
617extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
618extern void spa_freeze(spa_t *spa);
619extern int spa_change_guid(spa_t *spa);
620extern void spa_upgrade(spa_t *spa, uint64_t version);
621extern void spa_evict_all(void);
622extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
623    boolean_t l2cache);
624extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
625extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
626extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
627extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
628extern boolean_t spa_has_slogs(spa_t *spa);
629extern boolean_t spa_is_root(spa_t *spa);
630extern boolean_t spa_writeable(spa_t *spa);
631
632extern int spa_mode(spa_t *spa);
633extern uint64_t strtonum(const char *str, char **nptr);
634
635/* history logging */
636typedef enum history_log_type {
637	LOG_CMD_POOL_CREATE,
638	LOG_CMD_NORMAL,
639	LOG_INTERNAL
640} history_log_type_t;
641
642typedef struct history_arg {
643	char *ha_history_str;
644	history_log_type_t ha_log_type;
645	history_internal_events_t ha_event;
646	char *ha_zone;
647	uid_t ha_uid;
648} history_arg_t;
649
650extern char *spa_his_ievent_table[];
651
652extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
653extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
654    char *his_buf);
655extern int spa_history_log(spa_t *spa, const char *his_buf,
656    history_log_type_t what);
657extern void spa_history_log_internal(history_internal_events_t event,
658    spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
659extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
660
661/* error handling */
662struct zbookmark;
663extern void spa_log_error(spa_t *spa, zio_t *zio);
664extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
665    zio_t *zio, uint64_t stateoroffset, uint64_t length);
666extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
667extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
668extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
669extern uint64_t spa_get_errlog_size(spa_t *spa);
670extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
671extern void spa_errlog_rotate(spa_t *spa);
672extern void spa_errlog_drain(spa_t *spa);
673extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
674extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
675
676/* vdev cache */
677extern void vdev_cache_stat_init(void);
678extern void vdev_cache_stat_fini(void);
679
680/* Initialization and termination */
681extern void spa_init(int flags);
682extern void spa_fini(void);
683extern void spa_boot_init();
684
685/* properties */
686extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
687extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
688extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
689extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
690
691/* asynchronous event notification */
692extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
693
694#ifdef ZFS_DEBUG
695#define	dprintf_bp(bp, fmt, ...) do {				\
696	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
697	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
698	sprintf_blkptr(__blkbuf, (bp));				\
699	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
700	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
701	} \
702_NOTE(CONSTCOND) } while (0)
703#else
704#define	dprintf_bp(bp, fmt, ...)
705#endif
706
707extern boolean_t spa_debug_enabled(spa_t *spa);
708#define	spa_dbgmsg(spa, ...)			\
709{						\
710	if (spa_debug_enabled(spa))		\
711		zfs_dbgmsg(__VA_ARGS__);	\
712}
713
714extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
715
716#ifdef	__cplusplus
717}
718#endif
719
720#endif	/* _SYS_SPA_H */
721