xref: /illumos-gate/usr/src/uts/common/fs/zfs/sys/dnode.h (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1*fa9e4066Sahrens /*
2*fa9e4066Sahrens  * CDDL HEADER START
3*fa9e4066Sahrens  *
4*fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5*fa9e4066Sahrens  * Common Development and Distribution License, Version 1.0 only
6*fa9e4066Sahrens  * (the "License").  You may not use this file except in compliance
7*fa9e4066Sahrens  * with the License.
8*fa9e4066Sahrens  *
9*fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
11*fa9e4066Sahrens  * See the License for the specific language governing permissions
12*fa9e4066Sahrens  * and limitations under the License.
13*fa9e4066Sahrens  *
14*fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*fa9e4066Sahrens  *
20*fa9e4066Sahrens  * CDDL HEADER END
21*fa9e4066Sahrens  */
22*fa9e4066Sahrens /*
23*fa9e4066Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*fa9e4066Sahrens  * Use is subject to license terms.
25*fa9e4066Sahrens  */
26*fa9e4066Sahrens 
27*fa9e4066Sahrens #ifndef	_SYS_DNODE_H
28*fa9e4066Sahrens #define	_SYS_DNODE_H
29*fa9e4066Sahrens 
30*fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
31*fa9e4066Sahrens 
32*fa9e4066Sahrens #include <sys/zfs_context.h>
33*fa9e4066Sahrens #include <sys/avl.h>
34*fa9e4066Sahrens #include <sys/spa.h>
35*fa9e4066Sahrens #include <sys/txg.h>
36*fa9e4066Sahrens #include <sys/refcount.h>
37*fa9e4066Sahrens #include <sys/dmu_zfetch.h>
38*fa9e4066Sahrens 
39*fa9e4066Sahrens #ifdef	__cplusplus
40*fa9e4066Sahrens extern "C" {
41*fa9e4066Sahrens #endif
42*fa9e4066Sahrens 
43*fa9e4066Sahrens /*
44*fa9e4066Sahrens  * Flags.
45*fa9e4066Sahrens  */
46*fa9e4066Sahrens #define	DNODE_MUST_BE_ALLOCATED	1
47*fa9e4066Sahrens #define	DNODE_MUST_BE_FREE	2
48*fa9e4066Sahrens 
49*fa9e4066Sahrens /*
50*fa9e4066Sahrens  * Fixed constants.
51*fa9e4066Sahrens  */
52*fa9e4066Sahrens #define	DNODE_SHIFT		9	/* 512 bytes */
53*fa9e4066Sahrens #define	DN_MIN_INDBLKSHIFT	10	/* 1k */
54*fa9e4066Sahrens #define	DN_MAX_INDBLKSHIFT	14	/* 16k */
55*fa9e4066Sahrens #define	DNODE_BLOCK_SHIFT	14	/* 16k */
56*fa9e4066Sahrens #define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
57*fa9e4066Sahrens #define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
58*fa9e4066Sahrens #define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
59*fa9e4066Sahrens 
60*fa9e4066Sahrens /*
61*fa9e4066Sahrens  * Derived constants.
62*fa9e4066Sahrens  */
63*fa9e4066Sahrens #define	DNODE_SIZE	(1 << DNODE_SHIFT)
64*fa9e4066Sahrens #define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
65*fa9e4066Sahrens #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
66*fa9e4066Sahrens 
67*fa9e4066Sahrens #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
68*fa9e4066Sahrens #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
69*fa9e4066Sahrens #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
70*fa9e4066Sahrens 
71*fa9e4066Sahrens #define	DN_META_DNODE_LEVELS	\
72*fa9e4066Sahrens 	(1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT -	\
73*fa9e4066Sahrens 	DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
74*fa9e4066Sahrens 
75*fa9e4066Sahrens /* The +2 here is a cheesy way to round up */
76*fa9e4066Sahrens #define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
77*fa9e4066Sahrens 	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
78*fa9e4066Sahrens 
79*fa9e4066Sahrens #define	DN_MAX_OBJECT		\
80*fa9e4066Sahrens 	((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT +	\
81*fa9e4066Sahrens 	(DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
82*fa9e4066Sahrens 
83*fa9e4066Sahrens #define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
84*fa9e4066Sahrens 	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
85*fa9e4066Sahrens 
86*fa9e4066Sahrens #define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
87*fa9e4066Sahrens 
88*fa9e4066Sahrens struct dmu_buf_impl;
89*fa9e4066Sahrens struct objset_impl;
90*fa9e4066Sahrens struct zio;
91*fa9e4066Sahrens 
92*fa9e4066Sahrens enum dnode_dirtycontext {
93*fa9e4066Sahrens 	DN_UNDIRTIED,
94*fa9e4066Sahrens 	DN_DIRTY_OPEN,
95*fa9e4066Sahrens 	DN_DIRTY_SYNC
96*fa9e4066Sahrens };
97*fa9e4066Sahrens 
98*fa9e4066Sahrens typedef struct dnode_phys {
99*fa9e4066Sahrens 	uint8_t dn_type;		/* dmu_object_type_t */
100*fa9e4066Sahrens 	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
101*fa9e4066Sahrens 	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
102*fa9e4066Sahrens 	uint8_t dn_nblkptr;		/* length of dn_blkptr */
103*fa9e4066Sahrens 	uint8_t dn_bonustype;		/* type of data in bonus buffer */
104*fa9e4066Sahrens 	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
105*fa9e4066Sahrens 	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
106*fa9e4066Sahrens 	uint8_t dn_pad1[1];
107*fa9e4066Sahrens 	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
108*fa9e4066Sahrens 	uint16_t dn_bonuslen;		/* length of dn_bonus */
109*fa9e4066Sahrens 	uint8_t dn_pad2[4];
110*fa9e4066Sahrens 
111*fa9e4066Sahrens 	/* accounting is protected by dn_dirty_mtx */
112*fa9e4066Sahrens 	uint64_t dn_maxblkid;		/* largest allocated block ID */
113*fa9e4066Sahrens 	uint64_t dn_secphys;		/* 512b sectors of disk space used */
114*fa9e4066Sahrens 
115*fa9e4066Sahrens 	uint64_t dn_pad3[4];
116*fa9e4066Sahrens 
117*fa9e4066Sahrens 	blkptr_t dn_blkptr[1];
118*fa9e4066Sahrens 	uint8_t dn_bonus[DN_MAX_BONUSLEN];
119*fa9e4066Sahrens } dnode_phys_t;
120*fa9e4066Sahrens 
121*fa9e4066Sahrens typedef struct dnode {
122*fa9e4066Sahrens 	/*
123*fa9e4066Sahrens 	 * lock ordering:
124*fa9e4066Sahrens 	 *
125*fa9e4066Sahrens 	 * db_mtx > dn_dirty_mtx
126*fa9e4066Sahrens 	 * 	dbuf_syncdone
127*fa9e4066Sahrens 	 *
128*fa9e4066Sahrens 	 * dn_struct_rwlock/r > dn_dirty_mtx
129*fa9e4066Sahrens 	 * 	dmu_object_info
130*fa9e4066Sahrens 	 *
131*fa9e4066Sahrens 	 * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
132*fa9e4066Sahrens 	 * 	dbuf_dirty
133*fa9e4066Sahrens 	 * 	dbuf_setdirty
134*fa9e4066Sahrens 	 *
135*fa9e4066Sahrens 	 * dn_struct_rwlock/w > db_mtx > dn_mtx
136*fa9e4066Sahrens 	 * 	dnode_increase_indirection -> dbuf_find
137*fa9e4066Sahrens 	 * 	dbuf_hold_impl
138*fa9e4066Sahrens 	 * 	dnode_set_bonus
139*fa9e4066Sahrens 	 *
140*fa9e4066Sahrens 	 * dn_struct_rwlock/w > dn_mtx
141*fa9e4066Sahrens 	 * 	dnode_increase_indirection
142*fa9e4066Sahrens 	 *
143*fa9e4066Sahrens 	 * dn_dirty_mtx > dn_mtx
144*fa9e4066Sahrens 	 * 	dnode_buf_pageout
145*fa9e4066Sahrens 	 *
146*fa9e4066Sahrens 	 * db_mtx > dn_mtx
147*fa9e4066Sahrens 	 * 	dbuf_create
148*fa9e4066Sahrens 	 */
149*fa9e4066Sahrens 
150*fa9e4066Sahrens 	/*
151*fa9e4066Sahrens 	 * dn_struct_rwlock protects the structure of the dnode.
152*fa9e4066Sahrens 	 * In particular, it protects the number of levels of indirection.
153*fa9e4066Sahrens 	 */
154*fa9e4066Sahrens 	krwlock_t dn_struct_rwlock;
155*fa9e4066Sahrens 
156*fa9e4066Sahrens 	/*
157*fa9e4066Sahrens 	 * Our link on dataset's dd_dnodes list.
158*fa9e4066Sahrens 	 * Protected by dd_accounting_mtx.
159*fa9e4066Sahrens 	 */
160*fa9e4066Sahrens 	list_node_t dn_link;
161*fa9e4066Sahrens 
162*fa9e4066Sahrens 	/* immutable: */
163*fa9e4066Sahrens 	struct objset_impl *dn_objset;
164*fa9e4066Sahrens 	uint64_t dn_object;
165*fa9e4066Sahrens 	struct dmu_buf_impl *dn_dbuf;
166*fa9e4066Sahrens 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
167*fa9e4066Sahrens 
168*fa9e4066Sahrens 	/*
169*fa9e4066Sahrens 	 * Copies of stuff in dn_phys.  They're valid here even before
170*fa9e4066Sahrens 	 * the dnode is first synced.
171*fa9e4066Sahrens 	 */
172*fa9e4066Sahrens 	dmu_object_type_t dn_type;	/* object type (immutable) */
173*fa9e4066Sahrens 	uint8_t dn_bonustype;		/* bonus type (immutable) */
174*fa9e4066Sahrens 	uint16_t dn_bonuslen;		/* bonus length (immutable) */
175*fa9e4066Sahrens 	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
176*fa9e4066Sahrens 	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
177*fa9e4066Sahrens 	uint32_t dn_datablksz;		/* in bytes */
178*fa9e4066Sahrens 	uint16_t dn_datablkszsec;	/* in 512b sectors */
179*fa9e4066Sahrens 
180*fa9e4066Sahrens 	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
181*fa9e4066Sahrens 	uint8_t dn_compress;		/* ZIO_COMPRESS type */
182*fa9e4066Sahrens 
183*fa9e4066Sahrens 	/*
184*fa9e4066Sahrens 	 * The following are kept up-to-date in the *open* context, the syncing
185*fa9e4066Sahrens 	 * context should only pay attention to the dn_next_* values.
186*fa9e4066Sahrens 	 */
187*fa9e4066Sahrens 	uint8_t dn_nlevels;
188*fa9e4066Sahrens 	uint8_t dn_indblkshift;
189*fa9e4066Sahrens 
190*fa9e4066Sahrens 	uint8_t dn_next_nlevels[TXG_SIZE];
191*fa9e4066Sahrens 	uint8_t dn_next_indblkshift[TXG_SIZE];
192*fa9e4066Sahrens 
193*fa9e4066Sahrens 	/* protected by os_lock: */
194*fa9e4066Sahrens 	uint32_t dn_dirtyblksz[TXG_SIZE];	/* dirty block size in bytes */
195*fa9e4066Sahrens 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
196*fa9e4066Sahrens 
197*fa9e4066Sahrens 	/* protected by dn_mtx: */
198*fa9e4066Sahrens 	kmutex_t dn_mtx;
199*fa9e4066Sahrens 	list_t dn_dirty_dbufs[TXG_SIZE];
200*fa9e4066Sahrens 	uint64_t dn_maxblkid;
201*fa9e4066Sahrens 	avl_tree_t dn_ranges[TXG_SIZE];
202*fa9e4066Sahrens 	uint64_t dn_allocated_txg;
203*fa9e4066Sahrens 	uint64_t dn_free_txg;
204*fa9e4066Sahrens 	uint64_t dn_assigned_txg;
205*fa9e4066Sahrens 	struct dmu_tx *dn_assigned_tx;		/* if only one tx cares */
206*fa9e4066Sahrens 	kcondvar_t dn_notxholds;
207*fa9e4066Sahrens 	enum dnode_dirtycontext dn_dirtyctx;
208*fa9e4066Sahrens 	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
209*fa9e4066Sahrens 
210*fa9e4066Sahrens 	/* protected by own devices */
211*fa9e4066Sahrens 	refcount_t dn_tx_holds;
212*fa9e4066Sahrens 	refcount_t dn_holds;
213*fa9e4066Sahrens 
214*fa9e4066Sahrens 	kmutex_t dn_dbufs_mtx;
215*fa9e4066Sahrens 	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
216*fa9e4066Sahrens 	kcondvar_t dn_evicted;		/* a child dbuf has been evicted */
217*fa9e4066Sahrens 
218*fa9e4066Sahrens 	/*
219*fa9e4066Sahrens 	 * Performance hack: whenever we have a hold on the bonus buffer of a
220*fa9e4066Sahrens 	 * ZAP object, we will also have a hold on db0.  This will keep the
221*fa9e4066Sahrens 	 * meta-data for a micro-zap object cached as long as the znode for the
222*fa9e4066Sahrens 	 * object is in the znode cache.
223*fa9e4066Sahrens 	 */
224*fa9e4066Sahrens 	struct dmu_buf_impl *dn_db0;
225*fa9e4066Sahrens 
226*fa9e4066Sahrens 	/* holds prefetch structure */
227*fa9e4066Sahrens 	struct zfetch	dn_zfetch;
228*fa9e4066Sahrens } dnode_t;
229*fa9e4066Sahrens 
230*fa9e4066Sahrens typedef struct free_range {
231*fa9e4066Sahrens 	avl_node_t fr_node;
232*fa9e4066Sahrens 	uint64_t fr_blkid;
233*fa9e4066Sahrens 	uint64_t fr_nblks;
234*fa9e4066Sahrens } free_range_t;
235*fa9e4066Sahrens 
236*fa9e4066Sahrens dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
237*fa9e4066Sahrens     uint64_t object);
238*fa9e4066Sahrens void dnode_special_close(dnode_t *dn);
239*fa9e4066Sahrens 
240*fa9e4066Sahrens dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
241*fa9e4066Sahrens dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
242*fa9e4066Sahrens     void *ref);
243*fa9e4066Sahrens void dnode_add_ref(dnode_t *dn, void *ref);
244*fa9e4066Sahrens void dnode_rele(dnode_t *dn, void *ref);
245*fa9e4066Sahrens void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
246*fa9e4066Sahrens int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
247*fa9e4066Sahrens void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
248*fa9e4066Sahrens     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
249*fa9e4066Sahrens void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
250*fa9e4066Sahrens     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
251*fa9e4066Sahrens void dnode_free(dnode_t *dn, dmu_tx_t *tx);
252*fa9e4066Sahrens void dnode_byteswap(dnode_phys_t *dnp);
253*fa9e4066Sahrens void dnode_buf_byteswap(void *buf, size_t size);
254*fa9e4066Sahrens void dnode_verify(dnode_t *dn);
255*fa9e4066Sahrens int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
256*fa9e4066Sahrens uint64_t dnode_current_max_length(dnode_t *dn);
257*fa9e4066Sahrens uint64_t dnode_max_nonzero_offset(dnode_t *dn);
258*fa9e4066Sahrens void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
259*fa9e4066Sahrens void dnode_clear_range(dnode_t *dn, uint64_t blkid,
260*fa9e4066Sahrens     uint64_t nblks, dmu_tx_t *tx);
261*fa9e4066Sahrens void dnode_diduse_space(dnode_t *dn, int64_t space);
262*fa9e4066Sahrens void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
263*fa9e4066Sahrens void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
264*fa9e4066Sahrens uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
265*fa9e4066Sahrens void dnode_init(void);
266*fa9e4066Sahrens void dnode_fini(void);
267*fa9e4066Sahrens int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
268*fa9e4066Sahrens     uint64_t blkfill);
269*fa9e4066Sahrens 
270*fa9e4066Sahrens #ifdef ZFS_DEBUG
271*fa9e4066Sahrens 
272*fa9e4066Sahrens /*
273*fa9e4066Sahrens  * There should be a ## between the string literal and fmt, to make it
274*fa9e4066Sahrens  * clear that we're joining two strings together, but that piece of shit
275*fa9e4066Sahrens  * gcc doesn't support that preprocessor token.
276*fa9e4066Sahrens  */
277*fa9e4066Sahrens #define	dprintf_dnode(dn, fmt, ...) do { \
278*fa9e4066Sahrens 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
279*fa9e4066Sahrens 	char __db_buf[32]; \
280*fa9e4066Sahrens 	uint64_t __db_obj = (dn)->dn_object; \
281*fa9e4066Sahrens 	if (__db_obj == DMU_META_DNODE_OBJECT) \
282*fa9e4066Sahrens 		(void) strcpy(__db_buf, "mdn"); \
283*fa9e4066Sahrens 	else \
284*fa9e4066Sahrens 		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
285*fa9e4066Sahrens 		    (u_longlong_t)__db_obj);\
286*fa9e4066Sahrens 	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
287*fa9e4066Sahrens 	    __db_buf, __VA_ARGS__); \
288*fa9e4066Sahrens 	} \
289*fa9e4066Sahrens _NOTE(CONSTCOND) } while (0)
290*fa9e4066Sahrens 
291*fa9e4066Sahrens #else
292*fa9e4066Sahrens 
293*fa9e4066Sahrens #define	dprintf_dnode(db, fmt, ...)
294*fa9e4066Sahrens 
295*fa9e4066Sahrens #endif
296*fa9e4066Sahrens 
297*fa9e4066Sahrens #ifdef	__cplusplus
298*fa9e4066Sahrens }
299*fa9e4066Sahrens #endif
300*fa9e4066Sahrens 
301*fa9e4066Sahrens #endif	/* _SYS_DNODE_H */
302