xref: /illumos-gate/usr/src/uts/common/fs/zfs/sys/arc_impl.h (revision 9e3493cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2019, Joyent, Inc.
24  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2020, George Amanakis. All rights reserved.
28  */
29 
30 #ifndef	_SYS_ARC_IMPL_H
31 #define	_SYS_ARC_IMPL_H
32 
33 #include <sys/arc.h>
34 #include <sys/multilist.h>
35 
36 #ifdef __cplusplus
37 extern "C" {
38 #endif
39 
40 /*
41  * Note that buffers can be in one of 6 states:
42  *	ARC_anon	- anonymous (discussed below)
43  *	ARC_mru		- recently used, currently cached
44  *	ARC_mru_ghost	- recently used, no longer in cache
45  *	ARC_mfu		- frequently used, currently cached
46  *	ARC_mfu_ghost	- frequently used, no longer in cache
47  *	ARC_l2c_only	- exists in L2ARC but not other states
48  * When there are no active references to the buffer, they are
49  * are linked onto a list in one of these arc states.  These are
50  * the only buffers that can be evicted or deleted.  Within each
51  * state there are multiple lists, one for meta-data and one for
52  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
53  * etc.) is tracked separately so that it can be managed more
54  * explicitly: favored over data, limited explicitly.
55  *
56  * Anonymous buffers are buffers that are not associated with
57  * a DVA.  These are buffers that hold dirty block copies
58  * before they are written to stable storage.  By definition,
59  * they are "ref'd" and are considered part of arc_mru
60  * that cannot be freed.  Generally, they will aquire a DVA
61  * as they are written and migrate onto the arc_mru list.
62  *
63  * The ARC_l2c_only state is for buffers that are in the second
64  * level ARC but no longer in any of the ARC_m* lists.  The second
65  * level ARC itself may also contain buffers that are in any of
66  * the ARC_m* states - meaning that a buffer can exist in two
67  * places.  The reason for the ARC_l2c_only state is to keep the
68  * buffer header in the hash table, so that reads that hit the
69  * second level ARC benefit from these fast lookups.
70  */
71 
72 typedef struct arc_state {
73 	/*
74 	 * list of evictable buffers
75 	 */
76 	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
77 	/*
78 	 * total amount of evictable data in this state
79 	 */
80 	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
81 	/*
82 	 * total amount of data in this state; this includes: evictable,
83 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
84 	 */
85 	zfs_refcount_t arcs_size;
86 
87 	arc_state_type_t arcs_state;
88 } arc_state_t;
89 
90 typedef struct arc_callback arc_callback_t;
91 
92 struct arc_callback {
93 	void			*acb_private;
94 	arc_read_done_func_t	*acb_done;
95 	arc_buf_t		*acb_buf;
96 	boolean_t		acb_encrypted;
97 	boolean_t		acb_compressed;
98 	boolean_t		acb_noauth;
99 	zbookmark_phys_t	acb_zb;
100 	zio_t			*acb_zio_dummy;
101 	zio_t			*acb_zio_head;
102 	arc_callback_t		*acb_next;
103 };
104 
105 typedef struct arc_write_callback arc_write_callback_t;
106 
107 struct arc_write_callback {
108 	void			*awcb_private;
109 	arc_write_done_func_t	*awcb_ready;
110 	arc_write_done_func_t	*awcb_children_ready;
111 	arc_write_done_func_t	*awcb_physdone;
112 	arc_write_done_func_t	*awcb_done;
113 	arc_buf_t		*awcb_buf;
114 };
115 
116 /*
117  * ARC buffers are separated into multiple structs as a memory saving measure:
118  *   - Common fields struct, always defined, and embedded within it:
119  *       - L2-only fields, always allocated but undefined when not in L2ARC
120  *       - L1-only fields, only allocated when in L1ARC
121  *
122  *           Buffer in L1                     Buffer only in L2
123  *    +------------------------+          +------------------------+
124  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
125  *    |                        |          |                        |
126  *    |                        |          |                        |
127  *    |                        |          |                        |
128  *    +------------------------+          +------------------------+
129  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
130  *    | (undefined if L1-only) |          |                        |
131  *    +------------------------+          +------------------------+
132  *    | l1arc_buf_hdr_t        |
133  *    |                        |
134  *    |                        |
135  *    |                        |
136  *    |                        |
137  *    +------------------------+
138  *
139  * Because it's possible for the L2ARC to become extremely large, we can wind
140  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
141  * is minimized by only allocating the fields necessary for an L1-cached buffer
142  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
143  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
144  * words in pointers. arc_hdr_realloc() is used to switch a header between
145  * these two allocation states.
146  */
147 typedef struct l1arc_buf_hdr {
148 	kmutex_t		b_freeze_lock;
149 	zio_cksum_t		*b_freeze_cksum;
150 #ifdef ZFS_DEBUG
151 	/*
152 	 * Used for debugging with kmem_flags - by allocating and freeing
153 	 * b_thawed when the buffer is thawed, we get a record of the stack
154 	 * trace that thawed it.
155 	 */
156 	void			*b_thawed;
157 #endif
158 
159 	arc_buf_t		*b_buf;
160 	uint32_t		b_bufcnt;
161 	/* for waiting on writes to complete */
162 	kcondvar_t		b_cv;
163 	uint8_t			b_byteswap;
164 
165 	/* protected by arc state mutex */
166 	arc_state_t		*b_state;
167 	multilist_node_t	b_arc_node;
168 
169 	/* updated atomically */
170 	clock_t			b_arc_access;
171 
172 	/* self protecting */
173 	zfs_refcount_t		b_refcnt;
174 
175 	arc_callback_t		*b_acb;
176 	abd_t			*b_pabd;
177 } l1arc_buf_hdr_t;
178 
179 typedef enum l2arc_dev_hdr_flags_t {
180 	L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)	/* mirror of l2ad_first */
181 } l2arc_dev_hdr_flags_t;
182 
183 /*
184  * Pointer used in persistent L2ARC (for pointing to log blocks).
185  */
186 typedef struct l2arc_log_blkptr {
187 	/*
188 	 * Offset of log block within the device, in bytes
189 	 */
190 	uint64_t	lbp_daddr;
191 	/*
192 	 * Aligned payload size (in bytes) of the log block
193 	 */
194 	uint64_t	lbp_payload_asize;
195 	/*
196 	 * Offset in bytes of the first buffer in the payload
197 	 */
198 	uint64_t	lbp_payload_start;
199 	/*
200 	 * lbp_prop has the following format:
201 	 *	* logical size (in bytes)
202 	 *	* aligned (after compression) size (in bytes)
203 	 *	* compression algorithm (we always LZ4-compress l2arc logs)
204 	 *	* checksum algorithm (used for lbp_cksum)
205 	 */
206 	uint64_t	lbp_prop;
207 	zio_cksum_t	lbp_cksum;	/* checksum of log */
208 } l2arc_log_blkptr_t;
209 
210 /*
211  * The persistent L2ARC device header.
212  * Byte order of magic determines whether 64-bit bswap of fields is necessary.
213  */
214 typedef struct l2arc_dev_hdr_phys {
215 	uint64_t	dh_magic;	/* L2ARC_DEV_HDR_MAGIC */
216 	uint64_t	dh_version;	/* Persistent L2ARC version */
217 
218 	/*
219 	 * Global L2ARC device state and metadata.
220 	 */
221 	uint64_t	dh_spa_guid;
222 	uint64_t	dh_vdev_guid;
223 	uint64_t	dh_log_entries;		/* mirror of l2ad_log_entries */
224 	uint64_t	dh_evict;		/* evicted offset in bytes */
225 	uint64_t	dh_flags;		/* l2arc_dev_hdr_flags_t */
226 	/*
227 	 * Used in zdb.c for determining if a log block is valid, in the same
228 	 * way that l2arc_rebuild() does.
229 	 */
230 	uint64_t	dh_start;		/* mirror of l2ad_start */
231 	uint64_t	dh_end;			/* mirror of l2ad_end */
232 	/*
233 	 * Start of log block chain. [0] -> newest log, [1] -> one older (used
234 	 * for initiating prefetch).
235 	 */
236 	l2arc_log_blkptr_t	dh_start_lbps[2];
237 	/*
238 	 * Aligned size of all log blocks as accounted by vdev_space_update().
239 	 */
240 	uint64_t	dh_lb_asize;		/* mirror of l2ad_lb_asize */
241 	uint64_t	dh_lb_count;		/* mirror of l2ad_lb_count */
242 	const uint64_t		dh_pad[32];	/* pad to 512 bytes */
243 	zio_eck_t		dh_tail;
244 } l2arc_dev_hdr_phys_t;
245 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
246 
247 /*
248  * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
249  */
250 typedef struct l2arc_log_ent_phys {
251 	dva_t			le_dva;		/* dva of buffer */
252 	uint64_t		le_birth;	/* birth txg of buffer */
253 	/*
254 	 * le_prop has the following format:
255 	 *	* logical size (in bytes)
256 	 *	* physical (compressed) size (in bytes)
257 	 *	* compression algorithm
258 	 *	* object type (used to restore arc_buf_contents_t)
259 	 *	* protected status (used for encryption)
260 	 *	* prefetch status (used in l2arc_read_done())
261 	 */
262 	uint64_t		le_prop;
263 	uint64_t		le_daddr;	/* buf location on l2dev */
264 	/*
265 	 * We pad the size of each entry to a power of 2 so that the size of
266 	 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
267 	 * because of the L2ARC_SET_*SIZE macros.
268 	 */
269 	const uint64_t		le_pad[3];	/* pad to 64 bytes	 */
270 } l2arc_log_ent_phys_t;
271 
272 #define	L2ARC_LOG_BLK_MAX_ENTRIES	(1022)
273 
274 /*
275  * A log block of up to 1022 ARC buffer log entries, chained into the
276  * persistent L2ARC metadata linked list. Byte order of magic determines
277  * whether 64-bit bswap of fields is necessary.
278  */
279 typedef struct l2arc_log_blk_phys {
280 	uint64_t		lb_magic;	/* L2ARC_LOG_BLK_MAGIC */
281 	/*
282 	 * There are 2 chains (headed by dh_start_lbps[2]), and this field
283 	 * points back to the previous block in this chain. We alternate
284 	 * which chain we append to, so they are time-wise and offset-wise
285 	 * interleaved, but that is an optimization rather than for
286 	 * correctness.
287 	 */
288 	l2arc_log_blkptr_t	lb_prev_lbp;	/* pointer to prev log block */
289 	/*
290 	 * Pad header section to 128 bytes
291 	 */
292 	uint64_t		lb_pad[7];
293 	/* Payload */
294 	l2arc_log_ent_phys_t	lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
295 } l2arc_log_blk_phys_t;				/* 64K total */
296 /*
297  * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
298  * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
299  */
300 CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
301     1ULL << SPA_MINBLOCKSHIFT));
302 CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
303 CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
304 
305 /*
306  * These structures hold in-flight abd buffers for log blocks as they're being
307  * written to the L2ARC device.
308  */
309 typedef struct l2arc_lb_abd_buf {
310 	abd_t		*abd;
311 	list_node_t	node;
312 } l2arc_lb_abd_buf_t;
313 
314 /*
315  * These structures hold pointers to log blocks present on the L2ARC device.
316  */
317 typedef struct l2arc_lb_ptr_buf {
318 	l2arc_log_blkptr_t	*lb_ptr;
319 	list_node_t		node;
320 } l2arc_lb_ptr_buf_t;
321 
322 /* Macros for setting fields in le_prop and lbp_prop */
323 #define	L2BLK_GET_LSIZE(field)	\
324 	BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
325 #define	L2BLK_SET_LSIZE(field, x)	\
326 	BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
327 #define	L2BLK_GET_PSIZE(field)	\
328 	BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
329 #define	L2BLK_SET_PSIZE(field, x)	\
330 	BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
331 #define	L2BLK_GET_COMPRESS(field)	\
332 	BF64_GET((field), 32, SPA_COMPRESSBITS)
333 #define	L2BLK_SET_COMPRESS(field, x)	\
334 	BF64_SET((field), 32, SPA_COMPRESSBITS, x)
335 #define	L2BLK_GET_PREFETCH(field)	BF64_GET((field), 39, 1)
336 #define	L2BLK_SET_PREFETCH(field, x)	BF64_SET((field), 39, 1, x)
337 #define	L2BLK_GET_CHECKSUM(field)	BF64_GET((field), 40, 8)
338 #define	L2BLK_SET_CHECKSUM(field, x)	BF64_SET((field), 40, 8, x)
339 #define	L2BLK_GET_TYPE(field)		BF64_GET((field), 48, 8)
340 #define	L2BLK_SET_TYPE(field, x)	BF64_SET((field), 48, 8, x)
341 #define	L2BLK_GET_PROTECTED(field)	BF64_GET((field), 56, 1)
342 #define	L2BLK_SET_PROTECTED(field, x)	BF64_SET((field), 56, 1, x)
343 #define	L2BLK_GET_STATE(field)		BF64_GET((field), 57, 4)
344 #define	L2BLK_SET_STATE(field, x)	BF64_SET((field), 57, 4, x)
345 
346 #define	PTR_SWAP(x, y)		\
347 	do {			\
348 		void *tmp = (x);\
349 		x = y;		\
350 		y = tmp;	\
351 		_NOTE(CONSTCOND)\
352 	} while (0)
353 
354 #define	L2ARC_DEV_HDR_MAGIC	0x5a46534341434845LLU	/* ASCII: "ZFSCACHE" */
355 #define	L2ARC_LOG_BLK_MAGIC	0x4c4f47424c4b4844LLU	/* ASCII: "LOGBLKHD" */
356 
357 /*
358  * L2ARC Internals
359  */
360 typedef struct l2arc_dev {
361 	vdev_t			*l2ad_vdev;	/* vdev */
362 	spa_t			*l2ad_spa;	/* spa */
363 	uint64_t		l2ad_hand;	/* next write location */
364 	uint64_t		l2ad_start;	/* first addr on device */
365 	uint64_t		l2ad_end;	/* last addr on device */
366 	boolean_t		l2ad_first;	/* first sweep through */
367 	boolean_t		l2ad_writing;	/* currently writing */
368 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
369 	list_t			l2ad_buflist;	/* buffer list */
370 	list_node_t		l2ad_node;	/* device list node */
371 	zfs_refcount_t		l2ad_alloc;	/* allocated bytes */
372 	/*
373 	 * Persistence-related stuff
374 	 */
375 	l2arc_dev_hdr_phys_t	*l2ad_dev_hdr;	/* persistent device header */
376 	uint64_t		l2ad_dev_hdr_asize; /* aligned hdr size */
377 	l2arc_log_blk_phys_t	l2ad_log_blk;	/* currently open log block */
378 	int			l2ad_log_ent_idx; /* index into cur log blk */
379 	/* Number of bytes in current log block's payload */
380 	uint64_t		l2ad_log_blk_payload_asize;
381 	/*
382 	 * Offset (in bytes) of the first buffer in current log block's
383 	 * payload.
384 	 */
385 	uint64_t		l2ad_log_blk_payload_start;
386 	/* Flag indicating whether a rebuild is scheduled or is going on */
387 	boolean_t		l2ad_rebuild;
388 	boolean_t		l2ad_rebuild_cancel;
389 	boolean_t		l2ad_rebuild_began;
390 	uint64_t		l2ad_log_entries;   /* entries per log blk  */
391 	uint64_t		l2ad_evict;	 /* evicted offset in bytes */
392 	/* List of pointers to log blocks present in the L2ARC device */
393 	list_t			l2ad_lbptr_list;
394 	/*
395 	 * Aligned size of all log blocks as accounted by vdev_space_update().
396 	 */
397 	zfs_refcount_t		l2ad_lb_asize;
398 	/*
399 	 * Number of log blocks present on the device.
400 	 */
401 	zfs_refcount_t		l2ad_lb_count;
402 } l2arc_dev_t;
403 
404 /*
405  * Encrypted blocks will need to be stored encrypted on the L2ARC
406  * disk as they appear in the main pool. In order for this to work we
407  * need to pass around the encryption parameters so they can be used
408  * to write data to the L2ARC. This struct is only defined in the
409  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
410  * flag set.
411  */
412 typedef struct arc_buf_hdr_crypt {
413 	abd_t		*b_rabd;		/* raw encrypted data */
414 	dmu_object_type_t	b_ot;		/* object type */
415 	uint32_t		b_ebufcnt;	/* number or encryped buffers */
416 
417 	/* dsobj for looking up encryption key for l2arc encryption */
418 	uint64_t		b_dsobj;	/* for looking up key */
419 
420 	/* encryption parameters */
421 	uint8_t		b_salt[ZIO_DATA_SALT_LEN];
422 	uint8_t		b_iv[ZIO_DATA_IV_LEN];
423 
424 	/*
425 	 * Technically this could be removed since we will always be able to
426 	 * get the mac from the bp when we need it. However, it is inconvenient
427 	 * for callers of arc code to have to pass a bp in all the time. This
428 	 * also allows us to assert that L2ARC data is properly encrypted to
429 	 * match the data in the main storage pool.
430 	 */
431 	uint8_t		b_mac[ZIO_DATA_MAC_LEN];
432 } arc_buf_hdr_crypt_t;
433 
434 typedef struct l2arc_buf_hdr {
435 	/* protected by arc_buf_hdr mutex */
436 	l2arc_dev_t		*b_dev;		/* L2ARC device */
437 	uint64_t		b_daddr;	/* disk address, offset byte */
438 
439 	arc_state_type_t	b_arcs_state;
440 	list_node_t		b_l2node;
441 } l2arc_buf_hdr_t;
442 
443 typedef struct l2arc_write_callback {
444 	l2arc_dev_t	*l2wcb_dev;		/* device info */
445 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
446 	/* in-flight list of log blocks */
447 	list_t		l2wcb_abd_list;
448 } l2arc_write_callback_t;
449 
450 struct arc_buf_hdr {
451 	/* protected by hash lock */
452 	dva_t			b_dva;
453 	uint64_t		b_birth;
454 
455 	arc_buf_contents_t	b_type;
456 	arc_buf_hdr_t		*b_hash_next;
457 	arc_flags_t		b_flags;
458 
459 	/*
460 	 * This field stores the size of the data buffer after
461 	 * compression, and is set in the arc's zio completion handlers.
462 	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
463 	 *
464 	 * While the block pointers can store up to 32MB in their psize
465 	 * field, we can only store up to 32MB minus 512B. This is due
466 	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
467 	 * a field of zeros represents 512B in the bp). We can't use a
468 	 * bias of 1 since we need to reserve a psize of zero, here, to
469 	 * represent holes and embedded blocks.
470 	 *
471 	 * This isn't a problem in practice, since the maximum size of a
472 	 * buffer is limited to 16MB, so we never need to store 32MB in
473 	 * this field.
474 	 */
475 	uint16_t		b_psize;
476 
477 	/*
478 	 * This field stores the size of the data buffer before
479 	 * compression, and cannot change once set. It is in units
480 	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
481 	 */
482 	uint16_t		b_lsize;	/* immutable */
483 	uint64_t		b_spa;		/* immutable */
484 
485 	/* L2ARC fields. Undefined when not in L2ARC. */
486 	l2arc_buf_hdr_t		b_l2hdr;
487 	/* L1ARC fields. Undefined when in l2arc_only state */
488 	l1arc_buf_hdr_t		b_l1hdr;
489 	/*
490 	 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
491 	 * is set and the L1 header exists.
492 	 */
493 	arc_buf_hdr_crypt_t b_crypt_hdr;
494 };
495 
496 typedef struct arc_stats {
497 	kstat_named_t arcstat_hits;
498 	kstat_named_t arcstat_misses;
499 	kstat_named_t arcstat_demand_data_hits;
500 	kstat_named_t arcstat_demand_data_misses;
501 	kstat_named_t arcstat_demand_metadata_hits;
502 	kstat_named_t arcstat_demand_metadata_misses;
503 	kstat_named_t arcstat_prefetch_data_hits;
504 	kstat_named_t arcstat_prefetch_data_misses;
505 	kstat_named_t arcstat_prefetch_metadata_hits;
506 	kstat_named_t arcstat_prefetch_metadata_misses;
507 	kstat_named_t arcstat_mru_hits;
508 	kstat_named_t arcstat_mru_ghost_hits;
509 	kstat_named_t arcstat_mfu_hits;
510 	kstat_named_t arcstat_mfu_ghost_hits;
511 	kstat_named_t arcstat_deleted;
512 	/*
513 	 * Number of buffers that could not be evicted because the hash lock
514 	 * was held by another thread.  The lock may not necessarily be held
515 	 * by something using the same buffer, since hash locks are shared
516 	 * by multiple buffers.
517 	 */
518 	kstat_named_t arcstat_mutex_miss;
519 	/*
520 	 * Number of buffers skipped when updating the access state due to the
521 	 * header having already been released after acquiring the hash lock.
522 	 */
523 	kstat_named_t arcstat_access_skip;
524 	/*
525 	 * Number of buffers skipped because they have I/O in progress, are
526 	 * indirect prefetch buffers that have not lived long enough, or are
527 	 * not from the spa we're trying to evict from.
528 	 */
529 	kstat_named_t arcstat_evict_skip;
530 	/*
531 	 * Number of times arc_evict_state() was unable to evict enough
532 	 * buffers to reach its target amount.
533 	 */
534 	kstat_named_t arcstat_evict_not_enough;
535 	kstat_named_t arcstat_evict_l2_cached;
536 	kstat_named_t arcstat_evict_l2_eligible;
537 	kstat_named_t arcstat_evict_l2_eligible_mfu;
538 	kstat_named_t arcstat_evict_l2_eligible_mru;
539 	kstat_named_t arcstat_evict_l2_ineligible;
540 	kstat_named_t arcstat_evict_l2_skip;
541 	kstat_named_t arcstat_hash_elements;
542 	kstat_named_t arcstat_hash_elements_max;
543 	kstat_named_t arcstat_hash_collisions;
544 	kstat_named_t arcstat_hash_chains;
545 	kstat_named_t arcstat_hash_chain_max;
546 	kstat_named_t arcstat_p;
547 	kstat_named_t arcstat_c;
548 	kstat_named_t arcstat_c_min;
549 	kstat_named_t arcstat_c_max;
550 	/* Not updated directly; only synced in arc_kstat_update. */
551 	kstat_named_t arcstat_size;
552 	/*
553 	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
554 	 * Note that the compressed bytes may match the uncompressed bytes
555 	 * if the block is either not compressed or compressed arc is disabled.
556 	 */
557 	kstat_named_t arcstat_compressed_size;
558 	/*
559 	 * Uncompressed size of the data stored in b_pabd. If compressed
560 	 * arc is disabled then this value will be identical to the stat
561 	 * above.
562 	 */
563 	kstat_named_t arcstat_uncompressed_size;
564 	/*
565 	 * Number of bytes stored in all the arc_buf_t's. This is classified
566 	 * as "overhead" since this data is typically short-lived and will
567 	 * be evicted from the arc when it becomes unreferenced unless the
568 	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
569 	 * values have been set (see comment in dbuf.c for more information).
570 	 */
571 	kstat_named_t arcstat_overhead_size;
572 	/*
573 	 * Number of bytes consumed by internal ARC structures necessary
574 	 * for tracking purposes; these structures are not actually
575 	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
576 	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
577 	 * caches), and arc_buf_t structures (allocated via arc_buf_t
578 	 * cache).
579 	 * Not updated directly; only synced in arc_kstat_update.
580 	 */
581 	kstat_named_t arcstat_hdr_size;
582 	/*
583 	 * Number of bytes consumed by ARC buffers of type equal to
584 	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
585 	 * on disk user data (e.g. plain file contents).
586 	 * Not updated directly; only synced in arc_kstat_update.
587 	 */
588 	kstat_named_t arcstat_data_size;
589 	/*
590 	 * Number of bytes consumed by ARC buffers of type equal to
591 	 * ARC_BUFC_METADATA. This is generally consumed by buffers
592 	 * backing on disk data that is used for internal ZFS
593 	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
594 	 * Not updated directly; only synced in arc_kstat_update.
595 	 */
596 	kstat_named_t arcstat_metadata_size;
597 	/*
598 	 * Number of bytes consumed by various buffers and structures
599 	 * not actually backed with ARC buffers. This includes bonus
600 	 * buffers (allocated directly via zio_buf_* functions),
601 	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
602 	 * cache), and dnode_t structures (allocated via dnode_t cache).
603 	 * Not updated directly; only synced in arc_kstat_update.
604 	 */
605 	kstat_named_t arcstat_other_size;
606 	/*
607 	 * Total number of bytes consumed by ARC buffers residing in the
608 	 * arc_anon state. This includes *all* buffers in the arc_anon
609 	 * state; e.g. data, metadata, evictable, and unevictable buffers
610 	 * are all included in this value.
611 	 * Not updated directly; only synced in arc_kstat_update.
612 	 */
613 	kstat_named_t arcstat_anon_size;
614 	/*
615 	 * Number of bytes consumed by ARC buffers that meet the
616 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
617 	 * residing in the arc_anon state, and are eligible for eviction
618 	 * (e.g. have no outstanding holds on the buffer).
619 	 * Not updated directly; only synced in arc_kstat_update.
620 	 */
621 	kstat_named_t arcstat_anon_evictable_data;
622 	/*
623 	 * Number of bytes consumed by ARC buffers that meet the
624 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
625 	 * residing in the arc_anon state, and are eligible for eviction
626 	 * (e.g. have no outstanding holds on the buffer).
627 	 * Not updated directly; only synced in arc_kstat_update.
628 	 */
629 	kstat_named_t arcstat_anon_evictable_metadata;
630 	/*
631 	 * Total number of bytes consumed by ARC buffers residing in the
632 	 * arc_mru state. This includes *all* buffers in the arc_mru
633 	 * state; e.g. data, metadata, evictable, and unevictable buffers
634 	 * are all included in this value.
635 	 * Not updated directly; only synced in arc_kstat_update.
636 	 */
637 	kstat_named_t arcstat_mru_size;
638 	/*
639 	 * Number of bytes consumed by ARC buffers that meet the
640 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
641 	 * residing in the arc_mru state, and are eligible for eviction
642 	 * (e.g. have no outstanding holds on the buffer).
643 	 * Not updated directly; only synced in arc_kstat_update.
644 	 */
645 	kstat_named_t arcstat_mru_evictable_data;
646 	/*
647 	 * Number of bytes consumed by ARC buffers that meet the
648 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
649 	 * residing in the arc_mru state, and are eligible for eviction
650 	 * (e.g. have no outstanding holds on the buffer).
651 	 * Not updated directly; only synced in arc_kstat_update.
652 	 */
653 	kstat_named_t arcstat_mru_evictable_metadata;
654 	/*
655 	 * Total number of bytes that *would have been* consumed by ARC
656 	 * buffers in the arc_mru_ghost state. The key thing to note
657 	 * here, is the fact that this size doesn't actually indicate
658 	 * RAM consumption. The ghost lists only consist of headers and
659 	 * don't actually have ARC buffers linked off of these headers.
660 	 * Thus, *if* the headers had associated ARC buffers, these
661 	 * buffers *would have* consumed this number of bytes.
662 	 * Not updated directly; only synced in arc_kstat_update.
663 	 */
664 	kstat_named_t arcstat_mru_ghost_size;
665 	/*
666 	 * Number of bytes that *would have been* consumed by ARC
667 	 * buffers that are eligible for eviction, of type
668 	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
669 	 * Not updated directly; only synced in arc_kstat_update.
670 	 */
671 	kstat_named_t arcstat_mru_ghost_evictable_data;
672 	/*
673 	 * Number of bytes that *would have been* consumed by ARC
674 	 * buffers that are eligible for eviction, of type
675 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
676 	 * Not updated directly; only synced in arc_kstat_update.
677 	 */
678 	kstat_named_t arcstat_mru_ghost_evictable_metadata;
679 	/*
680 	 * Total number of bytes consumed by ARC buffers residing in the
681 	 * arc_mfu state. This includes *all* buffers in the arc_mfu
682 	 * state; e.g. data, metadata, evictable, and unevictable buffers
683 	 * are all included in this value.
684 	 * Not updated directly; only synced in arc_kstat_update.
685 	 */
686 	kstat_named_t arcstat_mfu_size;
687 	/*
688 	 * Number of bytes consumed by ARC buffers that are eligible for
689 	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
690 	 * state.
691 	 * Not updated directly; only synced in arc_kstat_update.
692 	 */
693 	kstat_named_t arcstat_mfu_evictable_data;
694 	/*
695 	 * Number of bytes consumed by ARC buffers that are eligible for
696 	 * eviction, of type ARC_BUFC_METADATA, and reside in the
697 	 * arc_mfu state.
698 	 * Not updated directly; only synced in arc_kstat_update.
699 	 */
700 	kstat_named_t arcstat_mfu_evictable_metadata;
701 	/*
702 	 * Total number of bytes that *would have been* consumed by ARC
703 	 * buffers in the arc_mfu_ghost state. See the comment above
704 	 * arcstat_mru_ghost_size for more details.
705 	 * Not updated directly; only synced in arc_kstat_update.
706 	 */
707 	kstat_named_t arcstat_mfu_ghost_size;
708 	/*
709 	 * Number of bytes that *would have been* consumed by ARC
710 	 * buffers that are eligible for eviction, of type
711 	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
712 	 * Not updated directly; only synced in arc_kstat_update.
713 	 */
714 	kstat_named_t arcstat_mfu_ghost_evictable_data;
715 	/*
716 	 * Number of bytes that *would have been* consumed by ARC
717 	 * buffers that are eligible for eviction, of type
718 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
719 	 * Not updated directly; only synced in arc_kstat_update.
720 	 */
721 	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
722 	kstat_named_t arcstat_l2_hits;
723 	kstat_named_t arcstat_l2_misses;
724 	/*
725 	 * Allocated size (in bytes) of L2ARC cached buffers by ARC state.
726 	 */
727 	kstat_named_t arcstat_l2_prefetch_asize;
728 	kstat_named_t arcstat_l2_mru_asize;
729 	kstat_named_t arcstat_l2_mfu_asize;
730 	/*
731 	 * Allocated size (in bytes) of L2ARC cached buffers by buffer content
732 	 * type.
733 	 */
734 	kstat_named_t arcstat_l2_bufc_data_asize;
735 	kstat_named_t arcstat_l2_bufc_metadata_asize;
736 	kstat_named_t arcstat_l2_feeds;
737 	kstat_named_t arcstat_l2_rw_clash;
738 	kstat_named_t arcstat_l2_read_bytes;
739 	kstat_named_t arcstat_l2_write_bytes;
740 	kstat_named_t arcstat_l2_writes_sent;
741 	kstat_named_t arcstat_l2_writes_done;
742 	kstat_named_t arcstat_l2_writes_error;
743 	kstat_named_t arcstat_l2_writes_lock_retry;
744 	kstat_named_t arcstat_l2_evict_lock_retry;
745 	kstat_named_t arcstat_l2_evict_reading;
746 	kstat_named_t arcstat_l2_evict_l1cached;
747 	kstat_named_t arcstat_l2_free_on_write;
748 	kstat_named_t arcstat_l2_abort_lowmem;
749 	kstat_named_t arcstat_l2_cksum_bad;
750 	kstat_named_t arcstat_l2_io_error;
751 	kstat_named_t arcstat_l2_lsize;
752 	kstat_named_t arcstat_l2_psize;
753 	/* Not updated directly; only synced in arc_kstat_update. */
754 	kstat_named_t arcstat_l2_hdr_size;
755 	/*
756 	 * Number of L2ARC log blocks written. These are used for restoring the
757 	 * L2ARC. Updated during writing of L2ARC log blocks.
758 	 */
759 	kstat_named_t arcstat_l2_log_blk_writes;
760 	/*
761 	 * Moving average of the aligned size of the L2ARC log blocks, in
762 	 * bytes. Updated during L2ARC rebuild and during writing of L2ARC
763 	 * log blocks.
764 	 */
765 	kstat_named_t arcstat_l2_log_blk_avg_asize;
766 	/* Aligned size of L2ARC log blocks on L2ARC devices. */
767 	kstat_named_t arcstat_l2_log_blk_asize;
768 	/* Number of L2ARC log blocks present on L2ARC devices. */
769 	kstat_named_t arcstat_l2_log_blk_count;
770 	/*
771 	 * Moving average of the aligned size of L2ARC restored data, in bytes,
772 	 * to the aligned size of their metadata in L2ARC, in bytes.
773 	 * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
774 	 */
775 	kstat_named_t arcstat_l2_data_to_meta_ratio;
776 	/*
777 	 * Number of times the L2ARC rebuild was successful for an L2ARC device.
778 	 */
779 	kstat_named_t arcstat_l2_rebuild_success;
780 	/*
781 	 * Number of times the L2ARC rebuild failed because the device header
782 	 * was in an unsupported format or corrupted.
783 	 */
784 	kstat_named_t arcstat_l2_rebuild_abort_unsupported;
785 	/*
786 	 * Number of times the L2ARC rebuild failed because of IO errors
787 	 * while reading a log block.
788 	 */
789 	kstat_named_t arcstat_l2_rebuild_abort_io_errors;
790 	/*
791 	 * Number of times the L2ARC rebuild failed because of IO errors when
792 	 * reading the device header.
793 	 */
794 	kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
795 	/*
796 	 * Number of L2ARC log blocks which failed to be restored due to
797 	 * checksum errors.
798 	 */
799 	kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
800 	/*
801 	 * Number of times the L2ARC rebuild was aborted due to low system
802 	 * memory.
803 	 */
804 	kstat_named_t arcstat_l2_rebuild_abort_lowmem;
805 	/* Logical size of L2ARC restored data, in bytes. */
806 	kstat_named_t arcstat_l2_rebuild_size;
807 	/* Aligned size of L2ARC restored data, in bytes. */
808 	kstat_named_t arcstat_l2_rebuild_asize;
809 	/*
810 	 * Number of L2ARC log entries (buffers) that were successfully
811 	 * restored in ARC.
812 	 */
813 	kstat_named_t arcstat_l2_rebuild_bufs;
814 	/*
815 	 * Number of L2ARC log entries (buffers) already cached in ARC. These
816 	 * were not restored again.
817 	 */
818 	kstat_named_t arcstat_l2_rebuild_bufs_precached;
819 	/*
820 	 * Number of L2ARC log blocks that were restored successfully. Each
821 	 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
822 	 */
823 	kstat_named_t arcstat_l2_rebuild_log_blks;
824 	kstat_named_t arcstat_memory_throttle_count;
825 	/* Not updated directly; only synced in arc_kstat_update. */
826 	kstat_named_t arcstat_meta_used;
827 	kstat_named_t arcstat_meta_limit;
828 	kstat_named_t arcstat_meta_max;
829 	kstat_named_t arcstat_meta_min;
830 	kstat_named_t arcstat_async_upgrade_sync;
831 	kstat_named_t arcstat_demand_hit_predictive_prefetch;
832 	kstat_named_t arcstat_demand_hit_prescient_prefetch;
833 } arc_stats_t;
834 
835 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
836 
837 #define	ARCSTAT_INCR(stat, val) \
838 	atomic_add_64(&arc_stats.stat.value.ui64, (val))
839 
840 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
841 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
842 
843 /*
844  * There are several ARC variables that are critical to export as kstats --
845  * but we don't want to have to grovel around in the kstat whenever we wish to
846  * manipulate them.  For these variables, we therefore define them to be in
847  * terms of the statistic variable.  This assures that we are not introducing
848  * the possibility of inconsistency by having shadow copies of the variables,
849  * while still allowing the code to be readable.
850  */
851 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
852 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
853 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
854 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
855 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
856 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
857 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
858 
859 /* compressed size of entire arc */
860 #define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
861 /* uncompressed size of entire arc */
862 #define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
863 /* number of bytes in the arc from arc_buf_t's */
864 #define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
865 
866 extern arc_stats_t arc_stats;
867 
868 /* used in zdb.c */
869 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
870     const l2arc_log_blkptr_t *lbp);
871 
872 #ifdef __cplusplus
873 }
874 #endif
875 
876 #endif /* _SYS_ARC_IMPL_H */
877